smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,811 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import glob
|
|
7
|
+
import time
|
|
8
|
+
from typing import Dict, List, Any, Tuple, Union, Optional, Iterable
|
|
9
|
+
import re
|
|
10
|
+
from itertools import zip_longest
|
|
11
|
+
import pysam
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import concurrent.futures
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
17
|
+
|
|
18
|
+
from tqdm import tqdm
|
|
19
|
+
from collections import defaultdict, Counter
|
|
20
|
+
|
|
21
|
+
from ..readwrite import make_dirs, time_string, date_string
|
|
22
|
+
|
|
23
|
+
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
|
|
26
|
+
"""
|
|
27
|
+
bam_path = str(bam_path)
|
|
28
|
+
fastq_path = str(fastq_path)
|
|
29
|
+
with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w", encoding="utf-8") as fq:
|
|
30
|
+
for r in bam.fetch(until_eof=True):
|
|
31
|
+
# Optionally skip secondary/supplementary:
|
|
32
|
+
# if r.is_secondary or r.is_supplementary:
|
|
33
|
+
# continue
|
|
34
|
+
|
|
35
|
+
name = r.query_name or ""
|
|
36
|
+
seq = r.query_sequence or ""
|
|
37
|
+
|
|
38
|
+
# Get numeric qualities; may be None
|
|
39
|
+
q = r.query_qualities
|
|
40
|
+
|
|
41
|
+
if q is None:
|
|
42
|
+
# fallback: fill with low quality ("!")
|
|
43
|
+
qual_str = "!" * len(seq)
|
|
44
|
+
else:
|
|
45
|
+
# q is an array/list of ints (Phred scores).
|
|
46
|
+
# Convert to FASTQ string with Phred+33 encoding,
|
|
47
|
+
# clamping to sane range [0, 93] to stay in printable ASCII.
|
|
48
|
+
qual_str = "".join(
|
|
49
|
+
chr(min(max(int(qv), 0), 93) + 33)
|
|
50
|
+
for qv in q
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
|
|
54
|
+
|
|
55
|
+
def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
56
|
+
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
57
|
+
args = []
|
|
58
|
+
if threads:
|
|
59
|
+
args += ["-@", str(threads)]
|
|
60
|
+
args += ["-o", out_bam, in_bam]
|
|
61
|
+
pysam.sort(*args)
|
|
62
|
+
|
|
63
|
+
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
64
|
+
bam_path = str(bam_path)
|
|
65
|
+
# pysam.index supports samtools-style args
|
|
66
|
+
if threads:
|
|
67
|
+
pysam.index("-@", str(threads), bam_path)
|
|
68
|
+
else:
|
|
69
|
+
pysam.index(bam_path)
|
|
70
|
+
|
|
71
|
+
def align_and_sort_BAM(fasta,
|
|
72
|
+
input,
|
|
73
|
+
cfg,
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
A wrapper for running dorado aligner and samtools functions
|
|
77
|
+
|
|
78
|
+
Parameters:
|
|
79
|
+
fasta (str): File path to the reference genome to align to.
|
|
80
|
+
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
81
|
+
cfg: The configuration object
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
None
|
|
85
|
+
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
86
|
+
"""
|
|
87
|
+
input_basename = input.name
|
|
88
|
+
input_suffix = input.suffix
|
|
89
|
+
input_as_fastq = input.with_name(input.stem + '.fastq')
|
|
90
|
+
|
|
91
|
+
output_path_minus_suffix = cfg.output_directory / input.stem
|
|
92
|
+
|
|
93
|
+
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
94
|
+
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
95
|
+
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
96
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
97
|
+
|
|
98
|
+
if cfg.threads:
|
|
99
|
+
threads = str(cfg.threads)
|
|
100
|
+
else:
|
|
101
|
+
threads = None
|
|
102
|
+
|
|
103
|
+
if cfg.aligner == 'minimap2':
|
|
104
|
+
if not cfg.align_from_bam:
|
|
105
|
+
print(f"Converting BAM to FASTQ: {input}")
|
|
106
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
107
|
+
print(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
108
|
+
mm_input = input_as_fastq
|
|
109
|
+
else:
|
|
110
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
111
|
+
mm_input = input
|
|
112
|
+
|
|
113
|
+
if threads:
|
|
114
|
+
minimap_command = ['minimap2'] + cfg.aligner_args + ['-t', threads, str(fasta), str(mm_input)]
|
|
115
|
+
else:
|
|
116
|
+
minimap_command = ['minimap2'] + cfg.aligner_args + [str(fasta), str(mm_input)]
|
|
117
|
+
subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
|
|
118
|
+
|
|
119
|
+
if not cfg.align_from_bam:
|
|
120
|
+
os.remove(input_as_fastq)
|
|
121
|
+
|
|
122
|
+
elif cfg.aligner == 'dorado':
|
|
123
|
+
# Run dorado aligner
|
|
124
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
125
|
+
if threads:
|
|
126
|
+
alignment_command = ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
|
|
127
|
+
else:
|
|
128
|
+
alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
|
|
129
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
print(f'Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado')
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
# --- Sort & Index with pysam ---
|
|
136
|
+
print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
137
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
138
|
+
|
|
139
|
+
print(f"[pysam] Indexing: {aligned_sorted_output}")
|
|
140
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
141
|
+
|
|
142
|
+
def bam_qc(
|
|
143
|
+
bam_files: Iterable[str | Path],
|
|
144
|
+
bam_qc_dir: str | Path,
|
|
145
|
+
threads: Optional[int],
|
|
146
|
+
modality: str,
|
|
147
|
+
stats: bool = True,
|
|
148
|
+
flagstats: bool = True,
|
|
149
|
+
idxstats: bool = True,
|
|
150
|
+
) -> None:
|
|
151
|
+
"""
|
|
152
|
+
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
153
|
+
Prefers pysam; falls back to `samtools` if needed.
|
|
154
|
+
Runs BAMs in parallel (up to `threads`, default serial).
|
|
155
|
+
"""
|
|
156
|
+
import subprocess
|
|
157
|
+
import shutil
|
|
158
|
+
|
|
159
|
+
# Try to import pysam once
|
|
160
|
+
try:
|
|
161
|
+
import pysam
|
|
162
|
+
HAVE_PYSAM = True
|
|
163
|
+
except Exception:
|
|
164
|
+
HAVE_PYSAM = False
|
|
165
|
+
|
|
166
|
+
bam_qc_dir = Path(bam_qc_dir)
|
|
167
|
+
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
|
|
169
|
+
bam_files = [Path(b) for b in bam_files]
|
|
170
|
+
|
|
171
|
+
def _has_index(p: Path) -> bool:
|
|
172
|
+
if p.suffix.lower() == ".bam":
|
|
173
|
+
bai = p.with_suffix(p.suffix + ".bai")
|
|
174
|
+
bai_alt = Path(str(p) + ".bai")
|
|
175
|
+
return bai.exists() or bai_alt.exists()
|
|
176
|
+
if p.suffix.lower() == ".cram":
|
|
177
|
+
crai = Path(str(p) + ".crai")
|
|
178
|
+
return crai.exists()
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
def _ensure_index(p: Path) -> None:
|
|
182
|
+
if _has_index(p):
|
|
183
|
+
return
|
|
184
|
+
if HAVE_PYSAM:
|
|
185
|
+
# pysam.index supports both BAM & CRAM
|
|
186
|
+
pysam.index(str(p))
|
|
187
|
+
else:
|
|
188
|
+
cmd = ["samtools", "index", str(p)]
|
|
189
|
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
190
|
+
|
|
191
|
+
def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
|
|
192
|
+
# outputs + return (file, [(task_name, returncode)])
|
|
193
|
+
results: List[Tuple[str, int]] = []
|
|
194
|
+
base = bam.stem # filename without .bam
|
|
195
|
+
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
196
|
+
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
197
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
198
|
+
|
|
199
|
+
# Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
|
|
200
|
+
try:
|
|
201
|
+
_ensure_index(bam)
|
|
202
|
+
except Exception as e:
|
|
203
|
+
# Still attempt stats/flagstat if requested
|
|
204
|
+
print(f"[warn] Indexing failed for {bam}: {e}")
|
|
205
|
+
|
|
206
|
+
# Choose runner per task
|
|
207
|
+
def run_stats():
|
|
208
|
+
if not stats:
|
|
209
|
+
return
|
|
210
|
+
if HAVE_PYSAM and hasattr(pysam, "stats"):
|
|
211
|
+
txt = pysam.stats(str(bam))
|
|
212
|
+
out_stats.write_text(txt)
|
|
213
|
+
results.append(("stats(pysam)", 0))
|
|
214
|
+
else:
|
|
215
|
+
cmd = ["samtools", "stats", str(bam)]
|
|
216
|
+
with open(out_stats, "w") as fh:
|
|
217
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
218
|
+
results.append(("stats(samtools)", cp.returncode))
|
|
219
|
+
if cp.returncode != 0:
|
|
220
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
221
|
+
|
|
222
|
+
def run_flagstat():
|
|
223
|
+
if not flagstats:
|
|
224
|
+
return
|
|
225
|
+
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
226
|
+
txt = pysam.flagstat(str(bam))
|
|
227
|
+
out_flag.write_text(txt)
|
|
228
|
+
results.append(("flagstat(pysam)", 0))
|
|
229
|
+
else:
|
|
230
|
+
cmd = ["samtools", "flagstat", str(bam)]
|
|
231
|
+
with open(out_flag, "w") as fh:
|
|
232
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
233
|
+
results.append(("flagstat(samtools)", cp.returncode))
|
|
234
|
+
if cp.returncode != 0:
|
|
235
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
236
|
+
|
|
237
|
+
def run_idxstats():
|
|
238
|
+
if not idxstats:
|
|
239
|
+
return
|
|
240
|
+
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
241
|
+
txt = pysam.idxstats(str(bam))
|
|
242
|
+
out_idx.write_text(txt)
|
|
243
|
+
results.append(("idxstats(pysam)", 0))
|
|
244
|
+
else:
|
|
245
|
+
cmd = ["samtools", "idxstats", str(bam)]
|
|
246
|
+
with open(out_idx, "w") as fh:
|
|
247
|
+
cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
|
|
248
|
+
results.append(("idxstats(samtools)", cp.returncode))
|
|
249
|
+
if cp.returncode != 0:
|
|
250
|
+
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
251
|
+
|
|
252
|
+
# Sanity: ensure samtools exists if pysam missing
|
|
253
|
+
if not HAVE_PYSAM:
|
|
254
|
+
if not shutil.which("samtools"):
|
|
255
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
256
|
+
|
|
257
|
+
# Execute tasks (serial per file; parallelized across files)
|
|
258
|
+
run_stats()
|
|
259
|
+
run_flagstat()
|
|
260
|
+
run_idxstats()
|
|
261
|
+
return bam, results
|
|
262
|
+
|
|
263
|
+
# Parallel across BAMs
|
|
264
|
+
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
265
|
+
futures = []
|
|
266
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
267
|
+
for b in bam_files:
|
|
268
|
+
futures.append(ex.submit(_run_one, b))
|
|
269
|
+
|
|
270
|
+
for fut in as_completed(futures):
|
|
271
|
+
try:
|
|
272
|
+
bam, res = fut.result()
|
|
273
|
+
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
274
|
+
print(f"[qc] {bam.name}: {summary}")
|
|
275
|
+
except Exception as e:
|
|
276
|
+
print(f"[error] QC failed: {e}")
|
|
277
|
+
|
|
278
|
+
# Placeholders to keep your signature stable
|
|
279
|
+
if modality not in {"conversion", "direct"}:
|
|
280
|
+
print(f"[warn] Unknown modality '{modality}', continuing.")
|
|
281
|
+
|
|
282
|
+
print("QC processing completed.")
|
|
283
|
+
|
|
284
|
+
def concatenate_fastqs_to_bam(
|
|
285
|
+
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
286
|
+
output_bam: Union[str, Path],
|
|
287
|
+
barcode_tag: str = "BC",
|
|
288
|
+
barcode_map: Optional[Dict[Union[str, Path], str]] = None,
|
|
289
|
+
add_read_group: bool = True,
|
|
290
|
+
rg_sample_field: Optional[str] = None,
|
|
291
|
+
progress: bool = True,
|
|
292
|
+
auto_pair: bool = True,
|
|
293
|
+
) -> Dict[str, Any]:
|
|
294
|
+
"""
|
|
295
|
+
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
fastq_files : list[Path|str] or list[(Path|str, Path|str)]
|
|
300
|
+
Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
|
|
301
|
+
output_bam : Path|str
|
|
302
|
+
Output BAM path (parent directory will be created).
|
|
303
|
+
barcode_tag : str
|
|
304
|
+
SAM tag used to store barcode on each read (default 'BC').
|
|
305
|
+
barcode_map : dict or None
|
|
306
|
+
Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
|
|
307
|
+
add_read_group : bool
|
|
308
|
+
If True, add @RG header lines (ID = barcode) and set each read's RG tag.
|
|
309
|
+
rg_sample_field : str or None
|
|
310
|
+
If set, include SM=<value> in @RG.
|
|
311
|
+
progress : bool
|
|
312
|
+
Show tqdm progress bars.
|
|
313
|
+
auto_pair : bool
|
|
314
|
+
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
315
|
+
|
|
316
|
+
Returns
|
|
317
|
+
-------
|
|
318
|
+
dict
|
|
319
|
+
{'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
# ---------- helpers (Pathlib-only) ----------
|
|
323
|
+
def _strip_fastq_ext(p: Path) -> str:
|
|
324
|
+
"""
|
|
325
|
+
Remove common FASTQ multi-suffixes; return stem-like name.
|
|
326
|
+
"""
|
|
327
|
+
name = p.name
|
|
328
|
+
lowers = name.lower()
|
|
329
|
+
for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
|
|
330
|
+
if lowers.endswith(ext):
|
|
331
|
+
return name[: -len(ext)]
|
|
332
|
+
return p.stem # fallback: remove last suffix only
|
|
333
|
+
|
|
334
|
+
def _extract_barcode_from_filename(p: Path) -> str:
|
|
335
|
+
stem = _strip_fastq_ext(p)
|
|
336
|
+
if "_" in stem:
|
|
337
|
+
token = stem.split("_")[-1]
|
|
338
|
+
if token:
|
|
339
|
+
return token
|
|
340
|
+
return stem
|
|
341
|
+
|
|
342
|
+
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
343
|
+
# return (prefix, readnum) if matches; else (None, None)
|
|
344
|
+
patterns = [
|
|
345
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
346
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
347
|
+
]
|
|
348
|
+
for pat in patterns:
|
|
349
|
+
m = re.match(pat, stem)
|
|
350
|
+
if m:
|
|
351
|
+
return m.group(1), int(m.group(2))
|
|
352
|
+
return None, None
|
|
353
|
+
|
|
354
|
+
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
355
|
+
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
356
|
+
unpaired: List[Path] = []
|
|
357
|
+
for pth in paths:
|
|
358
|
+
stem = _strip_fastq_ext(pth)
|
|
359
|
+
pref, num = _classify_read_token(stem)
|
|
360
|
+
if pref is None:
|
|
361
|
+
unpaired.append(pth)
|
|
362
|
+
else:
|
|
363
|
+
entry = pref_map.setdefault(pref, {})
|
|
364
|
+
entry[num] = pth
|
|
365
|
+
pairs: List[Tuple[Path, Path]] = []
|
|
366
|
+
leftovers: List[Path] = []
|
|
367
|
+
for d in pref_map.values():
|
|
368
|
+
if 1 in d and 2 in d:
|
|
369
|
+
pairs.append((d[1], d[2]))
|
|
370
|
+
else:
|
|
371
|
+
leftovers.extend(d.values())
|
|
372
|
+
leftovers.extend(unpaired)
|
|
373
|
+
return pairs, leftovers
|
|
374
|
+
|
|
375
|
+
def _fastq_iter(p: Path):
|
|
376
|
+
# pysam.FastxFile handles compressed extensions transparently
|
|
377
|
+
with pysam.FastxFile(str(p)) as fx:
|
|
378
|
+
for rec in fx:
|
|
379
|
+
yield rec # rec.name, rec.sequence, rec.quality
|
|
380
|
+
|
|
381
|
+
def _make_unaligned_segment(
|
|
382
|
+
name: str,
|
|
383
|
+
seq: str,
|
|
384
|
+
qual: Optional[str],
|
|
385
|
+
bc: str,
|
|
386
|
+
read1: bool,
|
|
387
|
+
read2: bool,
|
|
388
|
+
) -> pysam.AlignedSegment:
|
|
389
|
+
a = pysam.AlignedSegment()
|
|
390
|
+
a.query_name = name
|
|
391
|
+
a.query_sequence = seq
|
|
392
|
+
if qual is not None:
|
|
393
|
+
a.query_qualities = pysam.qualitystring_to_array(qual)
|
|
394
|
+
a.is_unmapped = True
|
|
395
|
+
a.is_paired = read1 or read2
|
|
396
|
+
a.is_read1 = read1
|
|
397
|
+
a.is_read2 = read2
|
|
398
|
+
a.mate_is_unmapped = a.is_paired
|
|
399
|
+
a.reference_id = -1
|
|
400
|
+
a.reference_start = -1
|
|
401
|
+
a.next_reference_id = -1
|
|
402
|
+
a.next_reference_start = -1
|
|
403
|
+
a.template_length = 0
|
|
404
|
+
a.set_tag(barcode_tag, str(bc), value_type="Z")
|
|
405
|
+
if add_read_group:
|
|
406
|
+
a.set_tag("RG", str(bc), value_type="Z")
|
|
407
|
+
return a
|
|
408
|
+
|
|
409
|
+
# ---------- normalize inputs to Path ----------
|
|
410
|
+
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
411
|
+
a, b = x
|
|
412
|
+
return Path(a), Path(b)
|
|
413
|
+
|
|
414
|
+
explicit_pairs: List[Tuple[Path, Path]] = []
|
|
415
|
+
singles: List[Path] = []
|
|
416
|
+
|
|
417
|
+
if not isinstance(fastq_files, (list, tuple)):
|
|
418
|
+
raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
|
|
419
|
+
|
|
420
|
+
if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
|
|
421
|
+
explicit_pairs = [_to_path_pair(x) for x in fastq_files]
|
|
422
|
+
else:
|
|
423
|
+
flat_paths = [Path(x) for x in fastq_files if x is not None]
|
|
424
|
+
if auto_pair:
|
|
425
|
+
explicit_pairs, leftovers = _pair_by_filename(flat_paths)
|
|
426
|
+
singles = leftovers
|
|
427
|
+
else:
|
|
428
|
+
singles = flat_paths
|
|
429
|
+
|
|
430
|
+
output_bam = Path(output_bam)
|
|
431
|
+
output_bam.parent.mkdir(parents=True, exist_ok=True)
|
|
432
|
+
|
|
433
|
+
# ---------- barcodes ----------
|
|
434
|
+
barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
|
|
435
|
+
per_path_barcode: Dict[Path, str] = {}
|
|
436
|
+
barcodes_in_order: List[str] = []
|
|
437
|
+
|
|
438
|
+
for r1, r2 in explicit_pairs:
|
|
439
|
+
bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
|
|
440
|
+
per_path_barcode[r1] = bc
|
|
441
|
+
per_path_barcode[r2] = bc
|
|
442
|
+
if bc not in barcodes_in_order:
|
|
443
|
+
barcodes_in_order.append(bc)
|
|
444
|
+
for pth in singles:
|
|
445
|
+
bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
|
|
446
|
+
per_path_barcode[pth] = bc
|
|
447
|
+
if bc not in barcodes_in_order:
|
|
448
|
+
barcodes_in_order.append(bc)
|
|
449
|
+
|
|
450
|
+
# ---------- BAM header ----------
|
|
451
|
+
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
452
|
+
if add_read_group:
|
|
453
|
+
header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
|
|
454
|
+
header.setdefault("PG", []).append(
|
|
455
|
+
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# ---------- counters ----------
|
|
459
|
+
per_file_counts: Dict[Path, int] = {}
|
|
460
|
+
total_written = 0
|
|
461
|
+
paired_pairs_written = 0
|
|
462
|
+
singletons_written = 0
|
|
463
|
+
|
|
464
|
+
# ---------- write BAM ----------
|
|
465
|
+
with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
|
|
466
|
+
# Paired
|
|
467
|
+
it_pairs = explicit_pairs
|
|
468
|
+
if progress and it_pairs:
|
|
469
|
+
it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
|
|
470
|
+
for r1_path, r2_path in it_pairs:
|
|
471
|
+
if not (r1_path.exists() and r2_path.exists()):
|
|
472
|
+
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
473
|
+
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
474
|
+
|
|
475
|
+
it1 = _fastq_iter(r1_path)
|
|
476
|
+
it2 = _fastq_iter(r2_path)
|
|
477
|
+
|
|
478
|
+
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
479
|
+
def _clean(n: Optional[str]) -> Optional[str]:
|
|
480
|
+
if n is None:
|
|
481
|
+
return None
|
|
482
|
+
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
483
|
+
|
|
484
|
+
name = (
|
|
485
|
+
_clean(getattr(rec1, "name", None))
|
|
486
|
+
or _clean(getattr(rec2, "name", None))
|
|
487
|
+
or getattr(rec1, "name", None)
|
|
488
|
+
or getattr(rec2, "name", None)
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
if rec1 is not None:
|
|
492
|
+
a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
|
|
493
|
+
bam_out.write(a1)
|
|
494
|
+
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
495
|
+
total_written += 1
|
|
496
|
+
if rec2 is not None:
|
|
497
|
+
a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
|
|
498
|
+
bam_out.write(a2)
|
|
499
|
+
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
500
|
+
total_written += 1
|
|
501
|
+
|
|
502
|
+
if rec1 is not None and rec2 is not None:
|
|
503
|
+
paired_pairs_written += 1
|
|
504
|
+
else:
|
|
505
|
+
if rec1 is not None:
|
|
506
|
+
singletons_written += 1
|
|
507
|
+
if rec2 is not None:
|
|
508
|
+
singletons_written += 1
|
|
509
|
+
|
|
510
|
+
# Singles
|
|
511
|
+
it_singles = singles
|
|
512
|
+
if progress and it_singles:
|
|
513
|
+
it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
|
|
514
|
+
for pth in it_singles:
|
|
515
|
+
if not pth.exists():
|
|
516
|
+
raise FileNotFoundError(pth)
|
|
517
|
+
bc = per_path_barcode.get(pth, "barcode")
|
|
518
|
+
for rec in _fastq_iter(pth):
|
|
519
|
+
a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
|
|
520
|
+
bam_out.write(a)
|
|
521
|
+
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
522
|
+
total_written += 1
|
|
523
|
+
singletons_written += 1
|
|
524
|
+
|
|
525
|
+
return {
|
|
526
|
+
"total_reads": total_written,
|
|
527
|
+
"per_file": {str(k): v for k, v in per_file_counts.items()},
|
|
528
|
+
"paired_pairs_written": paired_pairs_written,
|
|
529
|
+
"singletons_written": singletons_written,
|
|
530
|
+
"barcodes": barcodes_in_order,
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
def count_aligned_reads(bam_file):
|
|
534
|
+
"""
|
|
535
|
+
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
536
|
+
|
|
537
|
+
Parameters:
|
|
538
|
+
bam_file (str): A string representing the path to an aligned BAM file.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
aligned_reads_count (int): The total number or reads aligned in the BAM.
|
|
542
|
+
unaligned_reads_count (int): The total number of reads not aligned in the BAM.
|
|
543
|
+
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
544
|
+
|
|
545
|
+
"""
|
|
546
|
+
print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
|
|
547
|
+
aligned_reads_count = 0
|
|
548
|
+
unaligned_reads_count = 0
|
|
549
|
+
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
550
|
+
record_counts = defaultdict(int)
|
|
551
|
+
|
|
552
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
553
|
+
total_reads = bam.mapped + bam.unmapped
|
|
554
|
+
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
555
|
+
for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
|
|
556
|
+
if read.is_unmapped:
|
|
557
|
+
unaligned_reads_count += 1
|
|
558
|
+
else:
|
|
559
|
+
aligned_reads_count += 1
|
|
560
|
+
record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
|
|
561
|
+
|
|
562
|
+
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
563
|
+
for reference in record_counts:
|
|
564
|
+
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
565
|
+
record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
|
|
566
|
+
|
|
567
|
+
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
568
|
+
|
|
569
|
+
def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads):
|
|
570
|
+
"""
|
|
571
|
+
A wrapper function for splitting BAMS and indexing them.
|
|
572
|
+
Parameters:
|
|
573
|
+
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
574
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
575
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
576
|
+
barcode_kit (str): Name of barcoding kit.
|
|
577
|
+
barcode_both_ends (bool): Whether to require both ends to be barcoded.
|
|
578
|
+
trim (bool): Whether to trim off barcodes after demultiplexing.
|
|
579
|
+
threads (int): Number of threads to use.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
bam_files (list): List of split BAM file path strings
|
|
583
|
+
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
584
|
+
"""
|
|
585
|
+
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
586
|
+
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
587
|
+
if barcode_both_ends:
|
|
588
|
+
command.append("--barcode-both-ends")
|
|
589
|
+
if not trim:
|
|
590
|
+
command.append("--no-trim")
|
|
591
|
+
if threads:
|
|
592
|
+
command += ["-t", str(threads)]
|
|
593
|
+
else:
|
|
594
|
+
pass
|
|
595
|
+
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
596
|
+
command.append(str(input_bam))
|
|
597
|
+
command_string = ' '.join(command)
|
|
598
|
+
print(f"Running: {command_string}")
|
|
599
|
+
subprocess.run(command)
|
|
600
|
+
|
|
601
|
+
bam_files = sorted(
|
|
602
|
+
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
603
|
+
if p.is_file() and p.suffix == bam_suffix
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if not bam_files:
|
|
607
|
+
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
608
|
+
|
|
609
|
+
# ---- Optional renaming with prefix ----
|
|
610
|
+
renamed_bams = []
|
|
611
|
+
prefix = "de" if barcode_both_ends else "se"
|
|
612
|
+
|
|
613
|
+
for bam in bam_files:
|
|
614
|
+
bam = Path(bam)
|
|
615
|
+
bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
|
|
616
|
+
|
|
617
|
+
if prefix:
|
|
618
|
+
new_name = f"{prefix}_{bam.name}"
|
|
619
|
+
else:
|
|
620
|
+
new_name = bam.name
|
|
621
|
+
|
|
622
|
+
new_bam = bam.with_name(new_name)
|
|
623
|
+
bam.rename(new_bam)
|
|
624
|
+
|
|
625
|
+
# rename index if exists
|
|
626
|
+
if bai.exists():
|
|
627
|
+
new_bai = new_bam.with_suffix(bam_suffix + ".bai")
|
|
628
|
+
bai.rename(new_bai)
|
|
629
|
+
|
|
630
|
+
renamed_bams.append(new_bam)
|
|
631
|
+
|
|
632
|
+
return renamed_bams
|
|
633
|
+
|
|
634
|
+
def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
|
|
635
|
+
"""
|
|
636
|
+
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
637
|
+
|
|
638
|
+
Parameters:
|
|
639
|
+
bam_file (str): Path to the BAM file.
|
|
640
|
+
chromosome (str): Name of the reference chromosome.
|
|
641
|
+
positions (list): Positions to extract (0-based).
|
|
642
|
+
max_reference_length (int): Maximum reference length for padding.
|
|
643
|
+
sequence (str): The sequence of the record fasta
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
dict: Base identities from forward mapped reads.
|
|
647
|
+
dict: Base identities from reverse mapped reads.
|
|
648
|
+
"""
|
|
649
|
+
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
650
|
+
|
|
651
|
+
positions = set(positions)
|
|
652
|
+
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
653
|
+
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
654
|
+
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
655
|
+
|
|
656
|
+
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
657
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
658
|
+
total_reads = bam.mapped
|
|
659
|
+
ref_seq = sequence.upper()
|
|
660
|
+
for read in bam.fetch(chromosome):
|
|
661
|
+
if not read.is_mapped:
|
|
662
|
+
continue # Skip unmapped reads
|
|
663
|
+
|
|
664
|
+
read_name = read.query_name
|
|
665
|
+
query_sequence = read.query_sequence
|
|
666
|
+
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
667
|
+
|
|
668
|
+
# Use get_aligned_pairs directly with positions filtering
|
|
669
|
+
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
670
|
+
|
|
671
|
+
for read_position, reference_position in aligned_pairs:
|
|
672
|
+
if reference_position in positions:
|
|
673
|
+
read_base = query_sequence[read_position]
|
|
674
|
+
ref_base = ref_seq[reference_position]
|
|
675
|
+
|
|
676
|
+
base_dict[read_name][reference_position] = read_base
|
|
677
|
+
|
|
678
|
+
# Track mismatches (excluding Ns)
|
|
679
|
+
if read_base != ref_base and read_base != 'N' and ref_base != 'N':
|
|
680
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
681
|
+
|
|
682
|
+
# Determine C→T vs G→A dominance per read
|
|
683
|
+
mismatch_trend_per_read = {}
|
|
684
|
+
for read_name, ref_dict in mismatch_counts_per_read.items():
|
|
685
|
+
c_to_t = ref_dict.get("C", {}).get("T", 0)
|
|
686
|
+
g_to_a = ref_dict.get("G", {}).get("A", 0)
|
|
687
|
+
|
|
688
|
+
if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
|
|
689
|
+
mismatch_trend_per_read[read_name] = "equal"
|
|
690
|
+
elif c_to_t > g_to_a:
|
|
691
|
+
mismatch_trend_per_read[read_name] = "C->T"
|
|
692
|
+
elif g_to_a > c_to_t:
|
|
693
|
+
mismatch_trend_per_read[read_name] = "G->A"
|
|
694
|
+
else:
|
|
695
|
+
mismatch_trend_per_read[read_name] = "none"
|
|
696
|
+
|
|
697
|
+
return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
|
|
698
|
+
|
|
699
|
+
def extract_read_features_from_bam(bam_file_path):
|
|
700
|
+
"""
|
|
701
|
+
Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
|
|
702
|
+
Params:
|
|
703
|
+
bam_file_path (str):
|
|
704
|
+
Returns:
|
|
705
|
+
read_metrics (dict)
|
|
706
|
+
"""
|
|
707
|
+
# Open the BAM file
|
|
708
|
+
print(f'Extracting read features from BAM: {bam_file_path}')
|
|
709
|
+
with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
|
|
710
|
+
read_metrics = {}
|
|
711
|
+
reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
|
|
712
|
+
for read in bam_file:
|
|
713
|
+
# Skip unmapped reads
|
|
714
|
+
if read.is_unmapped:
|
|
715
|
+
continue
|
|
716
|
+
# Extract the read metrics
|
|
717
|
+
read_quality = read.query_qualities
|
|
718
|
+
median_read_quality = np.median(read_quality)
|
|
719
|
+
# Extract the reference (chromosome) name and its length
|
|
720
|
+
reference_name = read.reference_name
|
|
721
|
+
reference_index = bam_file.references.index(reference_name)
|
|
722
|
+
reference_length = reference_lengths[reference_index]
|
|
723
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
724
|
+
mapping_quality = read.mapping_quality # Phred-scaled MAPQ
|
|
725
|
+
read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
|
|
726
|
+
|
|
727
|
+
return read_metrics
|
|
728
|
+
|
|
729
|
+
def extract_readnames_from_bam(aligned_BAM):
|
|
730
|
+
"""
|
|
731
|
+
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
732
|
+
|
|
733
|
+
Parameters:
|
|
734
|
+
aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
None
|
|
738
|
+
|
|
739
|
+
"""
|
|
740
|
+
import subprocess
|
|
741
|
+
# Make a text file of reads for the BAM
|
|
742
|
+
txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
|
|
743
|
+
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
744
|
+
with open(txt_output, "w") as output_file:
|
|
745
|
+
cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
|
|
746
|
+
samtools_view.stdout.close()
|
|
747
|
+
cut_process.wait()
|
|
748
|
+
samtools_view.wait()
|
|
749
|
+
|
|
750
|
+
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
751
|
+
"""
|
|
752
|
+
Separates an input BAM file on the BC SAM tag values.
|
|
753
|
+
|
|
754
|
+
Parameters:
|
|
755
|
+
input_bam (str): File path to the BAM file to split.
|
|
756
|
+
output_prefix (str): A prefix to append to the output BAM.
|
|
757
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
758
|
+
split_dir (str): String indicating path to directory to split BAMs into
|
|
759
|
+
|
|
760
|
+
Returns:
|
|
761
|
+
None
|
|
762
|
+
Writes out split BAM files.
|
|
763
|
+
"""
|
|
764
|
+
bam_base = input_bam.name
|
|
765
|
+
bam_base_minus_suffix = input_bam.stem
|
|
766
|
+
|
|
767
|
+
# Open the input BAM file for reading
|
|
768
|
+
with pysam.AlignmentFile(str(input_bam), "rb") as bam:
|
|
769
|
+
# Create a dictionary to store output BAM files
|
|
770
|
+
output_files = {}
|
|
771
|
+
# Iterate over each read in the BAM file
|
|
772
|
+
for read in bam:
|
|
773
|
+
try:
|
|
774
|
+
# Get the barcode tag value
|
|
775
|
+
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
776
|
+
#bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
777
|
+
# Open the output BAM file corresponding to the barcode
|
|
778
|
+
if bc_tag not in output_files:
|
|
779
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
780
|
+
output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
|
|
781
|
+
# Write the read to the corresponding output BAM file
|
|
782
|
+
output_files[bc_tag].write(read)
|
|
783
|
+
except KeyError:
|
|
784
|
+
print(f"BC tag not present for read: {read.query_name}")
|
|
785
|
+
# Close all output BAM files
|
|
786
|
+
for output_file in output_files.values():
|
|
787
|
+
output_file.close()
|
|
788
|
+
|
|
789
|
+
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
790
|
+
"""
|
|
791
|
+
A wrapper function for splitting BAMS and indexing them.
|
|
792
|
+
Parameters:
|
|
793
|
+
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
794
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
795
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
796
|
+
|
|
797
|
+
Returns:
|
|
798
|
+
None
|
|
799
|
+
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
800
|
+
"""
|
|
801
|
+
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
802
|
+
file_prefix = date_string()
|
|
803
|
+
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
804
|
+
# Make a BAM index file for the BAMs in that directory
|
|
805
|
+
bam_pattern = '*' + bam_suffix
|
|
806
|
+
bam_files = glob.glob(split_dir / bam_pattern)
|
|
807
|
+
bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
|
|
808
|
+
for input_file in bam_files:
|
|
809
|
+
pysam.index(input_file)
|
|
810
|
+
|
|
811
|
+
return bam_files
|