smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Any, Tuple, Union, Optional
|
|
5
|
+
import re
|
|
6
|
+
from itertools import zip_longest
|
|
7
|
+
|
|
8
|
+
import pysam
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def concatenate_fastqs_to_bam(
|
|
13
|
+
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
14
|
+
output_bam: Union[str, Path],
|
|
15
|
+
barcode_tag: str = "BC",
|
|
16
|
+
barcode_map: Optional[Dict[Union[str, Path], str]] = None,
|
|
17
|
+
add_read_group: bool = True,
|
|
18
|
+
rg_sample_field: Optional[str] = None,
|
|
19
|
+
progress: bool = True,
|
|
20
|
+
auto_pair: bool = True,
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
fastq_files : list[Path|str] or list[(Path|str, Path|str)]
|
|
28
|
+
Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
|
|
29
|
+
output_bam : Path|str
|
|
30
|
+
Output BAM path (parent directory will be created).
|
|
31
|
+
barcode_tag : str
|
|
32
|
+
SAM tag used to store barcode on each read (default 'BC').
|
|
33
|
+
barcode_map : dict or None
|
|
34
|
+
Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
|
|
35
|
+
add_read_group : bool
|
|
36
|
+
If True, add @RG header lines (ID = barcode) and set each read's RG tag.
|
|
37
|
+
rg_sample_field : str or None
|
|
38
|
+
If set, include SM=<value> in @RG.
|
|
39
|
+
progress : bool
|
|
40
|
+
Show tqdm progress bars.
|
|
41
|
+
auto_pair : bool
|
|
42
|
+
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
dict
|
|
47
|
+
{'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# ---------- helpers (Pathlib-only) ----------
|
|
51
|
+
def _strip_fastq_ext(p: Path) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Remove common FASTQ multi-suffixes; return stem-like name.
|
|
54
|
+
"""
|
|
55
|
+
name = p.name
|
|
56
|
+
lowers = name.lower()
|
|
57
|
+
for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
|
|
58
|
+
if lowers.endswith(ext):
|
|
59
|
+
return name[: -len(ext)]
|
|
60
|
+
return p.stem # fallback: remove last suffix only
|
|
61
|
+
|
|
62
|
+
def _extract_barcode_from_filename(p: Path) -> str:
|
|
63
|
+
stem = _strip_fastq_ext(p)
|
|
64
|
+
if "_" in stem:
|
|
65
|
+
token = stem.split("_")[-1]
|
|
66
|
+
if token:
|
|
67
|
+
return token
|
|
68
|
+
return stem
|
|
69
|
+
|
|
70
|
+
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
71
|
+
# return (prefix, readnum) if matches; else (None, None)
|
|
72
|
+
patterns = [
|
|
73
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
74
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
75
|
+
]
|
|
76
|
+
for pat in patterns:
|
|
77
|
+
m = re.match(pat, stem)
|
|
78
|
+
if m:
|
|
79
|
+
return m.group(1), int(m.group(2))
|
|
80
|
+
return None, None
|
|
81
|
+
|
|
82
|
+
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
83
|
+
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
84
|
+
unpaired: List[Path] = []
|
|
85
|
+
for pth in paths:
|
|
86
|
+
stem = _strip_fastq_ext(pth)
|
|
87
|
+
pref, num = _classify_read_token(stem)
|
|
88
|
+
if pref is None:
|
|
89
|
+
unpaired.append(pth)
|
|
90
|
+
else:
|
|
91
|
+
entry = pref_map.setdefault(pref, {})
|
|
92
|
+
entry[num] = pth
|
|
93
|
+
pairs: List[Tuple[Path, Path]] = []
|
|
94
|
+
leftovers: List[Path] = []
|
|
95
|
+
for d in pref_map.values():
|
|
96
|
+
if 1 in d and 2 in d:
|
|
97
|
+
pairs.append((d[1], d[2]))
|
|
98
|
+
else:
|
|
99
|
+
leftovers.extend(d.values())
|
|
100
|
+
leftovers.extend(unpaired)
|
|
101
|
+
return pairs, leftovers
|
|
102
|
+
|
|
103
|
+
def _fastq_iter(p: Path):
|
|
104
|
+
# pysam.FastxFile handles compressed extensions transparently
|
|
105
|
+
with pysam.FastxFile(str(p)) as fx:
|
|
106
|
+
for rec in fx:
|
|
107
|
+
yield rec # rec.name, rec.sequence, rec.quality
|
|
108
|
+
|
|
109
|
+
def _make_unaligned_segment(
|
|
110
|
+
name: str,
|
|
111
|
+
seq: str,
|
|
112
|
+
qual: Optional[str],
|
|
113
|
+
bc: str,
|
|
114
|
+
read1: bool,
|
|
115
|
+
read2: bool,
|
|
116
|
+
) -> pysam.AlignedSegment:
|
|
117
|
+
a = pysam.AlignedSegment()
|
|
118
|
+
a.query_name = name
|
|
119
|
+
a.query_sequence = seq
|
|
120
|
+
if qual is not None:
|
|
121
|
+
a.query_qualities = pysam.qualitystring_to_array(qual)
|
|
122
|
+
a.is_unmapped = True
|
|
123
|
+
a.is_paired = read1 or read2
|
|
124
|
+
a.is_read1 = read1
|
|
125
|
+
a.is_read2 = read2
|
|
126
|
+
a.mate_is_unmapped = a.is_paired
|
|
127
|
+
a.reference_id = -1
|
|
128
|
+
a.reference_start = -1
|
|
129
|
+
a.next_reference_id = -1
|
|
130
|
+
a.next_reference_start = -1
|
|
131
|
+
a.template_length = 0
|
|
132
|
+
a.set_tag(barcode_tag, str(bc), value_type="Z")
|
|
133
|
+
if add_read_group:
|
|
134
|
+
a.set_tag("RG", str(bc), value_type="Z")
|
|
135
|
+
return a
|
|
136
|
+
|
|
137
|
+
# ---------- normalize inputs to Path ----------
|
|
138
|
+
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
139
|
+
a, b = x
|
|
140
|
+
return Path(a), Path(b)
|
|
141
|
+
|
|
142
|
+
explicit_pairs: List[Tuple[Path, Path]] = []
|
|
143
|
+
singles: List[Path] = []
|
|
144
|
+
|
|
145
|
+
if not isinstance(fastq_files, (list, tuple)):
|
|
146
|
+
raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
|
|
147
|
+
|
|
148
|
+
if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
|
|
149
|
+
explicit_pairs = [_to_path_pair(x) for x in fastq_files]
|
|
150
|
+
else:
|
|
151
|
+
flat_paths = [Path(x) for x in fastq_files if x is not None]
|
|
152
|
+
if auto_pair:
|
|
153
|
+
explicit_pairs, leftovers = _pair_by_filename(flat_paths)
|
|
154
|
+
singles = leftovers
|
|
155
|
+
else:
|
|
156
|
+
singles = flat_paths
|
|
157
|
+
|
|
158
|
+
output_bam = Path(output_bam)
|
|
159
|
+
output_bam.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
|
|
161
|
+
# ---------- barcodes ----------
|
|
162
|
+
barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
|
|
163
|
+
per_path_barcode: Dict[Path, str] = {}
|
|
164
|
+
barcodes_in_order: List[str] = []
|
|
165
|
+
|
|
166
|
+
for r1, r2 in explicit_pairs:
|
|
167
|
+
bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
|
|
168
|
+
per_path_barcode[r1] = bc
|
|
169
|
+
per_path_barcode[r2] = bc
|
|
170
|
+
if bc not in barcodes_in_order:
|
|
171
|
+
barcodes_in_order.append(bc)
|
|
172
|
+
for pth in singles:
|
|
173
|
+
bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
|
|
174
|
+
per_path_barcode[pth] = bc
|
|
175
|
+
if bc not in barcodes_in_order:
|
|
176
|
+
barcodes_in_order.append(bc)
|
|
177
|
+
|
|
178
|
+
# ---------- BAM header ----------
|
|
179
|
+
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
180
|
+
if add_read_group:
|
|
181
|
+
header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
|
|
182
|
+
header.setdefault("PG", []).append(
|
|
183
|
+
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# ---------- counters ----------
|
|
187
|
+
per_file_counts: Dict[Path, int] = {}
|
|
188
|
+
total_written = 0
|
|
189
|
+
paired_pairs_written = 0
|
|
190
|
+
singletons_written = 0
|
|
191
|
+
|
|
192
|
+
# ---------- write BAM ----------
|
|
193
|
+
with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
|
|
194
|
+
# Paired
|
|
195
|
+
it_pairs = explicit_pairs
|
|
196
|
+
if progress and it_pairs:
|
|
197
|
+
it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
|
|
198
|
+
for r1_path, r2_path in it_pairs:
|
|
199
|
+
if not (r1_path.exists() and r2_path.exists()):
|
|
200
|
+
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
201
|
+
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
202
|
+
|
|
203
|
+
it1 = _fastq_iter(r1_path)
|
|
204
|
+
it2 = _fastq_iter(r2_path)
|
|
205
|
+
|
|
206
|
+
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
207
|
+
def _clean(n: Optional[str]) -> Optional[str]:
|
|
208
|
+
if n is None:
|
|
209
|
+
return None
|
|
210
|
+
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
211
|
+
|
|
212
|
+
name = (
|
|
213
|
+
_clean(getattr(rec1, "name", None))
|
|
214
|
+
or _clean(getattr(rec2, "name", None))
|
|
215
|
+
or getattr(rec1, "name", None)
|
|
216
|
+
or getattr(rec2, "name", None)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if rec1 is not None:
|
|
220
|
+
a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
|
|
221
|
+
bam_out.write(a1)
|
|
222
|
+
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
223
|
+
total_written += 1
|
|
224
|
+
if rec2 is not None:
|
|
225
|
+
a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
|
|
226
|
+
bam_out.write(a2)
|
|
227
|
+
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
228
|
+
total_written += 1
|
|
229
|
+
|
|
230
|
+
if rec1 is not None and rec2 is not None:
|
|
231
|
+
paired_pairs_written += 1
|
|
232
|
+
else:
|
|
233
|
+
if rec1 is not None:
|
|
234
|
+
singletons_written += 1
|
|
235
|
+
if rec2 is not None:
|
|
236
|
+
singletons_written += 1
|
|
237
|
+
|
|
238
|
+
# Singles
|
|
239
|
+
it_singles = singles
|
|
240
|
+
if progress and it_singles:
|
|
241
|
+
it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
|
|
242
|
+
for pth in it_singles:
|
|
243
|
+
if not pth.exists():
|
|
244
|
+
raise FileNotFoundError(pth)
|
|
245
|
+
bc = per_path_barcode.get(pth, "barcode")
|
|
246
|
+
for rec in _fastq_iter(pth):
|
|
247
|
+
a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
|
|
248
|
+
bam_out.write(a)
|
|
249
|
+
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
250
|
+
total_written += 1
|
|
251
|
+
singletons_written += 1
|
|
252
|
+
|
|
253
|
+
return {
|
|
254
|
+
"total_reads": total_written,
|
|
255
|
+
"per_file": {str(k): v for k, v in per_file_counts.items()},
|
|
256
|
+
"paired_pairs_written": paired_pairs_written,
|
|
257
|
+
"singletons_written": singletons_written,
|
|
258
|
+
"barcodes": barcodes_in_order,
|
|
259
|
+
}
|
|
@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
|
|
|
14
14
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
15
15
|
|
|
16
16
|
"""
|
|
17
|
-
from
|
|
17
|
+
from ... import readwrite
|
|
18
18
|
import pysam
|
|
19
19
|
from tqdm import tqdm
|
|
20
20
|
from collections import defaultdict
|
|
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
|
|
|
25
25
|
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
26
26
|
record_counts = defaultdict(int)
|
|
27
27
|
|
|
28
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
28
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
29
29
|
total_reads = bam.mapped + bam.unmapped
|
|
30
30
|
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
31
31
|
for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
|
|
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
18
18
|
bam_files (list): List of split BAM file path strings
|
|
19
19
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
20
20
|
"""
|
|
21
|
-
from
|
|
21
|
+
from ...readwrite import make_dirs
|
|
22
22
|
import os
|
|
23
23
|
import subprocess
|
|
24
24
|
import glob
|
|
25
|
-
from .make_dirs import make_dirs
|
|
26
25
|
|
|
27
|
-
input_bam = aligned_sorted_BAM
|
|
26
|
+
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
28
27
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
29
28
|
if barcode_both_ends:
|
|
30
29
|
command.append("--barcode-both-ends")
|
|
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
34
33
|
command += ["-t", str(threads)]
|
|
35
34
|
else:
|
|
36
35
|
pass
|
|
37
|
-
command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
|
|
38
|
-
command.append(input_bam)
|
|
36
|
+
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
37
|
+
command.append(str(input_bam))
|
|
39
38
|
command_string = ' '.join(command)
|
|
40
39
|
print(f"Running: {command_string}")
|
|
41
40
|
subprocess.run(command)
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
bam_files.sort()
|
|
42
|
+
bam_files = sorted(
|
|
43
|
+
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
44
|
+
if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
|
|
45
|
+
)
|
|
48
46
|
|
|
49
47
|
if not bam_files:
|
|
50
48
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
@@ -27,7 +27,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
27
27
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
28
28
|
|
|
29
29
|
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
30
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
30
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
31
31
|
total_reads = bam.mapped
|
|
32
32
|
ref_seq = sequence.upper()
|
|
33
33
|
for read in bam.fetch(chromosome):
|
|
@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
23
23
|
import glob
|
|
24
24
|
import zipfile
|
|
25
25
|
|
|
26
|
-
os.chdir(mod_tsv_dir)
|
|
27
26
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
28
|
-
bam_files = glob.glob(
|
|
27
|
+
bam_files = glob.glob(split_dir / f"*{bam_suffix}")
|
|
28
|
+
print(f"Running modkit extract for the following bam files: {bam_files}")
|
|
29
29
|
|
|
30
30
|
if threads:
|
|
31
31
|
threads = str(threads)
|
|
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
35
35
|
for input_file in bam_files:
|
|
36
36
|
print(input_file)
|
|
37
37
|
# Extract the file basename
|
|
38
|
-
file_name =
|
|
38
|
+
file_name = input_file.name
|
|
39
39
|
if skip_unclassified and "unclassified" in file_name:
|
|
40
40
|
print("Skipping modkit extract on unclassified reads")
|
|
41
41
|
else:
|
|
42
42
|
# Construct the output TSV file path
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if
|
|
46
|
-
print(f"{
|
|
43
|
+
output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
|
|
44
|
+
output_tsv_gz = output_tsv + '.gz'
|
|
45
|
+
if output_tsv_gz.exists():
|
|
46
|
+
print(f"{output_tsv_gz} already exists, skipping modkit extract")
|
|
47
47
|
else:
|
|
48
48
|
print(f"Extracting modification data from {input_file}")
|
|
49
49
|
if modkit_summary:
|
|
50
50
|
# Run modkit summary
|
|
51
|
-
subprocess.run(["modkit", "summary", input_file])
|
|
51
|
+
subprocess.run(["modkit", "summary", str(input_file)])
|
|
52
52
|
else:
|
|
53
53
|
pass
|
|
54
54
|
# Run modkit extract
|
|
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
61
61
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
62
62
|
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
63
63
|
"-t", threads,
|
|
64
|
-
input_file, output_tsv
|
|
64
|
+
str(input_file), str(output_tsv)
|
|
65
65
|
]
|
|
66
66
|
else:
|
|
67
67
|
extract_command = [
|
|
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
71
71
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
72
72
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
73
73
|
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
74
|
-
input_file, output_tsv
|
|
74
|
+
str(input_file), str(output_tsv)
|
|
75
75
|
]
|
|
76
76
|
subprocess.run(extract_command)
|
|
77
77
|
# Zip the output TSV
|
|
78
78
|
print(f'zipping {output_tsv}')
|
|
79
79
|
if threads:
|
|
80
|
-
zip_command = ["pigz", "-f", "-p", threads, output_tsv]
|
|
80
|
+
zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
|
|
81
81
|
else:
|
|
82
|
-
zip_command = ["pigz", "-f", output_tsv]
|
|
83
|
-
subprocess.run(zip_command, check=True)
|
|
82
|
+
zip_command = ["pigz", "-f", str(output_tsv)]
|
|
83
|
+
subprocess.run(zip_command, check=True)
|
|
84
|
+
|
|
85
|
+
return
|
|
@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
|
|
|
67
67
|
None (Writes the converted FASTA file).
|
|
68
68
|
"""
|
|
69
69
|
unconverted = modification_types[0]
|
|
70
|
+
input_fasta = str(input_fasta)
|
|
71
|
+
output_fasta = str(output_fasta)
|
|
70
72
|
|
|
71
73
|
# Detect if input is gzipped
|
|
72
74
|
open_func = gzip.open if input_fasta.endswith('.gz') else open
|
|
@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
|
|
|
8
8
|
fasta (str): Path to the input fasta
|
|
9
9
|
"""
|
|
10
10
|
import os
|
|
11
|
+
from pathlib import Path
|
|
11
12
|
import subprocess
|
|
12
13
|
from .index_fasta import index_fasta
|
|
13
14
|
|
|
14
15
|
# Make a fasta index file if one isn't already available
|
|
15
|
-
index_path =
|
|
16
|
-
if
|
|
16
|
+
index_path = fasta / '.fai'
|
|
17
|
+
if index_path.exists():
|
|
17
18
|
print(f'Using existing fasta index file: {index_path}')
|
|
18
19
|
else:
|
|
19
20
|
index_fasta(fasta)
|
|
20
21
|
|
|
21
|
-
parent_dir =
|
|
22
|
-
fasta_basename =
|
|
23
|
-
chrom_basename =
|
|
24
|
-
chrom_path =
|
|
22
|
+
parent_dir = fasta.parent
|
|
23
|
+
fasta_basename = fasta.name
|
|
24
|
+
chrom_basename = fasta.stem + '.chrom.sizes'
|
|
25
|
+
chrom_path = parent_dir / chrom_basename
|
|
25
26
|
|
|
26
27
|
# Make a chromosome length file
|
|
27
|
-
if
|
|
28
|
+
if chrom_path.exists():
|
|
28
29
|
print(f'Using existing chrom length index file: {chrom_path}')
|
|
29
30
|
else:
|
|
30
31
|
with open(chrom_path, 'w') as outfile:
|
|
31
|
-
command = ["cut", "-f1,2", index_path]
|
|
32
|
+
command = ["cut", "-f1,2", str(index_path)]
|
|
32
33
|
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
5
|
+
"""
|
|
6
|
+
Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
|
|
7
|
+
|
|
8
|
+
Returns
|
|
9
|
+
-------
|
|
10
|
+
Path: path to chrom.sizes file (if requested), else .fai
|
|
11
|
+
"""
|
|
12
|
+
fasta = Path(fasta)
|
|
13
|
+
pysam.faidx(str(fasta)) # makes fasta.fai
|
|
14
|
+
|
|
15
|
+
if write_chrom_sizes:
|
|
16
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
17
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
18
|
+
with open(fai) as f_in, open(chrom_sizes, "w") as out:
|
|
19
|
+
for line in f_in:
|
|
20
|
+
chrom, size = line.split()[:2]
|
|
21
|
+
out.write(f"{chrom}\t{size}\n")
|
|
22
|
+
return chrom_sizes
|
|
23
|
+
|
|
24
|
+
return fasta.with_suffix(fasta.suffix + ".fai")
|
|
@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
|
|
|
13
13
|
import os
|
|
14
14
|
import subprocess
|
|
15
15
|
|
|
16
|
-
os.chdir(mod_bed_dir)
|
|
17
16
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
18
17
|
command = [
|
|
19
|
-
"modkit", "pileup", aligned_sorted_output, mod_bed_dir,
|
|
18
|
+
"modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
|
|
20
19
|
"--partition-tag", "BC",
|
|
21
20
|
"--only-tabs",
|
|
22
21
|
"--filter-threshold", f'{filter_threshold}',
|
|
@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
|
|
|
16
16
|
import subprocess
|
|
17
17
|
|
|
18
18
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
19
|
-
subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
|
|
19
|
+
subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
|
|
20
20
|
command = [
|
|
21
|
-
"modkit", "summary", aligned_sorted_output,
|
|
21
|
+
"modkit", "summary", str(aligned_sorted_output),
|
|
22
22
|
"--filter-threshold", f"{filter_threshold}",
|
|
23
23
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
24
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
@@ -1,24 +1,5 @@
|
|
|
1
1
|
# plot_bed_histograms
|
|
2
2
|
|
|
3
|
-
def plot_bed_histograms(bed_file, plotting_directory, fasta):
|
|
4
|
-
"""
|
|
5
|
-
Plots read length, coverage, mapq, read quality stats for each record.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
bed_file (str): Path to the bed file to derive metrics from.
|
|
9
|
-
plot_directory (str): Path to the directory to write out historgrams.
|
|
10
|
-
fasta (str): Path to FASTA corresponding to bed
|
|
11
|
-
|
|
12
|
-
Returns:
|
|
13
|
-
None
|
|
14
|
-
"""
|
|
15
|
-
import pandas as pd
|
|
16
|
-
import matplotlib.pyplot as plt
|
|
17
|
-
import numpy as np
|
|
18
|
-
import os
|
|
19
|
-
|
|
20
|
-
# plot_bed_histograms.py
|
|
21
|
-
|
|
22
3
|
def plot_bed_histograms(
|
|
23
4
|
bed_file,
|
|
24
5
|
plotting_directory,
|
|
@@ -15,13 +15,14 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
15
15
|
Writes out split BAM files.
|
|
16
16
|
"""
|
|
17
17
|
import pysam
|
|
18
|
+
from pathlib import Path
|
|
18
19
|
import os
|
|
19
20
|
|
|
20
|
-
bam_base =
|
|
21
|
-
bam_base_minus_suffix =
|
|
21
|
+
bam_base = input_bam.name
|
|
22
|
+
bam_base_minus_suffix = input_bam.stem
|
|
22
23
|
|
|
23
24
|
# Open the input BAM file for reading
|
|
24
|
-
with pysam.AlignmentFile(input_bam, "rb") as bam:
|
|
25
|
+
with pysam.AlignmentFile(str(input_bam), "rb") as bam:
|
|
25
26
|
# Create a dictionary to store output BAM files
|
|
26
27
|
output_files = {}
|
|
27
28
|
# Iterate over each read in the BAM file
|
|
@@ -32,8 +33,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
32
33
|
#bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
33
34
|
# Open the output BAM file corresponding to the barcode
|
|
34
35
|
if bc_tag not in output_files:
|
|
35
|
-
output_path =
|
|
36
|
-
output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
|
|
36
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
37
|
+
output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
|
|
37
38
|
# Write the read to the corresponding output BAM file
|
|
38
39
|
output_files[bc_tag].write(read)
|
|
39
40
|
except KeyError:
|
|
@@ -12,21 +12,21 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
12
12
|
None
|
|
13
13
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
14
14
|
"""
|
|
15
|
-
from
|
|
15
|
+
from ...readwrite import date_string, make_dirs
|
|
16
|
+
from pathlib import Path
|
|
16
17
|
import os
|
|
17
|
-
import
|
|
18
|
+
import pysam
|
|
18
19
|
import glob
|
|
19
20
|
from .separate_bam_by_bc import separate_bam_by_bc
|
|
20
|
-
from .make_dirs import make_dirs
|
|
21
21
|
|
|
22
22
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
23
|
-
file_prefix =
|
|
23
|
+
file_prefix = date_string()
|
|
24
24
|
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
25
25
|
# Make a BAM index file for the BAMs in that directory
|
|
26
26
|
bam_pattern = '*' + bam_suffix
|
|
27
|
-
bam_files = glob.glob(
|
|
28
|
-
bam_files = [bam for bam in bam_files if '.bai' not in bam]
|
|
27
|
+
bam_files = glob.glob(split_dir / bam_pattern)
|
|
28
|
+
bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
|
|
29
29
|
for input_file in bam_files:
|
|
30
|
-
|
|
30
|
+
pysam.index(input_file)
|
|
31
31
|
|
|
32
32
|
return bam_files
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from pyfaidx import Fasta
|
|
3
|
+
|
|
4
|
+
def subsample_fasta_from_bed(
|
|
5
|
+
input_FASTA: str | Path,
|
|
6
|
+
input_bed: str | Path,
|
|
7
|
+
output_directory: str | Path,
|
|
8
|
+
output_FASTA: str | Path
|
|
9
|
+
) -> None:
|
|
10
|
+
"""
|
|
11
|
+
Take a genome-wide FASTA file and a BED file containing
|
|
12
|
+
coordinate windows of interest. Outputs a subsampled FASTA.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Normalize everything to Path
|
|
16
|
+
input_FASTA = Path(input_FASTA)
|
|
17
|
+
input_bed = Path(input_bed)
|
|
18
|
+
output_directory = Path(output_directory)
|
|
19
|
+
output_FASTA = Path(output_FASTA)
|
|
20
|
+
|
|
21
|
+
# Ensure output directory exists
|
|
22
|
+
output_directory.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
output_FASTA_path = output_directory / output_FASTA
|
|
25
|
+
|
|
26
|
+
# Load the FASTA file using pyfaidx
|
|
27
|
+
fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
|
|
28
|
+
|
|
29
|
+
# Open BED + output FASTA
|
|
30
|
+
with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
|
|
31
|
+
for line in bed:
|
|
32
|
+
fields = line.strip().split()
|
|
33
|
+
chrom = fields[0]
|
|
34
|
+
start = int(fields[1]) # BED is 0-based
|
|
35
|
+
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
36
|
+
desc = " ".join(fields[3:]) if len(fields) > 3 else ""
|
|
37
|
+
|
|
38
|
+
if chrom not in fasta:
|
|
39
|
+
print(f"Warning: {chrom} not found in FASTA")
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
# pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
|
|
43
|
+
sequence = fasta[chrom][start:end].seq
|
|
44
|
+
|
|
45
|
+
header = f">{chrom}:{start}-{end}"
|
|
46
|
+
if desc:
|
|
47
|
+
header += f" {desc}"
|
|
48
|
+
|
|
49
|
+
out_fasta.write(f"{header}\n{sequence}\n")
|