split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
split3c/resolve/parse.py
ADDED
|
@@ -0,0 +1,1218 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
import pysam
|
|
5
|
+
|
|
6
|
+
from .bam import (
|
|
7
|
+
chromsizes_from_header,
|
|
8
|
+
get_bam_header_single,
|
|
9
|
+
get_bam_headers,
|
|
10
|
+
iter_bam_pairs,
|
|
11
|
+
iter_bam_pairs_single,
|
|
12
|
+
)
|
|
13
|
+
from .io_utils import open_text_output
|
|
14
|
+
from .pairs import make_pairs_header, write_pairs_header
|
|
15
|
+
|
|
16
|
+
# Minimal mutable alignment layout:
|
|
17
|
+
# [chrom, pos5, pos3, strand, mapq, anchor_tag, origin, merged_count, sam]
|
|
18
|
+
CHROM = 0
|
|
19
|
+
POS5 = 1
|
|
20
|
+
POS3 = 2
|
|
21
|
+
STRAND = 3
|
|
22
|
+
MAPQ = 4
|
|
23
|
+
ANCHOR = 5
|
|
24
|
+
ORIGIN = 6
|
|
25
|
+
MERGED = 7
|
|
26
|
+
SAM = 8
|
|
27
|
+
|
|
28
|
+
REF_CONSUMING_OPS = {0, 2, 3, 7, 8} # M, D, N, =, X
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_qname_any(qname: str) -> tuple[str, str, str, int, int]:
|
|
32
|
+
"""
|
|
33
|
+
Supports:
|
|
34
|
+
- enriched: base:[F1,R2:FT3,RT2]
|
|
35
|
+
- classic : base -> treated as FT1/RT1 with tags F1,R1
|
|
36
|
+
|
|
37
|
+
>>> parse_qname_any("READ")
|
|
38
|
+
('READ', 'F1', 'R1', 1, 1)
|
|
39
|
+
>>> parse_qname_any("readA:[F1,R2:FT3,RT2]")
|
|
40
|
+
('readA', 'F1', 'R2', 3, 2)
|
|
41
|
+
>>> parse_qname_any("READ:SP")
|
|
42
|
+
('READ', 'F1', 'R1', 1, 1)
|
|
43
|
+
>>> parse_qname_any("READ:SP_4")
|
|
44
|
+
('READ', 'F1', 'R1', 1, 1)
|
|
45
|
+
"""
|
|
46
|
+
# fast path for classic
|
|
47
|
+
if ":[" not in qname:
|
|
48
|
+
if ":SP" in qname:
|
|
49
|
+
qname = qname.split(":SP", 1)[0]
|
|
50
|
+
return qname, "F1", "R1", 1, 1
|
|
51
|
+
|
|
52
|
+
# enriched path (your current logic, but without try/except in the normal case)
|
|
53
|
+
try:
|
|
54
|
+
base_name, rest = qname.rsplit(":[", 1)
|
|
55
|
+
rest = rest[:-1] # drop trailing ']'
|
|
56
|
+
tags_part, counts_part = rest.split(":FT", 1)
|
|
57
|
+
tag1, tag2 = tags_part.split(",", 1)
|
|
58
|
+
ft_str, rt_str = counts_part.split(",RT", 1)
|
|
59
|
+
return base_name, tag1, tag2, int(ft_str), int(rt_str)
|
|
60
|
+
except Exception as exc:
|
|
61
|
+
raise ValueError(f"Invalid qname format: {qname!r}") from exc
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_qname(qname: str) -> tuple[str, str, str, int, int]:
|
|
65
|
+
"""
|
|
66
|
+
Parse a microsplit qname without regex.
|
|
67
|
+
|
|
68
|
+
Expected format:
|
|
69
|
+
<base_name>:[<tag1>,<tag2>:FT<ft>,RT<rt>]
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> parse_qname("readA:[F1,R2:FT3,RT2]")
|
|
74
|
+
('readA', 'F1', 'R2', 3, 2)
|
|
75
|
+
|
|
76
|
+
>>> parse_qname("bad_qname")
|
|
77
|
+
Traceback (most recent call last):
|
|
78
|
+
...
|
|
79
|
+
ValueError: Invalid microsplit qname: 'bad_qname'
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
base_name, rest = qname.rsplit(":[", 1)
|
|
83
|
+
rest = rest[:-1]
|
|
84
|
+
tags_part, counts_part = rest.split(":FT", 1)
|
|
85
|
+
tag1, tag2 = tags_part.split(",", 1)
|
|
86
|
+
ft_str, rt_str = counts_part.split(",RT", 1)
|
|
87
|
+
return base_name, tag1, tag2, int(ft_str), int(rt_str)
|
|
88
|
+
except Exception as exc:
|
|
89
|
+
raise ValueError(f"Invalid microsplit qname: {qname!r}") from exc
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def cigar_ref_span(read: pysam.AlignedSegment) -> int:
|
|
93
|
+
"""
|
|
94
|
+
Return the reference span consumed by the CIGAR.
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
>>> class FakeRead:
|
|
99
|
+
... cigartuples = [(0, 10), (1, 3), (2, 2), (7, 5), (8, 1)]
|
|
100
|
+
>>> cigar_ref_span(FakeRead())
|
|
101
|
+
18
|
|
102
|
+
"""
|
|
103
|
+
span = 0
|
|
104
|
+
for op, length in read.cigartuples or []:
|
|
105
|
+
if op in REF_CONSUMING_OPS:
|
|
106
|
+
span += length
|
|
107
|
+
return span
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def read_to_minimal_alignment(
|
|
111
|
+
read: pysam.AlignedSegment,
|
|
112
|
+
tag: str,
|
|
113
|
+
sam_output: bool = False,
|
|
114
|
+
) -> list | None:
|
|
115
|
+
"""
|
|
116
|
+
Convert one read into a minimal mutable alignment list.
|
|
117
|
+
|
|
118
|
+
Returns None for unmapped reads.
|
|
119
|
+
|
|
120
|
+
Notes
|
|
121
|
+
-----
|
|
122
|
+
The last field stores the raw SAM line only when `sam_output=True`.
|
|
123
|
+
|
|
124
|
+
Examples
|
|
125
|
+
--------
|
|
126
|
+
>>> class FakeRead:
|
|
127
|
+
... is_unmapped = False
|
|
128
|
+
... reference_name = "chr1"
|
|
129
|
+
... mapping_quality = 42
|
|
130
|
+
... cigartuples = [(0, 10)]
|
|
131
|
+
... is_reverse = False
|
|
132
|
+
... reference_start = 99
|
|
133
|
+
... def to_string(self):
|
|
134
|
+
... return "read1\\t0\\tchr1\\t100\\t42\\t10M\\t*\\t0\\t0\\tACGT\\tFFFF"
|
|
135
|
+
>>> read_to_minimal_alignment(FakeRead(), "F1", sam_output=False)
|
|
136
|
+
['chr1', 100, 109, '+', 42, 'F1', 'F', 1, None]
|
|
137
|
+
>>> read_to_minimal_alignment(FakeRead(), "F1", sam_output=True)[-1].startswith("read1\\t0\\tchr1")
|
|
138
|
+
True
|
|
139
|
+
"""
|
|
140
|
+
if read.is_unmapped or read.reference_name is None:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
strand = "-" if read.is_reverse else "+"
|
|
144
|
+
ref_start_1based = read.reference_start + 1
|
|
145
|
+
ref_span = cigar_ref_span(read)
|
|
146
|
+
|
|
147
|
+
if strand == "+":
|
|
148
|
+
pos5 = ref_start_1based
|
|
149
|
+
pos3 = ref_start_1based + ref_span - 1
|
|
150
|
+
else:
|
|
151
|
+
pos5 = ref_start_1based + ref_span - 1
|
|
152
|
+
pos3 = ref_start_1based
|
|
153
|
+
|
|
154
|
+
sam_line = read.to_string() if sam_output else None
|
|
155
|
+
|
|
156
|
+
return [
|
|
157
|
+
read.reference_name,
|
|
158
|
+
pos5,
|
|
159
|
+
pos3,
|
|
160
|
+
strand,
|
|
161
|
+
int(read.mapping_quality),
|
|
162
|
+
tag,
|
|
163
|
+
tag[0],
|
|
164
|
+
1,
|
|
165
|
+
sam_line,
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def observed_tags_in_block(
|
|
170
|
+
block: list[
|
|
171
|
+
tuple[
|
|
172
|
+
tuple[str, str, str, int, int],
|
|
173
|
+
pysam.AlignedSegment,
|
|
174
|
+
pysam.AlignedSegment,
|
|
175
|
+
]
|
|
176
|
+
],
|
|
177
|
+
) -> set[str]:
|
|
178
|
+
"""
|
|
179
|
+
Return the set of fragment tags observed in one block.
|
|
180
|
+
|
|
181
|
+
Examples
|
|
182
|
+
--------
|
|
183
|
+
>>> block = [
|
|
184
|
+
... (("readA", "F1", "R1", 2, 1), None, None),
|
|
185
|
+
... (("readA", "F2", "R1", 2, 1), None, None),
|
|
186
|
+
... ]
|
|
187
|
+
>>> observed_tags_in_block(block) == {"F1", "F2", "R1"}
|
|
188
|
+
True
|
|
189
|
+
"""
|
|
190
|
+
seen: set[str] = set()
|
|
191
|
+
for info, _, _ in block:
|
|
192
|
+
_, tag1, tag2, _, _ = info
|
|
193
|
+
seen.add(tag1)
|
|
194
|
+
seen.add(tag2)
|
|
195
|
+
return seen
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def tag_index(tag: str) -> int:
|
|
199
|
+
"""
|
|
200
|
+
Return the numeric part of a tag.
|
|
201
|
+
|
|
202
|
+
Examples
|
|
203
|
+
--------
|
|
204
|
+
>>> tag_index("F1")
|
|
205
|
+
1
|
|
206
|
+
>>> tag_index("R12")
|
|
207
|
+
12
|
|
208
|
+
"""
|
|
209
|
+
return int(tag[1:])
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def valid_slot(slot: list | None, min_mapq: int) -> bool:
|
|
213
|
+
"""
|
|
214
|
+
Return True for mapped slots with enough MAPQ.
|
|
215
|
+
|
|
216
|
+
Examples
|
|
217
|
+
--------
|
|
218
|
+
>>> valid_slot(['chr1', 10, 20, '+', 5, 'F1', 'F', 1, None], 10)
|
|
219
|
+
False
|
|
220
|
+
>>> valid_slot(['chr1', 10, 20, '+', 20, 'F1', 'F', 1, None], 10)
|
|
221
|
+
True
|
|
222
|
+
>>> valid_slot(None, 10)
|
|
223
|
+
False
|
|
224
|
+
"""
|
|
225
|
+
return slot is not None and slot[MAPQ] >= min_mapq
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def slot_len(slot: list) -> int:
|
|
229
|
+
"""
|
|
230
|
+
Return genomic span length of one slot.
|
|
231
|
+
|
|
232
|
+
Examples
|
|
233
|
+
--------
|
|
234
|
+
>>> slot_len(['chr1', 100, 109, '+', 30, 'F1', 'F', 1, None])
|
|
235
|
+
10
|
|
236
|
+
>>> slot_len(['chr1', 219, 200, '-', 30, 'R1', 'R', 1, None])
|
|
237
|
+
20
|
|
238
|
+
"""
|
|
239
|
+
return abs(slot[POS3] - slot[POS5]) + 1
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def adjacent_gap(a: list | None, b: list | None) -> int | None:
|
|
243
|
+
"""
|
|
244
|
+
Compute the oriented genomic gap between two adjacent same-origin slots.
|
|
245
|
+
|
|
246
|
+
For '+' strand:
|
|
247
|
+
gap = b.pos5 - a.pos3 - 1
|
|
248
|
+
For '-' strand:
|
|
249
|
+
gap = a.pos3 - b.pos5 - 1
|
|
250
|
+
|
|
251
|
+
Returns None when comparison is not meaningful.
|
|
252
|
+
|
|
253
|
+
Examples
|
|
254
|
+
--------
|
|
255
|
+
>>> a = ['chr1', 100, 120, '+', 30, 'F1', 'F', 1, None]
|
|
256
|
+
>>> b = ['chr1', 126, 140, '+', 30, 'F2', 'F', 1, None]
|
|
257
|
+
>>> adjacent_gap(a, b)
|
|
258
|
+
5
|
|
259
|
+
"""
|
|
260
|
+
if a is None or b is None:
|
|
261
|
+
return None
|
|
262
|
+
if a[CHROM] != b[CHROM]:
|
|
263
|
+
return None
|
|
264
|
+
if a[STRAND] != b[STRAND]:
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
if a[STRAND] == "+":
|
|
268
|
+
return b[POS5] - a[POS3] - 1
|
|
269
|
+
return a[POS3] - b[POS5] - 1
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def merge_two_same_origin(a: list, b: list) -> list:
|
|
273
|
+
"""
|
|
274
|
+
Merge b into a and keep one single node.
|
|
275
|
+
|
|
276
|
+
Rules
|
|
277
|
+
-----
|
|
278
|
+
- keep earliest anchor tag
|
|
279
|
+
- extend geometry
|
|
280
|
+
- keep max MAPQ
|
|
281
|
+
- merged_count is summed
|
|
282
|
+
- keep SAM from the surviving anchor tag
|
|
283
|
+
|
|
284
|
+
Examples
|
|
285
|
+
--------
|
|
286
|
+
>>> a = ['chr1', 100, 120, '+', 20, 'F1', 'F', 1, 'SAM_F1']
|
|
287
|
+
>>> b = ['chr1', 126, 140, '+', 30, 'F2', 'F', 1, 'SAM_F2']
|
|
288
|
+
>>> merge_two_same_origin(a, b)
|
|
289
|
+
['chr1', 100, 140, '+', 30, 'F1', 'F', 2, 'SAM_F1']
|
|
290
|
+
"""
|
|
291
|
+
if a[STRAND] == "+":
|
|
292
|
+
pos5 = min(a[POS5], b[POS5])
|
|
293
|
+
pos3 = max(a[POS3], b[POS3])
|
|
294
|
+
else:
|
|
295
|
+
pos5 = max(a[POS5], b[POS5])
|
|
296
|
+
pos3 = min(a[POS3], b[POS3])
|
|
297
|
+
|
|
298
|
+
keep_a_anchor = tag_index(a[ANCHOR]) <= tag_index(b[ANCHOR])
|
|
299
|
+
anchor = a[ANCHOR] if keep_a_anchor else b[ANCHOR]
|
|
300
|
+
sam_line = a[SAM] if keep_a_anchor else b[SAM]
|
|
301
|
+
|
|
302
|
+
return [
|
|
303
|
+
a[CHROM],
|
|
304
|
+
pos5,
|
|
305
|
+
pos3,
|
|
306
|
+
a[STRAND],
|
|
307
|
+
max(a[MAPQ], b[MAPQ]),
|
|
308
|
+
anchor,
|
|
309
|
+
a[ORIGIN],
|
|
310
|
+
a[MERGED] + b[MERGED],
|
|
311
|
+
sam_line,
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def collapse_adjacent_in_place(
|
|
316
|
+
slots: list[list | None], max_gap: int, min_mapq: int
|
|
317
|
+
) -> int:
|
|
318
|
+
"""
|
|
319
|
+
Collapse adjacent alive slots in place while gap is compatible.
|
|
320
|
+
|
|
321
|
+
Returns the number of merges.
|
|
322
|
+
|
|
323
|
+
Examples
|
|
324
|
+
--------
|
|
325
|
+
>>> slots = [
|
|
326
|
+
... ['chr1', 1, 10, '+', 20, 'F1', 'F', 1, None],
|
|
327
|
+
... ['chr1', 12, 21, '+', 20, 'F2', 'F', 1, None],
|
|
328
|
+
... ]
|
|
329
|
+
>>> collapse_adjacent_in_place(slots, 5, 1)
|
|
330
|
+
1
|
|
331
|
+
>>> slots[0]
|
|
332
|
+
['chr1', 1, 21, '+', 20, 'F1', 'F', 2, None]
|
|
333
|
+
>>> slots[1] is None
|
|
334
|
+
True
|
|
335
|
+
"""
|
|
336
|
+
n = len(slots)
|
|
337
|
+
merges = 0
|
|
338
|
+
|
|
339
|
+
while True:
|
|
340
|
+
changed = False
|
|
341
|
+
i = 0
|
|
342
|
+
while i < n:
|
|
343
|
+
if not valid_slot(slots[i], min_mapq):
|
|
344
|
+
i += 1
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
j = i + 1
|
|
348
|
+
while j < n and not valid_slot(slots[j], min_mapq):
|
|
349
|
+
j += 1
|
|
350
|
+
if j >= n:
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
gap = adjacent_gap(slots[i], slots[j])
|
|
354
|
+
if gap is not None and 0 <= gap <= max_gap:
|
|
355
|
+
slots[i] = merge_two_same_origin(slots[i], slots[j])
|
|
356
|
+
slots[j] = None
|
|
357
|
+
merges += 1
|
|
358
|
+
changed = True
|
|
359
|
+
else:
|
|
360
|
+
i = j
|
|
361
|
+
|
|
362
|
+
if not changed:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
return merges
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def terminal_gap_center(f_slot: list | None, r_slot: list | None) -> int | None:
|
|
369
|
+
"""
|
|
370
|
+
Compute central gap between terminal forward and reverse slots.
|
|
371
|
+
|
|
372
|
+
Examples
|
|
373
|
+
--------
|
|
374
|
+
>>> f = ['chr1', 100, 150, '+', 20, 'F2', 'F', 1, None]
|
|
375
|
+
>>> r = ['chr1', 200, 160, '-', 20, 'R1', 'R', 1, None]
|
|
376
|
+
>>> terminal_gap_center(f, r)
|
|
377
|
+
9
|
|
378
|
+
"""
|
|
379
|
+
if f_slot is None or r_slot is None:
|
|
380
|
+
return None
|
|
381
|
+
if f_slot[CHROM] != r_slot[CHROM]:
|
|
382
|
+
return None
|
|
383
|
+
if f_slot[STRAND] == "+" and r_slot[STRAND] == "-":
|
|
384
|
+
return r_slot[POS3] - f_slot[POS3] - 1
|
|
385
|
+
if f_slot[STRAND] == "-" and r_slot[STRAND] == "+":
|
|
386
|
+
return f_slot[POS3] - r_slot[POS3] - 1
|
|
387
|
+
return None
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def last_alive_index(slots: list[list | None], min_mapq: int) -> int | None:
|
|
391
|
+
"""
|
|
392
|
+
Return the index of the last alive slot.
|
|
393
|
+
|
|
394
|
+
Examples
|
|
395
|
+
--------
|
|
396
|
+
>>> last_alive_index([None, ['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]], 1)
|
|
397
|
+
1
|
|
398
|
+
"""
|
|
399
|
+
for i in range(len(slots) - 1, -1, -1):
|
|
400
|
+
if valid_slot(slots[i], min_mapq):
|
|
401
|
+
return i
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def collapse_terminal_fr_in_place(
|
|
406
|
+
forward: list[list | None],
|
|
407
|
+
reverse: list[list | None],
|
|
408
|
+
max_center_gap: int,
|
|
409
|
+
overlap_tolerance: int,
|
|
410
|
+
min_mapq: int,
|
|
411
|
+
) -> tuple[bool, int | None]:
|
|
412
|
+
"""
|
|
413
|
+
Collapse the terminal F/R pair in place if compatible.
|
|
414
|
+
|
|
415
|
+
Policy
|
|
416
|
+
------
|
|
417
|
+
Keep the slot with the higher MAPQ.
|
|
418
|
+
On tie, keep forward.
|
|
419
|
+
|
|
420
|
+
Returns
|
|
421
|
+
-------
|
|
422
|
+
(collapsed, gap)
|
|
423
|
+
|
|
424
|
+
Examples
|
|
425
|
+
--------
|
|
426
|
+
>>> F = [['chr1', 100, 119, '+', 30, 'F1', 'F', 1, 'SAM_F1']]
|
|
427
|
+
>>> R = [['chr1', 140, 125, '-', 20, 'R1', 'R', 1, 'SAM_R1']]
|
|
428
|
+
>>> collapse_terminal_fr_in_place(F, R, 10, 0, 1)
|
|
429
|
+
(True, 5)
|
|
430
|
+
>>> F[0] is not None, R[0] is None
|
|
431
|
+
(True, True)
|
|
432
|
+
"""
|
|
433
|
+
fi = last_alive_index(forward, min_mapq)
|
|
434
|
+
ri = last_alive_index(reverse, min_mapq)
|
|
435
|
+
if fi is None or ri is None:
|
|
436
|
+
return False, None
|
|
437
|
+
|
|
438
|
+
f_slot = forward[fi]
|
|
439
|
+
r_slot = reverse[ri]
|
|
440
|
+
gap = terminal_gap_center(f_slot, r_slot)
|
|
441
|
+
if gap is None:
|
|
442
|
+
return False, None
|
|
443
|
+
if not (-overlap_tolerance <= gap <= max_center_gap):
|
|
444
|
+
return False, gap
|
|
445
|
+
|
|
446
|
+
if f_slot[MAPQ] >= r_slot[MAPQ]:
|
|
447
|
+
reverse[ri] = None
|
|
448
|
+
else:
|
|
449
|
+
forward[fi] = None
|
|
450
|
+
|
|
451
|
+
return True, gap
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def alive_slots(slots: list[list | None], min_mapq: int) -> list[list]:
|
|
455
|
+
"""
|
|
456
|
+
Return alive slots only.
|
|
457
|
+
|
|
458
|
+
Examples
|
|
459
|
+
--------
|
|
460
|
+
>>> alive_slots([None, ['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]], 1)
|
|
461
|
+
[['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]]
|
|
462
|
+
"""
|
|
463
|
+
return [slot for slot in slots if valid_slot(slot, min_mapq)]
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def multiplicity_status(initial_count: int, final_count: int) -> str:
|
|
467
|
+
"""
|
|
468
|
+
Compute final molecule status.
|
|
469
|
+
|
|
470
|
+
Examples
|
|
471
|
+
--------
|
|
472
|
+
>>> multiplicity_status(4, 3)
|
|
473
|
+
'true_multiplex'
|
|
474
|
+
>>> multiplicity_status(4, 2)
|
|
475
|
+
'resolved_from_multiplex'
|
|
476
|
+
"""
|
|
477
|
+
if initial_count > 2 and final_count > 2:
|
|
478
|
+
return "true_multiplex"
|
|
479
|
+
if initial_count > 2 and final_count == 2:
|
|
480
|
+
return "resolved_from_multiplex"
|
|
481
|
+
if initial_count > 2 and final_count < 2:
|
|
482
|
+
return "dropped_from_multiplex"
|
|
483
|
+
if initial_count == 2 and final_count == 2:
|
|
484
|
+
return "simple"
|
|
485
|
+
return "dropped"
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def build_pairs_columns(
|
|
489
|
+
sam_output: bool = False,
|
|
490
|
+
qual_stats: bool = False,
|
|
491
|
+
filter_stats: bool = False,
|
|
492
|
+
) -> list[str]:
|
|
493
|
+
"""
|
|
494
|
+
Build dynamic output columns.
|
|
495
|
+
|
|
496
|
+
Examples
|
|
497
|
+
--------
|
|
498
|
+
>>> build_pairs_columns()
|
|
499
|
+
['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status']
|
|
500
|
+
>>> build_pairs_columns(sam_output=True)
|
|
501
|
+
['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status', 'sam1', 'sam2']
|
|
502
|
+
>>> build_pairs_columns(sam_output=True, qual_stats=True, filter_stats=True)
|
|
503
|
+
['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status', 'sam1', 'sam2', 'len1', 'len2', 'mapq1', 'mapq2', 'hard_merged', 'terminal_aliased']
|
|
504
|
+
"""
|
|
505
|
+
cols = [
|
|
506
|
+
"readID",
|
|
507
|
+
"chrom1",
|
|
508
|
+
"pos1",
|
|
509
|
+
"chrom2",
|
|
510
|
+
"pos2",
|
|
511
|
+
"strand1",
|
|
512
|
+
"strand2",
|
|
513
|
+
"pair_type",
|
|
514
|
+
"tag1",
|
|
515
|
+
"tag2",
|
|
516
|
+
"ft",
|
|
517
|
+
"rt",
|
|
518
|
+
"status",
|
|
519
|
+
]
|
|
520
|
+
if sam_output:
|
|
521
|
+
cols.extend(["sam1", "sam2"])
|
|
522
|
+
if qual_stats:
|
|
523
|
+
cols.extend(["len1", "len2", "mapq1", "mapq2"])
|
|
524
|
+
if filter_stats:
|
|
525
|
+
cols.extend(["hard_merged", "terminal_aliased"])
|
|
526
|
+
return cols
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def serialize_pair_line(
|
|
530
|
+
read_id: str,
|
|
531
|
+
a: list,
|
|
532
|
+
b: list,
|
|
533
|
+
ft: int,
|
|
534
|
+
rt: int,
|
|
535
|
+
status: str,
|
|
536
|
+
sam_output: bool = False,
|
|
537
|
+
qual_stats: bool = False,
|
|
538
|
+
filter_stats: bool = False,
|
|
539
|
+
terminal_aliased: bool = False,
|
|
540
|
+
) -> str:
|
|
541
|
+
"""
|
|
542
|
+
Serialize one pair line directly.
|
|
543
|
+
|
|
544
|
+
Examples
|
|
545
|
+
--------
|
|
546
|
+
>>> a = ['chr1', 10, 20, '+', 30, 'F1', 'F', 2, 'SAM_A']
|
|
547
|
+
>>> b = ['chr2', 40, 30, '-', 25, 'R1', 'R', 1, 'SAM_B']
|
|
548
|
+
>>> serialize_pair_line("read1", a, b, 2, 1, "resolved_from_multiplex")
|
|
549
|
+
'read1\\tchr1\\t10\\tchr2\\t40\\t+\\t-\\tUU\\tF1\\tR1\\t2\\t1\\tresolved_from_multiplex'
|
|
550
|
+
>>> serialize_pair_line("read1", a, b, 2, 1, "resolved_from_multiplex", sam_output=True)
|
|
551
|
+
'read1\\tchr1\\t10\\tchr2\\t40\\t+\\t-\\tUU\\tF1\\tR1\\t2\\t1\\tresolved_from_multiplex\\tSAM_A\\tSAM_B'
|
|
552
|
+
"""
|
|
553
|
+
fields = [
|
|
554
|
+
read_id,
|
|
555
|
+
a[CHROM],
|
|
556
|
+
str(a[POS5]),
|
|
557
|
+
b[CHROM],
|
|
558
|
+
str(b[POS5]),
|
|
559
|
+
a[STRAND],
|
|
560
|
+
b[STRAND],
|
|
561
|
+
"UU",
|
|
562
|
+
a[ANCHOR],
|
|
563
|
+
b[ANCHOR],
|
|
564
|
+
str(ft),
|
|
565
|
+
str(rt),
|
|
566
|
+
status,
|
|
567
|
+
]
|
|
568
|
+
|
|
569
|
+
if sam_output:
|
|
570
|
+
fields.extend(
|
|
571
|
+
[
|
|
572
|
+
a[SAM] if a[SAM] is not None else ".",
|
|
573
|
+
b[SAM] if b[SAM] is not None else ".",
|
|
574
|
+
]
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if qual_stats:
|
|
578
|
+
fields.extend(
|
|
579
|
+
[
|
|
580
|
+
str(slot_len(a)),
|
|
581
|
+
str(slot_len(b)),
|
|
582
|
+
str(a[MAPQ]),
|
|
583
|
+
str(b[MAPQ]),
|
|
584
|
+
]
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
if filter_stats:
|
|
588
|
+
hard_merged = 1 if (a[MERGED] > 1 or b[MERGED] > 1) else 0
|
|
589
|
+
fields.extend(
|
|
590
|
+
[
|
|
591
|
+
str(hard_merged),
|
|
592
|
+
"1" if terminal_aliased else "0",
|
|
593
|
+
]
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
return "\t".join(fields)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def build_pair_lines(
|
|
600
|
+
base_name: str,
|
|
601
|
+
forward: list[list | None],
|
|
602
|
+
reverse: list[list | None],
|
|
603
|
+
ft: int,
|
|
604
|
+
rt: int,
|
|
605
|
+
min_mapq: int,
|
|
606
|
+
status: str,
|
|
607
|
+
chrom_order: dict[str, int],
|
|
608
|
+
flip: bool,
|
|
609
|
+
sam_output: bool = False,
|
|
610
|
+
qual_stats: bool = False,
|
|
611
|
+
filter_stats: bool = False,
|
|
612
|
+
terminal_aliased: bool = False,
|
|
613
|
+
) -> list[str]:
|
|
614
|
+
"""
|
|
615
|
+
Build all final pair lines from alive nodes.
|
|
616
|
+
|
|
617
|
+
Examples
|
|
618
|
+
--------
|
|
619
|
+
>>> F = [['chr1', 1, 10, '+', 20, 'F1', 'F', 1, 'SAM_F1']]
|
|
620
|
+
>>> R = [['chr2', 50, 41, '-', 20, 'R1', 'R', 1, 'SAM_R1']]
|
|
621
|
+
>>> lines = build_pair_lines("q", F, R, 1, 1, 1, "simple", {"chr1": 0, "chr2": 1}, True, sam_output=True)
|
|
622
|
+
>>> len(lines)
|
|
623
|
+
1
|
|
624
|
+
>>> lines[0].startswith('q\\tchr1\\t1\\tchr2\\t50')
|
|
625
|
+
True
|
|
626
|
+
>>> '\\tSAM_F1\\tSAM_R1' in lines[0]
|
|
627
|
+
True
|
|
628
|
+
"""
|
|
629
|
+
nodes = alive_slots(forward, min_mapq) + alive_slots(reverse, min_mapq)
|
|
630
|
+
n = len(nodes)
|
|
631
|
+
lines: list[str] = []
|
|
632
|
+
|
|
633
|
+
for i in range(n):
|
|
634
|
+
a = nodes[i]
|
|
635
|
+
for j in range(i + 1, n):
|
|
636
|
+
b = nodes[j]
|
|
637
|
+
|
|
638
|
+
if flip:
|
|
639
|
+
key_a = (chrom_order.get(a[CHROM], 10**9), a[POS5])
|
|
640
|
+
key_b = (chrom_order.get(b[CHROM], 10**9), b[POS5])
|
|
641
|
+
if key_a <= key_b:
|
|
642
|
+
lines.append(
|
|
643
|
+
serialize_pair_line(
|
|
644
|
+
base_name,
|
|
645
|
+
a,
|
|
646
|
+
b,
|
|
647
|
+
ft,
|
|
648
|
+
rt,
|
|
649
|
+
status,
|
|
650
|
+
sam_output=sam_output,
|
|
651
|
+
qual_stats=qual_stats,
|
|
652
|
+
filter_stats=filter_stats,
|
|
653
|
+
terminal_aliased=terminal_aliased,
|
|
654
|
+
)
|
|
655
|
+
)
|
|
656
|
+
else:
|
|
657
|
+
lines.append(
|
|
658
|
+
serialize_pair_line(
|
|
659
|
+
base_name,
|
|
660
|
+
b,
|
|
661
|
+
a,
|
|
662
|
+
ft,
|
|
663
|
+
rt,
|
|
664
|
+
status,
|
|
665
|
+
sam_output=sam_output,
|
|
666
|
+
qual_stats=qual_stats,
|
|
667
|
+
filter_stats=filter_stats,
|
|
668
|
+
terminal_aliased=terminal_aliased,
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
else:
|
|
672
|
+
lines.append(
|
|
673
|
+
serialize_pair_line(
|
|
674
|
+
base_name,
|
|
675
|
+
a,
|
|
676
|
+
b,
|
|
677
|
+
ft,
|
|
678
|
+
rt,
|
|
679
|
+
status,
|
|
680
|
+
sam_output=sam_output,
|
|
681
|
+
qual_stats=qual_stats,
|
|
682
|
+
filter_stats=filter_stats,
|
|
683
|
+
terminal_aliased=terminal_aliased,
|
|
684
|
+
)
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
return lines
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def parse_block_to_lines(
|
|
691
|
+
block: list[
|
|
692
|
+
tuple[
|
|
693
|
+
tuple[str, str, str, int, int], pysam.AlignedSegment, pysam.AlignedSegment
|
|
694
|
+
]
|
|
695
|
+
],
|
|
696
|
+
min_mapq: int,
|
|
697
|
+
adjacent_gap_max: int,
|
|
698
|
+
terminal_center_gap_max: int,
|
|
699
|
+
terminal_overlap_tolerance: int,
|
|
700
|
+
chrom_order: dict[str, int],
|
|
701
|
+
flip: bool = True,
|
|
702
|
+
sam_output: bool = False,
|
|
703
|
+
qual_stats: bool = False,
|
|
704
|
+
filter_stats: bool = False,
|
|
705
|
+
strict_complete_cover: bool = False,
|
|
706
|
+
) -> tuple[list[str], str, dict]:
|
|
707
|
+
"""
|
|
708
|
+
Parse one molecule block and return serialized pair lines.
|
|
709
|
+
|
|
710
|
+
Notes
|
|
711
|
+
-----
|
|
712
|
+
This works both from:
|
|
713
|
+
- all-pairs BAM inputs
|
|
714
|
+
- cover-pairs BAM inputs
|
|
715
|
+
|
|
716
|
+
because tags are first reconstructed into per-fragment slots, then all
|
|
717
|
+
final alive combinations are emitted.
|
|
718
|
+
|
|
719
|
+
>>> class FakeRead:
|
|
720
|
+
... def __init__(self, qname, chrom, start0, span, mapq, is_rev):
|
|
721
|
+
... self.query_name = qname
|
|
722
|
+
... self.is_unmapped = False
|
|
723
|
+
... self.reference_name = chrom
|
|
724
|
+
... self.reference_start = start0
|
|
725
|
+
... self.mapping_quality = mapq
|
|
726
|
+
... self.is_reverse = is_rev
|
|
727
|
+
... self.cigartuples = [(0, span)]
|
|
728
|
+
... def to_string(self):
|
|
729
|
+
... return f"{self.query_name}\\t0\\t{self.reference_name}\\t{self.reference_start+1}\\t{self.mapping_quality}\\t{self.cigartuples[0][1]}M\\t*\\t0\\t0\\tACGT\\tFFFF"
|
|
730
|
+
...
|
|
731
|
+
>>> rF = FakeRead("q", "chr1", 99, 20, 30, False) # pos5=100 pos3=119
|
|
732
|
+
>>> rR = FakeRead("q", "chr1", 124, 15, 20, True) # pos3=125 => gap center = 125-119-1 = 5
|
|
733
|
+
>>> block = [(("q","F1","R1",1,1), rF, rR)]
|
|
734
|
+
>>> lines, status, core = parse_block_to_lines(block, min_mapq=1, adjacent_gap_max=5, terminal_center_gap_max=10, terminal_overlap_tolerance=0, chrom_order={"chr1":0})
|
|
735
|
+
>>> status
|
|
736
|
+
'simple'
|
|
737
|
+
>>> core["terminal_aliased"]
|
|
738
|
+
False
|
|
739
|
+
>>> len(lines)
|
|
740
|
+
1
|
|
741
|
+
>>> rF1 = FakeRead("q", "chr1", 99, 10, 30, False) # 100-109
|
|
742
|
+
>>> rF2 = FakeRead("q", "chr1", 111, 10, 30, False) # 112-121 gap = 112-109-1=2
|
|
743
|
+
>>> rR1 = FakeRead("q", "chr2", 199, 10, 30, True)
|
|
744
|
+
>>> block = [
|
|
745
|
+
... (("q","F1","R1",2,1), rF1, rR1),
|
|
746
|
+
... (("q","F2","R1",2,1), rF2, rR1),
|
|
747
|
+
... ]
|
|
748
|
+
>>> lines, status, core = parse_block_to_lines(block, min_mapq=1, adjacent_gap_max=5, terminal_center_gap_max=300, terminal_overlap_tolerance=1, chrom_order={"chr1":0,"chr2":1})
|
|
749
|
+
>>> status
|
|
750
|
+
'resolved_from_multiplex'
|
|
751
|
+
>>> core["n_forward_merges"] >= 1
|
|
752
|
+
True
|
|
753
|
+
>>> len(lines) >= 1
|
|
754
|
+
True
|
|
755
|
+
"""
|
|
756
|
+
if not block:
|
|
757
|
+
return (
|
|
758
|
+
[],
|
|
759
|
+
"empty",
|
|
760
|
+
{
|
|
761
|
+
"base_name": None,
|
|
762
|
+
"ft": 0,
|
|
763
|
+
"rt": 0,
|
|
764
|
+
"initial_count": 0,
|
|
765
|
+
"observed_tag_count": 0,
|
|
766
|
+
"valid_tag_count": 0,
|
|
767
|
+
"final_count": 0,
|
|
768
|
+
"n_forward_merges": 0,
|
|
769
|
+
"n_reverse_merges": 0,
|
|
770
|
+
"terminal_aliased": False,
|
|
771
|
+
"terminal_gap": None,
|
|
772
|
+
"input_mode": "unknown",
|
|
773
|
+
"complete_cover": False,
|
|
774
|
+
},
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
base_name, _, _, ft, rt = block[0][0]
|
|
778
|
+
initial_count = ft + rt
|
|
779
|
+
|
|
780
|
+
observed_tags = observed_tags_in_block(block)
|
|
781
|
+
observed_tag_count = len(observed_tags)
|
|
782
|
+
complete_cover = observed_tag_count == initial_count
|
|
783
|
+
|
|
784
|
+
if strict_complete_cover and not complete_cover:
|
|
785
|
+
return (
|
|
786
|
+
[],
|
|
787
|
+
"incomplete_cover",
|
|
788
|
+
{
|
|
789
|
+
"base_name": base_name,
|
|
790
|
+
"ft": ft,
|
|
791
|
+
"rt": rt,
|
|
792
|
+
"initial_count": initial_count,
|
|
793
|
+
"observed_tag_count": observed_tag_count,
|
|
794
|
+
"valid_tag_count": 0,
|
|
795
|
+
"final_count": 0,
|
|
796
|
+
"n_forward_merges": 0,
|
|
797
|
+
"n_reverse_merges": 0,
|
|
798
|
+
"terminal_aliased": False,
|
|
799
|
+
"terminal_gap": None,
|
|
800
|
+
"input_mode": "cover_or_partial",
|
|
801
|
+
"complete_cover": False,
|
|
802
|
+
},
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
forward: list[list | None] = [None] * ft
|
|
806
|
+
reverse: list[list | None] = [None] * rt
|
|
807
|
+
|
|
808
|
+
for info, read1, read2 in block:
|
|
809
|
+
_, tag1, tag2, _, _ = info
|
|
810
|
+
|
|
811
|
+
idx1 = tag_index(tag1) - 1
|
|
812
|
+
idx2 = tag_index(tag2) - 1
|
|
813
|
+
|
|
814
|
+
if tag1[0] == "F":
|
|
815
|
+
if forward[idx1] is None:
|
|
816
|
+
forward[idx1] = read_to_minimal_alignment(
|
|
817
|
+
read1, tag1, sam_output=sam_output
|
|
818
|
+
)
|
|
819
|
+
else:
|
|
820
|
+
if reverse[idx1] is None:
|
|
821
|
+
reverse[idx1] = read_to_minimal_alignment(
|
|
822
|
+
read1, tag1, sam_output=sam_output
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
if tag2[0] == "F":
|
|
826
|
+
if forward[idx2] is None:
|
|
827
|
+
forward[idx2] = read_to_minimal_alignment(
|
|
828
|
+
read2, tag2, sam_output=sam_output
|
|
829
|
+
)
|
|
830
|
+
else:
|
|
831
|
+
if reverse[idx2] is None:
|
|
832
|
+
reverse[idx2] = read_to_minimal_alignment(
|
|
833
|
+
read2, tag2, sam_output=sam_output
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
valid_tag_count = sum(1 for slot in forward if valid_slot(slot, min_mapq)) + sum(
|
|
837
|
+
1 for slot in reverse if valid_slot(slot, min_mapq)
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
do_multiplex_ops = initial_count > 2 # initial_count = ft + rt
|
|
841
|
+
|
|
842
|
+
if do_multiplex_ops:
|
|
843
|
+
n_forward_merges = collapse_adjacent_in_place(
|
|
844
|
+
forward, adjacent_gap_max, min_mapq
|
|
845
|
+
)
|
|
846
|
+
n_reverse_merges = collapse_adjacent_in_place(
|
|
847
|
+
reverse, adjacent_gap_max, min_mapq
|
|
848
|
+
)
|
|
849
|
+
terminal_aliased, terminal_gap = collapse_terminal_fr_in_place(
|
|
850
|
+
forward,
|
|
851
|
+
reverse,
|
|
852
|
+
terminal_center_gap_max,
|
|
853
|
+
terminal_overlap_tolerance,
|
|
854
|
+
min_mapq,
|
|
855
|
+
)
|
|
856
|
+
else:
|
|
857
|
+
n_forward_merges = 0
|
|
858
|
+
n_reverse_merges = 0
|
|
859
|
+
terminal_aliased = False
|
|
860
|
+
terminal_gap = None
|
|
861
|
+
|
|
862
|
+
final_nodes = alive_slots(forward, min_mapq) + alive_slots(reverse, min_mapq)
|
|
863
|
+
final_count = len(final_nodes)
|
|
864
|
+
status = multiplicity_status(initial_count, final_count)
|
|
865
|
+
|
|
866
|
+
if final_count < 2:
|
|
867
|
+
return (
|
|
868
|
+
[],
|
|
869
|
+
status,
|
|
870
|
+
{
|
|
871
|
+
"base_name": base_name,
|
|
872
|
+
"ft": ft,
|
|
873
|
+
"rt": rt,
|
|
874
|
+
"initial_count": initial_count,
|
|
875
|
+
"observed_tag_count": observed_tag_count,
|
|
876
|
+
"valid_tag_count": valid_tag_count,
|
|
877
|
+
"final_count": final_count,
|
|
878
|
+
"n_forward_merges": n_forward_merges,
|
|
879
|
+
"n_reverse_merges": n_reverse_merges,
|
|
880
|
+
"terminal_aliased": terminal_aliased,
|
|
881
|
+
"terminal_gap": terminal_gap,
|
|
882
|
+
"input_mode": (
|
|
883
|
+
"all_or_complete_cover" if complete_cover else "cover_or_partial"
|
|
884
|
+
),
|
|
885
|
+
"complete_cover": complete_cover,
|
|
886
|
+
},
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
lines = build_pair_lines(
|
|
890
|
+
base_name=base_name,
|
|
891
|
+
forward=forward,
|
|
892
|
+
reverse=reverse,
|
|
893
|
+
ft=ft,
|
|
894
|
+
rt=rt,
|
|
895
|
+
min_mapq=min_mapq,
|
|
896
|
+
status=status,
|
|
897
|
+
chrom_order=chrom_order,
|
|
898
|
+
flip=flip,
|
|
899
|
+
sam_output=sam_output,
|
|
900
|
+
qual_stats=qual_stats,
|
|
901
|
+
filter_stats=filter_stats,
|
|
902
|
+
terminal_aliased=terminal_aliased,
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
return (
|
|
906
|
+
lines,
|
|
907
|
+
status,
|
|
908
|
+
{
|
|
909
|
+
"base_name": base_name,
|
|
910
|
+
"ft": ft,
|
|
911
|
+
"rt": rt,
|
|
912
|
+
"initial_count": initial_count,
|
|
913
|
+
"observed_tag_count": observed_tag_count,
|
|
914
|
+
"valid_tag_count": valid_tag_count,
|
|
915
|
+
"final_count": final_count,
|
|
916
|
+
"n_forward_merges": n_forward_merges,
|
|
917
|
+
"n_reverse_merges": n_reverse_merges,
|
|
918
|
+
"terminal_aliased": terminal_aliased,
|
|
919
|
+
"terminal_gap": terminal_gap,
|
|
920
|
+
"input_mode": (
|
|
921
|
+
"all_or_complete_cover" if complete_cover else "cover_or_partial"
|
|
922
|
+
),
|
|
923
|
+
"complete_cover": complete_cover,
|
|
924
|
+
},
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def _simple_pair_to_line(
|
|
929
|
+
read1: pysam.AlignedSegment,
|
|
930
|
+
read2: pysam.AlignedSegment,
|
|
931
|
+
*,
|
|
932
|
+
min_mapq: int,
|
|
933
|
+
chrom_order: dict[str, int],
|
|
934
|
+
flip: bool,
|
|
935
|
+
sam_output: bool,
|
|
936
|
+
qual_stats: bool,
|
|
937
|
+
filter_stats: bool,
|
|
938
|
+
) -> str | None:
|
|
939
|
+
"""
|
|
940
|
+
>>> class FakeRead:
|
|
941
|
+
... def __init__(self, qname, chrom, start0, span, mapq, is_rev):
|
|
942
|
+
... self.query_name = qname
|
|
943
|
+
... self.is_unmapped = False
|
|
944
|
+
... self.reference_name = chrom
|
|
945
|
+
... self.reference_start = start0
|
|
946
|
+
... self.mapping_quality = mapq
|
|
947
|
+
... self.is_reverse = is_rev
|
|
948
|
+
... self.cigartuples = [(0, span)]
|
|
949
|
+
... def to_string(self):
|
|
950
|
+
... return "x"
|
|
951
|
+
>>> r1 = FakeRead("q", "chr2", 10, 10, 30, False)
|
|
952
|
+
>>> r2 = FakeRead("q", "chr1", 10, 10, 30, False)
|
|
953
|
+
>>> line = _simple_pair_to_line(r1, r2, min_mapq=1, chrom_order={"chr1":0,"chr2":1}, flip=True, sam_output=False, qual_stats=False, filter_stats=False)
|
|
954
|
+
>>> line.split("\\t")[1] # chrom1 après flip
|
|
955
|
+
'chr1'
|
|
956
|
+
"""
|
|
957
|
+
a = read_to_minimal_alignment(read1, "F1", sam_output=sam_output)
|
|
958
|
+
b = read_to_minimal_alignment(read2, "R1", sam_output=sam_output)
|
|
959
|
+
if not valid_slot(a, min_mapq) or not valid_slot(b, min_mapq):
|
|
960
|
+
return None
|
|
961
|
+
|
|
962
|
+
# pas de merges, pas de terminal collapse, status simple
|
|
963
|
+
if flip:
|
|
964
|
+
key_a = (chrom_order.get(a[CHROM], 10**9), a[POS5])
|
|
965
|
+
key_b = (chrom_order.get(b[CHROM], 10**9), b[POS5])
|
|
966
|
+
if key_a > key_b:
|
|
967
|
+
a, b = b, a
|
|
968
|
+
|
|
969
|
+
return serialize_pair_line(
|
|
970
|
+
read_id=read1.query_name, # ou base_name si tu préfères
|
|
971
|
+
a=a,
|
|
972
|
+
b=b,
|
|
973
|
+
ft=1,
|
|
974
|
+
rt=1,
|
|
975
|
+
status="simple",
|
|
976
|
+
sam_output=sam_output,
|
|
977
|
+
qual_stats=qual_stats,
|
|
978
|
+
filter_stats=filter_stats,
|
|
979
|
+
terminal_aliased=False,
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def parse_to_pairs(
|
|
984
|
+
bam_for_path: str,
|
|
985
|
+
bam_rev_path: str | None = None,
|
|
986
|
+
mode: Literal["simple", "split"] = "split",
|
|
987
|
+
out_pairs: str | None = None,
|
|
988
|
+
out_duplex: str | None = None,
|
|
989
|
+
out_true_multiplex_pairs: str | None = None,
|
|
990
|
+
min_mapq: int = 1,
|
|
991
|
+
adjacent_gap_max: int = 5,
|
|
992
|
+
terminal_center_gap_max: int = 300,
|
|
993
|
+
terminal_overlap_tolerance: int = 1,
|
|
994
|
+
assembly: str | None = None,
|
|
995
|
+
bam_threads: int = 1,
|
|
996
|
+
out_threads: int = 1,
|
|
997
|
+
flip: bool = True,
|
|
998
|
+
write_batch_size: int = 50000,
|
|
999
|
+
sam_output: bool = False,
|
|
1000
|
+
qual_stats: bool = False,
|
|
1001
|
+
filter_stats: bool = False,
|
|
1002
|
+
strict_complete_cover: bool = False,
|
|
1003
|
+
single_bam: bool = False,
|
|
1004
|
+
version: str = "0.0.0",
|
|
1005
|
+
) -> dict[str, int]:
|
|
1006
|
+
"""
|
|
1007
|
+
Parse remapped split BAM input into `.pairs` outputs.
|
|
1008
|
+
|
|
1009
|
+
Supports:
|
|
1010
|
+
- two synchronized BAM files (`bam_for_path`, `bam_rev_path`)
|
|
1011
|
+
- one interleaved BAM file (`single_bam=True`)
|
|
1012
|
+
|
|
1013
|
+
Notes
|
|
1014
|
+
-----
|
|
1015
|
+
- first-seen policy per tag
|
|
1016
|
+
- no heavy stats
|
|
1017
|
+
- direct line serialization
|
|
1018
|
+
- buffered writes
|
|
1019
|
+
"""
|
|
1020
|
+
if out_pairs is None and out_duplex is None and out_true_multiplex_pairs is None:
|
|
1021
|
+
raise ValueError(
|
|
1022
|
+
"Provide at least one output: out_pairs or out_duplex or out_true_multiplex_pairs"
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
if single_bam:
|
|
1026
|
+
header_for_dict = get_bam_header_single(bam_for_path)
|
|
1027
|
+
else:
|
|
1028
|
+
if bam_rev_path is None:
|
|
1029
|
+
raise ValueError("BAM Reverse File is required unless single_bam=True")
|
|
1030
|
+
header_for_dict, _ = get_bam_headers(bam_for_path, bam_rev_path)
|
|
1031
|
+
|
|
1032
|
+
chromsizes = chromsizes_from_header(header_for_dict)
|
|
1033
|
+
chrom_order = {chrom: i for i, (chrom, _) in enumerate(chromsizes)}
|
|
1034
|
+
|
|
1035
|
+
columns = build_pairs_columns(
|
|
1036
|
+
sam_output=sam_output,
|
|
1037
|
+
qual_stats=qual_stats,
|
|
1038
|
+
filter_stats=filter_stats,
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
header_lines = make_pairs_header(
|
|
1042
|
+
chromsizes=chromsizes,
|
|
1043
|
+
columns=columns,
|
|
1044
|
+
assembly=assembly,
|
|
1045
|
+
shape="upper triangle" if flip else "whole matrix",
|
|
1046
|
+
sorted_by="none",
|
|
1047
|
+
program_id="splitparse", # optionnel mais conseillé
|
|
1048
|
+
program_version=version,
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
all_handle = open_text_output(out_pairs, nproc=out_threads) if out_pairs else None
|
|
1052
|
+
resolved_handle = (
|
|
1053
|
+
open_text_output(out_duplex, nproc=out_threads) if out_duplex else None
|
|
1054
|
+
)
|
|
1055
|
+
multiplex_handle = (
|
|
1056
|
+
open_text_output(out_true_multiplex_pairs, nproc=out_threads)
|
|
1057
|
+
if out_true_multiplex_pairs
|
|
1058
|
+
else None
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
counts: dict[str, int] = defaultdict(int)
|
|
1062
|
+
all_buffer: list[str] = []
|
|
1063
|
+
resolved_buffer: list[str] = []
|
|
1064
|
+
multiplex_buffer: list[str] = []
|
|
1065
|
+
|
|
1066
|
+
def flush_buffers() -> None:
|
|
1067
|
+
if all_handle is not None and all_buffer:
|
|
1068
|
+
all_handle.write("".join(all_buffer))
|
|
1069
|
+
all_buffer.clear()
|
|
1070
|
+
if resolved_handle is not None and resolved_buffer:
|
|
1071
|
+
resolved_handle.write("".join(resolved_buffer))
|
|
1072
|
+
resolved_buffer.clear()
|
|
1073
|
+
if multiplex_handle is not None and multiplex_buffer:
|
|
1074
|
+
multiplex_handle.write("".join(multiplex_buffer))
|
|
1075
|
+
multiplex_buffer.clear()
|
|
1076
|
+
|
|
1077
|
+
try:
|
|
1078
|
+
if all_handle is not None:
|
|
1079
|
+
write_pairs_header(all_handle, header_lines)
|
|
1080
|
+
if resolved_handle is not None:
|
|
1081
|
+
write_pairs_header(resolved_handle, header_lines)
|
|
1082
|
+
if multiplex_handle is not None:
|
|
1083
|
+
write_pairs_header(multiplex_handle, header_lines)
|
|
1084
|
+
|
|
1085
|
+
current_base: str | None = None
|
|
1086
|
+
current_block: list[
|
|
1087
|
+
tuple[
|
|
1088
|
+
tuple[str, str, str, int, int],
|
|
1089
|
+
pysam.AlignedSegment,
|
|
1090
|
+
pysam.AlignedSegment,
|
|
1091
|
+
]
|
|
1092
|
+
] = []
|
|
1093
|
+
|
|
1094
|
+
def flush_block() -> None:
|
|
1095
|
+
nonlocal current_block
|
|
1096
|
+
if not current_block:
|
|
1097
|
+
return
|
|
1098
|
+
|
|
1099
|
+
lines, status, core = parse_block_to_lines(
|
|
1100
|
+
block=current_block,
|
|
1101
|
+
min_mapq=min_mapq,
|
|
1102
|
+
adjacent_gap_max=adjacent_gap_max,
|
|
1103
|
+
terminal_center_gap_max=terminal_center_gap_max,
|
|
1104
|
+
terminal_overlap_tolerance=terminal_overlap_tolerance,
|
|
1105
|
+
chrom_order=chrom_order,
|
|
1106
|
+
flip=flip,
|
|
1107
|
+
sam_output=sam_output,
|
|
1108
|
+
qual_stats=qual_stats,
|
|
1109
|
+
filter_stats=filter_stats,
|
|
1110
|
+
strict_complete_cover=strict_complete_cover,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
counts["molecules"] += 1
|
|
1114
|
+
counts[f"status_{status}"] += 1
|
|
1115
|
+
counts["pairs_total"] += len(lines)
|
|
1116
|
+
|
|
1117
|
+
if core["complete_cover"]:
|
|
1118
|
+
counts["complete_cover_blocks"] += 1
|
|
1119
|
+
else:
|
|
1120
|
+
counts["incomplete_cover_blocks"] += 1
|
|
1121
|
+
|
|
1122
|
+
counts["observed_tags_total"] += core["observed_tag_count"]
|
|
1123
|
+
counts["valid_tags_total"] += core["valid_tag_count"]
|
|
1124
|
+
counts["final_nodes_total"] += core["final_count"]
|
|
1125
|
+
|
|
1126
|
+
if all_handle is not None:
|
|
1127
|
+
for line in lines:
|
|
1128
|
+
all_buffer.append(line + "\n")
|
|
1129
|
+
|
|
1130
|
+
if status == "resolved_from_multiplex" and resolved_handle is not None:
|
|
1131
|
+
for line in lines:
|
|
1132
|
+
resolved_buffer.append(line + "\n")
|
|
1133
|
+
|
|
1134
|
+
if status == "true_multiplex" and multiplex_handle is not None:
|
|
1135
|
+
for line in lines:
|
|
1136
|
+
multiplex_buffer.append(line + "\n")
|
|
1137
|
+
|
|
1138
|
+
if (
|
|
1139
|
+
len(all_buffer) >= write_batch_size
|
|
1140
|
+
or len(resolved_buffer) >= write_batch_size
|
|
1141
|
+
or len(multiplex_buffer) >= write_batch_size
|
|
1142
|
+
):
|
|
1143
|
+
flush_buffers()
|
|
1144
|
+
|
|
1145
|
+
pair_iter = (
|
|
1146
|
+
iter_bam_pairs_single(
|
|
1147
|
+
bam_for_path,
|
|
1148
|
+
bam_threads=bam_threads,
|
|
1149
|
+
)
|
|
1150
|
+
if single_bam
|
|
1151
|
+
else iter_bam_pairs(
|
|
1152
|
+
bam_for_path,
|
|
1153
|
+
bam_rev_path,
|
|
1154
|
+
bam_threads=bam_threads,
|
|
1155
|
+
)
|
|
1156
|
+
)
|
|
1157
|
+
|
|
1158
|
+
if mode == "simple":
|
|
1159
|
+
if out_pairs is None:
|
|
1160
|
+
raise ValueError("--simple requires out_pairs")
|
|
1161
|
+
|
|
1162
|
+
# boucle simple: 1 paire -> 0/1 ligne
|
|
1163
|
+
for read1, read2 in pair_iter:
|
|
1164
|
+
if read2.query_name != read1.query_name:
|
|
1165
|
+
raise ValueError("Unsynchronized pair: query_name mismatch")
|
|
1166
|
+
|
|
1167
|
+
line = _simple_pair_to_line(
|
|
1168
|
+
read1,
|
|
1169
|
+
read2,
|
|
1170
|
+
min_mapq=min_mapq,
|
|
1171
|
+
chrom_order=chrom_order,
|
|
1172
|
+
flip=flip,
|
|
1173
|
+
sam_output=sam_output,
|
|
1174
|
+
qual_stats=qual_stats,
|
|
1175
|
+
filter_stats=filter_stats,
|
|
1176
|
+
)
|
|
1177
|
+
counts["pairs_seen"] += 1
|
|
1178
|
+
if line is None:
|
|
1179
|
+
counts["pairs_dropped"] += 1
|
|
1180
|
+
continue
|
|
1181
|
+
|
|
1182
|
+
all_buffer.append(line + "\n")
|
|
1183
|
+
counts["pairs_written"] += 1
|
|
1184
|
+
if len(all_buffer) >= write_batch_size:
|
|
1185
|
+
flush_buffers()
|
|
1186
|
+
|
|
1187
|
+
flush_buffers()
|
|
1188
|
+
return dict(counts)
|
|
1189
|
+
|
|
1190
|
+
else:
|
|
1191
|
+
for read1, read2 in pair_iter:
|
|
1192
|
+
if read2.query_name != read1.query_name:
|
|
1193
|
+
raise ValueError("Unsynchronized pair: query_name mismatch")
|
|
1194
|
+
|
|
1195
|
+
info = parse_qname_any(read1.query_name)
|
|
1196
|
+
base_name = info[0]
|
|
1197
|
+
|
|
1198
|
+
if current_base is None:
|
|
1199
|
+
current_base = base_name
|
|
1200
|
+
elif base_name != current_base:
|
|
1201
|
+
flush_block()
|
|
1202
|
+
current_block = []
|
|
1203
|
+
current_base = base_name
|
|
1204
|
+
|
|
1205
|
+
current_block.append((info, read1, read2))
|
|
1206
|
+
|
|
1207
|
+
flush_block()
|
|
1208
|
+
flush_buffers()
|
|
1209
|
+
|
|
1210
|
+
finally:
|
|
1211
|
+
if all_handle is not None:
|
|
1212
|
+
all_handle.close()
|
|
1213
|
+
if resolved_handle is not None:
|
|
1214
|
+
resolved_handle.close()
|
|
1215
|
+
if multiplex_handle is not None:
|
|
1216
|
+
multiplex_handle.close()
|
|
1217
|
+
|
|
1218
|
+
return dict(counts)
|