split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1218 @@
1
+ from collections import defaultdict
2
+ from typing import Literal
3
+
4
+ import pysam
5
+
6
+ from .bam import (
7
+ chromsizes_from_header,
8
+ get_bam_header_single,
9
+ get_bam_headers,
10
+ iter_bam_pairs,
11
+ iter_bam_pairs_single,
12
+ )
13
+ from .io_utils import open_text_output
14
+ from .pairs import make_pairs_header, write_pairs_header
15
+
16
+ # Minimal mutable alignment layout:
17
+ # [chrom, pos5, pos3, strand, mapq, anchor_tag, origin, merged_count, sam]
18
+ CHROM = 0
19
+ POS5 = 1
20
+ POS3 = 2
21
+ STRAND = 3
22
+ MAPQ = 4
23
+ ANCHOR = 5
24
+ ORIGIN = 6
25
+ MERGED = 7
26
+ SAM = 8
27
+
28
+ REF_CONSUMING_OPS = {0, 2, 3, 7, 8} # M, D, N, =, X
29
+
30
+
31
+ def parse_qname_any(qname: str) -> tuple[str, str, str, int, int]:
32
+ """
33
+ Supports:
34
+ - enriched: base:[F1,R2:FT3,RT2]
35
+ - classic : base -> treated as FT1/RT1 with tags F1,R1
36
+
37
+ >>> parse_qname_any("READ")
38
+ ('READ', 'F1', 'R1', 1, 1)
39
+ >>> parse_qname_any("readA:[F1,R2:FT3,RT2]")
40
+ ('readA', 'F1', 'R2', 3, 2)
41
+ >>> parse_qname_any("READ:SP")
42
+ ('READ', 'F1', 'R1', 1, 1)
43
+ >>> parse_qname_any("READ:SP_4")
44
+ ('READ', 'F1', 'R1', 1, 1)
45
+ """
46
+ # fast path for classic
47
+ if ":[" not in qname:
48
+ if ":SP" in qname:
49
+ qname = qname.split(":SP", 1)[0]
50
+ return qname, "F1", "R1", 1, 1
51
+
52
+ # enriched path (your current logic, but without try/except in the normal case)
53
+ try:
54
+ base_name, rest = qname.rsplit(":[", 1)
55
+ rest = rest[:-1] # drop trailing ']'
56
+ tags_part, counts_part = rest.split(":FT", 1)
57
+ tag1, tag2 = tags_part.split(",", 1)
58
+ ft_str, rt_str = counts_part.split(",RT", 1)
59
+ return base_name, tag1, tag2, int(ft_str), int(rt_str)
60
+ except Exception as exc:
61
+ raise ValueError(f"Invalid qname format: {qname!r}") from exc
62
+
63
+
64
+ def parse_qname(qname: str) -> tuple[str, str, str, int, int]:
65
+ """
66
+ Parse a microsplit qname without regex.
67
+
68
+ Expected format:
69
+ <base_name>:[<tag1>,<tag2>:FT<ft>,RT<rt>]
70
+
71
+ Examples
72
+ --------
73
+ >>> parse_qname("readA:[F1,R2:FT3,RT2]")
74
+ ('readA', 'F1', 'R2', 3, 2)
75
+
76
+ >>> parse_qname("bad_qname")
77
+ Traceback (most recent call last):
78
+ ...
79
+ ValueError: Invalid microsplit qname: 'bad_qname'
80
+ """
81
+ try:
82
+ base_name, rest = qname.rsplit(":[", 1)
83
+ rest = rest[:-1]
84
+ tags_part, counts_part = rest.split(":FT", 1)
85
+ tag1, tag2 = tags_part.split(",", 1)
86
+ ft_str, rt_str = counts_part.split(",RT", 1)
87
+ return base_name, tag1, tag2, int(ft_str), int(rt_str)
88
+ except Exception as exc:
89
+ raise ValueError(f"Invalid microsplit qname: {qname!r}") from exc
90
+
91
+
92
+ def cigar_ref_span(read: pysam.AlignedSegment) -> int:
93
+ """
94
+ Return the reference span consumed by the CIGAR.
95
+
96
+ Examples
97
+ --------
98
+ >>> class FakeRead:
99
+ ... cigartuples = [(0, 10), (1, 3), (2, 2), (7, 5), (8, 1)]
100
+ >>> cigar_ref_span(FakeRead())
101
+ 18
102
+ """
103
+ span = 0
104
+ for op, length in read.cigartuples or []:
105
+ if op in REF_CONSUMING_OPS:
106
+ span += length
107
+ return span
108
+
109
+
110
+ def read_to_minimal_alignment(
111
+ read: pysam.AlignedSegment,
112
+ tag: str,
113
+ sam_output: bool = False,
114
+ ) -> list | None:
115
+ """
116
+ Convert one read into a minimal mutable alignment list.
117
+
118
+ Returns None for unmapped reads.
119
+
120
+ Notes
121
+ -----
122
+ The last field stores the raw SAM line only when `sam_output=True`.
123
+
124
+ Examples
125
+ --------
126
+ >>> class FakeRead:
127
+ ... is_unmapped = False
128
+ ... reference_name = "chr1"
129
+ ... mapping_quality = 42
130
+ ... cigartuples = [(0, 10)]
131
+ ... is_reverse = False
132
+ ... reference_start = 99
133
+ ... def to_string(self):
134
+ ... return "read1\\t0\\tchr1\\t100\\t42\\t10M\\t*\\t0\\t0\\tACGT\\tFFFF"
135
+ >>> read_to_minimal_alignment(FakeRead(), "F1", sam_output=False)
136
+ ['chr1', 100, 109, '+', 42, 'F1', 'F', 1, None]
137
+ >>> read_to_minimal_alignment(FakeRead(), "F1", sam_output=True)[-1].startswith("read1\\t0\\tchr1")
138
+ True
139
+ """
140
+ if read.is_unmapped or read.reference_name is None:
141
+ return None
142
+
143
+ strand = "-" if read.is_reverse else "+"
144
+ ref_start_1based = read.reference_start + 1
145
+ ref_span = cigar_ref_span(read)
146
+
147
+ if strand == "+":
148
+ pos5 = ref_start_1based
149
+ pos3 = ref_start_1based + ref_span - 1
150
+ else:
151
+ pos5 = ref_start_1based + ref_span - 1
152
+ pos3 = ref_start_1based
153
+
154
+ sam_line = read.to_string() if sam_output else None
155
+
156
+ return [
157
+ read.reference_name,
158
+ pos5,
159
+ pos3,
160
+ strand,
161
+ int(read.mapping_quality),
162
+ tag,
163
+ tag[0],
164
+ 1,
165
+ sam_line,
166
+ ]
167
+
168
+
169
+ def observed_tags_in_block(
170
+ block: list[
171
+ tuple[
172
+ tuple[str, str, str, int, int],
173
+ pysam.AlignedSegment,
174
+ pysam.AlignedSegment,
175
+ ]
176
+ ],
177
+ ) -> set[str]:
178
+ """
179
+ Return the set of fragment tags observed in one block.
180
+
181
+ Examples
182
+ --------
183
+ >>> block = [
184
+ ... (("readA", "F1", "R1", 2, 1), None, None),
185
+ ... (("readA", "F2", "R1", 2, 1), None, None),
186
+ ... ]
187
+ >>> observed_tags_in_block(block) == {"F1", "F2", "R1"}
188
+ True
189
+ """
190
+ seen: set[str] = set()
191
+ for info, _, _ in block:
192
+ _, tag1, tag2, _, _ = info
193
+ seen.add(tag1)
194
+ seen.add(tag2)
195
+ return seen
196
+
197
+
198
+ def tag_index(tag: str) -> int:
199
+ """
200
+ Return the numeric part of a tag.
201
+
202
+ Examples
203
+ --------
204
+ >>> tag_index("F1")
205
+ 1
206
+ >>> tag_index("R12")
207
+ 12
208
+ """
209
+ return int(tag[1:])
210
+
211
+
212
+ def valid_slot(slot: list | None, min_mapq: int) -> bool:
213
+ """
214
+ Return True for mapped slots with enough MAPQ.
215
+
216
+ Examples
217
+ --------
218
+ >>> valid_slot(['chr1', 10, 20, '+', 5, 'F1', 'F', 1, None], 10)
219
+ False
220
+ >>> valid_slot(['chr1', 10, 20, '+', 20, 'F1', 'F', 1, None], 10)
221
+ True
222
+ >>> valid_slot(None, 10)
223
+ False
224
+ """
225
+ return slot is not None and slot[MAPQ] >= min_mapq
226
+
227
+
228
+ def slot_len(slot: list) -> int:
229
+ """
230
+ Return genomic span length of one slot.
231
+
232
+ Examples
233
+ --------
234
+ >>> slot_len(['chr1', 100, 109, '+', 30, 'F1', 'F', 1, None])
235
+ 10
236
+ >>> slot_len(['chr1', 219, 200, '-', 30, 'R1', 'R', 1, None])
237
+ 20
238
+ """
239
+ return abs(slot[POS3] - slot[POS5]) + 1
240
+
241
+
242
+ def adjacent_gap(a: list | None, b: list | None) -> int | None:
243
+ """
244
+ Compute the oriented genomic gap between two adjacent same-origin slots.
245
+
246
+ For '+' strand:
247
+ gap = b.pos5 - a.pos3 - 1
248
+ For '-' strand:
249
+ gap = a.pos3 - b.pos5 - 1
250
+
251
+ Returns None when comparison is not meaningful.
252
+
253
+ Examples
254
+ --------
255
+ >>> a = ['chr1', 100, 120, '+', 30, 'F1', 'F', 1, None]
256
+ >>> b = ['chr1', 126, 140, '+', 30, 'F2', 'F', 1, None]
257
+ >>> adjacent_gap(a, b)
258
+ 5
259
+ """
260
+ if a is None or b is None:
261
+ return None
262
+ if a[CHROM] != b[CHROM]:
263
+ return None
264
+ if a[STRAND] != b[STRAND]:
265
+ return None
266
+
267
+ if a[STRAND] == "+":
268
+ return b[POS5] - a[POS3] - 1
269
+ return a[POS3] - b[POS5] - 1
270
+
271
+
272
+ def merge_two_same_origin(a: list, b: list) -> list:
273
+ """
274
+ Merge b into a and keep one single node.
275
+
276
+ Rules
277
+ -----
278
+ - keep earliest anchor tag
279
+ - extend geometry
280
+ - keep max MAPQ
281
+ - merged_count is summed
282
+ - keep SAM from the surviving anchor tag
283
+
284
+ Examples
285
+ --------
286
+ >>> a = ['chr1', 100, 120, '+', 20, 'F1', 'F', 1, 'SAM_F1']
287
+ >>> b = ['chr1', 126, 140, '+', 30, 'F2', 'F', 1, 'SAM_F2']
288
+ >>> merge_two_same_origin(a, b)
289
+ ['chr1', 100, 140, '+', 30, 'F1', 'F', 2, 'SAM_F1']
290
+ """
291
+ if a[STRAND] == "+":
292
+ pos5 = min(a[POS5], b[POS5])
293
+ pos3 = max(a[POS3], b[POS3])
294
+ else:
295
+ pos5 = max(a[POS5], b[POS5])
296
+ pos3 = min(a[POS3], b[POS3])
297
+
298
+ keep_a_anchor = tag_index(a[ANCHOR]) <= tag_index(b[ANCHOR])
299
+ anchor = a[ANCHOR] if keep_a_anchor else b[ANCHOR]
300
+ sam_line = a[SAM] if keep_a_anchor else b[SAM]
301
+
302
+ return [
303
+ a[CHROM],
304
+ pos5,
305
+ pos3,
306
+ a[STRAND],
307
+ max(a[MAPQ], b[MAPQ]),
308
+ anchor,
309
+ a[ORIGIN],
310
+ a[MERGED] + b[MERGED],
311
+ sam_line,
312
+ ]
313
+
314
+
315
+ def collapse_adjacent_in_place(
316
+ slots: list[list | None], max_gap: int, min_mapq: int
317
+ ) -> int:
318
+ """
319
+ Collapse adjacent alive slots in place while gap is compatible.
320
+
321
+ Returns the number of merges.
322
+
323
+ Examples
324
+ --------
325
+ >>> slots = [
326
+ ... ['chr1', 1, 10, '+', 20, 'F1', 'F', 1, None],
327
+ ... ['chr1', 12, 21, '+', 20, 'F2', 'F', 1, None],
328
+ ... ]
329
+ >>> collapse_adjacent_in_place(slots, 5, 1)
330
+ 1
331
+ >>> slots[0]
332
+ ['chr1', 1, 21, '+', 20, 'F1', 'F', 2, None]
333
+ >>> slots[1] is None
334
+ True
335
+ """
336
+ n = len(slots)
337
+ merges = 0
338
+
339
+ while True:
340
+ changed = False
341
+ i = 0
342
+ while i < n:
343
+ if not valid_slot(slots[i], min_mapq):
344
+ i += 1
345
+ continue
346
+
347
+ j = i + 1
348
+ while j < n and not valid_slot(slots[j], min_mapq):
349
+ j += 1
350
+ if j >= n:
351
+ break
352
+
353
+ gap = adjacent_gap(slots[i], slots[j])
354
+ if gap is not None and 0 <= gap <= max_gap:
355
+ slots[i] = merge_two_same_origin(slots[i], slots[j])
356
+ slots[j] = None
357
+ merges += 1
358
+ changed = True
359
+ else:
360
+ i = j
361
+
362
+ if not changed:
363
+ break
364
+
365
+ return merges
366
+
367
+
368
+ def terminal_gap_center(f_slot: list | None, r_slot: list | None) -> int | None:
369
+ """
370
+ Compute central gap between terminal forward and reverse slots.
371
+
372
+ Examples
373
+ --------
374
+ >>> f = ['chr1', 100, 150, '+', 20, 'F2', 'F', 1, None]
375
+ >>> r = ['chr1', 200, 160, '-', 20, 'R1', 'R', 1, None]
376
+ >>> terminal_gap_center(f, r)
377
+ 9
378
+ """
379
+ if f_slot is None or r_slot is None:
380
+ return None
381
+ if f_slot[CHROM] != r_slot[CHROM]:
382
+ return None
383
+ if f_slot[STRAND] == "+" and r_slot[STRAND] == "-":
384
+ return r_slot[POS3] - f_slot[POS3] - 1
385
+ if f_slot[STRAND] == "-" and r_slot[STRAND] == "+":
386
+ return f_slot[POS3] - r_slot[POS3] - 1
387
+ return None
388
+
389
+
390
+ def last_alive_index(slots: list[list | None], min_mapq: int) -> int | None:
391
+ """
392
+ Return the index of the last alive slot.
393
+
394
+ Examples
395
+ --------
396
+ >>> last_alive_index([None, ['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]], 1)
397
+ 1
398
+ """
399
+ for i in range(len(slots) - 1, -1, -1):
400
+ if valid_slot(slots[i], min_mapq):
401
+ return i
402
+ return None
403
+
404
+
405
+ def collapse_terminal_fr_in_place(
406
+ forward: list[list | None],
407
+ reverse: list[list | None],
408
+ max_center_gap: int,
409
+ overlap_tolerance: int,
410
+ min_mapq: int,
411
+ ) -> tuple[bool, int | None]:
412
+ """
413
+ Collapse the terminal F/R pair in place if compatible.
414
+
415
+ Policy
416
+ ------
417
+ Keep the slot with the higher MAPQ.
418
+ On tie, keep forward.
419
+
420
+ Returns
421
+ -------
422
+ (collapsed, gap)
423
+
424
+ Examples
425
+ --------
426
+ >>> F = [['chr1', 100, 119, '+', 30, 'F1', 'F', 1, 'SAM_F1']]
427
+ >>> R = [['chr1', 140, 125, '-', 20, 'R1', 'R', 1, 'SAM_R1']]
428
+ >>> collapse_terminal_fr_in_place(F, R, 10, 0, 1)
429
+ (True, 5)
430
+ >>> F[0] is not None, R[0] is None
431
+ (True, True)
432
+ """
433
+ fi = last_alive_index(forward, min_mapq)
434
+ ri = last_alive_index(reverse, min_mapq)
435
+ if fi is None or ri is None:
436
+ return False, None
437
+
438
+ f_slot = forward[fi]
439
+ r_slot = reverse[ri]
440
+ gap = terminal_gap_center(f_slot, r_slot)
441
+ if gap is None:
442
+ return False, None
443
+ if not (-overlap_tolerance <= gap <= max_center_gap):
444
+ return False, gap
445
+
446
+ if f_slot[MAPQ] >= r_slot[MAPQ]:
447
+ reverse[ri] = None
448
+ else:
449
+ forward[fi] = None
450
+
451
+ return True, gap
452
+
453
+
454
+ def alive_slots(slots: list[list | None], min_mapq: int) -> list[list]:
455
+ """
456
+ Return alive slots only.
457
+
458
+ Examples
459
+ --------
460
+ >>> alive_slots([None, ['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]], 1)
461
+ [['chr1', 1, 10, '+', 20, 'F2', 'F', 1, None]]
462
+ """
463
+ return [slot for slot in slots if valid_slot(slot, min_mapq)]
464
+
465
+
466
+ def multiplicity_status(initial_count: int, final_count: int) -> str:
467
+ """
468
+ Compute final molecule status.
469
+
470
+ Examples
471
+ --------
472
+ >>> multiplicity_status(4, 3)
473
+ 'true_multiplex'
474
+ >>> multiplicity_status(4, 2)
475
+ 'resolved_from_multiplex'
476
+ """
477
+ if initial_count > 2 and final_count > 2:
478
+ return "true_multiplex"
479
+ if initial_count > 2 and final_count == 2:
480
+ return "resolved_from_multiplex"
481
+ if initial_count > 2 and final_count < 2:
482
+ return "dropped_from_multiplex"
483
+ if initial_count == 2 and final_count == 2:
484
+ return "simple"
485
+ return "dropped"
486
+
487
+
488
+ def build_pairs_columns(
489
+ sam_output: bool = False,
490
+ qual_stats: bool = False,
491
+ filter_stats: bool = False,
492
+ ) -> list[str]:
493
+ """
494
+ Build dynamic output columns.
495
+
496
+ Examples
497
+ --------
498
+ >>> build_pairs_columns()
499
+ ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status']
500
+ >>> build_pairs_columns(sam_output=True)
501
+ ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status', 'sam1', 'sam2']
502
+ >>> build_pairs_columns(sam_output=True, qual_stats=True, filter_stats=True)
503
+ ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type', 'tag1', 'tag2', 'ft', 'rt', 'status', 'sam1', 'sam2', 'len1', 'len2', 'mapq1', 'mapq2', 'hard_merged', 'terminal_aliased']
504
+ """
505
+ cols = [
506
+ "readID",
507
+ "chrom1",
508
+ "pos1",
509
+ "chrom2",
510
+ "pos2",
511
+ "strand1",
512
+ "strand2",
513
+ "pair_type",
514
+ "tag1",
515
+ "tag2",
516
+ "ft",
517
+ "rt",
518
+ "status",
519
+ ]
520
+ if sam_output:
521
+ cols.extend(["sam1", "sam2"])
522
+ if qual_stats:
523
+ cols.extend(["len1", "len2", "mapq1", "mapq2"])
524
+ if filter_stats:
525
+ cols.extend(["hard_merged", "terminal_aliased"])
526
+ return cols
527
+
528
+
529
+ def serialize_pair_line(
530
+ read_id: str,
531
+ a: list,
532
+ b: list,
533
+ ft: int,
534
+ rt: int,
535
+ status: str,
536
+ sam_output: bool = False,
537
+ qual_stats: bool = False,
538
+ filter_stats: bool = False,
539
+ terminal_aliased: bool = False,
540
+ ) -> str:
541
+ """
542
+ Serialize one pair line directly.
543
+
544
+ Examples
545
+ --------
546
+ >>> a = ['chr1', 10, 20, '+', 30, 'F1', 'F', 2, 'SAM_A']
547
+ >>> b = ['chr2', 40, 30, '-', 25, 'R1', 'R', 1, 'SAM_B']
548
+ >>> serialize_pair_line("read1", a, b, 2, 1, "resolved_from_multiplex")
549
+ 'read1\\tchr1\\t10\\tchr2\\t40\\t+\\t-\\tUU\\tF1\\tR1\\t2\\t1\\tresolved_from_multiplex'
550
+ >>> serialize_pair_line("read1", a, b, 2, 1, "resolved_from_multiplex", sam_output=True)
551
+ 'read1\\tchr1\\t10\\tchr2\\t40\\t+\\t-\\tUU\\tF1\\tR1\\t2\\t1\\tresolved_from_multiplex\\tSAM_A\\tSAM_B'
552
+ """
553
+ fields = [
554
+ read_id,
555
+ a[CHROM],
556
+ str(a[POS5]),
557
+ b[CHROM],
558
+ str(b[POS5]),
559
+ a[STRAND],
560
+ b[STRAND],
561
+ "UU",
562
+ a[ANCHOR],
563
+ b[ANCHOR],
564
+ str(ft),
565
+ str(rt),
566
+ status,
567
+ ]
568
+
569
+ if sam_output:
570
+ fields.extend(
571
+ [
572
+ a[SAM] if a[SAM] is not None else ".",
573
+ b[SAM] if b[SAM] is not None else ".",
574
+ ]
575
+ )
576
+
577
+ if qual_stats:
578
+ fields.extend(
579
+ [
580
+ str(slot_len(a)),
581
+ str(slot_len(b)),
582
+ str(a[MAPQ]),
583
+ str(b[MAPQ]),
584
+ ]
585
+ )
586
+
587
+ if filter_stats:
588
+ hard_merged = 1 if (a[MERGED] > 1 or b[MERGED] > 1) else 0
589
+ fields.extend(
590
+ [
591
+ str(hard_merged),
592
+ "1" if terminal_aliased else "0",
593
+ ]
594
+ )
595
+
596
+ return "\t".join(fields)
597
+
598
+
599
+ def build_pair_lines(
600
+ base_name: str,
601
+ forward: list[list | None],
602
+ reverse: list[list | None],
603
+ ft: int,
604
+ rt: int,
605
+ min_mapq: int,
606
+ status: str,
607
+ chrom_order: dict[str, int],
608
+ flip: bool,
609
+ sam_output: bool = False,
610
+ qual_stats: bool = False,
611
+ filter_stats: bool = False,
612
+ terminal_aliased: bool = False,
613
+ ) -> list[str]:
614
+ """
615
+ Build all final pair lines from alive nodes.
616
+
617
+ Examples
618
+ --------
619
+ >>> F = [['chr1', 1, 10, '+', 20, 'F1', 'F', 1, 'SAM_F1']]
620
+ >>> R = [['chr2', 50, 41, '-', 20, 'R1', 'R', 1, 'SAM_R1']]
621
+ >>> lines = build_pair_lines("q", F, R, 1, 1, 1, "simple", {"chr1": 0, "chr2": 1}, True, sam_output=True)
622
+ >>> len(lines)
623
+ 1
624
+ >>> lines[0].startswith('q\\tchr1\\t1\\tchr2\\t50')
625
+ True
626
+ >>> '\\tSAM_F1\\tSAM_R1' in lines[0]
627
+ True
628
+ """
629
+ nodes = alive_slots(forward, min_mapq) + alive_slots(reverse, min_mapq)
630
+ n = len(nodes)
631
+ lines: list[str] = []
632
+
633
+ for i in range(n):
634
+ a = nodes[i]
635
+ for j in range(i + 1, n):
636
+ b = nodes[j]
637
+
638
+ if flip:
639
+ key_a = (chrom_order.get(a[CHROM], 10**9), a[POS5])
640
+ key_b = (chrom_order.get(b[CHROM], 10**9), b[POS5])
641
+ if key_a <= key_b:
642
+ lines.append(
643
+ serialize_pair_line(
644
+ base_name,
645
+ a,
646
+ b,
647
+ ft,
648
+ rt,
649
+ status,
650
+ sam_output=sam_output,
651
+ qual_stats=qual_stats,
652
+ filter_stats=filter_stats,
653
+ terminal_aliased=terminal_aliased,
654
+ )
655
+ )
656
+ else:
657
+ lines.append(
658
+ serialize_pair_line(
659
+ base_name,
660
+ b,
661
+ a,
662
+ ft,
663
+ rt,
664
+ status,
665
+ sam_output=sam_output,
666
+ qual_stats=qual_stats,
667
+ filter_stats=filter_stats,
668
+ terminal_aliased=terminal_aliased,
669
+ )
670
+ )
671
+ else:
672
+ lines.append(
673
+ serialize_pair_line(
674
+ base_name,
675
+ a,
676
+ b,
677
+ ft,
678
+ rt,
679
+ status,
680
+ sam_output=sam_output,
681
+ qual_stats=qual_stats,
682
+ filter_stats=filter_stats,
683
+ terminal_aliased=terminal_aliased,
684
+ )
685
+ )
686
+
687
+ return lines
688
+
689
+
690
+ def parse_block_to_lines(
691
+ block: list[
692
+ tuple[
693
+ tuple[str, str, str, int, int], pysam.AlignedSegment, pysam.AlignedSegment
694
+ ]
695
+ ],
696
+ min_mapq: int,
697
+ adjacent_gap_max: int,
698
+ terminal_center_gap_max: int,
699
+ terminal_overlap_tolerance: int,
700
+ chrom_order: dict[str, int],
701
+ flip: bool = True,
702
+ sam_output: bool = False,
703
+ qual_stats: bool = False,
704
+ filter_stats: bool = False,
705
+ strict_complete_cover: bool = False,
706
+ ) -> tuple[list[str], str, dict]:
707
+ """
708
+ Parse one molecule block and return serialized pair lines.
709
+
710
+ Notes
711
+ -----
712
+ This works both from:
713
+ - all-pairs BAM inputs
714
+ - cover-pairs BAM inputs
715
+
716
+ because tags are first reconstructed into per-fragment slots, then all
717
+ final alive combinations are emitted.
718
+
719
+ >>> class FakeRead:
720
+ ... def __init__(self, qname, chrom, start0, span, mapq, is_rev):
721
+ ... self.query_name = qname
722
+ ... self.is_unmapped = False
723
+ ... self.reference_name = chrom
724
+ ... self.reference_start = start0
725
+ ... self.mapping_quality = mapq
726
+ ... self.is_reverse = is_rev
727
+ ... self.cigartuples = [(0, span)]
728
+ ... def to_string(self):
729
+ ... return f"{self.query_name}\\t0\\t{self.reference_name}\\t{self.reference_start+1}\\t{self.mapping_quality}\\t{self.cigartuples[0][1]}M\\t*\\t0\\t0\\tACGT\\tFFFF"
730
+ ...
731
+ >>> rF = FakeRead("q", "chr1", 99, 20, 30, False) # pos5=100 pos3=119
732
+ >>> rR = FakeRead("q", "chr1", 124, 15, 20, True) # pos3=125 => gap center = 125-119-1 = 5
733
+ >>> block = [(("q","F1","R1",1,1), rF, rR)]
734
+ >>> lines, status, core = parse_block_to_lines(block, min_mapq=1, adjacent_gap_max=5, terminal_center_gap_max=10, terminal_overlap_tolerance=0, chrom_order={"chr1":0})
735
+ >>> status
736
+ 'simple'
737
+ >>> core["terminal_aliased"]
738
+ False
739
+ >>> len(lines)
740
+ 1
741
+ >>> rF1 = FakeRead("q", "chr1", 99, 10, 30, False) # 100-109
742
+ >>> rF2 = FakeRead("q", "chr1", 111, 10, 30, False) # 112-121 gap = 112-109-1=2
743
+ >>> rR1 = FakeRead("q", "chr2", 199, 10, 30, True)
744
+ >>> block = [
745
+ ... (("q","F1","R1",2,1), rF1, rR1),
746
+ ... (("q","F2","R1",2,1), rF2, rR1),
747
+ ... ]
748
+ >>> lines, status, core = parse_block_to_lines(block, min_mapq=1, adjacent_gap_max=5, terminal_center_gap_max=300, terminal_overlap_tolerance=1, chrom_order={"chr1":0,"chr2":1})
749
+ >>> status
750
+ 'resolved_from_multiplex'
751
+ >>> core["n_forward_merges"] >= 1
752
+ True
753
+ >>> len(lines) >= 1
754
+ True
755
+ """
756
+ if not block:
757
+ return (
758
+ [],
759
+ "empty",
760
+ {
761
+ "base_name": None,
762
+ "ft": 0,
763
+ "rt": 0,
764
+ "initial_count": 0,
765
+ "observed_tag_count": 0,
766
+ "valid_tag_count": 0,
767
+ "final_count": 0,
768
+ "n_forward_merges": 0,
769
+ "n_reverse_merges": 0,
770
+ "terminal_aliased": False,
771
+ "terminal_gap": None,
772
+ "input_mode": "unknown",
773
+ "complete_cover": False,
774
+ },
775
+ )
776
+
777
+ base_name, _, _, ft, rt = block[0][0]
778
+ initial_count = ft + rt
779
+
780
+ observed_tags = observed_tags_in_block(block)
781
+ observed_tag_count = len(observed_tags)
782
+ complete_cover = observed_tag_count == initial_count
783
+
784
+ if strict_complete_cover and not complete_cover:
785
+ return (
786
+ [],
787
+ "incomplete_cover",
788
+ {
789
+ "base_name": base_name,
790
+ "ft": ft,
791
+ "rt": rt,
792
+ "initial_count": initial_count,
793
+ "observed_tag_count": observed_tag_count,
794
+ "valid_tag_count": 0,
795
+ "final_count": 0,
796
+ "n_forward_merges": 0,
797
+ "n_reverse_merges": 0,
798
+ "terminal_aliased": False,
799
+ "terminal_gap": None,
800
+ "input_mode": "cover_or_partial",
801
+ "complete_cover": False,
802
+ },
803
+ )
804
+
805
+ forward: list[list | None] = [None] * ft
806
+ reverse: list[list | None] = [None] * rt
807
+
808
+ for info, read1, read2 in block:
809
+ _, tag1, tag2, _, _ = info
810
+
811
+ idx1 = tag_index(tag1) - 1
812
+ idx2 = tag_index(tag2) - 1
813
+
814
+ if tag1[0] == "F":
815
+ if forward[idx1] is None:
816
+ forward[idx1] = read_to_minimal_alignment(
817
+ read1, tag1, sam_output=sam_output
818
+ )
819
+ else:
820
+ if reverse[idx1] is None:
821
+ reverse[idx1] = read_to_minimal_alignment(
822
+ read1, tag1, sam_output=sam_output
823
+ )
824
+
825
+ if tag2[0] == "F":
826
+ if forward[idx2] is None:
827
+ forward[idx2] = read_to_minimal_alignment(
828
+ read2, tag2, sam_output=sam_output
829
+ )
830
+ else:
831
+ if reverse[idx2] is None:
832
+ reverse[idx2] = read_to_minimal_alignment(
833
+ read2, tag2, sam_output=sam_output
834
+ )
835
+
836
+ valid_tag_count = sum(1 for slot in forward if valid_slot(slot, min_mapq)) + sum(
837
+ 1 for slot in reverse if valid_slot(slot, min_mapq)
838
+ )
839
+
840
+ do_multiplex_ops = initial_count > 2 # initial_count = ft + rt
841
+
842
+ if do_multiplex_ops:
843
+ n_forward_merges = collapse_adjacent_in_place(
844
+ forward, adjacent_gap_max, min_mapq
845
+ )
846
+ n_reverse_merges = collapse_adjacent_in_place(
847
+ reverse, adjacent_gap_max, min_mapq
848
+ )
849
+ terminal_aliased, terminal_gap = collapse_terminal_fr_in_place(
850
+ forward,
851
+ reverse,
852
+ terminal_center_gap_max,
853
+ terminal_overlap_tolerance,
854
+ min_mapq,
855
+ )
856
+ else:
857
+ n_forward_merges = 0
858
+ n_reverse_merges = 0
859
+ terminal_aliased = False
860
+ terminal_gap = None
861
+
862
+ final_nodes = alive_slots(forward, min_mapq) + alive_slots(reverse, min_mapq)
863
+ final_count = len(final_nodes)
864
+ status = multiplicity_status(initial_count, final_count)
865
+
866
+ if final_count < 2:
867
+ return (
868
+ [],
869
+ status,
870
+ {
871
+ "base_name": base_name,
872
+ "ft": ft,
873
+ "rt": rt,
874
+ "initial_count": initial_count,
875
+ "observed_tag_count": observed_tag_count,
876
+ "valid_tag_count": valid_tag_count,
877
+ "final_count": final_count,
878
+ "n_forward_merges": n_forward_merges,
879
+ "n_reverse_merges": n_reverse_merges,
880
+ "terminal_aliased": terminal_aliased,
881
+ "terminal_gap": terminal_gap,
882
+ "input_mode": (
883
+ "all_or_complete_cover" if complete_cover else "cover_or_partial"
884
+ ),
885
+ "complete_cover": complete_cover,
886
+ },
887
+ )
888
+
889
+ lines = build_pair_lines(
890
+ base_name=base_name,
891
+ forward=forward,
892
+ reverse=reverse,
893
+ ft=ft,
894
+ rt=rt,
895
+ min_mapq=min_mapq,
896
+ status=status,
897
+ chrom_order=chrom_order,
898
+ flip=flip,
899
+ sam_output=sam_output,
900
+ qual_stats=qual_stats,
901
+ filter_stats=filter_stats,
902
+ terminal_aliased=terminal_aliased,
903
+ )
904
+
905
+ return (
906
+ lines,
907
+ status,
908
+ {
909
+ "base_name": base_name,
910
+ "ft": ft,
911
+ "rt": rt,
912
+ "initial_count": initial_count,
913
+ "observed_tag_count": observed_tag_count,
914
+ "valid_tag_count": valid_tag_count,
915
+ "final_count": final_count,
916
+ "n_forward_merges": n_forward_merges,
917
+ "n_reverse_merges": n_reverse_merges,
918
+ "terminal_aliased": terminal_aliased,
919
+ "terminal_gap": terminal_gap,
920
+ "input_mode": (
921
+ "all_or_complete_cover" if complete_cover else "cover_or_partial"
922
+ ),
923
+ "complete_cover": complete_cover,
924
+ },
925
+ )
926
+
927
+
928
+ def _simple_pair_to_line(
929
+ read1: pysam.AlignedSegment,
930
+ read2: pysam.AlignedSegment,
931
+ *,
932
+ min_mapq: int,
933
+ chrom_order: dict[str, int],
934
+ flip: bool,
935
+ sam_output: bool,
936
+ qual_stats: bool,
937
+ filter_stats: bool,
938
+ ) -> str | None:
939
+ """
940
+ >>> class FakeRead:
941
+ ... def __init__(self, qname, chrom, start0, span, mapq, is_rev):
942
+ ... self.query_name = qname
943
+ ... self.is_unmapped = False
944
+ ... self.reference_name = chrom
945
+ ... self.reference_start = start0
946
+ ... self.mapping_quality = mapq
947
+ ... self.is_reverse = is_rev
948
+ ... self.cigartuples = [(0, span)]
949
+ ... def to_string(self):
950
+ ... return "x"
951
+ >>> r1 = FakeRead("q", "chr2", 10, 10, 30, False)
952
+ >>> r2 = FakeRead("q", "chr1", 10, 10, 30, False)
953
+ >>> line = _simple_pair_to_line(r1, r2, min_mapq=1, chrom_order={"chr1":0,"chr2":1}, flip=True, sam_output=False, qual_stats=False, filter_stats=False)
954
+ >>> line.split("\\t")[1] # chrom1 après flip
955
+ 'chr1'
956
+ """
957
+ a = read_to_minimal_alignment(read1, "F1", sam_output=sam_output)
958
+ b = read_to_minimal_alignment(read2, "R1", sam_output=sam_output)
959
+ if not valid_slot(a, min_mapq) or not valid_slot(b, min_mapq):
960
+ return None
961
+
962
+ # pas de merges, pas de terminal collapse, status simple
963
+ if flip:
964
+ key_a = (chrom_order.get(a[CHROM], 10**9), a[POS5])
965
+ key_b = (chrom_order.get(b[CHROM], 10**9), b[POS5])
966
+ if key_a > key_b:
967
+ a, b = b, a
968
+
969
+ return serialize_pair_line(
970
+ read_id=read1.query_name, # ou base_name si tu préfères
971
+ a=a,
972
+ b=b,
973
+ ft=1,
974
+ rt=1,
975
+ status="simple",
976
+ sam_output=sam_output,
977
+ qual_stats=qual_stats,
978
+ filter_stats=filter_stats,
979
+ terminal_aliased=False,
980
+ )
981
+
982
+
983
+ def parse_to_pairs(
984
+ bam_for_path: str,
985
+ bam_rev_path: str | None = None,
986
+ mode: Literal["simple", "split"] = "split",
987
+ out_pairs: str | None = None,
988
+ out_duplex: str | None = None,
989
+ out_true_multiplex_pairs: str | None = None,
990
+ min_mapq: int = 1,
991
+ adjacent_gap_max: int = 5,
992
+ terminal_center_gap_max: int = 300,
993
+ terminal_overlap_tolerance: int = 1,
994
+ assembly: str | None = None,
995
+ bam_threads: int = 1,
996
+ out_threads: int = 1,
997
+ flip: bool = True,
998
+ write_batch_size: int = 50000,
999
+ sam_output: bool = False,
1000
+ qual_stats: bool = False,
1001
+ filter_stats: bool = False,
1002
+ strict_complete_cover: bool = False,
1003
+ single_bam: bool = False,
1004
+ version: str = "0.0.0",
1005
+ ) -> dict[str, int]:
1006
+ """
1007
+ Parse remapped split BAM input into `.pairs` outputs.
1008
+
1009
+ Supports:
1010
+ - two synchronized BAM files (`bam_for_path`, `bam_rev_path`)
1011
+ - one interleaved BAM file (`single_bam=True`)
1012
+
1013
+ Notes
1014
+ -----
1015
+ - first-seen policy per tag
1016
+ - no heavy stats
1017
+ - direct line serialization
1018
+ - buffered writes
1019
+ """
1020
+ if out_pairs is None and out_duplex is None and out_true_multiplex_pairs is None:
1021
+ raise ValueError(
1022
+ "Provide at least one output: out_pairs or out_duplex or out_true_multiplex_pairs"
1023
+ )
1024
+
1025
+ if single_bam:
1026
+ header_for_dict = get_bam_header_single(bam_for_path)
1027
+ else:
1028
+ if bam_rev_path is None:
1029
+ raise ValueError("BAM Reverse File is required unless single_bam=True")
1030
+ header_for_dict, _ = get_bam_headers(bam_for_path, bam_rev_path)
1031
+
1032
+ chromsizes = chromsizes_from_header(header_for_dict)
1033
+ chrom_order = {chrom: i for i, (chrom, _) in enumerate(chromsizes)}
1034
+
1035
+ columns = build_pairs_columns(
1036
+ sam_output=sam_output,
1037
+ qual_stats=qual_stats,
1038
+ filter_stats=filter_stats,
1039
+ )
1040
+
1041
+ header_lines = make_pairs_header(
1042
+ chromsizes=chromsizes,
1043
+ columns=columns,
1044
+ assembly=assembly,
1045
+ shape="upper triangle" if flip else "whole matrix",
1046
+ sorted_by="none",
1047
+ program_id="splitparse", # optionnel mais conseillé
1048
+ program_version=version,
1049
+ )
1050
+
1051
+ all_handle = open_text_output(out_pairs, nproc=out_threads) if out_pairs else None
1052
+ resolved_handle = (
1053
+ open_text_output(out_duplex, nproc=out_threads) if out_duplex else None
1054
+ )
1055
+ multiplex_handle = (
1056
+ open_text_output(out_true_multiplex_pairs, nproc=out_threads)
1057
+ if out_true_multiplex_pairs
1058
+ else None
1059
+ )
1060
+
1061
+ counts: dict[str, int] = defaultdict(int)
1062
+ all_buffer: list[str] = []
1063
+ resolved_buffer: list[str] = []
1064
+ multiplex_buffer: list[str] = []
1065
+
1066
+ def flush_buffers() -> None:
1067
+ if all_handle is not None and all_buffer:
1068
+ all_handle.write("".join(all_buffer))
1069
+ all_buffer.clear()
1070
+ if resolved_handle is not None and resolved_buffer:
1071
+ resolved_handle.write("".join(resolved_buffer))
1072
+ resolved_buffer.clear()
1073
+ if multiplex_handle is not None and multiplex_buffer:
1074
+ multiplex_handle.write("".join(multiplex_buffer))
1075
+ multiplex_buffer.clear()
1076
+
1077
+ try:
1078
+ if all_handle is not None:
1079
+ write_pairs_header(all_handle, header_lines)
1080
+ if resolved_handle is not None:
1081
+ write_pairs_header(resolved_handle, header_lines)
1082
+ if multiplex_handle is not None:
1083
+ write_pairs_header(multiplex_handle, header_lines)
1084
+
1085
+ current_base: str | None = None
1086
+ current_block: list[
1087
+ tuple[
1088
+ tuple[str, str, str, int, int],
1089
+ pysam.AlignedSegment,
1090
+ pysam.AlignedSegment,
1091
+ ]
1092
+ ] = []
1093
+
1094
+ def flush_block() -> None:
1095
+ nonlocal current_block
1096
+ if not current_block:
1097
+ return
1098
+
1099
+ lines, status, core = parse_block_to_lines(
1100
+ block=current_block,
1101
+ min_mapq=min_mapq,
1102
+ adjacent_gap_max=adjacent_gap_max,
1103
+ terminal_center_gap_max=terminal_center_gap_max,
1104
+ terminal_overlap_tolerance=terminal_overlap_tolerance,
1105
+ chrom_order=chrom_order,
1106
+ flip=flip,
1107
+ sam_output=sam_output,
1108
+ qual_stats=qual_stats,
1109
+ filter_stats=filter_stats,
1110
+ strict_complete_cover=strict_complete_cover,
1111
+ )
1112
+
1113
+ counts["molecules"] += 1
1114
+ counts[f"status_{status}"] += 1
1115
+ counts["pairs_total"] += len(lines)
1116
+
1117
+ if core["complete_cover"]:
1118
+ counts["complete_cover_blocks"] += 1
1119
+ else:
1120
+ counts["incomplete_cover_blocks"] += 1
1121
+
1122
+ counts["observed_tags_total"] += core["observed_tag_count"]
1123
+ counts["valid_tags_total"] += core["valid_tag_count"]
1124
+ counts["final_nodes_total"] += core["final_count"]
1125
+
1126
+ if all_handle is not None:
1127
+ for line in lines:
1128
+ all_buffer.append(line + "\n")
1129
+
1130
+ if status == "resolved_from_multiplex" and resolved_handle is not None:
1131
+ for line in lines:
1132
+ resolved_buffer.append(line + "\n")
1133
+
1134
+ if status == "true_multiplex" and multiplex_handle is not None:
1135
+ for line in lines:
1136
+ multiplex_buffer.append(line + "\n")
1137
+
1138
+ if (
1139
+ len(all_buffer) >= write_batch_size
1140
+ or len(resolved_buffer) >= write_batch_size
1141
+ or len(multiplex_buffer) >= write_batch_size
1142
+ ):
1143
+ flush_buffers()
1144
+
1145
+ pair_iter = (
1146
+ iter_bam_pairs_single(
1147
+ bam_for_path,
1148
+ bam_threads=bam_threads,
1149
+ )
1150
+ if single_bam
1151
+ else iter_bam_pairs(
1152
+ bam_for_path,
1153
+ bam_rev_path,
1154
+ bam_threads=bam_threads,
1155
+ )
1156
+ )
1157
+
1158
+ if mode == "simple":
1159
+ if out_pairs is None:
1160
+ raise ValueError("--simple requires out_pairs")
1161
+
1162
+ # boucle simple: 1 paire -> 0/1 ligne
1163
+ for read1, read2 in pair_iter:
1164
+ if read2.query_name != read1.query_name:
1165
+ raise ValueError("Unsynchronized pair: query_name mismatch")
1166
+
1167
+ line = _simple_pair_to_line(
1168
+ read1,
1169
+ read2,
1170
+ min_mapq=min_mapq,
1171
+ chrom_order=chrom_order,
1172
+ flip=flip,
1173
+ sam_output=sam_output,
1174
+ qual_stats=qual_stats,
1175
+ filter_stats=filter_stats,
1176
+ )
1177
+ counts["pairs_seen"] += 1
1178
+ if line is None:
1179
+ counts["pairs_dropped"] += 1
1180
+ continue
1181
+
1182
+ all_buffer.append(line + "\n")
1183
+ counts["pairs_written"] += 1
1184
+ if len(all_buffer) >= write_batch_size:
1185
+ flush_buffers()
1186
+
1187
+ flush_buffers()
1188
+ return dict(counts)
1189
+
1190
+ else:
1191
+ for read1, read2 in pair_iter:
1192
+ if read2.query_name != read1.query_name:
1193
+ raise ValueError("Unsynchronized pair: query_name mismatch")
1194
+
1195
+ info = parse_qname_any(read1.query_name)
1196
+ base_name = info[0]
1197
+
1198
+ if current_base is None:
1199
+ current_base = base_name
1200
+ elif base_name != current_base:
1201
+ flush_block()
1202
+ current_block = []
1203
+ current_base = base_name
1204
+
1205
+ current_block.append((info, read1, read2))
1206
+
1207
+ flush_block()
1208
+ flush_buffers()
1209
+
1210
+ finally:
1211
+ if all_handle is not None:
1212
+ all_handle.close()
1213
+ if resolved_handle is not None:
1214
+ resolved_handle.close()
1215
+ if multiplex_handle is not None:
1216
+ multiplex_handle.close()
1217
+
1218
+ return dict(counts)