split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,849 @@
1
+ import logging
2
+ import os
3
+ import signal
4
+ import sys
5
+ import time
6
+ from multiprocessing import Queue
7
+
8
+ from .auxiliary import check_data, handle_write_cmd, partitionning
9
+ from .bam import (
10
+ get_bam_header_single,
11
+ get_bam_headers,
12
+ read_bam_interleaved,
13
+ read_bam_pair,
14
+ write_bam_interleaved_from_sam,
15
+ write_bam_pair_from_sam,
16
+ )
17
+ from .fastq import write_fastq_pair
18
+ from .processmanager import ProcessManager
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def build_cigar_tuple(cigar):
24
+ """
25
+ Parse CIGAR string into tuples of operations and lengths.
26
+
27
+ Examples
28
+ --------
29
+ >>> build_cigar_tuple("10M1I5M2D3S")
30
+ [['M', 'I', 'M', 'D', 'S'], [10, 1, 5, 2, 3]]
31
+ >>> build_cigar_tuple("5S95M")
32
+ [['S', 'M'], [5, 95]]
33
+ >>> build_cigar_tuple("100M")
34
+ [['M'], [100]]
35
+ >>> build_cigar_tuple("")
36
+ [[], []]
37
+ >>> build_cigar_tuple("5H10M5H")
38
+ [['H', 'M', 'H'], [5, 10, 5]]
39
+ >>> build_cigar_tuple("10MXXX") # non-CIGAR ignored
40
+ [['M'], [10]]
41
+ """
42
+ import re
43
+
44
+ cigar_tuples = [[], []]
45
+
46
+ for match in re.finditer(r"(\d+)([MIDNSHP=X])", cigar):
47
+ times = int(match.group(1))
48
+ code = match.group(2)
49
+ cigar_tuples[0].append(code)
50
+ cigar_tuples[1].append(times)
51
+ return cigar_tuples
52
+
53
+
54
+ def write_read(name, sequence, quality, start, stop):
55
+ """
56
+ write a sequence
57
+
58
+ Examples
59
+ --------
60
+ >>> write_read("@r1", "ACGT", "IIII", 1, 3)
61
+ '@r1\\nCG\\n+\\nII\\n'
62
+ >>> write_read("@r1", "ACGT", "IIII", 0, 4)
63
+ '@r1\\nACGT\\n+\\nIIII\\n'
64
+ """
65
+ return (
66
+ name
67
+ + "\n"
68
+ + sequence[start:stop]
69
+ + "\n"
70
+ + "+"
71
+ + "\n"
72
+ + quality[start:stop]
73
+ + "\n"
74
+ )
75
+
76
+
77
+ def count_len(cigar_ops, cigar_lens):
78
+ """
79
+ Calculates the length consumed in the read (ignores D, N, H, P)
80
+ and also returns the qpos_before positions (read coordinates
81
+ at the start of each CIGAR operator).
82
+
83
+ Parameters
84
+ ----------
85
+ cigar_ops : list of str
86
+ CIGAR operations, e.g. ['S','M','S'].
87
+ cigar_lens : list of int
88
+ Corresponding lengths, e.g. [1,2,1].
89
+
90
+ Returns
91
+ -------
92
+ qpos_before : list of int
93
+ Read positions before each CIGAR op.
94
+ qpos : int
95
+ Total length consumed in the read.
96
+
97
+ Examples
98
+ --------
99
+ >>> qpos_before, qpos = count_len(['M', 'I', 'D', 'S'], [5, 2, 3, 4])
100
+ >>> qpos_before
101
+ [0, 5, 7, 7]
102
+ >>> qpos
103
+ 11
104
+ >>> count_len(['N', 'H', 'P'], [10, 5, 2])
105
+ ([0, 0, 0], 0)
106
+ """
107
+ qpos_before = []
108
+ qpos = 0
109
+ for op, ln in zip(cigar_ops, cigar_lens):
110
+ qpos_before.append(qpos)
111
+ if op in ("M", "I", "S", "=", "X"):
112
+ qpos += ln
113
+ # D, N, H, P doesn't consume the read (len)
114
+ return qpos_before, qpos
115
+
116
+
117
+ def process_cigard(name, sequence, quality, cigar, seed_size, len_add):
118
+ """
119
+ Extract fragments in the FastQ format from read
120
+ Extract mapped fragments and non mapped fragment
121
+
122
+ Parameters:
123
+ read (tuple): The read from which to extract information.
124
+ seed_size (int): The minimum size of a segment to be considered for extraction.
125
+ len_add (int): Number of base pairs added to the neoformed fragment after completion of soft clipping.
126
+
127
+ Returns:
128
+ tuple: A tuple containing the read name and a list of FastQ format strings.
129
+
130
+ Examples
131
+ --------
132
+ Cas sans soft-clip:
133
+ >>> name, seq, qual, cig = "@r", "ACGT", "IIII", "4M"
134
+ >>> n, frags = process_cigard(name, seq, qual, cig, seed_size=0, len_add=0)
135
+ >>> n
136
+ '@r'
137
+ >>> frags == ['@r\\nACGT\\n+\\nIIII\\n']
138
+ True
139
+
140
+ # Un soft-clip en tête (ex: 2S2M, len_add=1)
141
+ >>> name, seq, qual, cig = "@r", "ACGT", "?@AB", "2S2M"
142
+ >>> n, frags = process_cigard(name, seq, qual, cig, seed_size=0, len_add=1)
143
+ >>> len(frags)
144
+ 2
145
+ >>> frags
146
+ ['@r\\nACG\\n+\\n?@A\\n', '@r\\nCGT\\n+\\n@AB\\n']
147
+ >>> all(f.startswith('@r\\n') for f in frags)
148
+ True
149
+
150
+ # Deux soft-clips (ex: 1S2M1S), seed_size=1 -> 3 fragments
151
+ >>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
152
+ >>> n, frags = process_cigard(name, seq, qual, cig, seed_size=1, len_add=0)
153
+ >>> len(frags)
154
+ 3
155
+
156
+ # Deux soft-clips (ex: 1S2M1S), seed_size=2 -> seul fragment central conservé
157
+ >>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
158
+ >>> n, frags = process_cigard(name, seq, qual, cig, seed_size=2, len_add=0)
159
+ >>> len(frags)
160
+ 1
161
+ >>> frags
162
+ ['@r\\nCG\\n+\\n??\\n']
163
+
164
+ # Deux soft-clips (ex: 1S2M1S), seed_size=3 -> fallback read complet
165
+ >>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
166
+ >>> n, frags = process_cigard(name, seq, qual, cig, seed_size=3, len_add=0)
167
+ >>> len(frags)
168
+ 1
169
+ >>> frags
170
+ ['@r\\nACGT\\n+\\n????\\n']
171
+ """
172
+ if cigar in (None, "", "*"):
173
+ return name, []
174
+
175
+ # Build_cigar_tuple retourne [ops_list, lens_list]
176
+ cigar_ops, cigar_lens = build_cigar_tuple(cigar)
177
+
178
+ # Intra reads coordinates
179
+ qpos_before, _ = count_len(cigar_ops, cigar_lens)
180
+ read_len = len(sequence)
181
+
182
+ soft_clip_indices = [i for i, x in enumerate(cigar_ops) if x in ["S"]]
183
+
184
+ def make_frag(start: int, stop: int):
185
+ """
186
+ Builds a FastQ fragment:
187
+ - clamp start/stop in [0, read_len]
188
+ - filter if length < seed_size
189
+ - returns None if fragment is invalid
190
+ """
191
+ s = max(0, min(read_len, start))
192
+ e = max(0, min(read_len, stop))
193
+ if e <= s:
194
+ return None
195
+ if seed_size and (e - s) < seed_size:
196
+ return None
197
+ return write_read(f"{name}", sequence, quality, s, e)
198
+
199
+ # Case 0 : No soft-clipping
200
+ if len(soft_clip_indices) == 0:
201
+ return name, [write_read(name, sequence, quality, 0, len(sequence))]
202
+
203
+ # Case 1 : One soft-clip
204
+ if len(soft_clip_indices) == 1:
205
+ i = soft_clip_indices[0]
206
+
207
+ if i == 0:
208
+ # soft-clip first
209
+ index = cigar_lens[i]
210
+ frag1 = make_frag(0, index + len_add)
211
+ frag2 = make_frag(index - len_add, read_len)
212
+ else:
213
+ # soft-clip end
214
+ index = qpos_before[i]
215
+ frag1 = make_frag(0, index + len_add)
216
+ frag2 = make_frag(index - len_add, read_len)
217
+
218
+ frags = [f for f in (frag1, frag2) if f is not None]
219
+
220
+ # If everything were filtered
221
+ if not frags:
222
+ if not seed_size or read_len >= seed_size:
223
+ return name, [write_read(name, sequence, quality, 0, read_len)]
224
+ return name, frags
225
+
226
+ # Case 2 : Two soft-clips (S ... M ... S)
227
+ if len(soft_clip_indices) == 2:
228
+ i0, i1 = soft_clip_indices
229
+
230
+ ln0 = cigar_lens[i0]
231
+ # fin du premier soft-clip en coordonnées read
232
+ if i0 == 0:
233
+ index1_end = ln0
234
+ else:
235
+ index1_end = qpos_before[i0] + ln0
236
+
237
+ # début du second soft-clip en coordonnées read
238
+ index2_start = qpos_before[i1]
239
+
240
+ fragS1 = make_frag(0, index1_end + len_add)
241
+ fragM = make_frag(index1_end - len_add, index2_start + len_add)
242
+ fragS2 = make_frag(index2_start - len_add, read_len)
243
+
244
+ frags = [f for f in (fragS1, fragM, fragS2) if f is not None]
245
+
246
+ if not frags:
247
+ if not seed_size or read_len >= seed_size:
248
+ return name, [write_read(name, sequence, quality, 0, read_len)]
249
+ return name, frags
250
+
251
+ # > 2 soft-clips : garde-fou
252
+ raise ValueError(
253
+ f"More than two soft clipped segments found in CIGAR string ({cigar}) for read {name}. problem with mapping ?"
254
+ )
255
+
256
+
257
+ def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
258
+ """
259
+ Construit un header de paire à partir d'un nom de read et de deux tags.
260
+
261
+ base_name : nom logique du read (ex: '@READ')
262
+ tag_i, tag_j : identifiants de fragments, typiquement 'F1', 'R1', etc. (1-based)
263
+ tot_for, tot_rev : nombres totaux de fragments forward / reverse
264
+
265
+ Retour (mode origin/o):
266
+ '<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
267
+
268
+ Examples
269
+ --------
270
+ >>> read_name("@READ", "F1", "R1", 3, 1, "o")
271
+ '@READ:[F1,R1:FT3,RT1]'
272
+ >>> read_name("@READ", "F1", "F2", 3, 1, "o")
273
+ '@READ:[F1,F2:FT3,RT1]'
274
+ >>> read_name("@READ", "R1", "R2", 0, 2, "o")
275
+ '@READ:[R1,R2:FT0,RT2]'
276
+ >>> read_name("@READ", "F1", "F2", 3, 1)
277
+ '@READ:[F1,F2:FT3,RT1]'
278
+ >>> read_name("@READ", "X", "Y", 3, 1, "nt")
279
+ '@READ'
280
+ >>> read_name("@READ", "X", "Y", 3, 1, "no_tag")
281
+ '@READ'
282
+ """
283
+ if tags in ("no_tag", "nt"):
284
+ return f"{base_name}"
285
+ else:
286
+ return f"{base_name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
287
+
288
+
289
+ def _fraglist_to_entries(frag_list, origin):
290
+ """
291
+ Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
292
+
293
+ Examples
294
+ --------
295
+ >>> _fraglist_to_entries(["@x\\nAC\\n+\\n!!\\n"], "F")
296
+ [('F', 0, 'AC', '!!')]
297
+ """
298
+ entries = []
299
+ for idx, frag in enumerate(frag_list):
300
+ lines = frag.strip().split("\n")
301
+ if len(lines) != 4 or lines[2] != "+":
302
+ raise ValueError("Fragment FastQ invalide dans process_cigard.")
303
+ seq = lines[1]
304
+ qual = lines[3]
305
+ entries.append((origin, idx, seq, qual))
306
+ return entries
307
+
308
+
309
+ def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
310
+ """
311
+ Construit une paire FASTQ textuelle à partir de deux entrées.
312
+
313
+ e = (origin, idx, seq, qual)
314
+ """
315
+ o1, i1, s1, q1 = e1
316
+ o2, i2, s2, q2 = e2
317
+
318
+ tag_i = f"{o1}{i1 + 1}"
319
+ tag_j = f"{o2}{i2 + 1}"
320
+ header = read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=tags)
321
+
322
+ fq_f = f"{header}\n{s1}\n+\n{q1}\n"
323
+ fq_r = f"{header}\n{s2}\n+\n{q2}\n"
324
+ return fq_f, fq_r
325
+
326
+
327
+ def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
328
+ """
329
+ Génère un nombre minimal (ou quasi minimal) de paires pour que
330
+ chaque fragment apparaisse au moins une fois.
331
+
332
+ Stratégie:
333
+ 1. appariement F-R tant que possible
334
+ 2. appariement des restes au sein du même côté
335
+ 3. si un fragment reste seul, on le rattache à un anchor déjà utilisé
336
+
337
+ Complexité: O(F + R)
338
+
339
+ Examples
340
+ --------
341
+ Cas 1F / 1R
342
+ >>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
343
+ >>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
344
+ >>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
345
+ >>> F
346
+ '@READ:[F1,R1:FT1,RT1]\\nAC\\n+\\n??\\n'
347
+ >>> R
348
+ '@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
349
+
350
+ Cas 2F / 1R -> minimum = ceil(3/2)=2
351
+ >>> frags_f = ["@x\\nA\\n+\\n=\\n", "@x\\nBC\\n+\\n==\\n"]
352
+ >>> frags_r = ["@x\\nW\\n+\\n!\\n"]
353
+ >>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r, tags="o")
354
+ >>> F.count('@READ:['), R.count('@READ:[')
355
+ (2, 2)
356
+ >>> '@READ:[F1,R1:FT2,RT1]' in F
357
+ True
358
+
359
+ Cas 2F / 2R -> minimum = 2
360
+ >>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nB\\n+\\n!\\n"]
361
+ >>> frags_r = ["@x\\nC\\n+\\n!\\n", "@x\\nD\\n+\\n!\\n"]
362
+ >>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
363
+ >>> F.count('@READ:['), R.count('@READ:[')
364
+ (2, 2)
365
+
366
+ Cas 4F / 1R -> minimum = ceil(5/2)=3
367
+ >>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nB\\n+\\n!\\n", "@x\\nC\\n+\\n!\\n", "@x\\nD\\n+\\n!\\n"]
368
+ >>> frags_r = ["@x\\nZ\\n+\\n!\\n"]
369
+ >>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
370
+ >>> F.count('@READ:['), R.count('@READ:[')
371
+ (3, 3)
372
+ """
373
+ for_entries = _fraglist_to_entries(frags_f, "F")
374
+ rev_entries = _fraglist_to_entries(frags_r, "R")
375
+
376
+ tot_for = len(for_entries)
377
+ tot_rev = len(rev_entries)
378
+
379
+ if tot_for + tot_rev < 2:
380
+ return ["", ""]
381
+
382
+ out_f = []
383
+ out_r = []
384
+
385
+ used_pairs = []
386
+
387
+ # 1) Apparier F-R autant que possible
388
+ n_cross = min(tot_for, tot_rev)
389
+ for i in range(n_cross):
390
+ p = (for_entries[i], rev_entries[i])
391
+ used_pairs.append(p)
392
+
393
+ # 2) Gérer les restes d'un seul côté
394
+ if tot_for > tot_rev:
395
+ leftovers = for_entries[n_cross:]
396
+ opposite_anchor = rev_entries[0] if rev_entries else None
397
+ else:
398
+ leftovers = rev_entries[n_cross:]
399
+ opposite_anchor = for_entries[0] if for_entries else None
400
+
401
+ # Paires internes sur les restes
402
+ j = 0
403
+ while j + 1 < len(leftovers):
404
+ used_pairs.append((leftovers[j], leftovers[j + 1]))
405
+ j += 2
406
+
407
+ # 3) S'il reste un fragment seul, le raccrocher à un anchor
408
+ if j < len(leftovers):
409
+ last = leftovers[j]
410
+ if opposite_anchor is not None:
411
+ used_pairs.append((last, opposite_anchor))
412
+ elif len(leftovers) >= 2:
413
+ used_pairs.append((last, leftovers[0]))
414
+ else:
415
+ # Cas pathologique : un seul fragment total
416
+ return ["", ""]
417
+
418
+ for e1, e2 in used_pairs:
419
+ fq_f, fq_r = _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=tags)
420
+ out_f.append(fq_f)
421
+ out_r.append(fq_r)
422
+
423
+ return ["".join(out_f), "".join(out_r)]
424
+
425
+
426
+ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
427
+ """
428
+ Génère deux chaînes FastQ (forward/reverse) à partir de fragments déjà splittés.
429
+
430
+ `frags_f` et `frags_r` sont des listes de fragments FASTQ complets:
431
+ '@name\\nSEQ\\n+\\nQUAL\\n'
432
+
433
+ Examples
434
+ --------
435
+ Cas simple : 1 fragment forward, 1 fragment reverse (une seule combinaison).
436
+ >>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
437
+ >>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
438
+ >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
439
+ >>> F
440
+ '@READ:[F1,R1:FT1,RT1]\\nAC\\n+\\n??\\n'
441
+ >>> R
442
+ '@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
443
+
444
+ Même cas, sans tag (nt).
445
+ >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="nt")
446
+ >>> F
447
+ '@READ\\nAC\\n+\\n??\\n'
448
+ >>> R
449
+ '@READ\\nTG\\n+\\n!!\\n'
450
+
451
+ Cas combinatoire :
452
+ - forward: 2 fragments (F0, F1)
453
+ - reverse: 1 fragment (R0)
454
+ -> combinaisons : (F0,F1), (F0,R0), (F1,R0)
455
+ >>> frags_f = ["@x\\nA\\n+\\n=\\n", "@x\\nBCD\\n+\\n===\\n"]
456
+ >>> frags_r = ["@x\\nWXYZ\\n+\\n>>>>\\n"]
457
+ >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="o")
458
+ >>> F.count('@READ:['), R.count('@READ:[')
459
+ (3, 3)
460
+ >>> '@READ:[F1,F2:FT2,RT1]' in F
461
+ True
462
+ >>> '@READ:[F1,R1:FT2,RT1]' in F
463
+ True
464
+ >>> '@READ:[F2,R1:FT2,RT1]' in F
465
+ True
466
+ >>> '@@' in F or '@@' in R
467
+ False
468
+ >>> F
469
+ '@READ:[F1,F2:FT2,RT1]\\nA\\n+\\n=\\n@READ:[F1,R1:FT2,RT1]\\nA\\n+\\n=\\n@READ:[F2,R1:FT2,RT1]\\nBCD\\n+\\n===\\n'
470
+ >>> R
471
+ '@READ:[F1,F2:FT2,RT1]\\nBCD\\n+\\n===\\n@READ:[F1,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n@READ:[F2,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n'
472
+
473
+ Cas combinatoire symétrique (2 fragments forward, 2 fragments reverse).
474
+ >>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nBC\\n+\\n!!\\n"]
475
+ >>> frags_r = ["@x\\nD\\n+\\n#\\n", "@x\\nEF\\n+\\n##\\n"]
476
+ >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
477
+ >>> # 4 fragments total => C(4,2) = 6 paires
478
+ >>> F.count('@READ:['), R.count('@READ:[')
479
+ (6, 6)
480
+ """
481
+ from itertools import combinations
482
+
483
+ # Cas combinatoire: on annote chaque fragment avec son origine (F/R) et un index local
484
+ def _to_entries(frag_list, origin):
485
+ """
486
+ Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
487
+ frag = '@smth\\nSEQ\\n+\\nQUAL\\n'
488
+ """
489
+ entries = []
490
+ for idx, frag in enumerate(frag_list):
491
+ lines = frag.strip().split("\n")
492
+ if len(lines) != 4 or lines[2] != "+":
493
+ raise ValueError("Fragment FastQ invalide dans process_cigard.")
494
+ seq = lines[1]
495
+ qual = lines[3]
496
+ entries.append((origin, idx, seq, qual))
497
+ return entries
498
+
499
+ for_entries = _to_entries(frags_f, "F")
500
+ tot_for = len(for_entries)
501
+ rev_entries = _to_entries(frags_r, "R")
502
+ tot_rev = len(rev_entries)
503
+ all_entries = for_entries + rev_entries
504
+
505
+ fastq_forward = []
506
+ fastq_reverse = []
507
+
508
+ # Toutes les combinaisons uniques (i < j)
509
+ for (o1, i1, s1, q1), (o2, i2, s2, q2) in combinations(all_entries, 2):
510
+ tag_i = f"{o1}{i1 + 1}"
511
+ tag_j = f"{o2}{i2 + 1}"
512
+ header = read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=tags)
513
+ fastq_forward.append(f"{header}\n{s1}\n+\n{q1}\n")
514
+ fastq_reverse.append(f"{header}\n{s2}\n+\n{q2}\n")
515
+
516
+ return ["".join(fastq_forward), "".join(fastq_reverse)]
517
+
518
+
519
+ def gen_read_pairs_from_frags(
520
+ base_name, frags_f, frags_r, tags=None, pairing_mode="all"
521
+ ):
522
+ """
523
+ Dispatcher entre plusieurs stratégies de génération de paires.
524
+ """
525
+ if pairing_mode == "all":
526
+ return gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=tags)
527
+ elif pairing_mode == "cover":
528
+ return gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=tags)
529
+ else:
530
+ raise ValueError(f"Unknown pairing_mode: {pairing_mode}")
531
+
532
+
533
+ # Pensez à la gestion de seq = "*"
534
+ def sam_fields(sam_line: str):
535
+ """
536
+ Return minimal information
537
+ """
538
+ f = sam_line.rstrip("\n").split("\t")
539
+ if len(f) < 11:
540
+ raise ValueError(
541
+ f"Malformed SAM line: expected >= 11 fields : \n {sam_line}\n\n"
542
+ )
543
+ qname = f[0]
544
+ cigar = f[5]
545
+ seq = f[9]
546
+ qual = f[10]
547
+
548
+ name = qname if qname.startswith("@") else f"@{qname}"
549
+ return name, seq, qual, cigar
550
+
551
+
552
+ def process_items(
553
+ input_queue,
554
+ output_queue,
555
+ bam_queue,
556
+ seed_size,
557
+ len_add,
558
+ tags=None,
559
+ bam_batch_size=1000,
560
+ fastq_batch_items=1250,
561
+ pairing_mode="cover",
562
+ ):
563
+ """
564
+ Process items from the input queue, split the reads based on CIGAR strings, and put the results into the output queue.
565
+
566
+ Parameters:
567
+ input_queue (Queue): Queue to get read pairs.
568
+ output_queue (Queue): Queue to put processed read pairs.
569
+ seed_size (int): The minimum size of a segment to be considered for extraction.
570
+
571
+ Examples
572
+ --------
573
+ """
574
+
575
+ def flush_bam(buf):
576
+ if buf:
577
+ bam_queue.put(buf)
578
+ buf.clear()
579
+
580
+ def flush_fastq(f_chunks, r_chunks):
581
+ if f_chunks:
582
+ output_queue.put(("".join(f_chunks), "".join(r_chunks)))
583
+ f_chunks.clear()
584
+ r_chunks.clear()
585
+
586
+ bam_buf = []
587
+ fq_f_chunks = []
588
+ fq_r_chunks = []
589
+ fastq_forward = ""
590
+ fastq_reverse = ""
591
+ try:
592
+ while True:
593
+ batch = input_queue.get()
594
+ if batch is None:
595
+ # Flush restant
596
+ flush_bam(bam_buf)
597
+ flush_fastq(fq_f_chunks, fq_r_chunks)
598
+ # Signal fin à chaque writer
599
+ output_queue.put(None)
600
+ bam_queue.put(None)
601
+ break
602
+
603
+ # batch = list[(sam_f, sam_r)]
604
+ for sam_f, sam_r in batch:
605
+ name_f, seq_f, qual_f, cig_f = sam_fields(sam_f)
606
+ name_r, seq_r, qual_r, cig_r = sam_fields(sam_r)
607
+ if name_f != name_r:
608
+ raise ValueError("File / Pairs unsynchronized")
609
+
610
+ if check_data(
611
+ (name_f, seq_f, qual_f, cig_f, name_r, seq_r, qual_r, cig_r)
612
+ ):
613
+ # Decisionmaking "splittable" FAST PATH: dodge processing for a large part of pairs
614
+ if ("S" not in str(cig_f) and "S" not in str(cig_r)) or (
615
+ cig_f == "*" and cig_r == "*"
616
+ ):
617
+ # Unsplittable
618
+ bam_buf.append((sam_f, sam_r))
619
+ if len(bam_buf) >= bam_batch_size:
620
+ flush_bam(bam_buf)
621
+ continue
622
+
623
+ base_name, frags_f = process_cigard(
624
+ name=name_f,
625
+ sequence=seq_f,
626
+ quality=qual_f,
627
+ cigar=cig_f,
628
+ seed_size=seed_size,
629
+ len_add=len_add,
630
+ )
631
+ _, frags_r = process_cigard(
632
+ name=name_r,
633
+ sequence=seq_r,
634
+ quality=qual_r,
635
+ cigar=cig_r,
636
+ seed_size=seed_size,
637
+ len_add=len_add,
638
+ )
639
+
640
+ if (len(frags_f) == 1) and (len(frags_r) == 1):
641
+ # unsplittable/unique : Write in BAM
642
+ bam_buf.append((sam_f, sam_r))
643
+ if len(bam_buf) >= bam_batch_size:
644
+ flush_bam(bam_buf)
645
+ continue
646
+
647
+ if len(frags_f) + len(frags_r) < 2:
648
+ # unsplittable/unique : Write in BAM
649
+ bam_buf.append((sam_f, sam_r))
650
+ if len(bam_buf) >= bam_batch_size:
651
+ flush_bam(bam_buf)
652
+ continue
653
+
654
+ fastq_forward, fastq_reverse = gen_read_pairs_from_frags(
655
+ base_name=base_name,
656
+ frags_f=frags_f,
657
+ frags_r=frags_r,
658
+ tags=tags,
659
+ pairing_mode=pairing_mode,
660
+ )
661
+
662
+ if not fastq_forward or not fastq_reverse:
663
+ bam_buf.append((sam_f, sam_r))
664
+ if len(bam_buf) >= bam_batch_size:
665
+ flush_bam(bam_buf)
666
+ continue
667
+
668
+ fq_f_chunks.append(fastq_forward)
669
+ fq_r_chunks.append(fastq_reverse)
670
+
671
+ if len(fq_f_chunks) >= fastq_batch_items:
672
+ flush_fastq(fq_f_chunks, fq_r_chunks)
673
+
674
+ if len(fq_f_chunks) > 0:
675
+ flush_fastq(fq_f_chunks, fq_r_chunks)
676
+ if len(bam_buf) > 0:
677
+ flush_bam(bam_buf)
678
+
679
+ except Exception as e:
680
+ try:
681
+ output_queue.put(None)
682
+ bam_queue.put(None)
683
+ except Exception:
684
+ pass
685
+ print(f"Error in process_items: {e}", file=sys.stderr)
686
+ sys.exit(1)
687
+
688
+
689
+ def cut(args):
690
+ """
691
+ Main function to orchestrate the reading, processing, and writing of BAM files to FastQ.
692
+
693
+ Parameters:
694
+ args (argparse.Namespace): Namespace object containing command-line arguments.
695
+ """
696
+ bam_for_file = args.bam_1
697
+ bam_rev_file = args.bam_2
698
+ single_bam = args.single_bam
699
+
700
+ output_forward = args.output_forward
701
+ output_reverse = args.output_reverse
702
+ num_threads = args.num_threads
703
+ seed_size = args.seed_size
704
+ len_add = args.lenght_added
705
+ tags = args.tags
706
+ pairing_mode = args.pairing_mode
707
+
708
+ if single_bam:
709
+ if not os.path.exists(bam_for_file):
710
+ logger.error("Single BAM file does not exist.")
711
+ sys.exit(1)
712
+ else:
713
+ if not os.path.exists(bam_for_file) or not os.path.exists(bam_rev_file):
714
+ logger.error("BAM file does not exist.")
715
+ sys.exit(1)
716
+
717
+ try:
718
+ print("Extracting BAM headers...", flush=True)
719
+ if single_bam:
720
+ header_single = get_bam_header_single(bam_for_file)
721
+ else:
722
+ header_for, header_rev = get_bam_headers(bam_for_file, bam_rev_file)
723
+ except Exception as e:
724
+ print(f"Error reading BAM headers: {e}")
725
+ sys.exit(1)
726
+
727
+ if single_bam:
728
+ output_bam_single = str(args.output_forward.split(".")[0]) + "_unsplit.bam"
729
+ else:
730
+ output_bam_f = str(args.output_forward.split(".")[0]) + "_unsplit.bam"
731
+ output_bam_r = str(args.output_reverse.split(".")[0]) + "_unsplit.bam"
732
+
733
+ input_queue = Queue(maxsize=750)
734
+ output_queue = Queue(maxsize=100)
735
+ unsplittable_queue = Queue(maxsize=100)
736
+
737
+ try:
738
+ pigz_per_file, compute_processes, bam_threads = partitionning(
739
+ num_threads,
740
+ single_bam=single_bam,
741
+ )
742
+ except ValueError as e:
743
+ print(f"ERROR: {e}", file=sys.stderr)
744
+ return 2
745
+
746
+ handle_write_cmd(
747
+ bam_1=bam_for_file,
748
+ bam_2=bam_rev_file if not single_bam else None,
749
+ output_fq1=output_forward,
750
+ output_fq2=output_reverse,
751
+ output_bam1=output_bam_single if single_bam else output_bam_f,
752
+ output_bam2=None if single_bam else output_bam_r,
753
+ args=args,
754
+ )
755
+
756
+ manager = ProcessManager()
757
+ signal.signal(signal.SIGINT, manager.handle_signal)
758
+ signal.signal(signal.SIGTERM, manager.handle_signal)
759
+
760
+ try:
761
+ # Reader
762
+ if single_bam:
763
+ manager.start_worker(
764
+ target=read_bam_interleaved,
765
+ args=(
766
+ bam_for_file,
767
+ input_queue,
768
+ compute_processes,
769
+ bam_threads,
770
+ ),
771
+ )
772
+ else:
773
+ manager.start_worker(
774
+ target=read_bam_pair,
775
+ args=(
776
+ bam_for_file,
777
+ bam_rev_file,
778
+ input_queue,
779
+ compute_processes,
780
+ bam_threads,
781
+ ),
782
+ )
783
+
784
+ # Compute workers
785
+ for _ in range(compute_processes):
786
+ manager.start_worker(
787
+ target=process_items,
788
+ args=(
789
+ input_queue,
790
+ output_queue,
791
+ unsplittable_queue,
792
+ seed_size,
793
+ len_add,
794
+ tags,
795
+ 1000,
796
+ 1250,
797
+ pairing_mode,
798
+ ),
799
+ )
800
+
801
+ # FASTQ writer
802
+ manager.start_worker(
803
+ target=write_fastq_pair,
804
+ args=(
805
+ output_queue,
806
+ output_forward,
807
+ output_reverse,
808
+ compute_processes,
809
+ pigz_per_file,
810
+ ),
811
+ )
812
+
813
+ # BAM writer
814
+ if single_bam:
815
+ manager.start_worker(
816
+ target=write_bam_interleaved_from_sam,
817
+ args=(
818
+ unsplittable_queue,
819
+ output_bam_single,
820
+ header_single,
821
+ compute_processes,
822
+ bam_threads,
823
+ ),
824
+ )
825
+ else:
826
+ manager.start_worker(
827
+ target=write_bam_pair_from_sam,
828
+ args=(
829
+ unsplittable_queue,
830
+ output_bam_f,
831
+ output_bam_r,
832
+ header_for,
833
+ header_rev,
834
+ compute_processes,
835
+ bam_threads,
836
+ ),
837
+ )
838
+
839
+ while manager.running():
840
+ if not manager.check_processes():
841
+ sys.exit(1)
842
+ time.sleep(1)
843
+
844
+ except Exception as e:
845
+ print(f"ERROR: {e}", file=sys.stderr)
846
+ manager.shutdown()
847
+ return 2
848
+ finally:
849
+ manager.shutdown()