split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
split3c/nssite/split.py
ADDED
|
@@ -0,0 +1,849 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import signal
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from multiprocessing import Queue
|
|
7
|
+
|
|
8
|
+
from .auxiliary import check_data, handle_write_cmd, partitionning
|
|
9
|
+
from .bam import (
|
|
10
|
+
get_bam_header_single,
|
|
11
|
+
get_bam_headers,
|
|
12
|
+
read_bam_interleaved,
|
|
13
|
+
read_bam_pair,
|
|
14
|
+
write_bam_interleaved_from_sam,
|
|
15
|
+
write_bam_pair_from_sam,
|
|
16
|
+
)
|
|
17
|
+
from .fastq import write_fastq_pair
|
|
18
|
+
from .processmanager import ProcessManager
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_cigar_tuple(cigar):
|
|
24
|
+
"""
|
|
25
|
+
Parse CIGAR string into tuples of operations and lengths.
|
|
26
|
+
|
|
27
|
+
Examples
|
|
28
|
+
--------
|
|
29
|
+
>>> build_cigar_tuple("10M1I5M2D3S")
|
|
30
|
+
[['M', 'I', 'M', 'D', 'S'], [10, 1, 5, 2, 3]]
|
|
31
|
+
>>> build_cigar_tuple("5S95M")
|
|
32
|
+
[['S', 'M'], [5, 95]]
|
|
33
|
+
>>> build_cigar_tuple("100M")
|
|
34
|
+
[['M'], [100]]
|
|
35
|
+
>>> build_cigar_tuple("")
|
|
36
|
+
[[], []]
|
|
37
|
+
>>> build_cigar_tuple("5H10M5H")
|
|
38
|
+
[['H', 'M', 'H'], [5, 10, 5]]
|
|
39
|
+
>>> build_cigar_tuple("10MXXX") # non-CIGAR ignored
|
|
40
|
+
[['M'], [10]]
|
|
41
|
+
"""
|
|
42
|
+
import re
|
|
43
|
+
|
|
44
|
+
cigar_tuples = [[], []]
|
|
45
|
+
|
|
46
|
+
for match in re.finditer(r"(\d+)([MIDNSHP=X])", cigar):
|
|
47
|
+
times = int(match.group(1))
|
|
48
|
+
code = match.group(2)
|
|
49
|
+
cigar_tuples[0].append(code)
|
|
50
|
+
cigar_tuples[1].append(times)
|
|
51
|
+
return cigar_tuples
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def write_read(name, sequence, quality, start, stop):
|
|
55
|
+
"""
|
|
56
|
+
write a sequence
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> write_read("@r1", "ACGT", "IIII", 1, 3)
|
|
61
|
+
'@r1\\nCG\\n+\\nII\\n'
|
|
62
|
+
>>> write_read("@r1", "ACGT", "IIII", 0, 4)
|
|
63
|
+
'@r1\\nACGT\\n+\\nIIII\\n'
|
|
64
|
+
"""
|
|
65
|
+
return (
|
|
66
|
+
name
|
|
67
|
+
+ "\n"
|
|
68
|
+
+ sequence[start:stop]
|
|
69
|
+
+ "\n"
|
|
70
|
+
+ "+"
|
|
71
|
+
+ "\n"
|
|
72
|
+
+ quality[start:stop]
|
|
73
|
+
+ "\n"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def count_len(cigar_ops, cigar_lens):
|
|
78
|
+
"""
|
|
79
|
+
Calculates the length consumed in the read (ignores D, N, H, P)
|
|
80
|
+
and also returns the qpos_before positions (read coordinates
|
|
81
|
+
at the start of each CIGAR operator).
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
cigar_ops : list of str
|
|
86
|
+
CIGAR operations, e.g. ['S','M','S'].
|
|
87
|
+
cigar_lens : list of int
|
|
88
|
+
Corresponding lengths, e.g. [1,2,1].
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
qpos_before : list of int
|
|
93
|
+
Read positions before each CIGAR op.
|
|
94
|
+
qpos : int
|
|
95
|
+
Total length consumed in the read.
|
|
96
|
+
|
|
97
|
+
Examples
|
|
98
|
+
--------
|
|
99
|
+
>>> qpos_before, qpos = count_len(['M', 'I', 'D', 'S'], [5, 2, 3, 4])
|
|
100
|
+
>>> qpos_before
|
|
101
|
+
[0, 5, 7, 7]
|
|
102
|
+
>>> qpos
|
|
103
|
+
11
|
|
104
|
+
>>> count_len(['N', 'H', 'P'], [10, 5, 2])
|
|
105
|
+
([0, 0, 0], 0)
|
|
106
|
+
"""
|
|
107
|
+
qpos_before = []
|
|
108
|
+
qpos = 0
|
|
109
|
+
for op, ln in zip(cigar_ops, cigar_lens):
|
|
110
|
+
qpos_before.append(qpos)
|
|
111
|
+
if op in ("M", "I", "S", "=", "X"):
|
|
112
|
+
qpos += ln
|
|
113
|
+
# D, N, H, P doesn't consume the read (len)
|
|
114
|
+
return qpos_before, qpos
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def process_cigard(name, sequence, quality, cigar, seed_size, len_add):
|
|
118
|
+
"""
|
|
119
|
+
Extract fragments in the FastQ format from read
|
|
120
|
+
Extract mapped fragments and non mapped fragment
|
|
121
|
+
|
|
122
|
+
Parameters:
|
|
123
|
+
read (tuple): The read from which to extract information.
|
|
124
|
+
seed_size (int): The minimum size of a segment to be considered for extraction.
|
|
125
|
+
len_add (int): Number of base pairs added to the neoformed fragment after completion of soft clipping.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
tuple: A tuple containing the read name and a list of FastQ format strings.
|
|
129
|
+
|
|
130
|
+
Examples
|
|
131
|
+
--------
|
|
132
|
+
Cas sans soft-clip:
|
|
133
|
+
>>> name, seq, qual, cig = "@r", "ACGT", "IIII", "4M"
|
|
134
|
+
>>> n, frags = process_cigard(name, seq, qual, cig, seed_size=0, len_add=0)
|
|
135
|
+
>>> n
|
|
136
|
+
'@r'
|
|
137
|
+
>>> frags == ['@r\\nACGT\\n+\\nIIII\\n']
|
|
138
|
+
True
|
|
139
|
+
|
|
140
|
+
# Un soft-clip en tête (ex: 2S2M, len_add=1)
|
|
141
|
+
>>> name, seq, qual, cig = "@r", "ACGT", "?@AB", "2S2M"
|
|
142
|
+
>>> n, frags = process_cigard(name, seq, qual, cig, seed_size=0, len_add=1)
|
|
143
|
+
>>> len(frags)
|
|
144
|
+
2
|
|
145
|
+
>>> frags
|
|
146
|
+
['@r\\nACG\\n+\\n?@A\\n', '@r\\nCGT\\n+\\n@AB\\n']
|
|
147
|
+
>>> all(f.startswith('@r\\n') for f in frags)
|
|
148
|
+
True
|
|
149
|
+
|
|
150
|
+
# Deux soft-clips (ex: 1S2M1S), seed_size=1 -> 3 fragments
|
|
151
|
+
>>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
|
|
152
|
+
>>> n, frags = process_cigard(name, seq, qual, cig, seed_size=1, len_add=0)
|
|
153
|
+
>>> len(frags)
|
|
154
|
+
3
|
|
155
|
+
|
|
156
|
+
# Deux soft-clips (ex: 1S2M1S), seed_size=2 -> seul fragment central conservé
|
|
157
|
+
>>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
|
|
158
|
+
>>> n, frags = process_cigard(name, seq, qual, cig, seed_size=2, len_add=0)
|
|
159
|
+
>>> len(frags)
|
|
160
|
+
1
|
|
161
|
+
>>> frags
|
|
162
|
+
['@r\\nCG\\n+\\n??\\n']
|
|
163
|
+
|
|
164
|
+
# Deux soft-clips (ex: 1S2M1S), seed_size=3 -> fallback read complet
|
|
165
|
+
>>> name, seq, qual, cig = "@r", "ACGT", "????", "1S2M1S"
|
|
166
|
+
>>> n, frags = process_cigard(name, seq, qual, cig, seed_size=3, len_add=0)
|
|
167
|
+
>>> len(frags)
|
|
168
|
+
1
|
|
169
|
+
>>> frags
|
|
170
|
+
['@r\\nACGT\\n+\\n????\\n']
|
|
171
|
+
"""
|
|
172
|
+
if cigar in (None, "", "*"):
|
|
173
|
+
return name, []
|
|
174
|
+
|
|
175
|
+
# Build_cigar_tuple retourne [ops_list, lens_list]
|
|
176
|
+
cigar_ops, cigar_lens = build_cigar_tuple(cigar)
|
|
177
|
+
|
|
178
|
+
# Intra reads coordinates
|
|
179
|
+
qpos_before, _ = count_len(cigar_ops, cigar_lens)
|
|
180
|
+
read_len = len(sequence)
|
|
181
|
+
|
|
182
|
+
soft_clip_indices = [i for i, x in enumerate(cigar_ops) if x in ["S"]]
|
|
183
|
+
|
|
184
|
+
def make_frag(start: int, stop: int):
|
|
185
|
+
"""
|
|
186
|
+
Builds a FastQ fragment:
|
|
187
|
+
- clamp start/stop in [0, read_len]
|
|
188
|
+
- filter if length < seed_size
|
|
189
|
+
- returns None if fragment is invalid
|
|
190
|
+
"""
|
|
191
|
+
s = max(0, min(read_len, start))
|
|
192
|
+
e = max(0, min(read_len, stop))
|
|
193
|
+
if e <= s:
|
|
194
|
+
return None
|
|
195
|
+
if seed_size and (e - s) < seed_size:
|
|
196
|
+
return None
|
|
197
|
+
return write_read(f"{name}", sequence, quality, s, e)
|
|
198
|
+
|
|
199
|
+
# Case 0 : No soft-clipping
|
|
200
|
+
if len(soft_clip_indices) == 0:
|
|
201
|
+
return name, [write_read(name, sequence, quality, 0, len(sequence))]
|
|
202
|
+
|
|
203
|
+
# Case 1 : One soft-clip
|
|
204
|
+
if len(soft_clip_indices) == 1:
|
|
205
|
+
i = soft_clip_indices[0]
|
|
206
|
+
|
|
207
|
+
if i == 0:
|
|
208
|
+
# soft-clip first
|
|
209
|
+
index = cigar_lens[i]
|
|
210
|
+
frag1 = make_frag(0, index + len_add)
|
|
211
|
+
frag2 = make_frag(index - len_add, read_len)
|
|
212
|
+
else:
|
|
213
|
+
# soft-clip end
|
|
214
|
+
index = qpos_before[i]
|
|
215
|
+
frag1 = make_frag(0, index + len_add)
|
|
216
|
+
frag2 = make_frag(index - len_add, read_len)
|
|
217
|
+
|
|
218
|
+
frags = [f for f in (frag1, frag2) if f is not None]
|
|
219
|
+
|
|
220
|
+
# If everything were filtered
|
|
221
|
+
if not frags:
|
|
222
|
+
if not seed_size or read_len >= seed_size:
|
|
223
|
+
return name, [write_read(name, sequence, quality, 0, read_len)]
|
|
224
|
+
return name, frags
|
|
225
|
+
|
|
226
|
+
# Case 2 : Two soft-clips (S ... M ... S)
|
|
227
|
+
if len(soft_clip_indices) == 2:
|
|
228
|
+
i0, i1 = soft_clip_indices
|
|
229
|
+
|
|
230
|
+
ln0 = cigar_lens[i0]
|
|
231
|
+
# fin du premier soft-clip en coordonnées read
|
|
232
|
+
if i0 == 0:
|
|
233
|
+
index1_end = ln0
|
|
234
|
+
else:
|
|
235
|
+
index1_end = qpos_before[i0] + ln0
|
|
236
|
+
|
|
237
|
+
# début du second soft-clip en coordonnées read
|
|
238
|
+
index2_start = qpos_before[i1]
|
|
239
|
+
|
|
240
|
+
fragS1 = make_frag(0, index1_end + len_add)
|
|
241
|
+
fragM = make_frag(index1_end - len_add, index2_start + len_add)
|
|
242
|
+
fragS2 = make_frag(index2_start - len_add, read_len)
|
|
243
|
+
|
|
244
|
+
frags = [f for f in (fragS1, fragM, fragS2) if f is not None]
|
|
245
|
+
|
|
246
|
+
if not frags:
|
|
247
|
+
if not seed_size or read_len >= seed_size:
|
|
248
|
+
return name, [write_read(name, sequence, quality, 0, read_len)]
|
|
249
|
+
return name, frags
|
|
250
|
+
|
|
251
|
+
# > 2 soft-clips : garde-fou
|
|
252
|
+
raise ValueError(
|
|
253
|
+
f"More than two soft clipped segments found in CIGAR string ({cigar}) for read {name}. problem with mapping ?"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
|
|
258
|
+
"""
|
|
259
|
+
Construit un header de paire à partir d'un nom de read et de deux tags.
|
|
260
|
+
|
|
261
|
+
base_name : nom logique du read (ex: '@READ')
|
|
262
|
+
tag_i, tag_j : identifiants de fragments, typiquement 'F1', 'R1', etc. (1-based)
|
|
263
|
+
tot_for, tot_rev : nombres totaux de fragments forward / reverse
|
|
264
|
+
|
|
265
|
+
Retour (mode origin/o):
|
|
266
|
+
'<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
|
|
267
|
+
|
|
268
|
+
Examples
|
|
269
|
+
--------
|
|
270
|
+
>>> read_name("@READ", "F1", "R1", 3, 1, "o")
|
|
271
|
+
'@READ:[F1,R1:FT3,RT1]'
|
|
272
|
+
>>> read_name("@READ", "F1", "F2", 3, 1, "o")
|
|
273
|
+
'@READ:[F1,F2:FT3,RT1]'
|
|
274
|
+
>>> read_name("@READ", "R1", "R2", 0, 2, "o")
|
|
275
|
+
'@READ:[R1,R2:FT0,RT2]'
|
|
276
|
+
>>> read_name("@READ", "F1", "F2", 3, 1)
|
|
277
|
+
'@READ:[F1,F2:FT3,RT1]'
|
|
278
|
+
>>> read_name("@READ", "X", "Y", 3, 1, "nt")
|
|
279
|
+
'@READ'
|
|
280
|
+
>>> read_name("@READ", "X", "Y", 3, 1, "no_tag")
|
|
281
|
+
'@READ'
|
|
282
|
+
"""
|
|
283
|
+
if tags in ("no_tag", "nt"):
|
|
284
|
+
return f"{base_name}"
|
|
285
|
+
else:
|
|
286
|
+
return f"{base_name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _fraglist_to_entries(frag_list, origin):
|
|
290
|
+
"""
|
|
291
|
+
Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
|
|
292
|
+
|
|
293
|
+
Examples
|
|
294
|
+
--------
|
|
295
|
+
>>> _fraglist_to_entries(["@x\\nAC\\n+\\n!!\\n"], "F")
|
|
296
|
+
[('F', 0, 'AC', '!!')]
|
|
297
|
+
"""
|
|
298
|
+
entries = []
|
|
299
|
+
for idx, frag in enumerate(frag_list):
|
|
300
|
+
lines = frag.strip().split("\n")
|
|
301
|
+
if len(lines) != 4 or lines[2] != "+":
|
|
302
|
+
raise ValueError("Fragment FastQ invalide dans process_cigard.")
|
|
303
|
+
seq = lines[1]
|
|
304
|
+
qual = lines[3]
|
|
305
|
+
entries.append((origin, idx, seq, qual))
|
|
306
|
+
return entries
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
|
|
310
|
+
"""
|
|
311
|
+
Construit une paire FASTQ textuelle à partir de deux entrées.
|
|
312
|
+
|
|
313
|
+
e = (origin, idx, seq, qual)
|
|
314
|
+
"""
|
|
315
|
+
o1, i1, s1, q1 = e1
|
|
316
|
+
o2, i2, s2, q2 = e2
|
|
317
|
+
|
|
318
|
+
tag_i = f"{o1}{i1 + 1}"
|
|
319
|
+
tag_j = f"{o2}{i2 + 1}"
|
|
320
|
+
header = read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=tags)
|
|
321
|
+
|
|
322
|
+
fq_f = f"{header}\n{s1}\n+\n{q1}\n"
|
|
323
|
+
fq_r = f"{header}\n{s2}\n+\n{q2}\n"
|
|
324
|
+
return fq_f, fq_r
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
|
|
328
|
+
"""
|
|
329
|
+
Génère un nombre minimal (ou quasi minimal) de paires pour que
|
|
330
|
+
chaque fragment apparaisse au moins une fois.
|
|
331
|
+
|
|
332
|
+
Stratégie:
|
|
333
|
+
1. appariement F-R tant que possible
|
|
334
|
+
2. appariement des restes au sein du même côté
|
|
335
|
+
3. si un fragment reste seul, on le rattache à un anchor déjà utilisé
|
|
336
|
+
|
|
337
|
+
Complexité: O(F + R)
|
|
338
|
+
|
|
339
|
+
Examples
|
|
340
|
+
--------
|
|
341
|
+
Cas 1F / 1R
|
|
342
|
+
>>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
|
|
343
|
+
>>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
|
|
344
|
+
>>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
|
|
345
|
+
>>> F
|
|
346
|
+
'@READ:[F1,R1:FT1,RT1]\\nAC\\n+\\n??\\n'
|
|
347
|
+
>>> R
|
|
348
|
+
'@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
|
|
349
|
+
|
|
350
|
+
Cas 2F / 1R -> minimum = ceil(3/2)=2
|
|
351
|
+
>>> frags_f = ["@x\\nA\\n+\\n=\\n", "@x\\nBC\\n+\\n==\\n"]
|
|
352
|
+
>>> frags_r = ["@x\\nW\\n+\\n!\\n"]
|
|
353
|
+
>>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r, tags="o")
|
|
354
|
+
>>> F.count('@READ:['), R.count('@READ:[')
|
|
355
|
+
(2, 2)
|
|
356
|
+
>>> '@READ:[F1,R1:FT2,RT1]' in F
|
|
357
|
+
True
|
|
358
|
+
|
|
359
|
+
Cas 2F / 2R -> minimum = 2
|
|
360
|
+
>>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nB\\n+\\n!\\n"]
|
|
361
|
+
>>> frags_r = ["@x\\nC\\n+\\n!\\n", "@x\\nD\\n+\\n!\\n"]
|
|
362
|
+
>>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
|
|
363
|
+
>>> F.count('@READ:['), R.count('@READ:[')
|
|
364
|
+
(2, 2)
|
|
365
|
+
|
|
366
|
+
Cas 4F / 1R -> minimum = ceil(5/2)=3
|
|
367
|
+
>>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nB\\n+\\n!\\n", "@x\\nC\\n+\\n!\\n", "@x\\nD\\n+\\n!\\n"]
|
|
368
|
+
>>> frags_r = ["@x\\nZ\\n+\\n!\\n"]
|
|
369
|
+
>>> F, R = gen_read_pairs_from_frags_cover("@READ", frags_f, frags_r)
|
|
370
|
+
>>> F.count('@READ:['), R.count('@READ:[')
|
|
371
|
+
(3, 3)
|
|
372
|
+
"""
|
|
373
|
+
for_entries = _fraglist_to_entries(frags_f, "F")
|
|
374
|
+
rev_entries = _fraglist_to_entries(frags_r, "R")
|
|
375
|
+
|
|
376
|
+
tot_for = len(for_entries)
|
|
377
|
+
tot_rev = len(rev_entries)
|
|
378
|
+
|
|
379
|
+
if tot_for + tot_rev < 2:
|
|
380
|
+
return ["", ""]
|
|
381
|
+
|
|
382
|
+
out_f = []
|
|
383
|
+
out_r = []
|
|
384
|
+
|
|
385
|
+
used_pairs = []
|
|
386
|
+
|
|
387
|
+
# 1) Apparier F-R autant que possible
|
|
388
|
+
n_cross = min(tot_for, tot_rev)
|
|
389
|
+
for i in range(n_cross):
|
|
390
|
+
p = (for_entries[i], rev_entries[i])
|
|
391
|
+
used_pairs.append(p)
|
|
392
|
+
|
|
393
|
+
# 2) Gérer les restes d'un seul côté
|
|
394
|
+
if tot_for > tot_rev:
|
|
395
|
+
leftovers = for_entries[n_cross:]
|
|
396
|
+
opposite_anchor = rev_entries[0] if rev_entries else None
|
|
397
|
+
else:
|
|
398
|
+
leftovers = rev_entries[n_cross:]
|
|
399
|
+
opposite_anchor = for_entries[0] if for_entries else None
|
|
400
|
+
|
|
401
|
+
# Paires internes sur les restes
|
|
402
|
+
j = 0
|
|
403
|
+
while j + 1 < len(leftovers):
|
|
404
|
+
used_pairs.append((leftovers[j], leftovers[j + 1]))
|
|
405
|
+
j += 2
|
|
406
|
+
|
|
407
|
+
# 3) S'il reste un fragment seul, le raccrocher à un anchor
|
|
408
|
+
if j < len(leftovers):
|
|
409
|
+
last = leftovers[j]
|
|
410
|
+
if opposite_anchor is not None:
|
|
411
|
+
used_pairs.append((last, opposite_anchor))
|
|
412
|
+
elif len(leftovers) >= 2:
|
|
413
|
+
used_pairs.append((last, leftovers[0]))
|
|
414
|
+
else:
|
|
415
|
+
# Cas pathologique : un seul fragment total
|
|
416
|
+
return ["", ""]
|
|
417
|
+
|
|
418
|
+
for e1, e2 in used_pairs:
|
|
419
|
+
fq_f, fq_r = _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=tags)
|
|
420
|
+
out_f.append(fq_f)
|
|
421
|
+
out_r.append(fq_r)
|
|
422
|
+
|
|
423
|
+
return ["".join(out_f), "".join(out_r)]
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
|
|
427
|
+
"""
|
|
428
|
+
Génère deux chaînes FastQ (forward/reverse) à partir de fragments déjà splittés.
|
|
429
|
+
|
|
430
|
+
`frags_f` et `frags_r` sont des listes de fragments FASTQ complets:
|
|
431
|
+
'@name\\nSEQ\\n+\\nQUAL\\n'
|
|
432
|
+
|
|
433
|
+
Examples
|
|
434
|
+
--------
|
|
435
|
+
Cas simple : 1 fragment forward, 1 fragment reverse (une seule combinaison).
|
|
436
|
+
>>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
|
|
437
|
+
>>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
|
|
438
|
+
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
|
|
439
|
+
>>> F
|
|
440
|
+
'@READ:[F1,R1:FT1,RT1]\\nAC\\n+\\n??\\n'
|
|
441
|
+
>>> R
|
|
442
|
+
'@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
|
|
443
|
+
|
|
444
|
+
Même cas, sans tag (nt).
|
|
445
|
+
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="nt")
|
|
446
|
+
>>> F
|
|
447
|
+
'@READ\\nAC\\n+\\n??\\n'
|
|
448
|
+
>>> R
|
|
449
|
+
'@READ\\nTG\\n+\\n!!\\n'
|
|
450
|
+
|
|
451
|
+
Cas combinatoire :
|
|
452
|
+
- forward: 2 fragments (F0, F1)
|
|
453
|
+
- reverse: 1 fragment (R0)
|
|
454
|
+
-> combinaisons : (F0,F1), (F0,R0), (F1,R0)
|
|
455
|
+
>>> frags_f = ["@x\\nA\\n+\\n=\\n", "@x\\nBCD\\n+\\n===\\n"]
|
|
456
|
+
>>> frags_r = ["@x\\nWXYZ\\n+\\n>>>>\\n"]
|
|
457
|
+
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="o")
|
|
458
|
+
>>> F.count('@READ:['), R.count('@READ:[')
|
|
459
|
+
(3, 3)
|
|
460
|
+
>>> '@READ:[F1,F2:FT2,RT1]' in F
|
|
461
|
+
True
|
|
462
|
+
>>> '@READ:[F1,R1:FT2,RT1]' in F
|
|
463
|
+
True
|
|
464
|
+
>>> '@READ:[F2,R1:FT2,RT1]' in F
|
|
465
|
+
True
|
|
466
|
+
>>> '@@' in F or '@@' in R
|
|
467
|
+
False
|
|
468
|
+
>>> F
|
|
469
|
+
'@READ:[F1,F2:FT2,RT1]\\nA\\n+\\n=\\n@READ:[F1,R1:FT2,RT1]\\nA\\n+\\n=\\n@READ:[F2,R1:FT2,RT1]\\nBCD\\n+\\n===\\n'
|
|
470
|
+
>>> R
|
|
471
|
+
'@READ:[F1,F2:FT2,RT1]\\nBCD\\n+\\n===\\n@READ:[F1,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n@READ:[F2,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n'
|
|
472
|
+
|
|
473
|
+
Cas combinatoire symétrique (2 fragments forward, 2 fragments reverse).
|
|
474
|
+
>>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nBC\\n+\\n!!\\n"]
|
|
475
|
+
>>> frags_r = ["@x\\nD\\n+\\n#\\n", "@x\\nEF\\n+\\n##\\n"]
|
|
476
|
+
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
|
|
477
|
+
>>> # 4 fragments total => C(4,2) = 6 paires
|
|
478
|
+
>>> F.count('@READ:['), R.count('@READ:[')
|
|
479
|
+
(6, 6)
|
|
480
|
+
"""
|
|
481
|
+
from itertools import combinations
|
|
482
|
+
|
|
483
|
+
# Cas combinatoire: on annote chaque fragment avec son origine (F/R) et un index local
|
|
484
|
+
def _to_entries(frag_list, origin):
|
|
485
|
+
"""
|
|
486
|
+
Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
|
|
487
|
+
frag = '@smth\\nSEQ\\n+\\nQUAL\\n'
|
|
488
|
+
"""
|
|
489
|
+
entries = []
|
|
490
|
+
for idx, frag in enumerate(frag_list):
|
|
491
|
+
lines = frag.strip().split("\n")
|
|
492
|
+
if len(lines) != 4 or lines[2] != "+":
|
|
493
|
+
raise ValueError("Fragment FastQ invalide dans process_cigard.")
|
|
494
|
+
seq = lines[1]
|
|
495
|
+
qual = lines[3]
|
|
496
|
+
entries.append((origin, idx, seq, qual))
|
|
497
|
+
return entries
|
|
498
|
+
|
|
499
|
+
for_entries = _to_entries(frags_f, "F")
|
|
500
|
+
tot_for = len(for_entries)
|
|
501
|
+
rev_entries = _to_entries(frags_r, "R")
|
|
502
|
+
tot_rev = len(rev_entries)
|
|
503
|
+
all_entries = for_entries + rev_entries
|
|
504
|
+
|
|
505
|
+
fastq_forward = []
|
|
506
|
+
fastq_reverse = []
|
|
507
|
+
|
|
508
|
+
# Toutes les combinaisons uniques (i < j)
|
|
509
|
+
for (o1, i1, s1, q1), (o2, i2, s2, q2) in combinations(all_entries, 2):
|
|
510
|
+
tag_i = f"{o1}{i1 + 1}"
|
|
511
|
+
tag_j = f"{o2}{i2 + 1}"
|
|
512
|
+
header = read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=tags)
|
|
513
|
+
fastq_forward.append(f"{header}\n{s1}\n+\n{q1}\n")
|
|
514
|
+
fastq_reverse.append(f"{header}\n{s2}\n+\n{q2}\n")
|
|
515
|
+
|
|
516
|
+
return ["".join(fastq_forward), "".join(fastq_reverse)]
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def gen_read_pairs_from_frags(
|
|
520
|
+
base_name, frags_f, frags_r, tags=None, pairing_mode="all"
|
|
521
|
+
):
|
|
522
|
+
"""
|
|
523
|
+
Dispatcher entre plusieurs stratégies de génération de paires.
|
|
524
|
+
"""
|
|
525
|
+
if pairing_mode == "all":
|
|
526
|
+
return gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=tags)
|
|
527
|
+
elif pairing_mode == "cover":
|
|
528
|
+
return gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=tags)
|
|
529
|
+
else:
|
|
530
|
+
raise ValueError(f"Unknown pairing_mode: {pairing_mode}")
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
# Pensez à la gestion de seq = "*"
|
|
534
|
+
def sam_fields(sam_line: str):
|
|
535
|
+
"""
|
|
536
|
+
Return minimal information
|
|
537
|
+
"""
|
|
538
|
+
f = sam_line.rstrip("\n").split("\t")
|
|
539
|
+
if len(f) < 11:
|
|
540
|
+
raise ValueError(
|
|
541
|
+
f"Malformed SAM line: expected >= 11 fields : \n {sam_line}\n\n"
|
|
542
|
+
)
|
|
543
|
+
qname = f[0]
|
|
544
|
+
cigar = f[5]
|
|
545
|
+
seq = f[9]
|
|
546
|
+
qual = f[10]
|
|
547
|
+
|
|
548
|
+
name = qname if qname.startswith("@") else f"@{qname}"
|
|
549
|
+
return name, seq, qual, cigar
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def process_items(
|
|
553
|
+
input_queue,
|
|
554
|
+
output_queue,
|
|
555
|
+
bam_queue,
|
|
556
|
+
seed_size,
|
|
557
|
+
len_add,
|
|
558
|
+
tags=None,
|
|
559
|
+
bam_batch_size=1000,
|
|
560
|
+
fastq_batch_items=1250,
|
|
561
|
+
pairing_mode="cover",
|
|
562
|
+
):
|
|
563
|
+
"""
|
|
564
|
+
Process items from the input queue, split the reads based on CIGAR strings, and put the results into the output queue.
|
|
565
|
+
|
|
566
|
+
Parameters:
|
|
567
|
+
input_queue (Queue): Queue to get read pairs.
|
|
568
|
+
output_queue (Queue): Queue to put processed read pairs.
|
|
569
|
+
seed_size (int): The minimum size of a segment to be considered for extraction.
|
|
570
|
+
|
|
571
|
+
Examples
|
|
572
|
+
--------
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
def flush_bam(buf):
|
|
576
|
+
if buf:
|
|
577
|
+
bam_queue.put(buf)
|
|
578
|
+
buf.clear()
|
|
579
|
+
|
|
580
|
+
def flush_fastq(f_chunks, r_chunks):
|
|
581
|
+
if f_chunks:
|
|
582
|
+
output_queue.put(("".join(f_chunks), "".join(r_chunks)))
|
|
583
|
+
f_chunks.clear()
|
|
584
|
+
r_chunks.clear()
|
|
585
|
+
|
|
586
|
+
bam_buf = []
|
|
587
|
+
fq_f_chunks = []
|
|
588
|
+
fq_r_chunks = []
|
|
589
|
+
fastq_forward = ""
|
|
590
|
+
fastq_reverse = ""
|
|
591
|
+
try:
|
|
592
|
+
while True:
|
|
593
|
+
batch = input_queue.get()
|
|
594
|
+
if batch is None:
|
|
595
|
+
# Flush restant
|
|
596
|
+
flush_bam(bam_buf)
|
|
597
|
+
flush_fastq(fq_f_chunks, fq_r_chunks)
|
|
598
|
+
# Signal fin à chaque writer
|
|
599
|
+
output_queue.put(None)
|
|
600
|
+
bam_queue.put(None)
|
|
601
|
+
break
|
|
602
|
+
|
|
603
|
+
# batch = list[(sam_f, sam_r)]
|
|
604
|
+
for sam_f, sam_r in batch:
|
|
605
|
+
name_f, seq_f, qual_f, cig_f = sam_fields(sam_f)
|
|
606
|
+
name_r, seq_r, qual_r, cig_r = sam_fields(sam_r)
|
|
607
|
+
if name_f != name_r:
|
|
608
|
+
raise ValueError("File / Pairs unsynchronized")
|
|
609
|
+
|
|
610
|
+
if check_data(
|
|
611
|
+
(name_f, seq_f, qual_f, cig_f, name_r, seq_r, qual_r, cig_r)
|
|
612
|
+
):
|
|
613
|
+
# Decisionmaking "splittable" FAST PATH: dodge processing for a large part of pairs
|
|
614
|
+
if ("S" not in str(cig_f) and "S" not in str(cig_r)) or (
|
|
615
|
+
cig_f == "*" and cig_r == "*"
|
|
616
|
+
):
|
|
617
|
+
# Unsplittable
|
|
618
|
+
bam_buf.append((sam_f, sam_r))
|
|
619
|
+
if len(bam_buf) >= bam_batch_size:
|
|
620
|
+
flush_bam(bam_buf)
|
|
621
|
+
continue
|
|
622
|
+
|
|
623
|
+
base_name, frags_f = process_cigard(
|
|
624
|
+
name=name_f,
|
|
625
|
+
sequence=seq_f,
|
|
626
|
+
quality=qual_f,
|
|
627
|
+
cigar=cig_f,
|
|
628
|
+
seed_size=seed_size,
|
|
629
|
+
len_add=len_add,
|
|
630
|
+
)
|
|
631
|
+
_, frags_r = process_cigard(
|
|
632
|
+
name=name_r,
|
|
633
|
+
sequence=seq_r,
|
|
634
|
+
quality=qual_r,
|
|
635
|
+
cigar=cig_r,
|
|
636
|
+
seed_size=seed_size,
|
|
637
|
+
len_add=len_add,
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
if (len(frags_f) == 1) and (len(frags_r) == 1):
|
|
641
|
+
# unsplittable/unique : Write in BAM
|
|
642
|
+
bam_buf.append((sam_f, sam_r))
|
|
643
|
+
if len(bam_buf) >= bam_batch_size:
|
|
644
|
+
flush_bam(bam_buf)
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
if len(frags_f) + len(frags_r) < 2:
|
|
648
|
+
# unsplittable/unique : Write in BAM
|
|
649
|
+
bam_buf.append((sam_f, sam_r))
|
|
650
|
+
if len(bam_buf) >= bam_batch_size:
|
|
651
|
+
flush_bam(bam_buf)
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
fastq_forward, fastq_reverse = gen_read_pairs_from_frags(
|
|
655
|
+
base_name=base_name,
|
|
656
|
+
frags_f=frags_f,
|
|
657
|
+
frags_r=frags_r,
|
|
658
|
+
tags=tags,
|
|
659
|
+
pairing_mode=pairing_mode,
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
if not fastq_forward or not fastq_reverse:
|
|
663
|
+
bam_buf.append((sam_f, sam_r))
|
|
664
|
+
if len(bam_buf) >= bam_batch_size:
|
|
665
|
+
flush_bam(bam_buf)
|
|
666
|
+
continue
|
|
667
|
+
|
|
668
|
+
fq_f_chunks.append(fastq_forward)
|
|
669
|
+
fq_r_chunks.append(fastq_reverse)
|
|
670
|
+
|
|
671
|
+
if len(fq_f_chunks) >= fastq_batch_items:
|
|
672
|
+
flush_fastq(fq_f_chunks, fq_r_chunks)
|
|
673
|
+
|
|
674
|
+
if len(fq_f_chunks) > 0:
|
|
675
|
+
flush_fastq(fq_f_chunks, fq_r_chunks)
|
|
676
|
+
if len(bam_buf) > 0:
|
|
677
|
+
flush_bam(bam_buf)
|
|
678
|
+
|
|
679
|
+
except Exception as e:
|
|
680
|
+
try:
|
|
681
|
+
output_queue.put(None)
|
|
682
|
+
bam_queue.put(None)
|
|
683
|
+
except Exception:
|
|
684
|
+
pass
|
|
685
|
+
print(f"Error in process_items: {e}", file=sys.stderr)
|
|
686
|
+
sys.exit(1)
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def cut(args):
|
|
690
|
+
"""
|
|
691
|
+
Main function to orchestrate the reading, processing, and writing of BAM files to FastQ.
|
|
692
|
+
|
|
693
|
+
Parameters:
|
|
694
|
+
args (argparse.Namespace): Namespace object containing command-line arguments.
|
|
695
|
+
"""
|
|
696
|
+
bam_for_file = args.bam_1
|
|
697
|
+
bam_rev_file = args.bam_2
|
|
698
|
+
single_bam = args.single_bam
|
|
699
|
+
|
|
700
|
+
output_forward = args.output_forward
|
|
701
|
+
output_reverse = args.output_reverse
|
|
702
|
+
num_threads = args.num_threads
|
|
703
|
+
seed_size = args.seed_size
|
|
704
|
+
len_add = args.lenght_added
|
|
705
|
+
tags = args.tags
|
|
706
|
+
pairing_mode = args.pairing_mode
|
|
707
|
+
|
|
708
|
+
if single_bam:
|
|
709
|
+
if not os.path.exists(bam_for_file):
|
|
710
|
+
logger.error("Single BAM file does not exist.")
|
|
711
|
+
sys.exit(1)
|
|
712
|
+
else:
|
|
713
|
+
if not os.path.exists(bam_for_file) or not os.path.exists(bam_rev_file):
|
|
714
|
+
logger.error("BAM file does not exist.")
|
|
715
|
+
sys.exit(1)
|
|
716
|
+
|
|
717
|
+
try:
|
|
718
|
+
print("Extracting BAM headers...", flush=True)
|
|
719
|
+
if single_bam:
|
|
720
|
+
header_single = get_bam_header_single(bam_for_file)
|
|
721
|
+
else:
|
|
722
|
+
header_for, header_rev = get_bam_headers(bam_for_file, bam_rev_file)
|
|
723
|
+
except Exception as e:
|
|
724
|
+
print(f"Error reading BAM headers: {e}")
|
|
725
|
+
sys.exit(1)
|
|
726
|
+
|
|
727
|
+
if single_bam:
|
|
728
|
+
output_bam_single = str(args.output_forward.split(".")[0]) + "_unsplit.bam"
|
|
729
|
+
else:
|
|
730
|
+
output_bam_f = str(args.output_forward.split(".")[0]) + "_unsplit.bam"
|
|
731
|
+
output_bam_r = str(args.output_reverse.split(".")[0]) + "_unsplit.bam"
|
|
732
|
+
|
|
733
|
+
input_queue = Queue(maxsize=750)
|
|
734
|
+
output_queue = Queue(maxsize=100)
|
|
735
|
+
unsplittable_queue = Queue(maxsize=100)
|
|
736
|
+
|
|
737
|
+
try:
|
|
738
|
+
pigz_per_file, compute_processes, bam_threads = partitionning(
|
|
739
|
+
num_threads,
|
|
740
|
+
single_bam=single_bam,
|
|
741
|
+
)
|
|
742
|
+
except ValueError as e:
|
|
743
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
744
|
+
return 2
|
|
745
|
+
|
|
746
|
+
handle_write_cmd(
|
|
747
|
+
bam_1=bam_for_file,
|
|
748
|
+
bam_2=bam_rev_file if not single_bam else None,
|
|
749
|
+
output_fq1=output_forward,
|
|
750
|
+
output_fq2=output_reverse,
|
|
751
|
+
output_bam1=output_bam_single if single_bam else output_bam_f,
|
|
752
|
+
output_bam2=None if single_bam else output_bam_r,
|
|
753
|
+
args=args,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
manager = ProcessManager()
|
|
757
|
+
signal.signal(signal.SIGINT, manager.handle_signal)
|
|
758
|
+
signal.signal(signal.SIGTERM, manager.handle_signal)
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
# Reader
|
|
762
|
+
if single_bam:
|
|
763
|
+
manager.start_worker(
|
|
764
|
+
target=read_bam_interleaved,
|
|
765
|
+
args=(
|
|
766
|
+
bam_for_file,
|
|
767
|
+
input_queue,
|
|
768
|
+
compute_processes,
|
|
769
|
+
bam_threads,
|
|
770
|
+
),
|
|
771
|
+
)
|
|
772
|
+
else:
|
|
773
|
+
manager.start_worker(
|
|
774
|
+
target=read_bam_pair,
|
|
775
|
+
args=(
|
|
776
|
+
bam_for_file,
|
|
777
|
+
bam_rev_file,
|
|
778
|
+
input_queue,
|
|
779
|
+
compute_processes,
|
|
780
|
+
bam_threads,
|
|
781
|
+
),
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
# Compute workers
|
|
785
|
+
for _ in range(compute_processes):
|
|
786
|
+
manager.start_worker(
|
|
787
|
+
target=process_items,
|
|
788
|
+
args=(
|
|
789
|
+
input_queue,
|
|
790
|
+
output_queue,
|
|
791
|
+
unsplittable_queue,
|
|
792
|
+
seed_size,
|
|
793
|
+
len_add,
|
|
794
|
+
tags,
|
|
795
|
+
1000,
|
|
796
|
+
1250,
|
|
797
|
+
pairing_mode,
|
|
798
|
+
),
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# FASTQ writer
|
|
802
|
+
manager.start_worker(
|
|
803
|
+
target=write_fastq_pair,
|
|
804
|
+
args=(
|
|
805
|
+
output_queue,
|
|
806
|
+
output_forward,
|
|
807
|
+
output_reverse,
|
|
808
|
+
compute_processes,
|
|
809
|
+
pigz_per_file,
|
|
810
|
+
),
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
# BAM writer
|
|
814
|
+
if single_bam:
|
|
815
|
+
manager.start_worker(
|
|
816
|
+
target=write_bam_interleaved_from_sam,
|
|
817
|
+
args=(
|
|
818
|
+
unsplittable_queue,
|
|
819
|
+
output_bam_single,
|
|
820
|
+
header_single,
|
|
821
|
+
compute_processes,
|
|
822
|
+
bam_threads,
|
|
823
|
+
),
|
|
824
|
+
)
|
|
825
|
+
else:
|
|
826
|
+
manager.start_worker(
|
|
827
|
+
target=write_bam_pair_from_sam,
|
|
828
|
+
args=(
|
|
829
|
+
unsplittable_queue,
|
|
830
|
+
output_bam_f,
|
|
831
|
+
output_bam_r,
|
|
832
|
+
header_for,
|
|
833
|
+
header_rev,
|
|
834
|
+
compute_processes,
|
|
835
|
+
bam_threads,
|
|
836
|
+
),
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
while manager.running():
|
|
840
|
+
if not manager.check_processes():
|
|
841
|
+
sys.exit(1)
|
|
842
|
+
time.sleep(1)
|
|
843
|
+
|
|
844
|
+
except Exception as e:
|
|
845
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
846
|
+
manager.shutdown()
|
|
847
|
+
return 2
|
|
848
|
+
finally:
|
|
849
|
+
manager.shutdown()
|