split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
import sys
|
|
27
|
+
from typing import List, Tuple
|
|
28
|
+
|
|
29
|
+
from Bio.Restriction import RestrictionBatch
|
|
30
|
+
from Bio.Seq import Seq
|
|
31
|
+
|
|
32
|
+
logging.basicConfig(level=logging.INFO)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
############################### Common part #################################
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def case_adaptation(List_Enzyme):
|
|
39
|
+
"""
|
|
40
|
+
Case sensitive enzymes adaptation
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> case_adaptation(["hindiii"])
|
|
45
|
+
['HindIII']
|
|
46
|
+
>>> case_adaptation(["dpnii", "bglii", "mboi"])
|
|
47
|
+
['DpnII', 'BglII', 'MboI']
|
|
48
|
+
>>> case_adaptation(["arima"])
|
|
49
|
+
['DpnII', 'HinfI']
|
|
50
|
+
>>> case_adaptation([" Foo ", "arima", "DpnII"])
|
|
51
|
+
['Foo', 'DpnII', 'HinfI', 'DpnII']
|
|
52
|
+
"""
|
|
53
|
+
adapted = []
|
|
54
|
+
|
|
55
|
+
for enzyme in List_Enzyme:
|
|
56
|
+
enz = str(enzyme).strip()
|
|
57
|
+
key = enz.lower()
|
|
58
|
+
|
|
59
|
+
if key == "hindiii":
|
|
60
|
+
adapted.append("HindIII")
|
|
61
|
+
elif key == "dpnii":
|
|
62
|
+
adapted.append("DpnII")
|
|
63
|
+
elif key == "bglii":
|
|
64
|
+
adapted.append("BglII")
|
|
65
|
+
elif key == "mboi":
|
|
66
|
+
adapted.append("MboI")
|
|
67
|
+
elif key == "arima":
|
|
68
|
+
# Double enzyme
|
|
69
|
+
adapted.extend(["DpnII", "HinfI"])
|
|
70
|
+
else:
|
|
71
|
+
adapted.append(enz)
|
|
72
|
+
|
|
73
|
+
return adapted
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def find_liga_sites(
|
|
77
|
+
List_Enzyme: List[str], borderless: bool = False
|
|
78
|
+
) -> List[Tuple[re.Pattern, int]]:
|
|
79
|
+
"""
|
|
80
|
+
This function finds the ligation sites for a given list of enzymes and
|
|
81
|
+
their length.
|
|
82
|
+
|
|
83
|
+
Parameters:
|
|
84
|
+
List_Enzyme (List[str]): A list of enzymes for which to find the ligation
|
|
85
|
+
sites.
|
|
86
|
+
|
|
87
|
+
borderless (bool, optional): If True, the total length of the give and
|
|
88
|
+
accept sites is used.
|
|
89
|
+
If False, only the length of the give site
|
|
90
|
+
is used. Default is False.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List[Tuple[re.Pattern, int]]: A list of tuples, where each tuple contains
|
|
94
|
+
a compiled regular expression
|
|
95
|
+
pattern for the ligation site and the length
|
|
96
|
+
of the site.
|
|
97
|
+
|
|
98
|
+
Examples
|
|
99
|
+
--------
|
|
100
|
+
>>> out = find_liga_sites(["DpnII"])
|
|
101
|
+
>>> isinstance(out, list) and len(out) >= 1
|
|
102
|
+
True
|
|
103
|
+
>>> any(p.pattern == "GATCGATC" and off == 4 for p, off in out)
|
|
104
|
+
True
|
|
105
|
+
|
|
106
|
+
>>> out_b = find_liga_sites(["DpnII"], borderless=True)
|
|
107
|
+
>>> any(p.pattern == "GATCGATC" and off == 8 for p, off in out_b)
|
|
108
|
+
True
|
|
109
|
+
|
|
110
|
+
# This function is inspired by and adapted from the Cutsite function in Hicstuff
|
|
111
|
+
# (https://github.com/koszullab/hicstuff), originally under BSD license.
|
|
112
|
+
# See https://github.com/koszullab/hicstuff/blob/main/LICENSE for the full license.
|
|
113
|
+
"""
|
|
114
|
+
restriction_batch = RestrictionBatch(List_Enzyme)
|
|
115
|
+
give_list = []
|
|
116
|
+
accept_list = []
|
|
117
|
+
ligation_site_list = []
|
|
118
|
+
|
|
119
|
+
for enz in restriction_batch:
|
|
120
|
+
site = enz.elucidate()
|
|
121
|
+
fw_cut = site.find("^")
|
|
122
|
+
rev_cut = site.find("_")
|
|
123
|
+
|
|
124
|
+
# Purify give site
|
|
125
|
+
give_site = site[:rev_cut].replace("^", "")
|
|
126
|
+
while give_site[0] == "N":
|
|
127
|
+
give_site = give_site[1:]
|
|
128
|
+
give_list.append(give_site)
|
|
129
|
+
|
|
130
|
+
# Purify accept site
|
|
131
|
+
accept_site = site[fw_cut + 1 :].replace("_", "")
|
|
132
|
+
while accept_site[-1] == "N":
|
|
133
|
+
accept_site = accept_site[:-1]
|
|
134
|
+
accept_list.append(accept_site)
|
|
135
|
+
|
|
136
|
+
# Find ligation site
|
|
137
|
+
for give_site in give_list:
|
|
138
|
+
for accept_site in accept_list:
|
|
139
|
+
ligation_site = (give_site + accept_site).replace("N", ".")
|
|
140
|
+
compiled_regex = re.compile(ligation_site)
|
|
141
|
+
|
|
142
|
+
# Use total lenght for borderless
|
|
143
|
+
if borderless:
|
|
144
|
+
length = len(give_site) + len(accept_site)
|
|
145
|
+
else:
|
|
146
|
+
length = len(give_site)
|
|
147
|
+
ligation_site_list.append((compiled_regex, length))
|
|
148
|
+
|
|
149
|
+
# If ligation site is not palindromic
|
|
150
|
+
reverse_complement_site = str(Seq(ligation_site).reverse_complement())
|
|
151
|
+
|
|
152
|
+
if ligation_site != reverse_complement_site:
|
|
153
|
+
compiled_reverse_regex = re.compile(reverse_complement_site)
|
|
154
|
+
# Use lenght of accept site for reverse complement site
|
|
155
|
+
if borderless:
|
|
156
|
+
length = len(give_site) + len(accept_site)
|
|
157
|
+
else:
|
|
158
|
+
length = len(accept_site)
|
|
159
|
+
ligation_site_list.append((compiled_reverse_regex, length))
|
|
160
|
+
|
|
161
|
+
return ligation_site_list
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def search_in_database(enzymes, borderless=False):
|
|
165
|
+
"""
|
|
166
|
+
_summary_ : Search enzyme in database and retrieve ligation site
|
|
167
|
+
Examples
|
|
168
|
+
--------
|
|
169
|
+
>>> import io, contextlib
|
|
170
|
+
>>> buf = io.StringIO()
|
|
171
|
+
>>> with contextlib.redirect_stdout(buf):
|
|
172
|
+
... out = search_in_database("DpnII")
|
|
173
|
+
>>> isinstance(out, list) and len(out) >= 1
|
|
174
|
+
True
|
|
175
|
+
>>> any(p.pattern == "GATCGATC" and off == 4 for p, off in out)
|
|
176
|
+
True
|
|
177
|
+
|
|
178
|
+
>>> buf = io.StringIO()
|
|
179
|
+
>>> with contextlib.redirect_stdout(buf):
|
|
180
|
+
... out = search_in_database("DpnII", borderless=True)
|
|
181
|
+
>>> any(p.pattern == "GATCGATC" and off == 8 for p, off in out)
|
|
182
|
+
True
|
|
183
|
+
>>> "Mode Borderless" in buf.getvalue()
|
|
184
|
+
True
|
|
185
|
+
|
|
186
|
+
>>> buf = io.StringIO()
|
|
187
|
+
>>> try:
|
|
188
|
+
... with contextlib.redirect_stdout(buf):
|
|
189
|
+
... search_in_database("No restriction enzyme found")
|
|
190
|
+
... except SystemExit as e:
|
|
191
|
+
... code = e.code
|
|
192
|
+
>>> code
|
|
193
|
+
0
|
|
194
|
+
>>> "No restriction enzyme found" in buf.getvalue()
|
|
195
|
+
True
|
|
196
|
+
"""
|
|
197
|
+
if enzymes == "No restriction enzyme found":
|
|
198
|
+
print(enzymes)
|
|
199
|
+
sys.exit(0)
|
|
200
|
+
else:
|
|
201
|
+
if borderless:
|
|
202
|
+
print("Mode Borderless")
|
|
203
|
+
list_enz = enzymes.split(",")
|
|
204
|
+
try:
|
|
205
|
+
ligation_site_list = find_liga_sites(case_adaptation(list_enz), borderless)
|
|
206
|
+
if len(ligation_site_list) > 1:
|
|
207
|
+
for el in ligation_site_list:
|
|
208
|
+
print(f"Ligation sites: {el[0]}", flush=True)
|
|
209
|
+
else:
|
|
210
|
+
print(f"Ligation sites: {ligation_site_list[0]}", flush=True)
|
|
211
|
+
return ligation_site_list
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
raise RuntimeError(
|
|
215
|
+
f"Error in enzyme identification for input={enzymes!r}"
|
|
216
|
+
) from e
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _split_two(total: int) -> tuple[int, int]:
|
|
220
|
+
"""
|
|
221
|
+
Partage total en deux parts entières, différence ≤ 1, chacune ≥ 1.
|
|
222
|
+
Examples
|
|
223
|
+
--------
|
|
224
|
+
>>> _split_two(2)
|
|
225
|
+
(1, 1)
|
|
226
|
+
>>> _split_two(5)
|
|
227
|
+
(2, 3)
|
|
228
|
+
>>> _split_two(8)
|
|
229
|
+
(4, 4)
|
|
230
|
+
>>> _split_two(1)
|
|
231
|
+
Traceback (most recent call last):
|
|
232
|
+
...
|
|
233
|
+
ValueError: total doit être ≥ 2, ici 1
|
|
234
|
+
"""
|
|
235
|
+
if total < 2:
|
|
236
|
+
raise ValueError(f"total doit être ≥ 2, ici {total}")
|
|
237
|
+
a = total // 2
|
|
238
|
+
b = total - a
|
|
239
|
+
return max(1, a), max(1, b)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def partition_threads(
|
|
243
|
+
num_threads: int, oversubscribe_factor: float = 1.35
|
|
244
|
+
) -> tuple[int, int, int]:
|
|
245
|
+
"""
|
|
246
|
+
Retourne (TRead_total, TFrag, TWrite_total), avec TRead/TWrite toujours PAIRS.
|
|
247
|
+
Palier minimal 5: 1R/flux, 1Frag, 1W/flux.
|
|
248
|
+
Extra par paires: d'abord W, puis R, en alternance. Reste impair → Frag.
|
|
249
|
+
|
|
250
|
+
Overallocation added (program doesn't use 1 CPU for 1 threads)
|
|
251
|
+
"""
|
|
252
|
+
import math
|
|
253
|
+
|
|
254
|
+
if num_threads < 5:
|
|
255
|
+
raise ValueError(f"num_threads doit être ≥ 5, ici {num_threads}")
|
|
256
|
+
|
|
257
|
+
# base: 1 par flux pour lecture/écriture, 1 pour frag
|
|
258
|
+
rpf = 1 # read per file
|
|
259
|
+
wpf = 1 # write per file
|
|
260
|
+
frag = 1
|
|
261
|
+
|
|
262
|
+
extra = num_threads - 5
|
|
263
|
+
turn = "write"
|
|
264
|
+
while extra >= 2:
|
|
265
|
+
if turn == "write":
|
|
266
|
+
wpf += 1
|
|
267
|
+
turn = "read"
|
|
268
|
+
else:
|
|
269
|
+
rpf += 1
|
|
270
|
+
turn = "write"
|
|
271
|
+
extra -= 2
|
|
272
|
+
|
|
273
|
+
if extra == 1:
|
|
274
|
+
frag += 1
|
|
275
|
+
|
|
276
|
+
tread = 2 * rpf
|
|
277
|
+
twrite = 2 * wpf
|
|
278
|
+
tfrag = frag
|
|
279
|
+
|
|
280
|
+
nominal_total = tread + twrite + tfrag
|
|
281
|
+
target_total = math.floor(num_threads * oversubscribe_factor)
|
|
282
|
+
surplus = target_total - nominal_total
|
|
283
|
+
|
|
284
|
+
# Surallocation préférentielle vers écriture puis lecture, par paires
|
|
285
|
+
turn = "write"
|
|
286
|
+
while surplus >= 2:
|
|
287
|
+
if turn == "write":
|
|
288
|
+
twrite += 2
|
|
289
|
+
turn = "read"
|
|
290
|
+
else:
|
|
291
|
+
tread += 2
|
|
292
|
+
turn = "write"
|
|
293
|
+
surplus -= 2
|
|
294
|
+
|
|
295
|
+
# Reste impair vers fragmentation
|
|
296
|
+
if surplus == 1:
|
|
297
|
+
tfrag += 1
|
|
298
|
+
|
|
299
|
+
return tread, tfrag, twrite
|
split3c/resite/read.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import subprocess
|
|
26
|
+
|
|
27
|
+
logging.basicConfig(level=logging.INFO)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def stop_signal(Queue, NumThreadFragmentation):
|
|
31
|
+
"""
|
|
32
|
+
_summary_ : Add a stop signal to the queue for each thread
|
|
33
|
+
"""
|
|
34
|
+
for _ in range(NumThreadFragmentation):
|
|
35
|
+
Queue.put(None)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read_fastq_gzip_simultaneously(
|
|
39
|
+
fileA: str, fileB: str, Queue, num_threads, NumThreadFragmentation
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
_summary_ : Read two fastq files simultaneously, decompress them with pigz,
|
|
43
|
+
take a couple a read and put them into a queue by block
|
|
44
|
+
"""
|
|
45
|
+
from .pretreatment import _split_two
|
|
46
|
+
|
|
47
|
+
tA, tB = _split_two(num_threads)
|
|
48
|
+
# Use pigz to decompress the input files
|
|
49
|
+
procA = subprocess.Popen(
|
|
50
|
+
["pigz", "-dc", "-p", str(tA), fileA],
|
|
51
|
+
stdout=subprocess.PIPE,
|
|
52
|
+
text=True,
|
|
53
|
+
)
|
|
54
|
+
procB = subprocess.Popen(
|
|
55
|
+
["pigz", "-dc", "-p", str(tB), fileB],
|
|
56
|
+
stdout=subprocess.PIPE,
|
|
57
|
+
text=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
Stacker = []
|
|
61
|
+
try:
|
|
62
|
+
while True:
|
|
63
|
+
NomA = (procA.stdout.readline()).rstrip()
|
|
64
|
+
seqA = (procA.stdout.readline()).rstrip()
|
|
65
|
+
procA.stdout.readline() # Skip +
|
|
66
|
+
qualA = (procA.stdout.readline()).rstrip()
|
|
67
|
+
|
|
68
|
+
NomB = (procB.stdout.readline()).rstrip()
|
|
69
|
+
seqB = (procB.stdout.readline()).rstrip()
|
|
70
|
+
procB.stdout.readline() # Skip +
|
|
71
|
+
qualB = (procB.stdout.readline()).rstrip()
|
|
72
|
+
|
|
73
|
+
if not seqA or not seqB:
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
Stacker.append([[NomA, NomB], [seqA, seqB], [qualA, qualB]])
|
|
77
|
+
|
|
78
|
+
if len(Stacker) > 256:
|
|
79
|
+
Queue.put(Stacker)
|
|
80
|
+
Stacker = []
|
|
81
|
+
|
|
82
|
+
if len(Stacker) > 0:
|
|
83
|
+
Queue.put(Stacker)
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logging.error(f"Error in TakeOneItem: {e}")
|
|
87
|
+
|
|
88
|
+
finally:
|
|
89
|
+
stop_signal(Queue, NumThreadFragmentation)
|
|
90
|
+
procA.wait()
|
|
91
|
+
procB.wait()
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import signal
|
|
26
|
+
import subprocess
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
# Setup logging
|
|
30
|
+
logging.basicConfig(level=logging.INFO)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def signal_handler(sig, frame, outF, outR):
|
|
34
|
+
print(f"\nReceived signal {sig}. Terminating gracefully...")
|
|
35
|
+
outF.terminate() # Terminate the pigz processes
|
|
36
|
+
outR.terminate()
|
|
37
|
+
logging.info("\nProcess termination requested by signal")
|
|
38
|
+
sys.exit(0)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def open_output(TWrite, output_forward, output_reverse):
|
|
42
|
+
from .pretreatment import _split_two
|
|
43
|
+
|
|
44
|
+
tF, tR = _split_two(TWrite)
|
|
45
|
+
|
|
46
|
+
# Open output files for writing
|
|
47
|
+
outF = subprocess.Popen(
|
|
48
|
+
["pigz", "-c", "-p", str(tF)],
|
|
49
|
+
stdin=subprocess.PIPE,
|
|
50
|
+
stdout=open(output_forward, "wb"),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
outR = subprocess.Popen(
|
|
54
|
+
["pigz", "-c", "-p", str(tR)],
|
|
55
|
+
stdin=subprocess.PIPE,
|
|
56
|
+
stdout=open(output_reverse, "wb"),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Register signal handlers
|
|
60
|
+
signal.signal(
|
|
61
|
+
signal.SIGINT,
|
|
62
|
+
lambda sig, frame: signal_handler(sig, frame, outF, outR),
|
|
63
|
+
) # Ctrl+C
|
|
64
|
+
signal.signal(
|
|
65
|
+
signal.SIGTSTP,
|
|
66
|
+
lambda sig, frame: signal_handler(sig, frame, outF, outR),
|
|
67
|
+
) # Ctrl+Z
|
|
68
|
+
|
|
69
|
+
return outF, outR
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def manage_pigz_problems(outF, outR, output_forward, output_reverse):
|
|
73
|
+
outF.stdin.close()
|
|
74
|
+
outR.stdin.close()
|
|
75
|
+
|
|
76
|
+
outF.wait()
|
|
77
|
+
outR.wait()
|
|
78
|
+
|
|
79
|
+
stdoutF, stderrF = outF.communicate()
|
|
80
|
+
if stderrF:
|
|
81
|
+
print(
|
|
82
|
+
f"Error in pigz command for file {output_forward}: {stderrF}",
|
|
83
|
+
flush=True,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
stdoutR, stderrR = outR.communicate()
|
|
87
|
+
if stderrR:
|
|
88
|
+
print(
|
|
89
|
+
f"Error in pigz command for file {output_reverse}: {stderrR}",
|
|
90
|
+
flush=True,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def write_pairs(
|
|
95
|
+
Output_buffer,
|
|
96
|
+
outF: subprocess.Popen,
|
|
97
|
+
outR: subprocess.Popen,
|
|
98
|
+
TFrag,
|
|
99
|
+
) -> None:
|
|
100
|
+
finished_processes = 0
|
|
101
|
+
while finished_processes < TFrag:
|
|
102
|
+
try:
|
|
103
|
+
data = Output_buffer.get()
|
|
104
|
+
if data is None:
|
|
105
|
+
finished_processes += 1
|
|
106
|
+
else:
|
|
107
|
+
outF.stdin.write("".join(data[0]).encode("utf-8"))
|
|
108
|
+
outR.stdin.write("".join(data[1]).encode("utf-8"))
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logging.error(f"Error in write_pairs: {e}")
|
|
File without changes
|
split3c/resolve/bam.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
import pysam
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_bam_headers(bam_for_path: str, bam_rev_path: str) -> tuple[dict, dict]:
|
|
7
|
+
"""
|
|
8
|
+
Read both BAM headers and return them as dictionaries.
|
|
9
|
+
|
|
10
|
+
Examples
|
|
11
|
+
--------
|
|
12
|
+
No doctest here because it requires real BAM files.
|
|
13
|
+
"""
|
|
14
|
+
with pysam.AlignmentFile(bam_for_path, "rb") as bf:
|
|
15
|
+
header_for = bf.header.to_dict()
|
|
16
|
+
with pysam.AlignmentFile(bam_rev_path, "rb") as br:
|
|
17
|
+
header_rev = br.header.to_dict()
|
|
18
|
+
return header_for, header_rev
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_bam_header_single(bam_path: str) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Read one BAM header and return it as a dictionary.
|
|
24
|
+
|
|
25
|
+
Examples
|
|
26
|
+
--------
|
|
27
|
+
No doctest here because it requires a real BAM file.
|
|
28
|
+
"""
|
|
29
|
+
with pysam.AlignmentFile(bam_path, "rb") as bam:
|
|
30
|
+
return bam.header.to_dict()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def iter_bam_pairs(
|
|
34
|
+
bam_for_path: str,
|
|
35
|
+
bam_rev_path: str,
|
|
36
|
+
bam_threads: int = 1,
|
|
37
|
+
) -> Iterator[tuple[pysam.AlignedSegment, pysam.AlignedSegment]]:
|
|
38
|
+
"""
|
|
39
|
+
Iterate over synchronized forward/reverse BAM records.
|
|
40
|
+
|
|
41
|
+
The two BAMs must contain the same qname-sorted records in the same order.
|
|
42
|
+
|
|
43
|
+
Examples
|
|
44
|
+
--------
|
|
45
|
+
No doctest here because it requires real BAM files.
|
|
46
|
+
"""
|
|
47
|
+
with (
|
|
48
|
+
pysam.AlignmentFile(bam_for_path, "rb", threads=bam_threads) as bam_for,
|
|
49
|
+
pysam.AlignmentFile(bam_rev_path, "rb", threads=bam_threads) as bam_rev,
|
|
50
|
+
):
|
|
51
|
+
for idx, (read_for, read_rev) in enumerate(zip(bam_for, bam_rev), start=1):
|
|
52
|
+
if read_for is None or read_rev is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"Forward and reverse BAMs do not have the same number of records "
|
|
55
|
+
f"(first mismatch at record {idx})."
|
|
56
|
+
)
|
|
57
|
+
if read_for.query_name != read_rev.query_name:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"Forward and reverse BAMs are not synchronized by qname "
|
|
60
|
+
f"at record {idx}: {read_for.query_name!r} != {read_rev.query_name!r}."
|
|
61
|
+
)
|
|
62
|
+
yield read_for, read_rev
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def iter_bam_pairs_single(
|
|
66
|
+
bam_path: str,
|
|
67
|
+
bam_threads: int = 1,
|
|
68
|
+
) -> Iterator[tuple[pysam.AlignedSegment, pysam.AlignedSegment]]:
|
|
69
|
+
"""
|
|
70
|
+
Iterate over pairs from one interleaved BAM.
|
|
71
|
+
|
|
72
|
+
Assumptions
|
|
73
|
+
-----------
|
|
74
|
+
- records are written as consecutive pairs
|
|
75
|
+
- the two mates of one logical pair have the same query_name
|
|
76
|
+
- the BAM contains an even number of records
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
--------
|
|
80
|
+
No doctest here because it requires real BAM files.
|
|
81
|
+
"""
|
|
82
|
+
with pysam.AlignmentFile(bam_path, "rb", threads=int(bam_threads * 2)) as bam:
|
|
83
|
+
it = iter(bam)
|
|
84
|
+
pair_idx = 0
|
|
85
|
+
|
|
86
|
+
while True:
|
|
87
|
+
try:
|
|
88
|
+
read1 = next(it)
|
|
89
|
+
except StopIteration:
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
read2 = next(it)
|
|
94
|
+
except StopIteration as exc:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
"Single BAM contains an odd number of records; "
|
|
97
|
+
f"dangling read at pair index {pair_idx + 1}: "
|
|
98
|
+
f"{read1.query_name!r}."
|
|
99
|
+
) from exc
|
|
100
|
+
|
|
101
|
+
pair_idx += 1
|
|
102
|
+
|
|
103
|
+
if read1.query_name != read2.query_name:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
"Single BAM is not properly interleaved by qname "
|
|
106
|
+
f"at pair index {pair_idx}: "
|
|
107
|
+
f"{read1.query_name!r} != {read2.query_name!r}."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
yield read1, read2
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def chromsizes_from_header(header_dict: dict) -> list[tuple[str, int]]:
|
|
114
|
+
"""
|
|
115
|
+
Extract chromosome names and lengths from a BAM header dictionary.
|
|
116
|
+
|
|
117
|
+
Examples
|
|
118
|
+
--------
|
|
119
|
+
>>> chromsizes_from_header({"SQ": [{"SN": "chr1", "LN": 100}, {"SN": "chr2", "LN": 50}]})
|
|
120
|
+
[('chr1', 100), ('chr2', 50)]
|
|
121
|
+
"""
|
|
122
|
+
chromsizes: list[tuple[str, int]] = []
|
|
123
|
+
for sq in header_dict.get("SQ", []):
|
|
124
|
+
sn = sq.get("SN")
|
|
125
|
+
ln = sq.get("LN")
|
|
126
|
+
if sn is None or ln is None:
|
|
127
|
+
continue
|
|
128
|
+
chromsizes.append((str(sn), int(ln)))
|
|
129
|
+
return chromsizes
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import shlex
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TextIO
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class TextWriter:
|
|
13
|
+
handle: TextIO
|
|
14
|
+
process: subprocess.Popen | None = None
|
|
15
|
+
outfile: None = None
|
|
16
|
+
|
|
17
|
+
def write(self, text: str) -> int:
|
|
18
|
+
return self.handle.write(text)
|
|
19
|
+
|
|
20
|
+
def flush(self) -> None:
|
|
21
|
+
self.handle.flush()
|
|
22
|
+
if self.outfile is not None:
|
|
23
|
+
self.outfile.flush()
|
|
24
|
+
|
|
25
|
+
def close(self) -> None:
|
|
26
|
+
try:
|
|
27
|
+
self.handle.close()
|
|
28
|
+
finally:
|
|
29
|
+
if self.process is not None:
|
|
30
|
+
ret = self.process.wait()
|
|
31
|
+
if ret != 0:
|
|
32
|
+
raise RuntimeError(
|
|
33
|
+
f"Compression command failed with exit code {ret}."
|
|
34
|
+
)
|
|
35
|
+
if self.outfile is not None and not self.outfile.closed:
|
|
36
|
+
self.outfile.close()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _pick_gzip_command(nproc: int) -> list[str] | None:
|
|
40
|
+
candidates = []
|
|
41
|
+
if shutil.which("pbgzip"):
|
|
42
|
+
candidates.append(["pbgzip", "-c", "-n", str(max(1, nproc))])
|
|
43
|
+
if shutil.which("bgzip"):
|
|
44
|
+
candidates.append(["bgzip", "-c", "-@", str(max(1, nproc))])
|
|
45
|
+
if shutil.which("pigz"):
|
|
46
|
+
candidates.append(["pigz", "-c", "-p", str(max(1, nproc))])
|
|
47
|
+
if shutil.which("gzip"):
|
|
48
|
+
candidates.append(["gzip", "-c"])
|
|
49
|
+
return candidates[0] if candidates else None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def open_text_output(path: str | Path, nproc: int = 1) -> TextWriter:
|
|
53
|
+
path = str(path)
|
|
54
|
+
if path == "-":
|
|
55
|
+
return TextWriter(
|
|
56
|
+
handle=io.TextIOWrapper(os.fdopen(os.dup(1), "wb"), encoding="utf-8")
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if path.endswith(".gz"):
|
|
60
|
+
cmd = _pick_gzip_command(nproc)
|
|
61
|
+
if cmd is None:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"No gzip-compatible compressor found (tried pbgzip, bgzip, pigz, gzip)."
|
|
64
|
+
)
|
|
65
|
+
outfile = open(path, "wb")
|
|
66
|
+
proc = subprocess.Popen(
|
|
67
|
+
cmd, stdin=subprocess.PIPE, stdout=outfile, stderr=subprocess.PIPE
|
|
68
|
+
)
|
|
69
|
+
if proc.stdin is None:
|
|
70
|
+
outfile.close()
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
f"Failed to open compressor stdin for: {shlex.join(cmd)}"
|
|
73
|
+
)
|
|
74
|
+
handle = io.TextIOWrapper(proc.stdin, encoding="utf-8")
|
|
75
|
+
return TextWriter(handle=handle, process=proc, outfile=outfile)
|
|
76
|
+
|
|
77
|
+
return TextWriter(handle=open(path, "w", encoding="utf-8"))
|