split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
split3c/nssite/bam.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
def get_bam_headers(bam_for_path, bam_rev_path):
|
|
2
|
+
"""
|
|
3
|
+
Ouvre les fichiers BAM, extrait leurs headers sous forme de dictionnaire,
|
|
4
|
+
et referme les fichiers immédiatement.
|
|
5
|
+
|
|
6
|
+
Returns:
|
|
7
|
+
tuple: (header_dict_forward, header_dict_reverse)
|
|
8
|
+
"""
|
|
9
|
+
import pysam
|
|
10
|
+
|
|
11
|
+
# Lecture du header Forward
|
|
12
|
+
with pysam.AlignmentFile(bam_for_path, "rb") as bf:
|
|
13
|
+
header_for = bf.header.to_dict()
|
|
14
|
+
|
|
15
|
+
# Lecture du header Reverse
|
|
16
|
+
with pysam.AlignmentFile(bam_rev_path, "rb") as br:
|
|
17
|
+
header_rev = br.header.to_dict()
|
|
18
|
+
|
|
19
|
+
return header_for, header_rev
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def read_bam_pair(
|
|
23
|
+
bam_for_file,
|
|
24
|
+
bam_rev_file,
|
|
25
|
+
input_queue,
|
|
26
|
+
num_processes,
|
|
27
|
+
bam_threads=1,
|
|
28
|
+
batch_size=500,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Read simultaneously two BAM files and put read pairs into an input queue.
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
bam_for_file (str): Path to the forward BAM file.
|
|
35
|
+
bam_rev_file (str): Path to the reverse BAM file.
|
|
36
|
+
Input_Queue (Queue): Queue to store read pairs.
|
|
37
|
+
TFrag (int): Number of fragmenting threads.
|
|
38
|
+
"""
|
|
39
|
+
import sys
|
|
40
|
+
|
|
41
|
+
import pysam
|
|
42
|
+
|
|
43
|
+
batch = []
|
|
44
|
+
try:
|
|
45
|
+
with (
|
|
46
|
+
pysam.AlignmentFile(bam_for_file, "rb", threads=bam_threads) as bam_for,
|
|
47
|
+
pysam.AlignmentFile(bam_rev_file, "rb", threads=bam_threads) as bam_rev,
|
|
48
|
+
):
|
|
49
|
+
for read_for, read_rev in zip(bam_for, bam_rev):
|
|
50
|
+
if read_for and read_rev:
|
|
51
|
+
# Convert read objects to serializable format
|
|
52
|
+
batch.append((read_for.to_string(), read_rev.to_string()))
|
|
53
|
+
|
|
54
|
+
if len(batch) >= batch_size:
|
|
55
|
+
input_queue.put(batch)
|
|
56
|
+
batch = []
|
|
57
|
+
|
|
58
|
+
if batch:
|
|
59
|
+
input_queue.put(batch)
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"Error: with {bam_for_file} or {bam_rev_file}, {e}")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
finally:
|
|
65
|
+
for _ in range(num_processes):
|
|
66
|
+
input_queue.put(None)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def write_bam_pair_from_sam(
|
|
70
|
+
queue,
|
|
71
|
+
out_bam_f_path,
|
|
72
|
+
out_bam_r_path,
|
|
73
|
+
header_for_dict,
|
|
74
|
+
header_rev_dict,
|
|
75
|
+
num_procs_finished_signal,
|
|
76
|
+
bam_threads=1,
|
|
77
|
+
):
|
|
78
|
+
"""
|
|
79
|
+
Écrit les paires BAM en utilisant les dictionnaires de header fournis.
|
|
80
|
+
"""
|
|
81
|
+
import sys
|
|
82
|
+
|
|
83
|
+
import pysam
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
header_for = pysam.AlignmentHeader.from_dict(header_for_dict)
|
|
87
|
+
header_rev = pysam.AlignmentHeader.from_dict(header_rev_dict)
|
|
88
|
+
|
|
89
|
+
with (
|
|
90
|
+
pysam.AlignmentFile(
|
|
91
|
+
out_bam_f_path, "wb", header=header_for, threads=bam_threads
|
|
92
|
+
) as out_f,
|
|
93
|
+
pysam.AlignmentFile(
|
|
94
|
+
out_bam_r_path, "wb", header=header_rev, threads=bam_threads
|
|
95
|
+
) as out_r,
|
|
96
|
+
):
|
|
97
|
+
finished = 0
|
|
98
|
+
while finished < num_procs_finished_signal:
|
|
99
|
+
batch = queue.get()
|
|
100
|
+
|
|
101
|
+
if batch is None:
|
|
102
|
+
finished += 1
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# batch = list[(sam_f, sam_r)]
|
|
106
|
+
for sam_f, sam_r in batch:
|
|
107
|
+
read_f = pysam.AlignedSegment.fromstring(sam_f, header_for)
|
|
108
|
+
read_r = pysam.AlignedSegment.fromstring(sam_r, header_rev)
|
|
109
|
+
|
|
110
|
+
out_f.write(read_f)
|
|
111
|
+
out_r.write(read_r)
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Error in write_bam_pair_from_sam: {e}", file=sys.stderr)
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
#########
|
|
119
|
+
# One bam option
|
|
120
|
+
########
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_bam_header_single(bam_path):
|
|
124
|
+
"""
|
|
125
|
+
Ouvre un BAM unique, extrait son header sous forme de dictionnaire,
|
|
126
|
+
puis referme le fichier.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
dict
|
|
131
|
+
Header BAM sous forme de dictionnaire.
|
|
132
|
+
"""
|
|
133
|
+
import pysam
|
|
134
|
+
|
|
135
|
+
with pysam.AlignmentFile(bam_path, "rb") as bam:
|
|
136
|
+
return bam.header.to_dict()
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _pair_reads_from_single_bam(read_a, read_b, strict=True):
|
|
140
|
+
"""
|
|
141
|
+
Ordonne deux lectures provenant d'un BAM interleavé en (forward/read1, reverse/read2).
|
|
142
|
+
|
|
143
|
+
Paramètres
|
|
144
|
+
----------
|
|
145
|
+
read_a, read_b : pysam.AlignedSegment
|
|
146
|
+
strict : bool
|
|
147
|
+
Si True, lève une erreur en cas d'incohérence forte.
|
|
148
|
+
Si False, tente un fallback par ordre d'apparition.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
tuple
|
|
153
|
+
(read_for, read_rev)
|
|
154
|
+
|
|
155
|
+
Notes
|
|
156
|
+
-----
|
|
157
|
+
On utilise en priorité les flags 0x40 / 0x80 (is_read1 / is_read2).
|
|
158
|
+
"""
|
|
159
|
+
if read_a is None or read_b is None:
|
|
160
|
+
raise ValueError("Pairing failure: one of the reads is None.")
|
|
161
|
+
|
|
162
|
+
if read_a.query_name != read_b.query_name:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Interleaved BAM not synchronized: "
|
|
165
|
+
f"{read_a.query_name!r} != {read_b.query_name!r}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Fallback permissif: ordre d'apparition
|
|
169
|
+
return read_a, read_b
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def read_bam_interleaved(
|
|
173
|
+
bam_file,
|
|
174
|
+
input_queue,
|
|
175
|
+
num_processes,
|
|
176
|
+
bam_threads=1,
|
|
177
|
+
batch_size=500,
|
|
178
|
+
strict=True,
|
|
179
|
+
):
|
|
180
|
+
"""
|
|
181
|
+
Lit un BAM unique interleavé (une ligne forward/read1 suivie de la ligne reverse/read2) et envoie des batchs de paires SAM dans input_queue.
|
|
182
|
+
|
|
183
|
+
Contrat de sortie identique à read_bam_pair:
|
|
184
|
+
batch = list[(sam_f, sam_r)]
|
|
185
|
+
|
|
186
|
+
Paramètres
|
|
187
|
+
----------
|
|
188
|
+
bam_file : str
|
|
189
|
+
Chemin vers un BAM unique interleavé.
|
|
190
|
+
input_queue : multiprocessing.Queue
|
|
191
|
+
num_processes : int
|
|
192
|
+
Nombre de workers compute, pour envoyer les sentinelles None.
|
|
193
|
+
bam_threads : int
|
|
194
|
+
Threads pysam/htslib.
|
|
195
|
+
batch_size : int
|
|
196
|
+
Taille des batchs.
|
|
197
|
+
strict : bool
|
|
198
|
+
Si True, échoue si les paires ne sont pas parfaitement cohérentes.
|
|
199
|
+
|
|
200
|
+
Exigences
|
|
201
|
+
---------
|
|
202
|
+
Le BAM doit être ordonné par nom ou au minimum avoir les deux mates consécutives.
|
|
203
|
+
"""
|
|
204
|
+
import sys
|
|
205
|
+
|
|
206
|
+
import pysam
|
|
207
|
+
|
|
208
|
+
batch = []
|
|
209
|
+
pending = None
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
with pysam.AlignmentFile(bam_file, "rb", threads=bam_threads) as bam:
|
|
213
|
+
for read in bam:
|
|
214
|
+
if pending is None:
|
|
215
|
+
pending = read
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
read_for, read_rev = _pair_reads_from_single_bam(
|
|
219
|
+
pending, read, strict=strict
|
|
220
|
+
)
|
|
221
|
+
batch.append((read_for.to_string(), read_rev.to_string()))
|
|
222
|
+
pending = None
|
|
223
|
+
|
|
224
|
+
if len(batch) >= batch_size:
|
|
225
|
+
input_queue.put(batch)
|
|
226
|
+
batch = []
|
|
227
|
+
|
|
228
|
+
if pending is not None:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Odd number of records in interleaved BAM or dangling read: "
|
|
231
|
+
f"{pending.query_name!r}"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if batch:
|
|
235
|
+
input_queue.put(batch)
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
print(f"Error with interleaved BAM {bam_file}: {e}", file=sys.stderr)
|
|
239
|
+
sys.exit(1)
|
|
240
|
+
finally:
|
|
241
|
+
for _ in range(num_processes):
|
|
242
|
+
input_queue.put(None)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def write_bam_interleaved_from_sam(
|
|
246
|
+
queue,
|
|
247
|
+
out_bam_path,
|
|
248
|
+
header_dict,
|
|
249
|
+
num_procs_finished_signal,
|
|
250
|
+
bam_threads=1,
|
|
251
|
+
):
|
|
252
|
+
"""
|
|
253
|
+
Écrit les paires BAM non splittables dans un BAM unique interleavé.
|
|
254
|
+
|
|
255
|
+
Contrat d'entrée:
|
|
256
|
+
queue contient des batchs list[(sam_f, sam_r)]
|
|
257
|
+
|
|
258
|
+
Sortie:
|
|
259
|
+
un seul BAM avec read1 puis read2 à la suite.
|
|
260
|
+
|
|
261
|
+
Paramètres
|
|
262
|
+
----------
|
|
263
|
+
queue : multiprocessing.Queue
|
|
264
|
+
out_bam_path : str
|
|
265
|
+
header_dict : dict
|
|
266
|
+
num_procs_finished_signal : int
|
|
267
|
+
bam_threads : int
|
|
268
|
+
"""
|
|
269
|
+
import sys
|
|
270
|
+
|
|
271
|
+
import pysam
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
header = pysam.AlignmentHeader.from_dict(header_dict)
|
|
275
|
+
|
|
276
|
+
with pysam.AlignmentFile(
|
|
277
|
+
out_bam_path,
|
|
278
|
+
"wb",
|
|
279
|
+
header=header,
|
|
280
|
+
threads=bam_threads,
|
|
281
|
+
) as out_bam:
|
|
282
|
+
finished = 0
|
|
283
|
+
while finished < num_procs_finished_signal:
|
|
284
|
+
batch = queue.get()
|
|
285
|
+
|
|
286
|
+
if batch is None:
|
|
287
|
+
finished += 1
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
for sam_f, sam_r in batch:
|
|
291
|
+
read_f = pysam.AlignedSegment.fromstring(sam_f, header)
|
|
292
|
+
read_r = pysam.AlignedSegment.fromstring(sam_r, header)
|
|
293
|
+
|
|
294
|
+
out_bam.write(read_f)
|
|
295
|
+
out_bam.write(read_r)
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
print(f"Error in write_bam_interleaved_from_sam: {e}", file=sys.stderr)
|
|
299
|
+
sys.exit(1)
|
split3c/nssite/fastq.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
def open_output(output_forward, output_reverse, write_processes):
|
|
2
|
+
"""
|
|
3
|
+
Open output files for writing with pigz compression.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
write_processes (int): Number of threads for writing.
|
|
7
|
+
output_forward (str): Path to the forward output file.
|
|
8
|
+
output_reverse (str): Path to the reverse output file.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
out_f (subprocess.Popen): Process for the forward output.
|
|
12
|
+
out_r (subprocess.Popen): Process for the reverse output.
|
|
13
|
+
"""
|
|
14
|
+
import signal
|
|
15
|
+
import subprocess
|
|
16
|
+
|
|
17
|
+
from .auxiliary import signal_handler
|
|
18
|
+
|
|
19
|
+
# Open output files for writing
|
|
20
|
+
out_f = subprocess.Popen(
|
|
21
|
+
args=["pigz", "-c", "-p", str(write_processes)],
|
|
22
|
+
stdin=subprocess.PIPE,
|
|
23
|
+
stdout=open(file=output_forward, mode="wb"),
|
|
24
|
+
)
|
|
25
|
+
out_r = subprocess.Popen(
|
|
26
|
+
["pigz", "-c", "-p", str(write_processes)],
|
|
27
|
+
stdin=subprocess.PIPE,
|
|
28
|
+
stdout=open(file=output_reverse, mode="wb"),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Register signal handlers
|
|
32
|
+
signal.signal(
|
|
33
|
+
signal.SIGINT,
|
|
34
|
+
lambda sig, frame: signal_handler(
|
|
35
|
+
sig=sig, frame=frame, out_f=out_f, out_r=out_r
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
signal.signal(
|
|
39
|
+
signal.SIGTSTP,
|
|
40
|
+
lambda sig, frame: signal_handler(
|
|
41
|
+
sig=sig, frame=frame, out_f=out_f, out_r=out_r
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
return out_f, out_r
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def write_fastq_pair(
|
|
48
|
+
output_queue, output_forward, output_reverse, num_process, write_processes
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Write FastQ file pairs to the output using data from the output queue.
|
|
52
|
+
|
|
53
|
+
Parameters:
|
|
54
|
+
output_queue (Queue): Queue to get processed read pairs.
|
|
55
|
+
out_f (subprocess.Popen): Process for the forward output.
|
|
56
|
+
out_r (subprocess.Popen): Process for the reverse output.
|
|
57
|
+
num_process (int): Number of fragmenting threads.
|
|
58
|
+
"""
|
|
59
|
+
import sys
|
|
60
|
+
|
|
61
|
+
out_f, out_r = open_output(output_forward, output_reverse, write_processes)
|
|
62
|
+
while num_process > 0:
|
|
63
|
+
try:
|
|
64
|
+
data = output_queue.get()
|
|
65
|
+
if data is None:
|
|
66
|
+
num_process -= 1
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
f_block = data[0]
|
|
70
|
+
r_block = data[1]
|
|
71
|
+
|
|
72
|
+
if out_f.stdin is None or out_r.stdin is None:
|
|
73
|
+
raise ValueError("pigz stdin closed")
|
|
74
|
+
|
|
75
|
+
out_f.stdin.write(f_block.encode("utf-8"))
|
|
76
|
+
out_r.stdin.write(r_block.encode("utf-8"))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Error in write_pairs: {e}")
|
|
79
|
+
manage_pigz_errors(out_f, out_r, output_forward, output_reverse)
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
ensure_ending(out_f, out_r)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def ensure_ending(out_f, out_r):
|
|
85
|
+
"""
|
|
86
|
+
Examples
|
|
87
|
+
--------
|
|
88
|
+
>>> class _P:
|
|
89
|
+
... def __init__(self):
|
|
90
|
+
... self.stdin = self
|
|
91
|
+
... self.closed = False
|
|
92
|
+
... self.waited = False
|
|
93
|
+
... self.terminated = False
|
|
94
|
+
... def close(self):
|
|
95
|
+
... self.closed = True
|
|
96
|
+
... def wait(self):
|
|
97
|
+
... self.waited = True
|
|
98
|
+
... def terminate(self):
|
|
99
|
+
... self.terminated = True
|
|
100
|
+
...
|
|
101
|
+
>>> pf, pr = _P(), _P()
|
|
102
|
+
>>> ensure_ending(pf, pr)
|
|
103
|
+
>>> (pf.closed, pr.closed, pf.waited, pr.waited, pf.terminated, pr.terminated)
|
|
104
|
+
(True, True, True, True, True, True)
|
|
105
|
+
"""
|
|
106
|
+
if out_f.stdin is not None:
|
|
107
|
+
out_f.stdin.close()
|
|
108
|
+
if out_r.stdin is not None:
|
|
109
|
+
out_r.stdin.close()
|
|
110
|
+
out_f.wait()
|
|
111
|
+
out_r.wait()
|
|
112
|
+
out_f.terminate()
|
|
113
|
+
out_r.terminate()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def manage_pigz_errors(out_f, out_r, output_forward, output_reverse):
|
|
117
|
+
"""
|
|
118
|
+
Manage pigz process termination and check for errors.
|
|
119
|
+
|
|
120
|
+
Examples
|
|
121
|
+
--------
|
|
122
|
+
>>> class _P:
|
|
123
|
+
... def __init__(self, rc): self.stdin=None; self.returncode=rc; self.waited=False; self.terminated=False
|
|
124
|
+
... def wait(self): self.waited=True
|
|
125
|
+
... def terminate(self): self.terminated=True
|
|
126
|
+
...
|
|
127
|
+
>>> pf, pr = _P(0), _P(0)
|
|
128
|
+
>>> manage_pigz_errors(pf, pr, 'F.fq.gz', 'R.fq.gz')
|
|
129
|
+
>>> pf.waited and pr.waited and pf.terminated and pr.terminated
|
|
130
|
+
True
|
|
131
|
+
>>> pf, pr = _P(1), _P(2)
|
|
132
|
+
>>> manage_pigz_errors(pf, pr, 'F.fq.gz', 'R.fq.gz') # doctest: +ELLIPSIS
|
|
133
|
+
Error in pigz command for file F.fq.gz
|
|
134
|
+
Error in pigz command for file R.fq.gz
|
|
135
|
+
"""
|
|
136
|
+
if out_f.stdin is not None:
|
|
137
|
+
out_f.stdin.close()
|
|
138
|
+
if out_r.stdin is not None:
|
|
139
|
+
out_r.stdin.close()
|
|
140
|
+
out_f.wait()
|
|
141
|
+
out_r.wait()
|
|
142
|
+
|
|
143
|
+
if out_f.returncode != 0:
|
|
144
|
+
print(f"Error in pigz command for file {output_forward}")
|
|
145
|
+
if out_r.returncode != 0:
|
|
146
|
+
print(f"Error in pigz command for file {output_reverse}")
|
|
147
|
+
out_f.terminate()
|
|
148
|
+
out_r.terminate()
|