split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
split3c/nssite/bam.py ADDED
@@ -0,0 +1,299 @@
1
+ def get_bam_headers(bam_for_path, bam_rev_path):
2
+ """
3
+ Ouvre les fichiers BAM, extrait leurs headers sous forme de dictionnaire,
4
+ et referme les fichiers immédiatement.
5
+
6
+ Returns:
7
+ tuple: (header_dict_forward, header_dict_reverse)
8
+ """
9
+ import pysam
10
+
11
+ # Lecture du header Forward
12
+ with pysam.AlignmentFile(bam_for_path, "rb") as bf:
13
+ header_for = bf.header.to_dict()
14
+
15
+ # Lecture du header Reverse
16
+ with pysam.AlignmentFile(bam_rev_path, "rb") as br:
17
+ header_rev = br.header.to_dict()
18
+
19
+ return header_for, header_rev
20
+
21
+
22
+ def read_bam_pair(
23
+ bam_for_file,
24
+ bam_rev_file,
25
+ input_queue,
26
+ num_processes,
27
+ bam_threads=1,
28
+ batch_size=500,
29
+ ):
30
+ """
31
+ Read simultaneously two BAM files and put read pairs into an input queue.
32
+
33
+ Parameters:
34
+ bam_for_file (str): Path to the forward BAM file.
35
+ bam_rev_file (str): Path to the reverse BAM file.
36
+ Input_Queue (Queue): Queue to store read pairs.
37
+ TFrag (int): Number of fragmenting threads.
38
+ """
39
+ import sys
40
+
41
+ import pysam
42
+
43
+ batch = []
44
+ try:
45
+ with (
46
+ pysam.AlignmentFile(bam_for_file, "rb", threads=bam_threads) as bam_for,
47
+ pysam.AlignmentFile(bam_rev_file, "rb", threads=bam_threads) as bam_rev,
48
+ ):
49
+ for read_for, read_rev in zip(bam_for, bam_rev):
50
+ if read_for and read_rev:
51
+ # Convert read objects to serializable format
52
+ batch.append((read_for.to_string(), read_rev.to_string()))
53
+
54
+ if len(batch) >= batch_size:
55
+ input_queue.put(batch)
56
+ batch = []
57
+
58
+ if batch:
59
+ input_queue.put(batch)
60
+
61
+ except Exception as e:
62
+ print(f"Error: with {bam_for_file} or {bam_rev_file}, {e}")
63
+ sys.exit(1)
64
+ finally:
65
+ for _ in range(num_processes):
66
+ input_queue.put(None)
67
+
68
+
69
+ def write_bam_pair_from_sam(
70
+ queue,
71
+ out_bam_f_path,
72
+ out_bam_r_path,
73
+ header_for_dict,
74
+ header_rev_dict,
75
+ num_procs_finished_signal,
76
+ bam_threads=1,
77
+ ):
78
+ """
79
+ Écrit les paires BAM en utilisant les dictionnaires de header fournis.
80
+ """
81
+ import sys
82
+
83
+ import pysam
84
+
85
+ try:
86
+ header_for = pysam.AlignmentHeader.from_dict(header_for_dict)
87
+ header_rev = pysam.AlignmentHeader.from_dict(header_rev_dict)
88
+
89
+ with (
90
+ pysam.AlignmentFile(
91
+ out_bam_f_path, "wb", header=header_for, threads=bam_threads
92
+ ) as out_f,
93
+ pysam.AlignmentFile(
94
+ out_bam_r_path, "wb", header=header_rev, threads=bam_threads
95
+ ) as out_r,
96
+ ):
97
+ finished = 0
98
+ while finished < num_procs_finished_signal:
99
+ batch = queue.get()
100
+
101
+ if batch is None:
102
+ finished += 1
103
+ continue
104
+
105
+ # batch = list[(sam_f, sam_r)]
106
+ for sam_f, sam_r in batch:
107
+ read_f = pysam.AlignedSegment.fromstring(sam_f, header_for)
108
+ read_r = pysam.AlignedSegment.fromstring(sam_r, header_rev)
109
+
110
+ out_f.write(read_f)
111
+ out_r.write(read_r)
112
+
113
+ except Exception as e:
114
+ print(f"Error in write_bam_pair_from_sam: {e}", file=sys.stderr)
115
+ sys.exit(1)
116
+
117
+
118
+ #########
119
+ # One bam option
120
+ ########
121
+
122
+
123
+ def get_bam_header_single(bam_path):
124
+ """
125
+ Ouvre un BAM unique, extrait son header sous forme de dictionnaire,
126
+ puis referme le fichier.
127
+
128
+ Returns
129
+ -------
130
+ dict
131
+ Header BAM sous forme de dictionnaire.
132
+ """
133
+ import pysam
134
+
135
+ with pysam.AlignmentFile(bam_path, "rb") as bam:
136
+ return bam.header.to_dict()
137
+
138
+
139
+ def _pair_reads_from_single_bam(read_a, read_b, strict=True):
140
+ """
141
+ Ordonne deux lectures provenant d'un BAM interleavé en (forward/read1, reverse/read2).
142
+
143
+ Paramètres
144
+ ----------
145
+ read_a, read_b : pysam.AlignedSegment
146
+ strict : bool
147
+ Si True, lève une erreur en cas d'incohérence forte.
148
+ Si False, tente un fallback par ordre d'apparition.
149
+
150
+ Returns
151
+ -------
152
+ tuple
153
+ (read_for, read_rev)
154
+
155
+ Notes
156
+ -----
157
+ On utilise en priorité les flags 0x40 / 0x80 (is_read1 / is_read2).
158
+ """
159
+ if read_a is None or read_b is None:
160
+ raise ValueError("Pairing failure: one of the reads is None.")
161
+
162
+ if read_a.query_name != read_b.query_name:
163
+ raise ValueError(
164
+ f"Interleaved BAM not synchronized: "
165
+ f"{read_a.query_name!r} != {read_b.query_name!r}"
166
+ )
167
+
168
+ # Fallback permissif: ordre d'apparition
169
+ return read_a, read_b
170
+
171
+
172
+ def read_bam_interleaved(
173
+ bam_file,
174
+ input_queue,
175
+ num_processes,
176
+ bam_threads=1,
177
+ batch_size=500,
178
+ strict=True,
179
+ ):
180
+ """
181
+ Lit un BAM unique interleavé (une ligne forward/read1 suivie de la ligne reverse/read2) et envoie des batchs de paires SAM dans input_queue.
182
+
183
+ Contrat de sortie identique à read_bam_pair:
184
+ batch = list[(sam_f, sam_r)]
185
+
186
+ Paramètres
187
+ ----------
188
+ bam_file : str
189
+ Chemin vers un BAM unique interleavé.
190
+ input_queue : multiprocessing.Queue
191
+ num_processes : int
192
+ Nombre de workers compute, pour envoyer les sentinelles None.
193
+ bam_threads : int
194
+ Threads pysam/htslib.
195
+ batch_size : int
196
+ Taille des batchs.
197
+ strict : bool
198
+ Si True, échoue si les paires ne sont pas parfaitement cohérentes.
199
+
200
+ Exigences
201
+ ---------
202
+ Le BAM doit être ordonné par nom ou au minimum avoir les deux mates consécutives.
203
+ """
204
+ import sys
205
+
206
+ import pysam
207
+
208
+ batch = []
209
+ pending = None
210
+
211
+ try:
212
+ with pysam.AlignmentFile(bam_file, "rb", threads=bam_threads) as bam:
213
+ for read in bam:
214
+ if pending is None:
215
+ pending = read
216
+ continue
217
+
218
+ read_for, read_rev = _pair_reads_from_single_bam(
219
+ pending, read, strict=strict
220
+ )
221
+ batch.append((read_for.to_string(), read_rev.to_string()))
222
+ pending = None
223
+
224
+ if len(batch) >= batch_size:
225
+ input_queue.put(batch)
226
+ batch = []
227
+
228
+ if pending is not None:
229
+ raise ValueError(
230
+ f"Odd number of records in interleaved BAM or dangling read: "
231
+ f"{pending.query_name!r}"
232
+ )
233
+
234
+ if batch:
235
+ input_queue.put(batch)
236
+
237
+ except Exception as e:
238
+ print(f"Error with interleaved BAM {bam_file}: {e}", file=sys.stderr)
239
+ sys.exit(1)
240
+ finally:
241
+ for _ in range(num_processes):
242
+ input_queue.put(None)
243
+
244
+
245
+ def write_bam_interleaved_from_sam(
246
+ queue,
247
+ out_bam_path,
248
+ header_dict,
249
+ num_procs_finished_signal,
250
+ bam_threads=1,
251
+ ):
252
+ """
253
+ Écrit les paires BAM non splittables dans un BAM unique interleavé.
254
+
255
+ Contrat d'entrée:
256
+ queue contient des batchs list[(sam_f, sam_r)]
257
+
258
+ Sortie:
259
+ un seul BAM avec read1 puis read2 à la suite.
260
+
261
+ Paramètres
262
+ ----------
263
+ queue : multiprocessing.Queue
264
+ out_bam_path : str
265
+ header_dict : dict
266
+ num_procs_finished_signal : int
267
+ bam_threads : int
268
+ """
269
+ import sys
270
+
271
+ import pysam
272
+
273
+ try:
274
+ header = pysam.AlignmentHeader.from_dict(header_dict)
275
+
276
+ with pysam.AlignmentFile(
277
+ out_bam_path,
278
+ "wb",
279
+ header=header,
280
+ threads=bam_threads,
281
+ ) as out_bam:
282
+ finished = 0
283
+ while finished < num_procs_finished_signal:
284
+ batch = queue.get()
285
+
286
+ if batch is None:
287
+ finished += 1
288
+ continue
289
+
290
+ for sam_f, sam_r in batch:
291
+ read_f = pysam.AlignedSegment.fromstring(sam_f, header)
292
+ read_r = pysam.AlignedSegment.fromstring(sam_r, header)
293
+
294
+ out_bam.write(read_f)
295
+ out_bam.write(read_r)
296
+
297
+ except Exception as e:
298
+ print(f"Error in write_bam_interleaved_from_sam: {e}", file=sys.stderr)
299
+ sys.exit(1)
@@ -0,0 +1,148 @@
1
+ def open_output(output_forward, output_reverse, write_processes):
2
+ """
3
+ Open output files for writing with pigz compression.
4
+
5
+ Parameters:
6
+ write_processes (int): Number of threads for writing.
7
+ output_forward (str): Path to the forward output file.
8
+ output_reverse (str): Path to the reverse output file.
9
+
10
+ Returns:
11
+ out_f (subprocess.Popen): Process for the forward output.
12
+ out_r (subprocess.Popen): Process for the reverse output.
13
+ """
14
+ import signal
15
+ import subprocess
16
+
17
+ from .auxiliary import signal_handler
18
+
19
+ # Open output files for writing
20
+ out_f = subprocess.Popen(
21
+ args=["pigz", "-c", "-p", str(write_processes)],
22
+ stdin=subprocess.PIPE,
23
+ stdout=open(file=output_forward, mode="wb"),
24
+ )
25
+ out_r = subprocess.Popen(
26
+ ["pigz", "-c", "-p", str(write_processes)],
27
+ stdin=subprocess.PIPE,
28
+ stdout=open(file=output_reverse, mode="wb"),
29
+ )
30
+
31
+ # Register signal handlers
32
+ signal.signal(
33
+ signal.SIGINT,
34
+ lambda sig, frame: signal_handler(
35
+ sig=sig, frame=frame, out_f=out_f, out_r=out_r
36
+ ),
37
+ )
38
+ signal.signal(
39
+ signal.SIGTSTP,
40
+ lambda sig, frame: signal_handler(
41
+ sig=sig, frame=frame, out_f=out_f, out_r=out_r
42
+ ),
43
+ )
44
+ return out_f, out_r
45
+
46
+
47
+ def write_fastq_pair(
48
+ output_queue, output_forward, output_reverse, num_process, write_processes
49
+ ):
50
+ """
51
+ Write FastQ file pairs to the output using data from the output queue.
52
+
53
+ Parameters:
54
+ output_queue (Queue): Queue to get processed read pairs.
55
+ out_f (subprocess.Popen): Process for the forward output.
56
+ out_r (subprocess.Popen): Process for the reverse output.
57
+ num_process (int): Number of fragmenting threads.
58
+ """
59
+ import sys
60
+
61
+ out_f, out_r = open_output(output_forward, output_reverse, write_processes)
62
+ while num_process > 0:
63
+ try:
64
+ data = output_queue.get()
65
+ if data is None:
66
+ num_process -= 1
67
+ continue
68
+
69
+ f_block = data[0]
70
+ r_block = data[1]
71
+
72
+ if out_f.stdin is None or out_r.stdin is None:
73
+ raise ValueError("pigz stdin closed")
74
+
75
+ out_f.stdin.write(f_block.encode("utf-8"))
76
+ out_r.stdin.write(r_block.encode("utf-8"))
77
+ except Exception as e:
78
+ print(f"Error in write_pairs: {e}")
79
+ manage_pigz_errors(out_f, out_r, output_forward, output_reverse)
80
+ sys.exit(1)
81
+ ensure_ending(out_f, out_r)
82
+
83
+
84
+ def ensure_ending(out_f, out_r):
85
+ """
86
+ Examples
87
+ --------
88
+ >>> class _P:
89
+ ... def __init__(self):
90
+ ... self.stdin = self
91
+ ... self.closed = False
92
+ ... self.waited = False
93
+ ... self.terminated = False
94
+ ... def close(self):
95
+ ... self.closed = True
96
+ ... def wait(self):
97
+ ... self.waited = True
98
+ ... def terminate(self):
99
+ ... self.terminated = True
100
+ ...
101
+ >>> pf, pr = _P(), _P()
102
+ >>> ensure_ending(pf, pr)
103
+ >>> (pf.closed, pr.closed, pf.waited, pr.waited, pf.terminated, pr.terminated)
104
+ (True, True, True, True, True, True)
105
+ """
106
+ if out_f.stdin is not None:
107
+ out_f.stdin.close()
108
+ if out_r.stdin is not None:
109
+ out_r.stdin.close()
110
+ out_f.wait()
111
+ out_r.wait()
112
+ out_f.terminate()
113
+ out_r.terminate()
114
+
115
+
116
+ def manage_pigz_errors(out_f, out_r, output_forward, output_reverse):
117
+ """
118
+ Manage pigz process termination and check for errors.
119
+
120
+ Examples
121
+ --------
122
+ >>> class _P:
123
+ ... def __init__(self, rc): self.stdin=None; self.returncode=rc; self.waited=False; self.terminated=False
124
+ ... def wait(self): self.waited=True
125
+ ... def terminate(self): self.terminated=True
126
+ ...
127
+ >>> pf, pr = _P(0), _P(0)
128
+ >>> manage_pigz_errors(pf, pr, 'F.fq.gz', 'R.fq.gz')
129
+ >>> pf.waited and pr.waited and pf.terminated and pr.terminated
130
+ True
131
+ >>> pf, pr = _P(1), _P(2)
132
+ >>> manage_pigz_errors(pf, pr, 'F.fq.gz', 'R.fq.gz') # doctest: +ELLIPSIS
133
+ Error in pigz command for file F.fq.gz
134
+ Error in pigz command for file R.fq.gz
135
+ """
136
+ if out_f.stdin is not None:
137
+ out_f.stdin.close()
138
+ if out_r.stdin is not None:
139
+ out_r.stdin.close()
140
+ out_f.wait()
141
+ out_r.wait()
142
+
143
+ if out_f.returncode != 0:
144
+ print(f"Error in pigz command for file {output_forward}")
145
+ if out_r.returncode != 0:
146
+ print(f"Error in pigz command for file {output_reverse}")
147
+ out_f.terminate()
148
+ out_r.terminate()