split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ # SPDX-FileCopyrightText: 2025 2024 Samir Bertache
2
+ #
3
+ # SPDX-License-Identifier: AGPL-3.0-or-later
4
+
5
+ """
6
+ This script is a the Parasplit project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
7
+
8
+ Copyright © 2024 Samir Bertache
9
+
10
+ SPDX-License-Identifier: AGPL-3.0-or-later
11
+
12
+ ===============================================================================
13
+
14
+ This program is free software: you can redistribute it and/or modify it under
15
+ the terms of the GNU Affero General Public License as published by the
16
+ Free Software Foundation, either version 3 of the License, or (at your option)
17
+ any later version.
18
+
19
+ This program is distributed in the hope that it will be useful,
20
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
+ See the GNU Affero General Public License for more details.
23
+
24
+ You should have received a copy of the GNU Affero General Public License
25
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
26
+ """
27
+
28
+ from .frag import process_items
29
+ from .pretreatment import partition_threads, search_in_database
30
+ from .read import read_fastq_gzip_simultaneously
31
+ from .write_control import manage_pigz_problems, open_output, write_pairs
32
+
33
+ __version__ = "1.1.5"
split3c/resite/frag.py ADDED
@@ -0,0 +1,576 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import logging
25
+ import re
26
+ from typing import Generator, List, Tuple
27
+
28
+ from .header import _tag_from_global_index, build_pair_header
29
+ from .index import index_list, index_list_borderless
30
+
31
+ logging.basicConfig(level=logging.INFO)
32
+
33
+
34
+ ################################ Mode All #####################################
35
+
36
+
37
+ def create_pairs_all(sequence, seed_size, ligation_site_list, indexation) -> Generator:
38
+ """
39
+ Create all possible pairs of fragments from given sequences.
40
+
41
+ Parameters:
42
+ Sequence (List[str]): List containing forward and reverse sequences.
43
+ seed_size (int): Minimum size of the fragment to be considered.
44
+ ligation_site_list (List[Tuple[re.Pattern, int]]): List of ligation sites with regex patterns and offsets.
45
+
46
+ Yields:
47
+ List: A list containing:
48
+ - A unique identifier for the pair.
49
+ - Information about the first fragment and which sequence it comes from (forward or reverse).
50
+ - Information about the second fragment and which sequence it comes from (forward or reverse).
51
+
52
+ >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
53
+ >>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
54
+ >>> def fake_index(Seq, sites, seed): return ([[0,12]], [[0,12]])
55
+ >>> lig = [(re.compile("CCCC"),4)]
56
+ >>> bufF, bufR = processing_all("readX", seqs, quals, lig, 0, fake_index, tags="o")
57
+ >>> bufF[0].startswith("readX\\n")
58
+ True
59
+ >>> bufR[0].startswith("readX\\n")
60
+ True
61
+
62
+ >>> # tags=None doit se comporter comme tags="o" quand multipaires
63
+ >>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
64
+ >>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags=None)
65
+ >>> len(bufF2), len(bufR2)
66
+ (6, 6)
67
+ >>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
68
+ True
69
+ >>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
70
+ True
71
+
72
+ >>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
73
+ >>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags="o")
74
+ >>> len(bufF2), len(bufR2)
75
+ (6, 6)
76
+ >>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
77
+ True
78
+ >>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
79
+ True
80
+
81
+
82
+ >>> # 1 fragment sur R1, 1 fragment sur R2 => 1 paire (F1,R1)
83
+ >>> def idx_1_1(Seq, sites, seed): return ([[0,2]], [[0,2]])
84
+ >>> pairs = list(create_pairs_all(["AB", "CD"], 0, [], idx_1_1))
85
+ >>> len(pairs)
86
+ 1
87
+ >>> pair_id, tag_i, tag_j, fragA, fragB, tot_for, tot_rev = pairs[0]
88
+ >>> (tag_i, tag_j, tot_for, tot_rev)
89
+ ('F1', 'R1', 1, 1)
90
+ >>> # fragA vient de R1 (which=0), fragB vient de R2 (which=1)
91
+ >>> (fragA[1], fragB[1])
92
+ (0, 1)
93
+
94
+ >>> # 2 fragments sur R1, 2 fragments sur R2 => 4 fragments => C(4,2)=6 paires
95
+ >>> def idx_2_2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
96
+ >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
97
+ >>> pairs = list(create_pairs_all(seqs, 0, [], idx_2_2))
98
+ >>> len(pairs)
99
+ 6
100
+ >>> # Dernière paire = (R1,R2) (car ordre i<j sur AllFrag = F1,F2,R1,R2)
101
+ >>> pairs[-1][1], pairs[-1][2]
102
+ ('R1', 'R2')
103
+ >>> # et les deux fragments viennent bien de la séquence reverse (which=1)
104
+ >>> pairs[-1][3][1], pairs[-1][4][1]
105
+ (1, 1)
106
+ """
107
+ ListFragmentFor, ListFragmentRev = indexation(
108
+ sequence, ligation_site_list, seed_size
109
+ )
110
+ AllFrag = ListFragmentFor + ListFragmentRev
111
+ NbFragFor = len(ListFragmentFor)
112
+ NbFragRev = len(ListFragmentRev)
113
+
114
+ for i, fragI in enumerate(AllFrag):
115
+ for j, fragJ in enumerate(AllFrag):
116
+ if i < j:
117
+ tag_i = _tag_from_global_index(i, NbFragFor)
118
+ tag_j = _tag_from_global_index(j, NbFragFor)
119
+
120
+ which_i = 0 if i < NbFragFor else 1
121
+ which_j = 0 if j < NbFragFor else 1
122
+
123
+ pair_id = f"{i}{j}"
124
+ yield [
125
+ pair_id,
126
+ tag_i,
127
+ tag_j,
128
+ [fragI, which_i],
129
+ [fragJ, which_j],
130
+ NbFragFor,
131
+ NbFragRev,
132
+ ]
133
+
134
+
135
+ def processing_all(
136
+ Name: str,
137
+ Sequence: List[str],
138
+ Quality: List[str],
139
+ ligation_site_list: List[Tuple[re.Pattern, int]],
140
+ seed_size: int,
141
+ indexation,
142
+ tags: str | None = None,
143
+ ) -> Tuple[List[str], List[str]]:
144
+ """
145
+ Process the sequences to generate buffers for forward and reverse reads
146
+ by creating ALL possible fragment pairs. Do not add the suffix :ij if there is only one pair.
147
+
148
+ Returns:
149
+ bufferF, bufferR: deux listes de lignes FastQ.
150
+
151
+ Doctests:
152
+ >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
153
+ >>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
154
+ >>> def fake_index(Seq, sites, seed): return ([[0,12]], [[0,12]])
155
+ >>> lig = [(re.compile("CCCC"),4)]
156
+ >>> bufF, bufR = processing_all("readX", seqs, quals, lig, 0, fake_index, tags="o")
157
+ >>> bufF[0].startswith("readX\\n")
158
+ True
159
+ >>> bufR[0].startswith("readX\\n")
160
+ True
161
+
162
+ >>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
163
+ >>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags="o")
164
+ >>> # 4 fragments total => C(4,2)=6 paires, donc 6 entrées
165
+ >>> len(bufF2), len(bufR2)
166
+ (6, 6)
167
+ >>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
168
+ True
169
+ >>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
170
+ True
171
+ >>> # 1ère paire = (F1,F2) donc forward=seqs[0][0:6], reverse=seqs[0][6:12]
172
+ >>> bufF2[0].splitlines()[1], bufR2[0].splitlines()[1]
173
+ ('AAAACC', 'CCGGGG')
174
+ >>> # dernière paire = (R1,R2) donc forward=seqs[1][0:6], reverse=seqs[1][6:12]
175
+ >>> bufF2[-1].splitlines()[0].startswith("readY:[R1,R2:FT2,RT2]")
176
+ True
177
+ >>> bufF2[-1].splitlines()[1], bufR2[-1].splitlines()[1]
178
+ ('TTTTGG', 'GGCCCC')
179
+
180
+
181
+ >>> from itertools import combinations
182
+ >>> def microsplit_like_all(name, seqs, quals, fr_for, fr_rev):
183
+ ... entries = []
184
+ ... for k, (s,e) in enumerate(fr_for):
185
+ ... entries.append(("F", k, seqs[0][s:e], quals[0][s:e]))
186
+ ... for k, (s,e) in enumerate(fr_rev):
187
+ ... entries.append(("R", k, seqs[1][s:e], quals[1][s:e]))
188
+ ... tot_for = len(fr_for); tot_rev = len(fr_rev)
189
+ ... outF, outR = [], []
190
+ ... for (o1,i1,s1,q1), (o2,i2,s2,q2) in combinations(entries, 2):
191
+ ... tag_i = f"{o1}{i1+1}"
192
+ ... tag_j = f"{o2}{i2+1}"
193
+ ... header = f"{name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
194
+ ... outF.append(f"{header}\\n{s1}\\n+\\n{q1}\\n")
195
+ ... outR.append(f"{header}\\n{s2}\\n+\\n{q2}\\n")
196
+ ... return outF, outR
197
+ >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
198
+ >>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
199
+ >>> def idx_2_2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
200
+ >>> bufF3, bufR3 = processing_all("readY", seqs, quals, lig, 0, idx_2_2, tags="o")
201
+ >>> expF, expR = microsplit_like_all("readY", seqs, quals, [[0,6],[6,12]], [[0,6],[6,12]])
202
+ >>> bufF3 == expF and bufR3 == expR
203
+ True
204
+ """
205
+ bufferF = []
206
+ bufferR = []
207
+
208
+ # Collect all pairs
209
+ all_pairs = [
210
+ pair
211
+ for pair in create_pairs_all(
212
+ Sequence, seed_size, ligation_site_list, indexation
213
+ )
214
+ ]
215
+ single_pair = len(all_pairs) == 1
216
+
217
+ for _, tag_i, tag_j, fragA, fragB, tot_for, tot_rev in all_pairs:
218
+ header = build_pair_header(
219
+ raw_name=Name,
220
+ tag_i=tag_i,
221
+ tag_j=tag_j,
222
+ tot_for=tot_for,
223
+ tot_rev=tot_rev,
224
+ tags=tags,
225
+ single_pair=single_pair,
226
+ )
227
+ # forward
228
+ frag_for, which_for = fragA
229
+ seq_f = Sequence[which_for][frag_for[0] : frag_for[1]]
230
+ qual_f = Quality[which_for][frag_for[0] : frag_for[1]]
231
+ bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
232
+
233
+ # reverse
234
+ frag_rev, which_rev = fragB
235
+ seq_r = Sequence[which_rev][frag_rev[0] : frag_rev[1]]
236
+ qual_r = Quality[which_rev][frag_rev[0] : frag_rev[1]]
237
+ bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
238
+
239
+ return bufferF, bufferR
240
+
241
+
242
+ ################################ Mode FR #####################################
243
+
244
+
245
+ def create_pairs_fr(fragments):
246
+ """
247
+ Create pairs of forward and reverse fragments.
248
+
249
+ Parameters:
250
+ fragments (List[List[List[int]]]): List containing forward and reverse fragments. Each fragment is a list of start and end indices.
251
+
252
+ Yields:
253
+ List: A list containing:
254
+ - A unique identifier for the pair.
255
+ - The forward fragment indices.
256
+ - The reverse fragment indices.
257
+
258
+ """
259
+ forward_fragments = fragments[0]
260
+ reverse_fragments = fragments[1]
261
+ for i, index_f_frag in enumerate(forward_fragments):
262
+ for j, index_r_frag in enumerate(reverse_fragments):
263
+ pair_id = str(i) + str(j)
264
+ tag_i = f"F{i + 1}"
265
+ tag_j = f"R{j + 1}"
266
+ yield [
267
+ pair_id,
268
+ tag_i,
269
+ tag_j,
270
+ index_f_frag,
271
+ index_r_frag,
272
+ len(forward_fragments),
273
+ len(reverse_fragments),
274
+ ]
275
+
276
+
277
+ def processing_fr(
278
+ TNom: str,
279
+ TSeq: List[str],
280
+ TQual: List[str],
281
+ ligation_site_list: List[Tuple[re.Pattern, int]],
282
+ seed_size: int,
283
+ indexation,
284
+ tags: str | None = None,
285
+ ) -> Tuple[List[str], List[str]]:
286
+ """
287
+ Process the sequences to generate buffers for forward and reverse reads
288
+ selon le mode FR (un fragment forward + un fragment reverse).
289
+ N'ajoute pas de suffixe :ij si une seule paire.
290
+
291
+ Doctests:
292
+ >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
293
+ >>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
294
+ >>> def idx_fr(seq, lig, sd): return ([[0,6]], [[6,12]])
295
+ >>> bufF, bufR = processing_fr("readZ", seqs, quals, [], 0, idx_fr, tags="o")
296
+ >>> bufF
297
+ ['readZ\\nAAAACC\\n+\\nIIIIII\\n']
298
+ >>> bufR
299
+ ['readZ\\nGGCCCC\\n+\\nJJJJJJ\\n']
300
+
301
+ >>> # Avec suffixe
302
+ >>> def idx_fr2(seq, lig, sd): return ([[0,6],[6,12]], [[0,6],[6,12]])
303
+ >>> bufF2, bufR2 = processing_fr("readW", seqs, quals, [], 0, idx_fr2) # tags=None => "o"
304
+ >>> len(bufF2), len(bufR2)
305
+ (4, 4)
306
+ >>> any(x.startswith("readW:[F1,R1:FT2,RT2]\\n") for x in bufF2)
307
+ True
308
+ >>> any(x.startswith("readW:[F2,R2:FT2,RT2]\\n") for x in bufF2)
309
+ True
310
+
311
+ >>> bufFna, bufRna = processing_fr("readW", seqs, quals, [], 0, idx_fr2, tags="na")
312
+ >>> len(bufFna), len(bufRna)
313
+ (4, 4)
314
+ >>> all(x.startswith("readW\\n") for x in bufFna)
315
+ True
316
+
317
+ >>> seqs = ["ABCDEFGH", "IJKLMNOP"]
318
+ >>> quals = ["!!!!!!!!", "????????"]
319
+ >>> def idx_fr(seq, lig, sd): return ([[0,2]], [[2,4]])
320
+ >>> bufF, bufR = processing_fr("readZ", seqs, quals, [], 0, idx_fr, tags="o")
321
+ >>> bufF[0].splitlines()[1], bufR[0].splitlines()[1]
322
+ ('AB', 'KL')
323
+ """
324
+ bufferF = []
325
+ bufferR = []
326
+
327
+ # liste des fragments forward et reverse
328
+ ListFragFor, ListFragRev = indexation(TSeq, ligation_site_list, seed_size)
329
+ all_pairs = list(create_pairs_fr([ListFragFor, ListFragRev]))
330
+ single_pair = len(all_pairs) == 1
331
+
332
+ for (
333
+ _,
334
+ tag_i,
335
+ tag_j,
336
+ index_f_frag,
337
+ index_r_frag,
338
+ tot_for,
339
+ tot_rev,
340
+ ) in all_pairs:
341
+ header = build_pair_header(
342
+ raw_name=TNom,
343
+ tag_i=tag_i,
344
+ tag_j=tag_j,
345
+ tot_for=tot_for,
346
+ tot_rev=tot_rev,
347
+ tags=tags,
348
+ single_pair=single_pair,
349
+ )
350
+ seq_f = TSeq[0][index_f_frag[0] : index_f_frag[1]]
351
+ qual_f = TQual[0][index_f_frag[0] : index_f_frag[1]]
352
+ bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
353
+
354
+ seq_r = TSeq[1][index_r_frag[0] : index_r_frag[1]]
355
+ qual_r = TQual[1][index_r_frag[0] : index_r_frag[1]]
356
+ bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
357
+
358
+ return bufferF, bufferR
359
+
360
+
361
+ ############################## Mode Cover ###################################
362
+
363
+
364
+ def create_pairs_cover(
365
+ sequence, seed_size, ligation_site_list, indexation
366
+ ) -> Generator:
367
+ """
368
+ Create a minimal / near-minimal set of pairs such that every fragment
369
+ appears at least once.
370
+
371
+ Strategy
372
+ --------
373
+ 1. Pair forward and reverse fragments one-to-one while both sides exist.
374
+ 2. Pair leftover fragments on the larger side two-by-two.
375
+ 3. If one leftover fragment remains, attach it to an anchor fragment that
376
+ has already been used if possible.
377
+
378
+ Yields
379
+ ------
380
+ Same layout as create_pairs_all:
381
+ [pair_id, tag_i, tag_j, [fragI, which_i], [fragJ, which_j], NbFragFor, NbFragRev]
382
+
383
+ Examples
384
+ --------
385
+ >>> def idx_2_1(Seq, sites, seed): return ([[0,2],[2,4]], [[0,2]])
386
+ >>> pairs = list(create_pairs_cover(["ABCD", "EF"], 0, [], idx_2_1))
387
+ >>> len(pairs)
388
+ 2
389
+ >>> [(p[1], p[2]) for p in pairs]
390
+ [('F1', 'R1'), ('F2', 'R1')]
391
+
392
+ >>> def idx_2_2(Seq, sites, seed): return ([[0,2],[2,4]], [[0,2],[2,4]])
393
+ >>> pairs = list(create_pairs_cover(["ABCD", "EFGH"], 0, [], idx_2_2))
394
+ >>> len(pairs)
395
+ 2
396
+ >>> [(p[1], p[2]) for p in pairs]
397
+ [('F1', 'R1'), ('F2', 'R2')]
398
+ """
399
+ ListFragmentFor, ListFragmentRev = indexation(
400
+ sequence, ligation_site_list, seed_size
401
+ )
402
+ NbFragFor = len(ListFragmentFor)
403
+ NbFragRev = len(ListFragmentRev)
404
+
405
+ if NbFragFor + NbFragRev < 2:
406
+ return
407
+
408
+ for_entries = [(f"F{i + 1}", [frag, 0]) for i, frag in enumerate(ListFragmentFor)]
409
+ rev_entries = [(f"R{i + 1}", [frag, 1]) for i, frag in enumerate(ListFragmentRev)]
410
+
411
+ used_pairs = []
412
+
413
+ # 1) Pair F/R while possible
414
+ n_cross = min(NbFragFor, NbFragRev)
415
+ for i in range(n_cross):
416
+ used_pairs.append((for_entries[i], rev_entries[i]))
417
+
418
+ # 2) Leftovers on the larger side
419
+ if NbFragFor > NbFragRev:
420
+ leftovers = for_entries[n_cross:]
421
+ opposite_anchor = rev_entries[0] if rev_entries else None
422
+ same_anchor = for_entries[0] if for_entries else None
423
+ else:
424
+ leftovers = rev_entries[n_cross:]
425
+ opposite_anchor = for_entries[0] if for_entries else None
426
+ same_anchor = rev_entries[0] if rev_entries else None
427
+
428
+ j = 0
429
+ while j + 1 < len(leftovers):
430
+ used_pairs.append((leftovers[j], leftovers[j + 1]))
431
+ j += 2
432
+
433
+ # 3) One last leftover -> attach to an already used anchor
434
+ if j < len(leftovers):
435
+ last = leftovers[j]
436
+ if opposite_anchor is not None:
437
+ used_pairs.append((last, opposite_anchor))
438
+ elif same_anchor is not None and same_anchor[0] != last[0]:
439
+ used_pairs.append((last, same_anchor))
440
+ else:
441
+ return
442
+
443
+ for pair_id, (e1, e2) in enumerate(used_pairs):
444
+ tag_i, fragA = e1
445
+ tag_j, fragB = e2
446
+ yield [
447
+ str(pair_id),
448
+ tag_i,
449
+ tag_j,
450
+ fragA,
451
+ fragB,
452
+ NbFragFor,
453
+ NbFragRev,
454
+ ]
455
+
456
+
457
+ def processing_cover(
458
+ Name: str,
459
+ Sequence: List[str],
460
+ Quality: List[str],
461
+ ligation_site_list: List[Tuple[re.Pattern, int]],
462
+ seed_size: int,
463
+ indexation,
464
+ tags: str | None = None,
465
+ ) -> Tuple[List[str], List[str]]:
466
+ """
467
+ Process sequences using COVER mode:
468
+ generate only enough pairs so that each fragment appears at least once.
469
+
470
+ Examples
471
+ --------
472
+ >>> seqs = ["AAAACCCC", "TTTTGGGG"]
473
+ >>> quals = ["IIIIIIII", "JJJJJJJJ"]
474
+ >>> def idx_2_1(Seq, sites, seed): return ([[0,4],[4,8]], [[0,4]])
475
+ >>> bufF, bufR = processing_cover("readC", seqs, quals, [], 0, idx_2_1, tags="o")
476
+ >>> len(bufF), len(bufR)
477
+ (2, 2)
478
+ >>> any("readC:[F1,R1:FT2,RT1]\\n" in x for x in bufF)
479
+ True
480
+ >>> any("readC:[F2,R1:FT2,RT1]\\n" in x for x in bufF)
481
+ True
482
+ """
483
+ bufferF = []
484
+ bufferR = []
485
+
486
+ cover_pairs = list(
487
+ create_pairs_cover(Sequence, seed_size, ligation_site_list, indexation)
488
+ )
489
+ single_pair = len(cover_pairs) == 1
490
+
491
+ for _, tag_i, tag_j, fragA, fragB, tot_for, tot_rev in cover_pairs:
492
+ header = build_pair_header(
493
+ raw_name=Name,
494
+ tag_i=tag_i,
495
+ tag_j=tag_j,
496
+ tot_for=tot_for,
497
+ tot_rev=tot_rev,
498
+ tags=tags,
499
+ single_pair=single_pair,
500
+ )
501
+
502
+ frag_for, which_for = fragA
503
+ seq_f = Sequence[which_for][frag_for[0] : frag_for[1]]
504
+ qual_f = Quality[which_for][frag_for[0] : frag_for[1]]
505
+ bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
506
+
507
+ frag_rev, which_rev = fragB
508
+ seq_r = Sequence[which_rev][frag_rev[0] : frag_rev[1]]
509
+ qual_r = Quality[which_rev][frag_rev[0] : frag_rev[1]]
510
+ bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
511
+
512
+ return bufferF, bufferR
513
+
514
+
515
+ ################################ Common #####################################
516
+ def process_items(
517
+ Input_Buffer,
518
+ Output_buffer,
519
+ ligation_site_list,
520
+ seed_size,
521
+ buffer_size,
522
+ mode,
523
+ borderless,
524
+ tags,
525
+ ):
526
+ """
527
+ _summary_ : Process the sequences to generate FastQ sequences paired
528
+ """
529
+ BigBufferF = []
530
+ BigBufferR = []
531
+
532
+ if borderless:
533
+ indexation = index_list_borderless
534
+ else:
535
+ indexation = index_list
536
+
537
+ if mode == "all":
538
+ Treatment = processing_all
539
+ elif mode == "fr":
540
+ Treatment = processing_fr
541
+ else:
542
+ Treatment = processing_cover
543
+
544
+ while True:
545
+ try:
546
+ Items = Input_Buffer.get()
547
+ if Items is None:
548
+ break
549
+
550
+ for item in Items:
551
+ # Run the synchronous processing in a separate process
552
+ bufferF, bufferR = Treatment(
553
+ item[0][0],
554
+ item[1],
555
+ item[2],
556
+ ligation_site_list,
557
+ seed_size,
558
+ indexation,
559
+ tags,
560
+ )
561
+
562
+ BigBufferF.extend(bufferF)
563
+ BigBufferR.extend(bufferR)
564
+
565
+ if len(BigBufferF) > buffer_size:
566
+ Output_buffer.put([BigBufferF, BigBufferR])
567
+ BigBufferF = []
568
+ BigBufferR = []
569
+
570
+ except Exception as e:
571
+ logging.exception(f"Error in process items : {e}")
572
+ raise
573
+
574
+ if BigBufferF or BigBufferR:
575
+ Output_buffer.put([BigBufferF, BigBufferR])
576
+ Output_buffer.put(None)
@@ -0,0 +1,91 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+
25
+ def _tag_from_global_index(k: int, n_for: int) -> str:
26
+ # k est l’index dans AllFrag = for + rev
27
+ if k < n_for:
28
+ return f"F{k + 1}"
29
+ return f"R{k - n_for + 1}"
30
+
31
+
32
+ def _base_name(raw_name: str) -> str:
33
+ return raw_name.split(" ")[0]
34
+
35
+
36
+ def read_name(
37
+ base_name: str,
38
+ tag_i: str,
39
+ tag_j: str,
40
+ tot_for: int,
41
+ tot_rev: int,
42
+ tags: str | None = None,
43
+ ) -> str:
44
+ if tags in ("no_tag", "nt"):
45
+ return base_name
46
+ return f"{base_name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
47
+
48
+
49
+ def build_pair_header(
50
+ raw_name: str,
51
+ tag_i: str,
52
+ tag_j: str,
53
+ tot_for: int,
54
+ tot_rev: int,
55
+ tags: str | None,
56
+ single_pair: bool,
57
+ ) -> str:
58
+ """
59
+ Contract:
60
+ - single_pair=True -> base only (strip comments after first space)
61
+ - tags=None defaults to "o"
62
+ - tags in ("nt","no_tag","na","no_annot") -> base only
63
+ - otherwise -> base:[tag_i,tag_j:FTx,RTy]
64
+
65
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags="o", single_pair=True)
66
+ 'readX'
67
+ >>> build_pair_header("readX comment blah", "F1", "R1", 1, 1, tags="o", single_pair=True)
68
+ 'readX'
69
+
70
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags=None, single_pair=False)
71
+ 'readX:[F1,R1:FT1,RT1]'
72
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags="o", single_pair=False)
73
+ 'readX:[F1,R1:FT1,RT1]'
74
+
75
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags="na", single_pair=False)
76
+ 'readX'
77
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags="no_tag", single_pair=False)
78
+ 'readX'
79
+ >>> build_pair_header("readX", "F1", "R1", 1, 1, tags="na", single_pair=False)
80
+ 'readX'
81
+ """
82
+
83
+ base = _base_name(raw_name)
84
+
85
+ if single_pair:
86
+ return base
87
+ if tags is None:
88
+ tags = "o"
89
+ if tags in ("no_annot", "na"):
90
+ return base
91
+ return read_name(base, tag_i, tag_j, tot_for, tot_rev, tags=tags)