split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 2024 Samir Bertache
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This script is a the Parasplit project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
7
|
+
|
|
8
|
+
Copyright © 2024 Samir Bertache
|
|
9
|
+
|
|
10
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
11
|
+
|
|
12
|
+
===============================================================================
|
|
13
|
+
|
|
14
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
15
|
+
the terms of the GNU Affero General Public License as published by the
|
|
16
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
17
|
+
any later version.
|
|
18
|
+
|
|
19
|
+
This program is distributed in the hope that it will be useful,
|
|
20
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
22
|
+
See the GNU Affero General Public License for more details.
|
|
23
|
+
|
|
24
|
+
You should have received a copy of the GNU Affero General Public License
|
|
25
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from .frag import process_items
|
|
29
|
+
from .pretreatment import partition_threads, search_in_database
|
|
30
|
+
from .read import read_fastq_gzip_simultaneously
|
|
31
|
+
from .write_control import manage_pigz_problems, open_output, write_pairs
|
|
32
|
+
|
|
33
|
+
__version__ = "1.1.5"
|
split3c/resite/frag.py
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
from typing import Generator, List, Tuple
|
|
27
|
+
|
|
28
|
+
from .header import _tag_from_global_index, build_pair_header
|
|
29
|
+
from .index import index_list, index_list_borderless
|
|
30
|
+
|
|
31
|
+
logging.basicConfig(level=logging.INFO)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
################################ Mode All #####################################
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def create_pairs_all(sequence, seed_size, ligation_site_list, indexation) -> Generator:
|
|
38
|
+
"""
|
|
39
|
+
Create all possible pairs of fragments from given sequences.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
Sequence (List[str]): List containing forward and reverse sequences.
|
|
43
|
+
seed_size (int): Minimum size of the fragment to be considered.
|
|
44
|
+
ligation_site_list (List[Tuple[re.Pattern, int]]): List of ligation sites with regex patterns and offsets.
|
|
45
|
+
|
|
46
|
+
Yields:
|
|
47
|
+
List: A list containing:
|
|
48
|
+
- A unique identifier for the pair.
|
|
49
|
+
- Information about the first fragment and which sequence it comes from (forward or reverse).
|
|
50
|
+
- Information about the second fragment and which sequence it comes from (forward or reverse).
|
|
51
|
+
|
|
52
|
+
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|
|
53
|
+
>>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
|
|
54
|
+
>>> def fake_index(Seq, sites, seed): return ([[0,12]], [[0,12]])
|
|
55
|
+
>>> lig = [(re.compile("CCCC"),4)]
|
|
56
|
+
>>> bufF, bufR = processing_all("readX", seqs, quals, lig, 0, fake_index, tags="o")
|
|
57
|
+
>>> bufF[0].startswith("readX\\n")
|
|
58
|
+
True
|
|
59
|
+
>>> bufR[0].startswith("readX\\n")
|
|
60
|
+
True
|
|
61
|
+
|
|
62
|
+
>>> # tags=None doit se comporter comme tags="o" quand multipaires
|
|
63
|
+
>>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
64
|
+
>>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags=None)
|
|
65
|
+
>>> len(bufF2), len(bufR2)
|
|
66
|
+
(6, 6)
|
|
67
|
+
>>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
|
|
68
|
+
True
|
|
69
|
+
>>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
|
|
70
|
+
True
|
|
71
|
+
|
|
72
|
+
>>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
73
|
+
>>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags="o")
|
|
74
|
+
>>> len(bufF2), len(bufR2)
|
|
75
|
+
(6, 6)
|
|
76
|
+
>>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
|
|
77
|
+
True
|
|
78
|
+
>>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
|
|
79
|
+
True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
>>> # 1 fragment sur R1, 1 fragment sur R2 => 1 paire (F1,R1)
|
|
83
|
+
>>> def idx_1_1(Seq, sites, seed): return ([[0,2]], [[0,2]])
|
|
84
|
+
>>> pairs = list(create_pairs_all(["AB", "CD"], 0, [], idx_1_1))
|
|
85
|
+
>>> len(pairs)
|
|
86
|
+
1
|
|
87
|
+
>>> pair_id, tag_i, tag_j, fragA, fragB, tot_for, tot_rev = pairs[0]
|
|
88
|
+
>>> (tag_i, tag_j, tot_for, tot_rev)
|
|
89
|
+
('F1', 'R1', 1, 1)
|
|
90
|
+
>>> # fragA vient de R1 (which=0), fragB vient de R2 (which=1)
|
|
91
|
+
>>> (fragA[1], fragB[1])
|
|
92
|
+
(0, 1)
|
|
93
|
+
|
|
94
|
+
>>> # 2 fragments sur R1, 2 fragments sur R2 => 4 fragments => C(4,2)=6 paires
|
|
95
|
+
>>> def idx_2_2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
96
|
+
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|
|
97
|
+
>>> pairs = list(create_pairs_all(seqs, 0, [], idx_2_2))
|
|
98
|
+
>>> len(pairs)
|
|
99
|
+
6
|
|
100
|
+
>>> # Dernière paire = (R1,R2) (car ordre i<j sur AllFrag = F1,F2,R1,R2)
|
|
101
|
+
>>> pairs[-1][1], pairs[-1][2]
|
|
102
|
+
('R1', 'R2')
|
|
103
|
+
>>> # et les deux fragments viennent bien de la séquence reverse (which=1)
|
|
104
|
+
>>> pairs[-1][3][1], pairs[-1][4][1]
|
|
105
|
+
(1, 1)
|
|
106
|
+
"""
|
|
107
|
+
ListFragmentFor, ListFragmentRev = indexation(
|
|
108
|
+
sequence, ligation_site_list, seed_size
|
|
109
|
+
)
|
|
110
|
+
AllFrag = ListFragmentFor + ListFragmentRev
|
|
111
|
+
NbFragFor = len(ListFragmentFor)
|
|
112
|
+
NbFragRev = len(ListFragmentRev)
|
|
113
|
+
|
|
114
|
+
for i, fragI in enumerate(AllFrag):
|
|
115
|
+
for j, fragJ in enumerate(AllFrag):
|
|
116
|
+
if i < j:
|
|
117
|
+
tag_i = _tag_from_global_index(i, NbFragFor)
|
|
118
|
+
tag_j = _tag_from_global_index(j, NbFragFor)
|
|
119
|
+
|
|
120
|
+
which_i = 0 if i < NbFragFor else 1
|
|
121
|
+
which_j = 0 if j < NbFragFor else 1
|
|
122
|
+
|
|
123
|
+
pair_id = f"{i}{j}"
|
|
124
|
+
yield [
|
|
125
|
+
pair_id,
|
|
126
|
+
tag_i,
|
|
127
|
+
tag_j,
|
|
128
|
+
[fragI, which_i],
|
|
129
|
+
[fragJ, which_j],
|
|
130
|
+
NbFragFor,
|
|
131
|
+
NbFragRev,
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def processing_all(
|
|
136
|
+
Name: str,
|
|
137
|
+
Sequence: List[str],
|
|
138
|
+
Quality: List[str],
|
|
139
|
+
ligation_site_list: List[Tuple[re.Pattern, int]],
|
|
140
|
+
seed_size: int,
|
|
141
|
+
indexation,
|
|
142
|
+
tags: str | None = None,
|
|
143
|
+
) -> Tuple[List[str], List[str]]:
|
|
144
|
+
"""
|
|
145
|
+
Process the sequences to generate buffers for forward and reverse reads
|
|
146
|
+
by creating ALL possible fragment pairs. Do not add the suffix :ij if there is only one pair.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
bufferF, bufferR: deux listes de lignes FastQ.
|
|
150
|
+
|
|
151
|
+
Doctests:
|
|
152
|
+
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|
|
153
|
+
>>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
|
|
154
|
+
>>> def fake_index(Seq, sites, seed): return ([[0,12]], [[0,12]])
|
|
155
|
+
>>> lig = [(re.compile("CCCC"),4)]
|
|
156
|
+
>>> bufF, bufR = processing_all("readX", seqs, quals, lig, 0, fake_index, tags="o")
|
|
157
|
+
>>> bufF[0].startswith("readX\\n")
|
|
158
|
+
True
|
|
159
|
+
>>> bufR[0].startswith("readX\\n")
|
|
160
|
+
True
|
|
161
|
+
|
|
162
|
+
>>> def fake_index2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
163
|
+
>>> bufF2, bufR2 = processing_all("readY", seqs, quals, lig, 0, fake_index2, tags="o")
|
|
164
|
+
>>> # 4 fragments total => C(4,2)=6 paires, donc 6 entrées
|
|
165
|
+
>>> len(bufF2), len(bufR2)
|
|
166
|
+
(6, 6)
|
|
167
|
+
>>> any("readY:[F1,F2:FT2,RT2]\\n" in x for x in bufF2)
|
|
168
|
+
True
|
|
169
|
+
>>> any("readY:[R1,R2:FT2,RT2]\\n" in x for x in bufF2)
|
|
170
|
+
True
|
|
171
|
+
>>> # 1ère paire = (F1,F2) donc forward=seqs[0][0:6], reverse=seqs[0][6:12]
|
|
172
|
+
>>> bufF2[0].splitlines()[1], bufR2[0].splitlines()[1]
|
|
173
|
+
('AAAACC', 'CCGGGG')
|
|
174
|
+
>>> # dernière paire = (R1,R2) donc forward=seqs[1][0:6], reverse=seqs[1][6:12]
|
|
175
|
+
>>> bufF2[-1].splitlines()[0].startswith("readY:[R1,R2:FT2,RT2]")
|
|
176
|
+
True
|
|
177
|
+
>>> bufF2[-1].splitlines()[1], bufR2[-1].splitlines()[1]
|
|
178
|
+
('TTTTGG', 'GGCCCC')
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
>>> from itertools import combinations
|
|
182
|
+
>>> def microsplit_like_all(name, seqs, quals, fr_for, fr_rev):
|
|
183
|
+
... entries = []
|
|
184
|
+
... for k, (s,e) in enumerate(fr_for):
|
|
185
|
+
... entries.append(("F", k, seqs[0][s:e], quals[0][s:e]))
|
|
186
|
+
... for k, (s,e) in enumerate(fr_rev):
|
|
187
|
+
... entries.append(("R", k, seqs[1][s:e], quals[1][s:e]))
|
|
188
|
+
... tot_for = len(fr_for); tot_rev = len(fr_rev)
|
|
189
|
+
... outF, outR = [], []
|
|
190
|
+
... for (o1,i1,s1,q1), (o2,i2,s2,q2) in combinations(entries, 2):
|
|
191
|
+
... tag_i = f"{o1}{i1+1}"
|
|
192
|
+
... tag_j = f"{o2}{i2+1}"
|
|
193
|
+
... header = f"{name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
|
|
194
|
+
... outF.append(f"{header}\\n{s1}\\n+\\n{q1}\\n")
|
|
195
|
+
... outR.append(f"{header}\\n{s2}\\n+\\n{q2}\\n")
|
|
196
|
+
... return outF, outR
|
|
197
|
+
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|
|
198
|
+
>>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
|
|
199
|
+
>>> def idx_2_2(Seq, sites, seed): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
200
|
+
>>> bufF3, bufR3 = processing_all("readY", seqs, quals, lig, 0, idx_2_2, tags="o")
|
|
201
|
+
>>> expF, expR = microsplit_like_all("readY", seqs, quals, [[0,6],[6,12]], [[0,6],[6,12]])
|
|
202
|
+
>>> bufF3 == expF and bufR3 == expR
|
|
203
|
+
True
|
|
204
|
+
"""
|
|
205
|
+
bufferF = []
|
|
206
|
+
bufferR = []
|
|
207
|
+
|
|
208
|
+
# Collect all pairs
|
|
209
|
+
all_pairs = [
|
|
210
|
+
pair
|
|
211
|
+
for pair in create_pairs_all(
|
|
212
|
+
Sequence, seed_size, ligation_site_list, indexation
|
|
213
|
+
)
|
|
214
|
+
]
|
|
215
|
+
single_pair = len(all_pairs) == 1
|
|
216
|
+
|
|
217
|
+
for _, tag_i, tag_j, fragA, fragB, tot_for, tot_rev in all_pairs:
|
|
218
|
+
header = build_pair_header(
|
|
219
|
+
raw_name=Name,
|
|
220
|
+
tag_i=tag_i,
|
|
221
|
+
tag_j=tag_j,
|
|
222
|
+
tot_for=tot_for,
|
|
223
|
+
tot_rev=tot_rev,
|
|
224
|
+
tags=tags,
|
|
225
|
+
single_pair=single_pair,
|
|
226
|
+
)
|
|
227
|
+
# forward
|
|
228
|
+
frag_for, which_for = fragA
|
|
229
|
+
seq_f = Sequence[which_for][frag_for[0] : frag_for[1]]
|
|
230
|
+
qual_f = Quality[which_for][frag_for[0] : frag_for[1]]
|
|
231
|
+
bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
|
|
232
|
+
|
|
233
|
+
# reverse
|
|
234
|
+
frag_rev, which_rev = fragB
|
|
235
|
+
seq_r = Sequence[which_rev][frag_rev[0] : frag_rev[1]]
|
|
236
|
+
qual_r = Quality[which_rev][frag_rev[0] : frag_rev[1]]
|
|
237
|
+
bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
|
|
238
|
+
|
|
239
|
+
return bufferF, bufferR
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
################################ Mode FR #####################################
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def create_pairs_fr(fragments):
|
|
246
|
+
"""
|
|
247
|
+
Create pairs of forward and reverse fragments.
|
|
248
|
+
|
|
249
|
+
Parameters:
|
|
250
|
+
fragments (List[List[List[int]]]): List containing forward and reverse fragments. Each fragment is a list of start and end indices.
|
|
251
|
+
|
|
252
|
+
Yields:
|
|
253
|
+
List: A list containing:
|
|
254
|
+
- A unique identifier for the pair.
|
|
255
|
+
- The forward fragment indices.
|
|
256
|
+
- The reverse fragment indices.
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
forward_fragments = fragments[0]
|
|
260
|
+
reverse_fragments = fragments[1]
|
|
261
|
+
for i, index_f_frag in enumerate(forward_fragments):
|
|
262
|
+
for j, index_r_frag in enumerate(reverse_fragments):
|
|
263
|
+
pair_id = str(i) + str(j)
|
|
264
|
+
tag_i = f"F{i + 1}"
|
|
265
|
+
tag_j = f"R{j + 1}"
|
|
266
|
+
yield [
|
|
267
|
+
pair_id,
|
|
268
|
+
tag_i,
|
|
269
|
+
tag_j,
|
|
270
|
+
index_f_frag,
|
|
271
|
+
index_r_frag,
|
|
272
|
+
len(forward_fragments),
|
|
273
|
+
len(reverse_fragments),
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def processing_fr(
|
|
278
|
+
TNom: str,
|
|
279
|
+
TSeq: List[str],
|
|
280
|
+
TQual: List[str],
|
|
281
|
+
ligation_site_list: List[Tuple[re.Pattern, int]],
|
|
282
|
+
seed_size: int,
|
|
283
|
+
indexation,
|
|
284
|
+
tags: str | None = None,
|
|
285
|
+
) -> Tuple[List[str], List[str]]:
|
|
286
|
+
"""
|
|
287
|
+
Process the sequences to generate buffers for forward and reverse reads
|
|
288
|
+
selon le mode FR (un fragment forward + un fragment reverse).
|
|
289
|
+
N'ajoute pas de suffixe :ij si une seule paire.
|
|
290
|
+
|
|
291
|
+
Doctests:
|
|
292
|
+
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|
|
293
|
+
>>> quals = ["IIIIIIIIIIII", "JJJJJJJJJJJJ"]
|
|
294
|
+
>>> def idx_fr(seq, lig, sd): return ([[0,6]], [[6,12]])
|
|
295
|
+
>>> bufF, bufR = processing_fr("readZ", seqs, quals, [], 0, idx_fr, tags="o")
|
|
296
|
+
>>> bufF
|
|
297
|
+
['readZ\\nAAAACC\\n+\\nIIIIII\\n']
|
|
298
|
+
>>> bufR
|
|
299
|
+
['readZ\\nGGCCCC\\n+\\nJJJJJJ\\n']
|
|
300
|
+
|
|
301
|
+
>>> # Avec suffixe
|
|
302
|
+
>>> def idx_fr2(seq, lig, sd): return ([[0,6],[6,12]], [[0,6],[6,12]])
|
|
303
|
+
>>> bufF2, bufR2 = processing_fr("readW", seqs, quals, [], 0, idx_fr2) # tags=None => "o"
|
|
304
|
+
>>> len(bufF2), len(bufR2)
|
|
305
|
+
(4, 4)
|
|
306
|
+
>>> any(x.startswith("readW:[F1,R1:FT2,RT2]\\n") for x in bufF2)
|
|
307
|
+
True
|
|
308
|
+
>>> any(x.startswith("readW:[F2,R2:FT2,RT2]\\n") for x in bufF2)
|
|
309
|
+
True
|
|
310
|
+
|
|
311
|
+
>>> bufFna, bufRna = processing_fr("readW", seqs, quals, [], 0, idx_fr2, tags="na")
|
|
312
|
+
>>> len(bufFna), len(bufRna)
|
|
313
|
+
(4, 4)
|
|
314
|
+
>>> all(x.startswith("readW\\n") for x in bufFna)
|
|
315
|
+
True
|
|
316
|
+
|
|
317
|
+
>>> seqs = ["ABCDEFGH", "IJKLMNOP"]
|
|
318
|
+
>>> quals = ["!!!!!!!!", "????????"]
|
|
319
|
+
>>> def idx_fr(seq, lig, sd): return ([[0,2]], [[2,4]])
|
|
320
|
+
>>> bufF, bufR = processing_fr("readZ", seqs, quals, [], 0, idx_fr, tags="o")
|
|
321
|
+
>>> bufF[0].splitlines()[1], bufR[0].splitlines()[1]
|
|
322
|
+
('AB', 'KL')
|
|
323
|
+
"""
|
|
324
|
+
bufferF = []
|
|
325
|
+
bufferR = []
|
|
326
|
+
|
|
327
|
+
# liste des fragments forward et reverse
|
|
328
|
+
ListFragFor, ListFragRev = indexation(TSeq, ligation_site_list, seed_size)
|
|
329
|
+
all_pairs = list(create_pairs_fr([ListFragFor, ListFragRev]))
|
|
330
|
+
single_pair = len(all_pairs) == 1
|
|
331
|
+
|
|
332
|
+
for (
|
|
333
|
+
_,
|
|
334
|
+
tag_i,
|
|
335
|
+
tag_j,
|
|
336
|
+
index_f_frag,
|
|
337
|
+
index_r_frag,
|
|
338
|
+
tot_for,
|
|
339
|
+
tot_rev,
|
|
340
|
+
) in all_pairs:
|
|
341
|
+
header = build_pair_header(
|
|
342
|
+
raw_name=TNom,
|
|
343
|
+
tag_i=tag_i,
|
|
344
|
+
tag_j=tag_j,
|
|
345
|
+
tot_for=tot_for,
|
|
346
|
+
tot_rev=tot_rev,
|
|
347
|
+
tags=tags,
|
|
348
|
+
single_pair=single_pair,
|
|
349
|
+
)
|
|
350
|
+
seq_f = TSeq[0][index_f_frag[0] : index_f_frag[1]]
|
|
351
|
+
qual_f = TQual[0][index_f_frag[0] : index_f_frag[1]]
|
|
352
|
+
bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
|
|
353
|
+
|
|
354
|
+
seq_r = TSeq[1][index_r_frag[0] : index_r_frag[1]]
|
|
355
|
+
qual_r = TQual[1][index_r_frag[0] : index_r_frag[1]]
|
|
356
|
+
bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
|
|
357
|
+
|
|
358
|
+
return bufferF, bufferR
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
############################## Mode Cover ###################################
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def create_pairs_cover(
|
|
365
|
+
sequence, seed_size, ligation_site_list, indexation
|
|
366
|
+
) -> Generator:
|
|
367
|
+
"""
|
|
368
|
+
Create a minimal / near-minimal set of pairs such that every fragment
|
|
369
|
+
appears at least once.
|
|
370
|
+
|
|
371
|
+
Strategy
|
|
372
|
+
--------
|
|
373
|
+
1. Pair forward and reverse fragments one-to-one while both sides exist.
|
|
374
|
+
2. Pair leftover fragments on the larger side two-by-two.
|
|
375
|
+
3. If one leftover fragment remains, attach it to an anchor fragment that
|
|
376
|
+
has already been used if possible.
|
|
377
|
+
|
|
378
|
+
Yields
|
|
379
|
+
------
|
|
380
|
+
Same layout as create_pairs_all:
|
|
381
|
+
[pair_id, tag_i, tag_j, [fragI, which_i], [fragJ, which_j], NbFragFor, NbFragRev]
|
|
382
|
+
|
|
383
|
+
Examples
|
|
384
|
+
--------
|
|
385
|
+
>>> def idx_2_1(Seq, sites, seed): return ([[0,2],[2,4]], [[0,2]])
|
|
386
|
+
>>> pairs = list(create_pairs_cover(["ABCD", "EF"], 0, [], idx_2_1))
|
|
387
|
+
>>> len(pairs)
|
|
388
|
+
2
|
|
389
|
+
>>> [(p[1], p[2]) for p in pairs]
|
|
390
|
+
[('F1', 'R1'), ('F2', 'R1')]
|
|
391
|
+
|
|
392
|
+
>>> def idx_2_2(Seq, sites, seed): return ([[0,2],[2,4]], [[0,2],[2,4]])
|
|
393
|
+
>>> pairs = list(create_pairs_cover(["ABCD", "EFGH"], 0, [], idx_2_2))
|
|
394
|
+
>>> len(pairs)
|
|
395
|
+
2
|
|
396
|
+
>>> [(p[1], p[2]) for p in pairs]
|
|
397
|
+
[('F1', 'R1'), ('F2', 'R2')]
|
|
398
|
+
"""
|
|
399
|
+
ListFragmentFor, ListFragmentRev = indexation(
|
|
400
|
+
sequence, ligation_site_list, seed_size
|
|
401
|
+
)
|
|
402
|
+
NbFragFor = len(ListFragmentFor)
|
|
403
|
+
NbFragRev = len(ListFragmentRev)
|
|
404
|
+
|
|
405
|
+
if NbFragFor + NbFragRev < 2:
|
|
406
|
+
return
|
|
407
|
+
|
|
408
|
+
for_entries = [(f"F{i + 1}", [frag, 0]) for i, frag in enumerate(ListFragmentFor)]
|
|
409
|
+
rev_entries = [(f"R{i + 1}", [frag, 1]) for i, frag in enumerate(ListFragmentRev)]
|
|
410
|
+
|
|
411
|
+
used_pairs = []
|
|
412
|
+
|
|
413
|
+
# 1) Pair F/R while possible
|
|
414
|
+
n_cross = min(NbFragFor, NbFragRev)
|
|
415
|
+
for i in range(n_cross):
|
|
416
|
+
used_pairs.append((for_entries[i], rev_entries[i]))
|
|
417
|
+
|
|
418
|
+
# 2) Leftovers on the larger side
|
|
419
|
+
if NbFragFor > NbFragRev:
|
|
420
|
+
leftovers = for_entries[n_cross:]
|
|
421
|
+
opposite_anchor = rev_entries[0] if rev_entries else None
|
|
422
|
+
same_anchor = for_entries[0] if for_entries else None
|
|
423
|
+
else:
|
|
424
|
+
leftovers = rev_entries[n_cross:]
|
|
425
|
+
opposite_anchor = for_entries[0] if for_entries else None
|
|
426
|
+
same_anchor = rev_entries[0] if rev_entries else None
|
|
427
|
+
|
|
428
|
+
j = 0
|
|
429
|
+
while j + 1 < len(leftovers):
|
|
430
|
+
used_pairs.append((leftovers[j], leftovers[j + 1]))
|
|
431
|
+
j += 2
|
|
432
|
+
|
|
433
|
+
# 3) One last leftover -> attach to an already used anchor
|
|
434
|
+
if j < len(leftovers):
|
|
435
|
+
last = leftovers[j]
|
|
436
|
+
if opposite_anchor is not None:
|
|
437
|
+
used_pairs.append((last, opposite_anchor))
|
|
438
|
+
elif same_anchor is not None and same_anchor[0] != last[0]:
|
|
439
|
+
used_pairs.append((last, same_anchor))
|
|
440
|
+
else:
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
for pair_id, (e1, e2) in enumerate(used_pairs):
|
|
444
|
+
tag_i, fragA = e1
|
|
445
|
+
tag_j, fragB = e2
|
|
446
|
+
yield [
|
|
447
|
+
str(pair_id),
|
|
448
|
+
tag_i,
|
|
449
|
+
tag_j,
|
|
450
|
+
fragA,
|
|
451
|
+
fragB,
|
|
452
|
+
NbFragFor,
|
|
453
|
+
NbFragRev,
|
|
454
|
+
]
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def processing_cover(
|
|
458
|
+
Name: str,
|
|
459
|
+
Sequence: List[str],
|
|
460
|
+
Quality: List[str],
|
|
461
|
+
ligation_site_list: List[Tuple[re.Pattern, int]],
|
|
462
|
+
seed_size: int,
|
|
463
|
+
indexation,
|
|
464
|
+
tags: str | None = None,
|
|
465
|
+
) -> Tuple[List[str], List[str]]:
|
|
466
|
+
"""
|
|
467
|
+
Process sequences using COVER mode:
|
|
468
|
+
generate only enough pairs so that each fragment appears at least once.
|
|
469
|
+
|
|
470
|
+
Examples
|
|
471
|
+
--------
|
|
472
|
+
>>> seqs = ["AAAACCCC", "TTTTGGGG"]
|
|
473
|
+
>>> quals = ["IIIIIIII", "JJJJJJJJ"]
|
|
474
|
+
>>> def idx_2_1(Seq, sites, seed): return ([[0,4],[4,8]], [[0,4]])
|
|
475
|
+
>>> bufF, bufR = processing_cover("readC", seqs, quals, [], 0, idx_2_1, tags="o")
|
|
476
|
+
>>> len(bufF), len(bufR)
|
|
477
|
+
(2, 2)
|
|
478
|
+
>>> any("readC:[F1,R1:FT2,RT1]\\n" in x for x in bufF)
|
|
479
|
+
True
|
|
480
|
+
>>> any("readC:[F2,R1:FT2,RT1]\\n" in x for x in bufF)
|
|
481
|
+
True
|
|
482
|
+
"""
|
|
483
|
+
bufferF = []
|
|
484
|
+
bufferR = []
|
|
485
|
+
|
|
486
|
+
cover_pairs = list(
|
|
487
|
+
create_pairs_cover(Sequence, seed_size, ligation_site_list, indexation)
|
|
488
|
+
)
|
|
489
|
+
single_pair = len(cover_pairs) == 1
|
|
490
|
+
|
|
491
|
+
for _, tag_i, tag_j, fragA, fragB, tot_for, tot_rev in cover_pairs:
|
|
492
|
+
header = build_pair_header(
|
|
493
|
+
raw_name=Name,
|
|
494
|
+
tag_i=tag_i,
|
|
495
|
+
tag_j=tag_j,
|
|
496
|
+
tot_for=tot_for,
|
|
497
|
+
tot_rev=tot_rev,
|
|
498
|
+
tags=tags,
|
|
499
|
+
single_pair=single_pair,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
frag_for, which_for = fragA
|
|
503
|
+
seq_f = Sequence[which_for][frag_for[0] : frag_for[1]]
|
|
504
|
+
qual_f = Quality[which_for][frag_for[0] : frag_for[1]]
|
|
505
|
+
bufferF.append(f"{header}\n{seq_f}\n+\n{qual_f}\n")
|
|
506
|
+
|
|
507
|
+
frag_rev, which_rev = fragB
|
|
508
|
+
seq_r = Sequence[which_rev][frag_rev[0] : frag_rev[1]]
|
|
509
|
+
qual_r = Quality[which_rev][frag_rev[0] : frag_rev[1]]
|
|
510
|
+
bufferR.append(f"{header}\n{seq_r}\n+\n{qual_r}\n")
|
|
511
|
+
|
|
512
|
+
return bufferF, bufferR
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
################################ Common #####################################
|
|
516
|
+
def process_items(
|
|
517
|
+
Input_Buffer,
|
|
518
|
+
Output_buffer,
|
|
519
|
+
ligation_site_list,
|
|
520
|
+
seed_size,
|
|
521
|
+
buffer_size,
|
|
522
|
+
mode,
|
|
523
|
+
borderless,
|
|
524
|
+
tags,
|
|
525
|
+
):
|
|
526
|
+
"""
|
|
527
|
+
_summary_ : Process the sequences to generate FastQ sequences paired
|
|
528
|
+
"""
|
|
529
|
+
BigBufferF = []
|
|
530
|
+
BigBufferR = []
|
|
531
|
+
|
|
532
|
+
if borderless:
|
|
533
|
+
indexation = index_list_borderless
|
|
534
|
+
else:
|
|
535
|
+
indexation = index_list
|
|
536
|
+
|
|
537
|
+
if mode == "all":
|
|
538
|
+
Treatment = processing_all
|
|
539
|
+
elif mode == "fr":
|
|
540
|
+
Treatment = processing_fr
|
|
541
|
+
else:
|
|
542
|
+
Treatment = processing_cover
|
|
543
|
+
|
|
544
|
+
while True:
|
|
545
|
+
try:
|
|
546
|
+
Items = Input_Buffer.get()
|
|
547
|
+
if Items is None:
|
|
548
|
+
break
|
|
549
|
+
|
|
550
|
+
for item in Items:
|
|
551
|
+
# Run the synchronous processing in a separate process
|
|
552
|
+
bufferF, bufferR = Treatment(
|
|
553
|
+
item[0][0],
|
|
554
|
+
item[1],
|
|
555
|
+
item[2],
|
|
556
|
+
ligation_site_list,
|
|
557
|
+
seed_size,
|
|
558
|
+
indexation,
|
|
559
|
+
tags,
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
BigBufferF.extend(bufferF)
|
|
563
|
+
BigBufferR.extend(bufferR)
|
|
564
|
+
|
|
565
|
+
if len(BigBufferF) > buffer_size:
|
|
566
|
+
Output_buffer.put([BigBufferF, BigBufferR])
|
|
567
|
+
BigBufferF = []
|
|
568
|
+
BigBufferR = []
|
|
569
|
+
|
|
570
|
+
except Exception as e:
|
|
571
|
+
logging.exception(f"Error in process items : {e}")
|
|
572
|
+
raise
|
|
573
|
+
|
|
574
|
+
if BigBufferF or BigBufferR:
|
|
575
|
+
Output_buffer.put([BigBufferF, BigBufferR])
|
|
576
|
+
Output_buffer.put(None)
|
split3c/resite/header.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _tag_from_global_index(k: int, n_for: int) -> str:
|
|
26
|
+
# k est l’index dans AllFrag = for + rev
|
|
27
|
+
if k < n_for:
|
|
28
|
+
return f"F{k + 1}"
|
|
29
|
+
return f"R{k - n_for + 1}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _base_name(raw_name: str) -> str:
|
|
33
|
+
return raw_name.split(" ")[0]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def read_name(
|
|
37
|
+
base_name: str,
|
|
38
|
+
tag_i: str,
|
|
39
|
+
tag_j: str,
|
|
40
|
+
tot_for: int,
|
|
41
|
+
tot_rev: int,
|
|
42
|
+
tags: str | None = None,
|
|
43
|
+
) -> str:
|
|
44
|
+
if tags in ("no_tag", "nt"):
|
|
45
|
+
return base_name
|
|
46
|
+
return f"{base_name}:[{tag_i},{tag_j}:FT{tot_for},RT{tot_rev}]"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def build_pair_header(
|
|
50
|
+
raw_name: str,
|
|
51
|
+
tag_i: str,
|
|
52
|
+
tag_j: str,
|
|
53
|
+
tot_for: int,
|
|
54
|
+
tot_rev: int,
|
|
55
|
+
tags: str | None,
|
|
56
|
+
single_pair: bool,
|
|
57
|
+
) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Contract:
|
|
60
|
+
- single_pair=True -> base only (strip comments after first space)
|
|
61
|
+
- tags=None defaults to "o"
|
|
62
|
+
- tags in ("nt","no_tag","na","no_annot") -> base only
|
|
63
|
+
- otherwise -> base:[tag_i,tag_j:FTx,RTy]
|
|
64
|
+
|
|
65
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags="o", single_pair=True)
|
|
66
|
+
'readX'
|
|
67
|
+
>>> build_pair_header("readX comment blah", "F1", "R1", 1, 1, tags="o", single_pair=True)
|
|
68
|
+
'readX'
|
|
69
|
+
|
|
70
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags=None, single_pair=False)
|
|
71
|
+
'readX:[F1,R1:FT1,RT1]'
|
|
72
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags="o", single_pair=False)
|
|
73
|
+
'readX:[F1,R1:FT1,RT1]'
|
|
74
|
+
|
|
75
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags="na", single_pair=False)
|
|
76
|
+
'readX'
|
|
77
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags="no_tag", single_pair=False)
|
|
78
|
+
'readX'
|
|
79
|
+
>>> build_pair_header("readX", "F1", "R1", 1, 1, tags="na", single_pair=False)
|
|
80
|
+
'readX'
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
base = _base_name(raw_name)
|
|
84
|
+
|
|
85
|
+
if single_pair:
|
|
86
|
+
return base
|
|
87
|
+
if tags is None:
|
|
88
|
+
tags = "o"
|
|
89
|
+
if tags in ("no_annot", "na"):
|
|
90
|
+
return base
|
|
91
|
+
return read_name(base, tag_i, tag_j, tot_for, tot_rev, tags=tags)
|