split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import logging
25
+ import re
26
+ from typing import List, Tuple
27
+
28
+ logging.basicConfig(level=logging.INFO)
29
+
30
+
31
+ #################### Specific part of Borderless option #####################
32
+
33
+
34
+ def find_positions_for_one_site_borderless(
35
+ text: str, Enzyme: Tuple[re.Pattern, int]
36
+ ) -> List[List[int]]:
37
+ """
38
+ Find all positions of a specific pattern (RESite) in a given text using regular expressions.
39
+
40
+ Parameters:
41
+ text (str): The text to search in.
42
+ Enzyme (Tuple[re.Pattern, int]): A compiled regex and an offset.
43
+
44
+ Returns:
45
+ List of [position, offset].
46
+
47
+ Examples:
48
+ >>> find_positions_for_one_site_borderless("", (re.compile("A"), 1))
49
+ []
50
+ >>> find_positions_for_one_site_borderless("AAAA", (re.compile("AA"), 2))
51
+ [[0, 2], [2, 2]]
52
+ >>> find_positions_for_one_site_borderless("XYZ", (re.compile("A"), 1))
53
+ []
54
+ >>> find_positions_for_one_site_borderless("GATCGATC", (re.compile("GATC"), 4))
55
+ [[0, 4], [4, 4]]
56
+ """
57
+ regex, offset = Enzyme
58
+ return [[m.start(), offset] for m in regex.finditer(text)]
59
+
60
+
61
+ def find_all_pos_borderless(
62
+ text: str,
63
+ ligation_site_list: List[Tuple[re.Pattern, int]],
64
+ ) -> List[List[int]]:
65
+ """
66
+ Aggregate all borderless site positions plus the end-of-text marker.
67
+
68
+ Examples:
69
+ >>> find_all_pos_borderless("", [])
70
+ [[0, 0]]
71
+ >>> find_all_pos_borderless("AAAA", [])
72
+ [[4, 0]]
73
+ >>> find_all_pos_borderless("GAATTC", [(re.compile("GAATTC"), 6)])
74
+ [[0, 6], [6, 0]]
75
+ >>> find_all_pos_borderless("XXGAATTCYYGATCZZ", [(re.compile("GAATTC"), 6), (re.compile("GATC"), 4)])
76
+ [[2, 6], [10, 4], [16, 0]]
77
+ """
78
+ AllSite: List[List[int]] = []
79
+ for enzyme in ligation_site_list:
80
+ AllSite += find_positions_for_one_site_borderless(text, enzyme)
81
+ # always mark end of sequence
82
+ AllSite.append([len(text), 0])
83
+ return sorted(AllSite, key=lambda x: x[0])
84
+
85
+
86
+ def IndexFragList_borderless(
87
+ all_index_list: List[List[int]], seed_size: int
88
+ ) -> List[List[int]]:
89
+ """
90
+ From a list [[pos,offset],…], build fragments [start,end] discarding
91
+ those shorter than seed_size, and trimming out the enzyme borders.
92
+
93
+ Examples:
94
+ >>> # no fragment (text too short)
95
+ >>> IndexFragList_borderless([[3,2],[5,2]], 10)
96
+ []
97
+ >>> # exact seed_size
98
+ >>> IndexFragList_borderless([[0,1],[5,1],[10,0]], 4)
99
+ [[1, 5], [6, 10]]
100
+ """
101
+ ListFragListIndex: List[List[int]] = []
102
+ previous_position = 0
103
+ for i, (current_position, offset) in enumerate(all_index_list):
104
+ prev_offset = all_index_list[i - 1][1] if i > 0 else 0
105
+ start = previous_position + prev_offset
106
+ end = current_position
107
+ if end - start >= seed_size:
108
+ ListFragListIndex.append([start, end])
109
+ previous_position = current_position
110
+ return ListFragListIndex
111
+
112
+
113
+ def index_list_single_borderless(
114
+ sequence: str,
115
+ ligation_site_list: List[Tuple[re.Pattern, int]],
116
+ seed_size: int,
117
+ ) -> List[List[int]]:
118
+ """
119
+ Compute fragment indices for one sequence in borderless mode.
120
+
121
+ Examples:
122
+ >>> # no enzyme sites => single fragment from 0 to len
123
+ >>> index_list_single_borderless("AAAA", [], 0)
124
+ [[0, 4]]
125
+ """
126
+ positions = find_all_pos_borderless(sequence, ligation_site_list)
127
+ return IndexFragList_borderless(positions, seed_size)
128
+
129
+
130
+ def index_list_borderless(
131
+ Sequences: List[str],
132
+ ligation_site_list: List[Tuple[re.Pattern, int]],
133
+ seed_size: int,
134
+ ) -> Tuple[List[List[int]], List[List[int]]]:
135
+ """
136
+ Compute fragments for forward and reverse sequences in borderless mode.
137
+
138
+ Examples:
139
+ >>> f,r = index_list_borderless(["AAAA","BBBB"], [], 0)
140
+ >>> f, r
141
+ ([[0, 4]], [[0, 4]])
142
+ """
143
+ for_seq = index_list_single_borderless(Sequences[0], ligation_site_list, seed_size)
144
+ rev_seq = index_list_single_borderless(Sequences[1], ligation_site_list, seed_size)
145
+ return for_seq, rev_seq
146
+
147
+
148
+ # ------------------- Classic part (simple positions) -----------------------
149
+
150
+
151
+ def find_positions_for_one_site(text: str, Enzyme: Tuple[re.Pattern, int]) -> List[int]:
152
+ """
153
+ Return end-positions (start + offset) for each occurrence.
154
+
155
+ Examples:
156
+ >>> find_positions_for_one_site("", (re.compile("A"),1))
157
+ []
158
+ >>> find_positions_for_one_site("AZAA", (re.compile("ZAA"),3))
159
+ [4]
160
+ >>> find_positions_for_one_site("XYZ", (re.compile("A"),1))
161
+ []
162
+ """
163
+ regex, offset = Enzyme
164
+ return [m.start() + offset for m in regex.finditer(text)]
165
+
166
+
167
+ def find_all_pos(
168
+ text: str, ligation_site_list: List[Tuple[re.Pattern, int]]
169
+ ) -> List[int]:
170
+ """
171
+ Aggregate positions 0, all site-ends, and len(text) sorted.
172
+
173
+ Examples:
174
+ >>> find_all_pos("AAAA", [])
175
+ [0, 4]
176
+ >>> find_all_pos("AXA", [(re.compile("XA"),2)])
177
+ [0, 3]
178
+ """
179
+ sites = [0]
180
+ for enzyme in ligation_site_list:
181
+ sites += find_positions_for_one_site(text, enzyme)
182
+ sites.append(len(text))
183
+ return sorted(set(sites))
184
+
185
+
186
+ def IndexFragList(index_list: List[int], seed_size: int) -> List[List[int]]:
187
+ """
188
+ From a sorted list of positions, extract [prev,curr] where size>seed_size.
189
+
190
+ Examples:
191
+ >>> IndexFragList([0,5,10], 0)
192
+ [[0, 5], [5, 10]]
193
+ >>> IndexFragList([0,3,5], 4)
194
+ []
195
+ """
196
+ fragments: List[List[int]] = []
197
+ for prev, curr in zip(index_list, index_list[1:]):
198
+ if curr - prev >= seed_size:
199
+ fragments.append([prev, curr])
200
+ return fragments
201
+
202
+
203
+ def index_list_single(
204
+ sequence: str,
205
+ ligation_site_list: List[Tuple[re.Pattern, int]],
206
+ seed_size: int,
207
+ ) -> List[List[int]]:
208
+ """
209
+ Wrapper: find_all_pos + IndexFragList.
210
+
211
+ Examples:
212
+ >>> index_list_single("AAAA", [], 0)
213
+ [[0, 4]]
214
+ >>> index_list_single("AXA", [(re.compile("XA"),2)], 0)
215
+ [[0, 3]]
216
+ """
217
+ positions = find_all_pos(sequence, ligation_site_list)
218
+ return IndexFragList(positions, seed_size)
219
+
220
+
221
+ def index_list(
222
+ Sequences: List[str],
223
+ ligation_site_list: List[Tuple[re.Pattern, int]],
224
+ seed_size: int,
225
+ ) -> Tuple[List[List[int]], List[List[int]]]:
226
+ """
227
+ Dual wrapper for forward & reverse.
228
+
229
+ Examples:
230
+ >>> index_list(["AAAA","BBBB"], [], 0)
231
+ ([[0, 4]], [[0, 4]])
232
+ """
233
+ return (
234
+ index_list_single(Sequences[0], ligation_site_list, seed_size),
235
+ index_list_single(Sequences[1], ligation_site_list, seed_size),
236
+ )