mgnify-pipelines-toolkit 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -1,327 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
- from collections import defaultdict
19
- import os
20
-
21
- from Bio.Seq import Seq
22
- import regex
23
-
24
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
25
- primer_regex_query_builder,
26
- get_read_count,
27
- fetch_mcp,
28
- )
29
-
30
-
31
- def parse_args():
32
-
33
- parser = argparse.ArgumentParser()
34
-
35
- parser.add_argument(
36
- "-i",
37
- "--input",
38
- required=True,
39
- type=str,
40
- help="Path to merged FASTQ to look for primers",
41
- )
42
- parser.add_argument(
43
- "-p",
44
- "--primers",
45
- required=True,
46
- type=str,
47
- help="Path to directory containing standard primers fasta files",
48
- )
49
- parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
50
- parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
51
- args = parser.parse_args()
52
-
53
- input = args.input
54
- primers = args.primers
55
- sample = args.sample
56
- output = args.output
57
-
58
- return input, primers, sample, output
59
-
60
-
61
- def parse_std_primers(primers):
62
- """
63
- Parse the library of standard primers.
64
-
65
- Reads the fasta files in the given directory
66
- Primer names (which are the fasta headers) are labeled with F or R for 5'-3' and 3'-5' primers respectively
67
-
68
- Returns two dictionaries:
69
- std_primer_dict_regex
70
- key: region+primer name
71
- val: primer sequence from 5' to 3'
72
- std_primer_dict
73
- key: region+primer name
74
- val: primer sequence from 5' to 3' for forward primers, 3' to 5' for reverse
75
- """
76
-
77
- std_primer_dict_regex = defaultdict(defaultdict)
78
- std_primer_dict = defaultdict(defaultdict)
79
-
80
- dir = os.listdir(primers)
81
- dir = [f"{primers}/{path}" for path in dir]
82
-
83
- rev_flag = False
84
-
85
- for path in dir:
86
- region = path.split("/")[-1].split(".")[0]
87
- with open(path, "r") as fr:
88
- key = ""
89
- for line in fr:
90
- line = line.strip()
91
- if line[0] == ">":
92
- if "R" in line: # If a primer is a reverse primer
93
- rev_flag = True
94
- key = line[1:]
95
- else:
96
- if rev_flag:
97
- rev_conv = str(Seq(line).complement())
98
- line = rev_conv
99
- rev_flag = False
100
-
101
- primer = primer_regex_query_builder(line)
102
- std_primer_dict_regex[region][key] = primer
103
- std_primer_dict[region][key] = line
104
-
105
- return std_primer_dict_regex, std_primer_dict
106
-
107
-
108
- def run_primer_matching_once(input_path, input_primer, rev=False):
109
- """
110
- Run primer matching using the regex package.
111
-
112
- Takes one primer, strand, and fastq input
113
- Uses fuzzy matching to allow for at most one error (for sequencing errors)
114
- Returns number of reads matching given primer
115
- """
116
-
117
- match_count = 0.0
118
-
119
- mcp_count_dict = fetch_mcp(input_path, 50, rev=rev)
120
-
121
- for mcp in mcp_count_dict.keys():
122
- mcp = mcp.strip()
123
- res = regex.match(input_primer, mcp)
124
- if res is not None:
125
- match_count += mcp_count_dict[mcp]
126
-
127
- return match_count
128
-
129
-
130
- def get_primer_props(std_primer_dict_regex, input_path):
131
- """
132
- Look for the standard primers in the input fastq file.
133
-
134
- Will loop through the dictionary of primers, using fuzzy regex matching to find matching primers.
135
- If a std primer is present above a set threshold proportion, it is collected. Both strands are searched for.
136
- If there is an std primer for both the F and R strands, the maximum prop for each strand is chosen and the pair
137
- is output as a combination.
138
-
139
- Returns a list containing two elements:
140
- max_region: the amplified region the chosen primers belong to
141
- max_primers: dictionary containing the F and/or R primers that were chosen
142
- """
143
-
144
- threshold = 0.60 # Arbitrary threshold for collecting a matched primer
145
- read_count = get_read_count(
146
- input_path, file_type="fastq"
147
- ) # Get read count of fastq file to calculate proportion with
148
- res_dict = defaultdict(defaultdict)
149
-
150
- # Loop through every primer region
151
- for region, primer in std_primer_dict_regex.items():
152
- res_dict[region]["F"] = {}
153
- res_dict[region]["R"] = {}
154
-
155
- # Loop through every primer of a certain region
156
- for primer_name, primer_seq in primer.items():
157
-
158
- region_name_str = f"{region};{primer_name}"
159
- primer_count = 0.0
160
-
161
- if "F" in primer_name:
162
- primer_count = run_primer_matching_once(
163
- input_path, primer_seq, rev=False
164
- ) # Get proportion of a F primer with fuzzy regex matching
165
- elif "R" in primer_name:
166
- primer_count = run_primer_matching_once(
167
- input_path, primer_seq, rev=True
168
- ) # Get proportion of a R primer with fuzzy regex matching
169
-
170
- try:
171
- primer_prop = primer_count / read_count
172
- except ZeroDivisionError:
173
- primer_prop = 0
174
-
175
- if "F" in primer_name:
176
- if (
177
- primer_prop > threshold
178
- ): # Only collect primer if it's above threshold
179
- res_dict[region]["F"][primer_name] = primer_prop
180
- elif "R" in primer_name:
181
- if (
182
- primer_prop > threshold
183
- ): # Only collect primer if it's above threshold
184
- res_dict[region]["R"][primer_name] = primer_prop
185
-
186
- print(f"{region_name_str}: {primer_prop}")
187
-
188
- # If an F or/and R primer wasn't found then just remove it from the dictionary
189
- if res_dict[region]["F"] == {}:
190
- res_dict[region].pop("F")
191
- if res_dict[region]["R"] == {}:
192
- res_dict[region].pop("R")
193
-
194
- singles = defaultdict(str)
195
- doubles = defaultdict(list)
196
-
197
- double_status = False # Flag for whether primers were found on both strands
198
-
199
- # Loop through every collected primer and put primers in singles or doubles
200
- for region in res_dict.keys():
201
- strands = res_dict[region]
202
-
203
- for strand in strands.keys():
204
- primers = strands[strand]
205
- max_prop = 0
206
- max_name = ""
207
- for primer_name, prop in primers.items():
208
- if prop > max_prop:
209
- max_prop = prop
210
- max_name = primer_name
211
-
212
- if len(strands.keys()) == 2:
213
- double_status = True
214
- doubles[region].append({max_name: max_prop})
215
- elif len(strands.keys()) == 1:
216
- singles[region] = {max_name: max_prop}
217
-
218
- max_region = ""
219
- max_primers = {}
220
- max_mean_prop = 0
221
-
222
- # if at least one pair of primers was collected
223
- if double_status:
224
- for (
225
- region
226
- ) in doubles: # Loop through all pairs of primers and choose the best one
227
- primers = doubles[region]
228
-
229
- f_primer_name = list(primers[0].keys())[0]
230
- r_primer_name = list(primers[1].keys())[0]
231
- f_primer_prop = primers[0][f_primer_name]
232
- r_primer_prop = primers[1][r_primer_name]
233
-
234
- mean_prop = (f_primer_prop + r_primer_prop) / 2.0
235
- if mean_prop > max_mean_prop:
236
- max_mean_prop = mean_prop
237
- max_region = region
238
- max_primers = [
239
- {f_primer_name: f_primer_prop},
240
- {r_primer_name: r_primer_prop},
241
- ]
242
-
243
- else:
244
- for region in singles: # Choose the best single primer
245
- primer = singles[region]
246
- primer_name = list(primer.keys())[0]
247
- prop = primer[primer_name]
248
- if prop > max_mean_prop:
249
- max_mean_prop = prop
250
- max_region = region
251
- max_primers = {primer_name: prop}
252
-
253
- if max_region == "":
254
- print("No standard library primers!")
255
- return []
256
- elif double_status:
257
- print("Standard library primers found!")
258
- print(f"Region: {max_region}")
259
- print(f"Forward Primer: {max_primers[0]}")
260
- print(f"Reverse Primer: {max_primers[1]}")
261
-
262
- return [max_region, max_primers[0], max_primers[1]]
263
- else:
264
- print("Standard library primer found on one strand!")
265
- print(f"Region: {max_region}")
266
- print(f"Primer: {max_primers}")
267
-
268
- return [max_region, max_primers]
269
-
270
-
271
- def save_out(results, sample_id, output, std_primer_dict):
272
- """
273
- Save found std primers into a fasta file.
274
- """
275
-
276
- with (
277
- open(f"{output}/{sample_id}_std_primer_out.txt", "w") as fw_out,
278
- open(f"{output}/{sample_id}_std_primers.fasta", "w") as fw_seq,
279
- ):
280
- if results == []:
281
- fw_out.write("")
282
- fw_seq.write("")
283
-
284
- elif len(results) == 2:
285
- region = results[0]
286
- primer_name = list(results[1].keys())[0]
287
- primer_prop = results[1][list(results[1].keys())[0]]
288
- seq = std_primer_dict[region][primer_name]
289
- if "R" in primer_name:
290
- seq = str(Seq(seq).complement())
291
- fw_out.write(f"{region}\n")
292
- fw_out.write(f"{primer_name}: {primer_prop}")
293
-
294
- fw_seq.write(f">{primer_name}\n{seq}")
295
-
296
- elif len(results) == 3:
297
- region = results[0]
298
- f_primer_name = list(results[1].keys())[0]
299
- f_primer_prop = results[1][list(results[1].keys())[0]]
300
- f_seq = std_primer_dict[region][f_primer_name]
301
- r_primer_name = list(results[2].keys())[0]
302
- r_primer_prop = results[2][list(results[2].keys())[0]]
303
- r_seq = std_primer_dict[region][r_primer_name]
304
- r_seq = str(Seq(r_seq).complement())
305
-
306
- fw_out.write(f"{region}\n")
307
- fw_out.write(f"{f_primer_name}: {f_primer_prop}\n")
308
- fw_out.write(f"{r_primer_name}: {r_primer_prop}")
309
-
310
- fw_seq.write(f">{f_primer_name}\n{f_seq}\n")
311
- fw_seq.write(f">{r_primer_name}\n{r_seq}\n")
312
-
313
-
314
- def main():
315
-
316
- input, primers, sample, output = parse_args()
317
- std_primer_dict_regex, std_primer_dict = parse_std_primers(
318
- primers
319
- ) # Parse std primer library into dictionaries
320
- results = get_primer_props(
321
- std_primer_dict_regex, input
322
- ) # Find all the std primers in the input and select most common
323
- save_out(results, sample, output, std_primer_dict)
324
-
325
-
326
- if __name__ == "__main__":
327
- main()
@@ -1,43 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- _AMBIGUOUS_BASES_DICT = {
18
- "R": "[AG]",
19
- "Y": "[CT]",
20
- "S": "[GC]",
21
- "W": "[AT]",
22
- "K": "[GT]",
23
- "M": "[AC]",
24
- "B": "[CGT]",
25
- "D": "[AGT]",
26
- "H": "[ACT]",
27
- "V": "[ACG]",
28
- "N": "[ACTG]",
29
- }
30
-
31
- _AMBIGUOUS_BASES_DICT_REV = {
32
- "A,G": "R",
33
- "C,T": "Y",
34
- "C,G": "S",
35
- "A,T": "W",
36
- "G,T": "K",
37
- "A,C": "M",
38
- "C,G,T": "B",
39
- "A,G,T": "D",
40
- "A,C,T": "H",
41
- "A,C,G": "V",
42
- "A,C,G,T": "N",
43
- }