mgnify-pipelines-toolkit 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/constants/thresholds.py +0 -4
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/METADATA +1 -2
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/RECORD +7 -14
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/entry_points.txt +0 -5
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -221
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -164
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -214
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -175
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -111
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -327
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -43
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,327 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
from collections import defaultdict
|
|
19
|
-
import os
|
|
20
|
-
|
|
21
|
-
from Bio.Seq import Seq
|
|
22
|
-
import regex
|
|
23
|
-
|
|
24
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
25
|
-
primer_regex_query_builder,
|
|
26
|
-
get_read_count,
|
|
27
|
-
fetch_mcp,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def parse_args():
|
|
32
|
-
|
|
33
|
-
parser = argparse.ArgumentParser()
|
|
34
|
-
|
|
35
|
-
parser.add_argument(
|
|
36
|
-
"-i",
|
|
37
|
-
"--input",
|
|
38
|
-
required=True,
|
|
39
|
-
type=str,
|
|
40
|
-
help="Path to merged FASTQ to look for primers",
|
|
41
|
-
)
|
|
42
|
-
parser.add_argument(
|
|
43
|
-
"-p",
|
|
44
|
-
"--primers",
|
|
45
|
-
required=True,
|
|
46
|
-
type=str,
|
|
47
|
-
help="Path to directory containing standard primers fasta files",
|
|
48
|
-
)
|
|
49
|
-
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
50
|
-
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
51
|
-
args = parser.parse_args()
|
|
52
|
-
|
|
53
|
-
input = args.input
|
|
54
|
-
primers = args.primers
|
|
55
|
-
sample = args.sample
|
|
56
|
-
output = args.output
|
|
57
|
-
|
|
58
|
-
return input, primers, sample, output
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def parse_std_primers(primers):
|
|
62
|
-
"""
|
|
63
|
-
Parse the library of standard primers.
|
|
64
|
-
|
|
65
|
-
Reads the fasta files in the given directory
|
|
66
|
-
Primer names (which are the fasta headers) are labeled with F or R for 5'-3' and 3'-5' primers respectively
|
|
67
|
-
|
|
68
|
-
Returns two dictionaries:
|
|
69
|
-
std_primer_dict_regex
|
|
70
|
-
key: region+primer name
|
|
71
|
-
val: primer sequence from 5' to 3'
|
|
72
|
-
std_primer_dict
|
|
73
|
-
key: region+primer name
|
|
74
|
-
val: primer sequence from 5' to 3' for forward primers, 3' to 5' for reverse
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
std_primer_dict_regex = defaultdict(defaultdict)
|
|
78
|
-
std_primer_dict = defaultdict(defaultdict)
|
|
79
|
-
|
|
80
|
-
dir = os.listdir(primers)
|
|
81
|
-
dir = [f"{primers}/{path}" for path in dir]
|
|
82
|
-
|
|
83
|
-
rev_flag = False
|
|
84
|
-
|
|
85
|
-
for path in dir:
|
|
86
|
-
region = path.split("/")[-1].split(".")[0]
|
|
87
|
-
with open(path, "r") as fr:
|
|
88
|
-
key = ""
|
|
89
|
-
for line in fr:
|
|
90
|
-
line = line.strip()
|
|
91
|
-
if line[0] == ">":
|
|
92
|
-
if "R" in line: # If a primer is a reverse primer
|
|
93
|
-
rev_flag = True
|
|
94
|
-
key = line[1:]
|
|
95
|
-
else:
|
|
96
|
-
if rev_flag:
|
|
97
|
-
rev_conv = str(Seq(line).complement())
|
|
98
|
-
line = rev_conv
|
|
99
|
-
rev_flag = False
|
|
100
|
-
|
|
101
|
-
primer = primer_regex_query_builder(line)
|
|
102
|
-
std_primer_dict_regex[region][key] = primer
|
|
103
|
-
std_primer_dict[region][key] = line
|
|
104
|
-
|
|
105
|
-
return std_primer_dict_regex, std_primer_dict
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def run_primer_matching_once(input_path, input_primer, rev=False):
|
|
109
|
-
"""
|
|
110
|
-
Run primer matching using the regex package.
|
|
111
|
-
|
|
112
|
-
Takes one primer, strand, and fastq input
|
|
113
|
-
Uses fuzzy matching to allow for at most one error (for sequencing errors)
|
|
114
|
-
Returns number of reads matching given primer
|
|
115
|
-
"""
|
|
116
|
-
|
|
117
|
-
match_count = 0.0
|
|
118
|
-
|
|
119
|
-
mcp_count_dict = fetch_mcp(input_path, 50, rev=rev)
|
|
120
|
-
|
|
121
|
-
for mcp in mcp_count_dict.keys():
|
|
122
|
-
mcp = mcp.strip()
|
|
123
|
-
res = regex.match(input_primer, mcp)
|
|
124
|
-
if res is not None:
|
|
125
|
-
match_count += mcp_count_dict[mcp]
|
|
126
|
-
|
|
127
|
-
return match_count
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def get_primer_props(std_primer_dict_regex, input_path):
|
|
131
|
-
"""
|
|
132
|
-
Look for the standard primers in the input fastq file.
|
|
133
|
-
|
|
134
|
-
Will loop through the dictionary of primers, using fuzzy regex matching to find matching primers.
|
|
135
|
-
If a std primer is present above a set threshold proportion, it is collected. Both strands are searched for.
|
|
136
|
-
If there is an std primer for both the F and R strands, the maximum prop for each strand is chosen and the pair
|
|
137
|
-
is output as a combination.
|
|
138
|
-
|
|
139
|
-
Returns a list containing two elements:
|
|
140
|
-
max_region: the amplified region the chosen primers belong to
|
|
141
|
-
max_primers: dictionary containing the F and/or R primers that were chosen
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
|
-
threshold = 0.60 # Arbitrary threshold for collecting a matched primer
|
|
145
|
-
read_count = get_read_count(
|
|
146
|
-
input_path, file_type="fastq"
|
|
147
|
-
) # Get read count of fastq file to calculate proportion with
|
|
148
|
-
res_dict = defaultdict(defaultdict)
|
|
149
|
-
|
|
150
|
-
# Loop through every primer region
|
|
151
|
-
for region, primer in std_primer_dict_regex.items():
|
|
152
|
-
res_dict[region]["F"] = {}
|
|
153
|
-
res_dict[region]["R"] = {}
|
|
154
|
-
|
|
155
|
-
# Loop through every primer of a certain region
|
|
156
|
-
for primer_name, primer_seq in primer.items():
|
|
157
|
-
|
|
158
|
-
region_name_str = f"{region};{primer_name}"
|
|
159
|
-
primer_count = 0.0
|
|
160
|
-
|
|
161
|
-
if "F" in primer_name:
|
|
162
|
-
primer_count = run_primer_matching_once(
|
|
163
|
-
input_path, primer_seq, rev=False
|
|
164
|
-
) # Get proportion of a F primer with fuzzy regex matching
|
|
165
|
-
elif "R" in primer_name:
|
|
166
|
-
primer_count = run_primer_matching_once(
|
|
167
|
-
input_path, primer_seq, rev=True
|
|
168
|
-
) # Get proportion of a R primer with fuzzy regex matching
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
primer_prop = primer_count / read_count
|
|
172
|
-
except ZeroDivisionError:
|
|
173
|
-
primer_prop = 0
|
|
174
|
-
|
|
175
|
-
if "F" in primer_name:
|
|
176
|
-
if (
|
|
177
|
-
primer_prop > threshold
|
|
178
|
-
): # Only collect primer if it's above threshold
|
|
179
|
-
res_dict[region]["F"][primer_name] = primer_prop
|
|
180
|
-
elif "R" in primer_name:
|
|
181
|
-
if (
|
|
182
|
-
primer_prop > threshold
|
|
183
|
-
): # Only collect primer if it's above threshold
|
|
184
|
-
res_dict[region]["R"][primer_name] = primer_prop
|
|
185
|
-
|
|
186
|
-
print(f"{region_name_str}: {primer_prop}")
|
|
187
|
-
|
|
188
|
-
# If an F or/and R primer wasn't found then just remove it from the dictionary
|
|
189
|
-
if res_dict[region]["F"] == {}:
|
|
190
|
-
res_dict[region].pop("F")
|
|
191
|
-
if res_dict[region]["R"] == {}:
|
|
192
|
-
res_dict[region].pop("R")
|
|
193
|
-
|
|
194
|
-
singles = defaultdict(str)
|
|
195
|
-
doubles = defaultdict(list)
|
|
196
|
-
|
|
197
|
-
double_status = False # Flag for whether primers were found on both strands
|
|
198
|
-
|
|
199
|
-
# Loop through every collected primer and put primers in singles or doubles
|
|
200
|
-
for region in res_dict.keys():
|
|
201
|
-
strands = res_dict[region]
|
|
202
|
-
|
|
203
|
-
for strand in strands.keys():
|
|
204
|
-
primers = strands[strand]
|
|
205
|
-
max_prop = 0
|
|
206
|
-
max_name = ""
|
|
207
|
-
for primer_name, prop in primers.items():
|
|
208
|
-
if prop > max_prop:
|
|
209
|
-
max_prop = prop
|
|
210
|
-
max_name = primer_name
|
|
211
|
-
|
|
212
|
-
if len(strands.keys()) == 2:
|
|
213
|
-
double_status = True
|
|
214
|
-
doubles[region].append({max_name: max_prop})
|
|
215
|
-
elif len(strands.keys()) == 1:
|
|
216
|
-
singles[region] = {max_name: max_prop}
|
|
217
|
-
|
|
218
|
-
max_region = ""
|
|
219
|
-
max_primers = {}
|
|
220
|
-
max_mean_prop = 0
|
|
221
|
-
|
|
222
|
-
# if at least one pair of primers was collected
|
|
223
|
-
if double_status:
|
|
224
|
-
for (
|
|
225
|
-
region
|
|
226
|
-
) in doubles: # Loop through all pairs of primers and choose the best one
|
|
227
|
-
primers = doubles[region]
|
|
228
|
-
|
|
229
|
-
f_primer_name = list(primers[0].keys())[0]
|
|
230
|
-
r_primer_name = list(primers[1].keys())[0]
|
|
231
|
-
f_primer_prop = primers[0][f_primer_name]
|
|
232
|
-
r_primer_prop = primers[1][r_primer_name]
|
|
233
|
-
|
|
234
|
-
mean_prop = (f_primer_prop + r_primer_prop) / 2.0
|
|
235
|
-
if mean_prop > max_mean_prop:
|
|
236
|
-
max_mean_prop = mean_prop
|
|
237
|
-
max_region = region
|
|
238
|
-
max_primers = [
|
|
239
|
-
{f_primer_name: f_primer_prop},
|
|
240
|
-
{r_primer_name: r_primer_prop},
|
|
241
|
-
]
|
|
242
|
-
|
|
243
|
-
else:
|
|
244
|
-
for region in singles: # Choose the best single primer
|
|
245
|
-
primer = singles[region]
|
|
246
|
-
primer_name = list(primer.keys())[0]
|
|
247
|
-
prop = primer[primer_name]
|
|
248
|
-
if prop > max_mean_prop:
|
|
249
|
-
max_mean_prop = prop
|
|
250
|
-
max_region = region
|
|
251
|
-
max_primers = {primer_name: prop}
|
|
252
|
-
|
|
253
|
-
if max_region == "":
|
|
254
|
-
print("No standard library primers!")
|
|
255
|
-
return []
|
|
256
|
-
elif double_status:
|
|
257
|
-
print("Standard library primers found!")
|
|
258
|
-
print(f"Region: {max_region}")
|
|
259
|
-
print(f"Forward Primer: {max_primers[0]}")
|
|
260
|
-
print(f"Reverse Primer: {max_primers[1]}")
|
|
261
|
-
|
|
262
|
-
return [max_region, max_primers[0], max_primers[1]]
|
|
263
|
-
else:
|
|
264
|
-
print("Standard library primer found on one strand!")
|
|
265
|
-
print(f"Region: {max_region}")
|
|
266
|
-
print(f"Primer: {max_primers}")
|
|
267
|
-
|
|
268
|
-
return [max_region, max_primers]
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
def save_out(results, sample_id, output, std_primer_dict):
|
|
272
|
-
"""
|
|
273
|
-
Save found std primers into a fasta file.
|
|
274
|
-
"""
|
|
275
|
-
|
|
276
|
-
with (
|
|
277
|
-
open(f"{output}/{sample_id}_std_primer_out.txt", "w") as fw_out,
|
|
278
|
-
open(f"{output}/{sample_id}_std_primers.fasta", "w") as fw_seq,
|
|
279
|
-
):
|
|
280
|
-
if results == []:
|
|
281
|
-
fw_out.write("")
|
|
282
|
-
fw_seq.write("")
|
|
283
|
-
|
|
284
|
-
elif len(results) == 2:
|
|
285
|
-
region = results[0]
|
|
286
|
-
primer_name = list(results[1].keys())[0]
|
|
287
|
-
primer_prop = results[1][list(results[1].keys())[0]]
|
|
288
|
-
seq = std_primer_dict[region][primer_name]
|
|
289
|
-
if "R" in primer_name:
|
|
290
|
-
seq = str(Seq(seq).complement())
|
|
291
|
-
fw_out.write(f"{region}\n")
|
|
292
|
-
fw_out.write(f"{primer_name}: {primer_prop}")
|
|
293
|
-
|
|
294
|
-
fw_seq.write(f">{primer_name}\n{seq}")
|
|
295
|
-
|
|
296
|
-
elif len(results) == 3:
|
|
297
|
-
region = results[0]
|
|
298
|
-
f_primer_name = list(results[1].keys())[0]
|
|
299
|
-
f_primer_prop = results[1][list(results[1].keys())[0]]
|
|
300
|
-
f_seq = std_primer_dict[region][f_primer_name]
|
|
301
|
-
r_primer_name = list(results[2].keys())[0]
|
|
302
|
-
r_primer_prop = results[2][list(results[2].keys())[0]]
|
|
303
|
-
r_seq = std_primer_dict[region][r_primer_name]
|
|
304
|
-
r_seq = str(Seq(r_seq).complement())
|
|
305
|
-
|
|
306
|
-
fw_out.write(f"{region}\n")
|
|
307
|
-
fw_out.write(f"{f_primer_name}: {f_primer_prop}\n")
|
|
308
|
-
fw_out.write(f"{r_primer_name}: {r_primer_prop}")
|
|
309
|
-
|
|
310
|
-
fw_seq.write(f">{f_primer_name}\n{f_seq}\n")
|
|
311
|
-
fw_seq.write(f">{r_primer_name}\n{r_seq}\n")
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def main():
|
|
315
|
-
|
|
316
|
-
input, primers, sample, output = parse_args()
|
|
317
|
-
std_primer_dict_regex, std_primer_dict = parse_std_primers(
|
|
318
|
-
primers
|
|
319
|
-
) # Parse std primer library into dictionaries
|
|
320
|
-
results = get_primer_props(
|
|
321
|
-
std_primer_dict_regex, input
|
|
322
|
-
) # Find all the std primers in the input and select most common
|
|
323
|
-
save_out(results, sample, output, std_primer_dict)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
if __name__ == "__main__":
|
|
327
|
-
main()
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
_AMBIGUOUS_BASES_DICT = {
|
|
18
|
-
"R": "[AG]",
|
|
19
|
-
"Y": "[CT]",
|
|
20
|
-
"S": "[GC]",
|
|
21
|
-
"W": "[AT]",
|
|
22
|
-
"K": "[GT]",
|
|
23
|
-
"M": "[AC]",
|
|
24
|
-
"B": "[CGT]",
|
|
25
|
-
"D": "[AGT]",
|
|
26
|
-
"H": "[ACT]",
|
|
27
|
-
"V": "[ACG]",
|
|
28
|
-
"N": "[ACTG]",
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
_AMBIGUOUS_BASES_DICT_REV = {
|
|
32
|
-
"A,G": "R",
|
|
33
|
-
"C,T": "Y",
|
|
34
|
-
"C,G": "S",
|
|
35
|
-
"A,T": "W",
|
|
36
|
-
"G,T": "K",
|
|
37
|
-
"A,C": "M",
|
|
38
|
-
"C,G,T": "B",
|
|
39
|
-
"A,G,T": "D",
|
|
40
|
-
"A,C,T": "H",
|
|
41
|
-
"A,C,G": "V",
|
|
42
|
-
"A,C,G,T": "N",
|
|
43
|
-
}
|
|
File without changes
|
{mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/top_level.txt
RENAMED
|
File without changes
|