mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +138 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- mgnify_pipelines_toolkit/utils/get_mpt_version.py +26 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.6.dist-info/RECORD +34 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/entry_points.txt +4 -0
- mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -17,31 +17,48 @@
|
|
|
17
17
|
import argparse
|
|
18
18
|
from collections import defaultdict
|
|
19
19
|
import os
|
|
20
|
-
import subprocess
|
|
21
20
|
|
|
22
21
|
from Bio.Seq import Seq
|
|
23
22
|
import regex
|
|
24
23
|
|
|
25
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import
|
|
24
|
+
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
25
|
+
primer_regex_query_builder,
|
|
26
|
+
get_read_count,
|
|
27
|
+
fetch_mcp,
|
|
28
|
+
)
|
|
29
|
+
|
|
26
30
|
|
|
27
31
|
def parse_args():
|
|
28
32
|
|
|
29
33
|
parser = argparse.ArgumentParser()
|
|
30
34
|
|
|
31
|
-
parser.add_argument(
|
|
32
|
-
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"-i",
|
|
37
|
+
"--input",
|
|
38
|
+
required=True,
|
|
39
|
+
type=str,
|
|
40
|
+
help="Path to merged FASTQ to look for primers",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-p",
|
|
44
|
+
"--primers",
|
|
45
|
+
required=True,
|
|
46
|
+
type=str,
|
|
47
|
+
help="Path to directory containing standard primers fasta files",
|
|
48
|
+
)
|
|
33
49
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
34
50
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
35
51
|
args = parser.parse_args()
|
|
36
|
-
|
|
37
|
-
_INPUT = args.input
|
|
38
|
-
_PRIMERS = args.primers
|
|
39
|
-
_SAMPLE = args.sample
|
|
40
|
-
_OUTPUT = args.output
|
|
41
52
|
|
|
42
|
-
|
|
53
|
+
input = args.input
|
|
54
|
+
primers = args.primers
|
|
55
|
+
sample = args.sample
|
|
56
|
+
output = args.output
|
|
57
|
+
|
|
58
|
+
return input, primers, sample, output
|
|
59
|
+
|
|
43
60
|
|
|
44
|
-
def parse_std_primers(
|
|
61
|
+
def parse_std_primers(primers):
|
|
45
62
|
"""
|
|
46
63
|
Parse the library of standard primers.
|
|
47
64
|
|
|
@@ -60,19 +77,19 @@ def parse_std_primers(_PRIMERS):
|
|
|
60
77
|
std_primer_dict_regex = defaultdict(defaultdict)
|
|
61
78
|
std_primer_dict = defaultdict(defaultdict)
|
|
62
79
|
|
|
63
|
-
dir = os.listdir(
|
|
64
|
-
dir = [
|
|
65
|
-
|
|
80
|
+
dir = os.listdir(primers)
|
|
81
|
+
dir = [f"{primers}/{path}" for path in dir]
|
|
82
|
+
|
|
66
83
|
rev_flag = False
|
|
67
84
|
|
|
68
85
|
for path in dir:
|
|
69
|
-
region = path.split(
|
|
70
|
-
with open(path,
|
|
71
|
-
key =
|
|
86
|
+
region = path.split("/")[-1].split(".")[0]
|
|
87
|
+
with open(path, "r") as fr:
|
|
88
|
+
key = ""
|
|
72
89
|
for line in fr:
|
|
73
90
|
line = line.strip()
|
|
74
|
-
if line[0] ==
|
|
75
|
-
if
|
|
91
|
+
if line[0] == ">":
|
|
92
|
+
if "R" in line: # If a primer is a reverse primer
|
|
76
93
|
rev_flag = True
|
|
77
94
|
key = line[1:]
|
|
78
95
|
else:
|
|
@@ -87,6 +104,7 @@ def parse_std_primers(_PRIMERS):
|
|
|
87
104
|
|
|
88
105
|
return std_primer_dict_regex, std_primer_dict
|
|
89
106
|
|
|
107
|
+
|
|
90
108
|
def run_primer_matching_once(input_path, input_primer, rev=False):
|
|
91
109
|
"""
|
|
92
110
|
Run primer matching using the regex package.
|
|
@@ -103,11 +121,12 @@ def run_primer_matching_once(input_path, input_primer, rev=False):
|
|
|
103
121
|
for mcp in mcp_count_dict.keys():
|
|
104
122
|
mcp = mcp.strip()
|
|
105
123
|
res = regex.match(input_primer, mcp)
|
|
106
|
-
if res
|
|
124
|
+
if res is not None:
|
|
107
125
|
match_count += mcp_count_dict[mcp]
|
|
108
126
|
|
|
109
127
|
return match_count
|
|
110
128
|
|
|
129
|
+
|
|
111
130
|
def get_primer_props(std_primer_dict_regex, input_path):
|
|
112
131
|
"""
|
|
113
132
|
Look for the standard primers in the input fastq file.
|
|
@@ -122,93 +141,107 @@ def get_primer_props(std_primer_dict_regex, input_path):
|
|
|
122
141
|
max_primers: dictionary containing the F and/or R primers that were chosen
|
|
123
142
|
"""
|
|
124
143
|
|
|
125
|
-
threshold = 0.60
|
|
126
|
-
read_count = get_read_count(
|
|
144
|
+
threshold = 0.60 # Arbitrary threshold for collecting a matched primer
|
|
145
|
+
read_count = get_read_count(
|
|
146
|
+
input_path, "fastq"
|
|
147
|
+
) # Get read count of fastq file to calculate proportion with
|
|
127
148
|
res_dict = defaultdict(defaultdict)
|
|
128
149
|
|
|
129
150
|
# Loop through every primer region
|
|
130
151
|
for region, primer in std_primer_dict_regex.items():
|
|
131
|
-
res_dict[region][
|
|
132
|
-
res_dict[region][
|
|
152
|
+
res_dict[region]["F"] = {}
|
|
153
|
+
res_dict[region]["R"] = {}
|
|
133
154
|
|
|
134
155
|
# Loop through every primer of a certain region
|
|
135
156
|
for primer_name, primer_seq in primer.items():
|
|
136
|
-
|
|
137
|
-
region_name_str = f
|
|
157
|
+
|
|
158
|
+
region_name_str = f"{region};{primer_name}"
|
|
138
159
|
primer_count = 0.0
|
|
139
160
|
|
|
140
|
-
if
|
|
141
|
-
primer_count = run_primer_matching_once(
|
|
142
|
-
|
|
143
|
-
|
|
161
|
+
if "F" in primer_name:
|
|
162
|
+
primer_count = run_primer_matching_once(
|
|
163
|
+
input_path, primer_seq, rev=False
|
|
164
|
+
) # Get proportion of a F primer with fuzzy regex matching
|
|
165
|
+
elif "R" in primer_name:
|
|
166
|
+
primer_count = run_primer_matching_once(
|
|
167
|
+
input_path, primer_seq, rev=True
|
|
168
|
+
) # Get proportion of a R primer with fuzzy regex matching
|
|
144
169
|
|
|
145
170
|
try:
|
|
146
171
|
primer_prop = primer_count / read_count
|
|
147
172
|
except ZeroDivisionError:
|
|
148
173
|
primer_prop = 0
|
|
149
174
|
|
|
150
|
-
if
|
|
151
|
-
if
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
175
|
+
if "F" in primer_name:
|
|
176
|
+
if (
|
|
177
|
+
primer_prop > threshold
|
|
178
|
+
): # Only collect primer if it's above threshold
|
|
179
|
+
res_dict[region]["F"][primer_name] = primer_prop
|
|
180
|
+
elif "R" in primer_name:
|
|
181
|
+
if (
|
|
182
|
+
primer_prop > threshold
|
|
183
|
+
): # Only collect primer if it's above threshold
|
|
184
|
+
res_dict[region]["R"][primer_name] = primer_prop
|
|
185
|
+
|
|
186
|
+
print(f"{region_name_str}: {primer_prop}")
|
|
156
187
|
|
|
157
|
-
print(f'{region_name_str}: {primer_prop}')
|
|
158
|
-
|
|
159
188
|
# If an F or/and R primer wasn't found then just remove it from the dictionary
|
|
160
|
-
if res_dict[region][
|
|
161
|
-
res_dict[region].pop(
|
|
162
|
-
if res_dict[region][
|
|
163
|
-
res_dict[region].pop(
|
|
164
|
-
|
|
189
|
+
if res_dict[region]["F"] == {}:
|
|
190
|
+
res_dict[region].pop("F")
|
|
191
|
+
if res_dict[region]["R"] == {}:
|
|
192
|
+
res_dict[region].pop("R")
|
|
165
193
|
|
|
166
194
|
singles = defaultdict(str)
|
|
167
195
|
doubles = defaultdict(list)
|
|
168
196
|
|
|
169
|
-
double_status = False
|
|
197
|
+
double_status = False # Flag for whether primers were found on both strands
|
|
170
198
|
|
|
171
199
|
# Loop through every collected primer and put primers in singles or doubles
|
|
172
200
|
for region in res_dict.keys():
|
|
173
201
|
strands = res_dict[region]
|
|
174
|
-
|
|
202
|
+
|
|
175
203
|
for strand in strands.keys():
|
|
176
204
|
primers = strands[strand]
|
|
177
205
|
max_prop = 0
|
|
178
|
-
max_name =
|
|
206
|
+
max_name = ""
|
|
179
207
|
for primer_name, prop in primers.items():
|
|
180
208
|
if prop > max_prop:
|
|
181
209
|
max_prop = prop
|
|
182
210
|
max_name = primer_name
|
|
183
|
-
|
|
211
|
+
|
|
184
212
|
if len(strands.keys()) == 2:
|
|
185
213
|
double_status = True
|
|
186
214
|
doubles[region].append({max_name: max_prop})
|
|
187
215
|
elif len(strands.keys()) == 1:
|
|
188
216
|
singles[region] = {max_name: max_prop}
|
|
189
217
|
|
|
190
|
-
max_region =
|
|
218
|
+
max_region = ""
|
|
191
219
|
max_primers = {}
|
|
192
220
|
max_mean_prop = 0
|
|
193
|
-
|
|
221
|
+
|
|
194
222
|
# if at least one pair of primers was collected
|
|
195
223
|
if double_status:
|
|
196
|
-
for
|
|
224
|
+
for (
|
|
225
|
+
region
|
|
226
|
+
) in doubles: # Loop through all pairs of primers and choose the best one
|
|
197
227
|
primers = doubles[region]
|
|
198
228
|
|
|
199
229
|
f_primer_name = list(primers[0].keys())[0]
|
|
200
230
|
r_primer_name = list(primers[1].keys())[0]
|
|
201
231
|
f_primer_prop = primers[0][f_primer_name]
|
|
202
232
|
r_primer_prop = primers[1][r_primer_name]
|
|
203
|
-
|
|
233
|
+
|
|
204
234
|
mean_prop = (f_primer_prop + r_primer_prop) / 2.0
|
|
205
235
|
if mean_prop > max_mean_prop:
|
|
206
236
|
max_mean_prop = mean_prop
|
|
207
237
|
max_region = region
|
|
208
|
-
max_primers = [
|
|
238
|
+
max_primers = [
|
|
239
|
+
{f_primer_name: f_primer_prop},
|
|
240
|
+
{r_primer_name: r_primer_prop},
|
|
241
|
+
]
|
|
209
242
|
|
|
210
243
|
else:
|
|
211
|
-
for region in singles:
|
|
244
|
+
for region in singles: # Choose the best single primer
|
|
212
245
|
primer = singles[region]
|
|
213
246
|
primer_name = list(primer.keys())[0]
|
|
214
247
|
prop = primer[primer_name]
|
|
@@ -217,23 +250,22 @@ def get_primer_props(std_primer_dict_regex, input_path):
|
|
|
217
250
|
max_region = region
|
|
218
251
|
max_primers = {primer_name: prop}
|
|
219
252
|
|
|
220
|
-
if max_region ==
|
|
221
|
-
print(
|
|
222
|
-
return
|
|
253
|
+
if max_region == "":
|
|
254
|
+
print("No standard library primers!")
|
|
255
|
+
return []
|
|
223
256
|
elif double_status:
|
|
224
|
-
print(
|
|
225
|
-
print(f
|
|
226
|
-
print(f
|
|
227
|
-
print(f
|
|
257
|
+
print("Standard library primers found!")
|
|
258
|
+
print(f"Region: {max_region}")
|
|
259
|
+
print(f"Forward Primer: {max_primers[0]}")
|
|
260
|
+
print(f"Reverse Primer: {max_primers[1]}")
|
|
228
261
|
|
|
229
|
-
return
|
|
262
|
+
return [max_region, max_primers[0], max_primers[1]]
|
|
230
263
|
else:
|
|
231
|
-
print(
|
|
232
|
-
print(f
|
|
233
|
-
print(f
|
|
234
|
-
|
|
235
|
-
return([max_region, max_primers])
|
|
264
|
+
print("Standard library primer found on one strand!")
|
|
265
|
+
print(f"Region: {max_region}")
|
|
266
|
+
print(f"Primer: {max_primers}")
|
|
236
267
|
|
|
268
|
+
return [max_region, max_primers]
|
|
237
269
|
|
|
238
270
|
|
|
239
271
|
def save_out(results, sample_id, output, std_primer_dict):
|
|
@@ -241,24 +273,26 @@ def save_out(results, sample_id, output, std_primer_dict):
|
|
|
241
273
|
Save found std primers into a fasta file.
|
|
242
274
|
"""
|
|
243
275
|
|
|
244
|
-
with
|
|
276
|
+
with (
|
|
277
|
+
open(f"{output}/{sample_id}_std_primer_out.txt", "w") as fw_out,
|
|
278
|
+
open(f"{output}/{sample_id}_std_primers.fasta", "w") as fw_seq,
|
|
279
|
+
):
|
|
245
280
|
if results == []:
|
|
246
|
-
fw_out.write(
|
|
247
|
-
fw_seq.write(
|
|
248
|
-
|
|
281
|
+
fw_out.write("")
|
|
282
|
+
fw_seq.write("")
|
|
283
|
+
|
|
249
284
|
elif len(results) == 2:
|
|
250
285
|
region = results[0]
|
|
251
286
|
primer_name = list(results[1].keys())[0]
|
|
252
287
|
primer_prop = results[1][list(results[1].keys())[0]]
|
|
253
288
|
seq = std_primer_dict[region][primer_name]
|
|
254
|
-
if
|
|
289
|
+
if "R" in primer_name:
|
|
255
290
|
seq = str(Seq(seq).complement())
|
|
256
|
-
fw_out.write(f
|
|
257
|
-
fw_out.write(f
|
|
291
|
+
fw_out.write(f"{region}\n")
|
|
292
|
+
fw_out.write(f"{primer_name}: {primer_prop}")
|
|
293
|
+
|
|
294
|
+
fw_seq.write(f">{primer_name}\n{seq}")
|
|
258
295
|
|
|
259
|
-
fw_seq.write(f'>{primer_name}\n{seq}')
|
|
260
|
-
|
|
261
|
-
|
|
262
296
|
elif len(results) == 3:
|
|
263
297
|
region = results[0]
|
|
264
298
|
f_primer_name = list(results[1].keys())[0]
|
|
@@ -268,22 +302,26 @@ def save_out(results, sample_id, output, std_primer_dict):
|
|
|
268
302
|
r_primer_prop = results[2][list(results[2].keys())[0]]
|
|
269
303
|
r_seq = std_primer_dict[region][r_primer_name]
|
|
270
304
|
r_seq = str(Seq(r_seq).complement())
|
|
271
|
-
|
|
272
305
|
|
|
273
|
-
fw_out.write(f
|
|
274
|
-
fw_out.write(f
|
|
275
|
-
fw_out.write(f
|
|
306
|
+
fw_out.write(f"{region}\n")
|
|
307
|
+
fw_out.write(f"{f_primer_name}: {f_primer_prop}\n")
|
|
308
|
+
fw_out.write(f"{r_primer_name}: {r_primer_prop}")
|
|
309
|
+
|
|
310
|
+
fw_seq.write(f">{f_primer_name}\n{f_seq}\n")
|
|
311
|
+
fw_seq.write(f">{r_primer_name}\n{r_seq}\n")
|
|
276
312
|
|
|
277
|
-
fw_seq.write(f'>{f_primer_name}\n{f_seq}\n')
|
|
278
|
-
fw_seq.write(f'>{r_primer_name}\n{r_seq}\n')
|
|
279
313
|
|
|
280
|
-
|
|
281
314
|
def main():
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
std_primer_dict_regex, std_primer_dict = parse_std_primers(
|
|
285
|
-
|
|
286
|
-
|
|
315
|
+
|
|
316
|
+
input, primers, sample, output = parse_args()
|
|
317
|
+
std_primer_dict_regex, std_primer_dict = parse_std_primers(
|
|
318
|
+
primers
|
|
319
|
+
) # Parse std primer library into dictionaries
|
|
320
|
+
results = get_primer_props(
|
|
321
|
+
std_primer_dict_regex, input
|
|
322
|
+
) # Find all the std primers in the input and select most common
|
|
323
|
+
save_out(results, sample, output, std_primer_dict)
|
|
324
|
+
|
|
287
325
|
|
|
288
326
|
if __name__ == "__main__":
|
|
289
|
-
main()
|
|
327
|
+
main()
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import gzip
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_args():
|
|
27
|
+
|
|
28
|
+
parser = argparse.ArgumentParser(
|
|
29
|
+
description="Script that sanity checks whether the strand suffix of a FASTQ file matches the headers inside the FASTQ file."
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-f",
|
|
33
|
+
"--fwd",
|
|
34
|
+
required=True,
|
|
35
|
+
type=str,
|
|
36
|
+
help="Input forward read headers file (PE) OR SE read file",
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"-r",
|
|
40
|
+
"--rev",
|
|
41
|
+
required=False,
|
|
42
|
+
type=str,
|
|
43
|
+
help="Input reverse read headers file (PE)",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
46
|
+
parser.add_argument("-o", "--output", required=True, type=str, help="Output")
|
|
47
|
+
|
|
48
|
+
args = parser.parse_args()
|
|
49
|
+
|
|
50
|
+
fwd = args.fwd
|
|
51
|
+
rev = args.rev
|
|
52
|
+
sample = args.sample
|
|
53
|
+
output = args.output
|
|
54
|
+
|
|
55
|
+
return fwd, rev, sample, output
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def choose_open_func(file_path):
|
|
59
|
+
|
|
60
|
+
open_func = open
|
|
61
|
+
|
|
62
|
+
if file_path[-2:] == "gz":
|
|
63
|
+
open_func = gzip.open
|
|
64
|
+
|
|
65
|
+
return open_func
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def main():
|
|
69
|
+
|
|
70
|
+
fwd, rev, sample, output = parse_args()
|
|
71
|
+
|
|
72
|
+
files_to_parse = []
|
|
73
|
+
|
|
74
|
+
if "_1" in fwd:
|
|
75
|
+
if not rev:
|
|
76
|
+
logging.error(
|
|
77
|
+
'No reverse file given, yet given forward file has the "_1" suffix implying it\'s paired-end. '
|
|
78
|
+
+ "Either supply the reverse file, or supply a single-end file."
|
|
79
|
+
)
|
|
80
|
+
elif "_2" not in rev:
|
|
81
|
+
logging.error(
|
|
82
|
+
'The expected suffix "_2" for a supplied reverse file is missing. Please verify your inputs.'
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
files_to_parse = [fwd, rev]
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
files_to_parse = [fwd]
|
|
89
|
+
|
|
90
|
+
open_func = choose_open_func(
|
|
91
|
+
fwd
|
|
92
|
+
) # Choose between gzip.open() and open() by checking the file extension
|
|
93
|
+
reads_with_err = defaultdict(list)
|
|
94
|
+
|
|
95
|
+
for file in files_to_parse:
|
|
96
|
+
|
|
97
|
+
header_str = ""
|
|
98
|
+
|
|
99
|
+
if "_1" in file:
|
|
100
|
+
header_str = "/1"
|
|
101
|
+
elif "_2" in file:
|
|
102
|
+
header_str = "/2"
|
|
103
|
+
else:
|
|
104
|
+
header_str = "/1" # SE files still have "/1" in the headers
|
|
105
|
+
|
|
106
|
+
for counter, line in enumerate(open_func(file)):
|
|
107
|
+
|
|
108
|
+
if counter % 4 == 0: # Only do stuff every four lines to hit the header
|
|
109
|
+
line = line.decode("ascii").strip()
|
|
110
|
+
curr_read_strand = line[-2:]
|
|
111
|
+
|
|
112
|
+
if curr_read_strand != header_str:
|
|
113
|
+
reads_with_err[file].append(line)
|
|
114
|
+
reads_with_err["total"].append(1)
|
|
115
|
+
|
|
116
|
+
if len(reads_with_err) != 0:
|
|
117
|
+
|
|
118
|
+
num_of_reads_with_err = len(reads_with_err["total"])
|
|
119
|
+
reads_with_err["total"] = num_of_reads_with_err
|
|
120
|
+
|
|
121
|
+
logging.error(
|
|
122
|
+
f"Found {num_of_reads_with_err} reads with header strands that don't match file suffix. See log file at {output}/{sample}_suffix_header_err.json" # noqa: E501
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
with open(
|
|
126
|
+
f"{output}/{sample}_suffix_header_err.json", "w"
|
|
127
|
+
) as fw: # Writes JSON file containing the headers of reads with errors
|
|
128
|
+
json.dump(reads_with_err, fw)
|
|
129
|
+
|
|
130
|
+
else:
|
|
131
|
+
with open(
|
|
132
|
+
f"{output}/{sample}_suffix_header_err.json", "w"
|
|
133
|
+
) as fw: # Creates an empty file if there are no errors
|
|
134
|
+
print("No errors.")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
main()
|
|
@@ -36,17 +36,31 @@ LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
|
|
|
36
36
|
|
|
37
37
|
def set_model_names(prefix, name, directory):
|
|
38
38
|
pattern_dict = {}
|
|
39
|
-
pattern_dict[SSU] = os.path.join(directory, f
|
|
40
|
-
pattern_dict[SSU_rRNA_archaea] = os.path.join(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
pattern_dict[
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
pattern_dict[
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
pattern_dict[
|
|
39
|
+
pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
|
|
40
|
+
pattern_dict[SSU_rRNA_archaea] = os.path.join(
|
|
41
|
+
directory, f"{prefix}{name}_{SSU_rRNA_archaea}.RF01959.fa"
|
|
42
|
+
)
|
|
43
|
+
pattern_dict[SSU_rRNA_bacteria] = os.path.join(
|
|
44
|
+
directory, f"{prefix}{name}_{SSU_rRNA_bacteria}.RF00177.fa"
|
|
45
|
+
)
|
|
46
|
+
pattern_dict[SSU_rRNA_eukarya] = os.path.join(
|
|
47
|
+
directory, f"{prefix}{name}_{SSU_rRNA_eukarya}.RF01960.fa"
|
|
48
|
+
)
|
|
49
|
+
pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
|
|
50
|
+
directory, f"{prefix}{name}_{SSU_rRNA_microsporidia}.RF02542.fa"
|
|
51
|
+
)
|
|
52
|
+
pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
|
|
53
|
+
pattern_dict[LSU_rRNA_archaea] = os.path.join(
|
|
54
|
+
directory, f"{prefix}{name}_{LSU_rRNA_archaea}.RF02540.fa"
|
|
55
|
+
)
|
|
56
|
+
pattern_dict[LSU_rRNA_bacteria] = os.path.join(
|
|
57
|
+
directory, f"{prefix}{name}_{LSU_rRNA_bacteria}.RF02541.fa"
|
|
58
|
+
)
|
|
59
|
+
pattern_dict[LSU_rRNA_eukarya] = os.path.join(
|
|
60
|
+
directory, f"{prefix}{name}_{LSU_rRNA_eukarya}.RF02543.fa"
|
|
61
|
+
)
|
|
62
|
+
pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fa")
|
|
63
|
+
pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fa")
|
|
50
64
|
return pattern_dict
|
|
51
65
|
|
|
52
66
|
|
|
@@ -69,42 +83,57 @@ def main():
|
|
|
69
83
|
directory = "sequence-categorisation"
|
|
70
84
|
if not os.path.exists(directory):
|
|
71
85
|
os.makedirs(directory)
|
|
72
|
-
|
|
73
|
-
if not os.path.exists(
|
|
74
|
-
os.makedirs(
|
|
86
|
+
directory_ncrna = os.path.join("sequence-categorisation", "ncRNA")
|
|
87
|
+
if not os.path.exists(directory_ncrna):
|
|
88
|
+
os.makedirs(directory_ncrna)
|
|
75
89
|
|
|
76
|
-
print(
|
|
90
|
+
print("Start fasta mode")
|
|
77
91
|
pattern_dict = set_model_names(prefix, name, directory)
|
|
78
|
-
coding_rna = [
|
|
79
|
-
|
|
92
|
+
coding_rna = [
|
|
93
|
+
SSU_rRNA_archaea,
|
|
94
|
+
SSU_rRNA_bacteria,
|
|
95
|
+
SSU_rRNA_eukarya,
|
|
96
|
+
SSU_rRNA_microsporidia,
|
|
97
|
+
LSU_rRNA_archaea,
|
|
98
|
+
LSU_rRNA_bacteria,
|
|
99
|
+
LSU_rRNA_eukarya,
|
|
100
|
+
Seq5S,
|
|
101
|
+
Seq5_8S,
|
|
102
|
+
]
|
|
80
103
|
open_files = {}
|
|
81
104
|
for record in SeqIO.parse(args.input, "fasta"):
|
|
82
|
-
model =
|
|
105
|
+
model = "-".join(record.id.split("/")[0].split("-")[1:])
|
|
83
106
|
if model in coding_rna:
|
|
84
107
|
filename = pattern_dict[model]
|
|
85
108
|
else:
|
|
86
|
-
filename = os.path.join(
|
|
109
|
+
filename = os.path.join(directory_ncrna, f"{prefix}{name}_{model}.fasta")
|
|
87
110
|
if model not in open_files:
|
|
88
|
-
file_out = open(filename,
|
|
111
|
+
file_out = open(filename, "w")
|
|
89
112
|
open_files[model] = file_out
|
|
90
113
|
SeqIO.write(record, open_files[model], "fasta")
|
|
91
114
|
|
|
92
|
-
if model in (
|
|
115
|
+
if model in (
|
|
116
|
+
SSU_rRNA_archaea,
|
|
117
|
+
SSU_rRNA_bacteria,
|
|
118
|
+
SSU_rRNA_eukarya,
|
|
119
|
+
SSU_rRNA_microsporidia,
|
|
120
|
+
):
|
|
93
121
|
if SSU not in open_files:
|
|
94
|
-
file_out = open(pattern_dict[SSU],
|
|
122
|
+
file_out = open(pattern_dict[SSU], "w")
|
|
95
123
|
open_files[SSU] = file_out
|
|
96
124
|
SeqIO.write(record, open_files[SSU], "fasta")
|
|
97
125
|
if model in (LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya):
|
|
98
126
|
if LSU not in open_files:
|
|
99
|
-
file_out = open(pattern_dict[LSU],
|
|
127
|
+
file_out = open(pattern_dict[LSU], "w")
|
|
100
128
|
open_files[LSU] = file_out
|
|
101
129
|
SeqIO.write(record, open_files[LSU], "fasta")
|
|
102
130
|
|
|
103
131
|
for item in open_files:
|
|
104
132
|
open_files[item].close()
|
|
105
133
|
|
|
106
|
-
if len(os.listdir(
|
|
107
|
-
os.rmdir(
|
|
134
|
+
if len(os.listdir(directory_ncrna)) == 0:
|
|
135
|
+
os.rmdir(directory_ncrna)
|
|
136
|
+
|
|
108
137
|
|
|
109
138
|
if __name__ == "__main__":
|
|
110
|
-
main()
|
|
139
|
+
main()
|
|
@@ -16,41 +16,47 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
import sys
|
|
19
|
-
import os
|
|
20
|
-
import gzip
|
|
21
|
-
from Bio import SeqIO
|
|
22
19
|
|
|
23
20
|
|
|
24
21
|
def main():
|
|
25
22
|
parser = argparse.ArgumentParser(description="Extract lsu, ssu and 5s")
|
|
26
|
-
parser.add_argument(
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"-i", "--input", dest="input", help="Input fasta file", required=True
|
|
25
|
+
)
|
|
27
26
|
parser.add_argument("-l", "--lsu", dest="lsu", help="LSU pattern", required=True)
|
|
28
27
|
parser.add_argument("-s", "--ssu", dest="ssu", help="SSU pattern", required=True)
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
ssu_coords = "SSU_coords"
|
|
30
|
+
lsu_coords = "LSU_coords"
|
|
31
|
+
ssu_count = 0
|
|
32
|
+
lsu_count = 0
|
|
34
33
|
|
|
35
34
|
if len(sys.argv) == 1:
|
|
36
35
|
parser.print_help()
|
|
37
36
|
else:
|
|
38
37
|
args = parser.parse_args()
|
|
39
38
|
|
|
40
|
-
with
|
|
39
|
+
with (
|
|
40
|
+
open(ssu_coords, "w") as out_ssu,
|
|
41
|
+
open(lsu_coords, "w") as out_lsu,
|
|
42
|
+
open(args.input, "r") as input,
|
|
43
|
+
):
|
|
41
44
|
for line in input:
|
|
42
45
|
if args.lsu in line:
|
|
43
46
|
out_lsu.write(line)
|
|
44
|
-
|
|
47
|
+
lsu_count += 1
|
|
45
48
|
elif args.ssu in line:
|
|
46
49
|
out_ssu.write(line)
|
|
47
|
-
|
|
48
|
-
with open("RNA-counts",
|
|
49
|
-
count.write(
|
|
50
|
+
ssu_count += 1
|
|
51
|
+
with open("RNA-counts", "w") as count:
|
|
52
|
+
count.write(
|
|
53
|
+
"LSU count\t" + str(lsu_count) + "\nSSU count\t" + str(ssu_count)
|
|
54
|
+
)
|
|
50
55
|
|
|
51
56
|
out_ssu.close()
|
|
52
57
|
out_lsu.close()
|
|
53
58
|
count.close()
|
|
54
59
|
|
|
60
|
+
|
|
55
61
|
if __name__ == "__main__":
|
|
56
62
|
main()
|