mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (32) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
  13. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +138 -0
  14. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
  15. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
  16. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
  17. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
  18. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
  19. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
  20. mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
  21. mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
  22. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
  23. mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  24. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
  25. mgnify_pipelines_toolkit/utils/get_mpt_version.py +26 -0
  26. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/METADATA +18 -1
  27. mgnify_pipelines_toolkit-0.1.6.dist-info/RECORD +34 -0
  28. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/WHEEL +1 -1
  29. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/entry_points.txt +4 -0
  30. mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
  31. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/LICENSE +0 -0
  32. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/top_level.txt +0 -0
@@ -17,31 +17,48 @@
17
17
  import argparse
18
18
  from collections import defaultdict
19
19
  import os
20
- import subprocess
21
20
 
22
21
  from Bio.Seq import Seq
23
22
  import regex
24
23
 
25
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import primer_regex_query_builder, get_read_count, fetch_mcp
24
+ from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
25
+ primer_regex_query_builder,
26
+ get_read_count,
27
+ fetch_mcp,
28
+ )
29
+
26
30
 
27
31
  def parse_args():
28
32
 
29
33
  parser = argparse.ArgumentParser()
30
34
 
31
- parser.add_argument("-i", "--input", required=True, type=str, help="Path to merged FASTQ to look for primers")
32
- parser.add_argument("-p", "--primers", required=True, type=str, help="Path to directory containing standard primers fasta files")
35
+ parser.add_argument(
36
+ "-i",
37
+ "--input",
38
+ required=True,
39
+ type=str,
40
+ help="Path to merged FASTQ to look for primers",
41
+ )
42
+ parser.add_argument(
43
+ "-p",
44
+ "--primers",
45
+ required=True,
46
+ type=str,
47
+ help="Path to directory containing standard primers fasta files",
48
+ )
33
49
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
34
50
  parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
35
51
  args = parser.parse_args()
36
-
37
- _INPUT = args.input
38
- _PRIMERS = args.primers
39
- _SAMPLE = args.sample
40
- _OUTPUT = args.output
41
52
 
42
- return _INPUT, _PRIMERS, _SAMPLE, _OUTPUT
53
+ input = args.input
54
+ primers = args.primers
55
+ sample = args.sample
56
+ output = args.output
57
+
58
+ return input, primers, sample, output
59
+
43
60
 
44
- def parse_std_primers(_PRIMERS):
61
+ def parse_std_primers(primers):
45
62
  """
46
63
  Parse the library of standard primers.
47
64
 
@@ -60,19 +77,19 @@ def parse_std_primers(_PRIMERS):
60
77
  std_primer_dict_regex = defaultdict(defaultdict)
61
78
  std_primer_dict = defaultdict(defaultdict)
62
79
 
63
- dir = os.listdir(_PRIMERS)
64
- dir = [ f'{_PRIMERS}/{path}' for path in dir ]
65
-
80
+ dir = os.listdir(primers)
81
+ dir = [f"{primers}/{path}" for path in dir]
82
+
66
83
  rev_flag = False
67
84
 
68
85
  for path in dir:
69
- region = path.split('/')[-1].split('.')[0]
70
- with open(path, 'r') as fr:
71
- key = ''
86
+ region = path.split("/")[-1].split(".")[0]
87
+ with open(path, "r") as fr:
88
+ key = ""
72
89
  for line in fr:
73
90
  line = line.strip()
74
- if line[0] == '>':
75
- if 'R' in line: # If a primer is a reverse primer
91
+ if line[0] == ">":
92
+ if "R" in line: # If a primer is a reverse primer
76
93
  rev_flag = True
77
94
  key = line[1:]
78
95
  else:
@@ -87,6 +104,7 @@ def parse_std_primers(_PRIMERS):
87
104
 
88
105
  return std_primer_dict_regex, std_primer_dict
89
106
 
107
+
90
108
  def run_primer_matching_once(input_path, input_primer, rev=False):
91
109
  """
92
110
  Run primer matching using the regex package.
@@ -103,11 +121,12 @@ def run_primer_matching_once(input_path, input_primer, rev=False):
103
121
  for mcp in mcp_count_dict.keys():
104
122
  mcp = mcp.strip()
105
123
  res = regex.match(input_primer, mcp)
106
- if res != None:
124
+ if res is not None:
107
125
  match_count += mcp_count_dict[mcp]
108
126
 
109
127
  return match_count
110
128
 
129
+
111
130
  def get_primer_props(std_primer_dict_regex, input_path):
112
131
  """
113
132
  Look for the standard primers in the input fastq file.
@@ -122,93 +141,107 @@ def get_primer_props(std_primer_dict_regex, input_path):
122
141
  max_primers: dictionary containing the F and/or R primers that were chosen
123
142
  """
124
143
 
125
- threshold = 0.60 # Arbitrary threshold for collecting a matched primer
126
- read_count = get_read_count(input_path, 'fastq') # Get read count of fastq file to calculate proportion with
144
+ threshold = 0.60 # Arbitrary threshold for collecting a matched primer
145
+ read_count = get_read_count(
146
+ input_path, "fastq"
147
+ ) # Get read count of fastq file to calculate proportion with
127
148
  res_dict = defaultdict(defaultdict)
128
149
 
129
150
  # Loop through every primer region
130
151
  for region, primer in std_primer_dict_regex.items():
131
- res_dict[region]['F'] = {}
132
- res_dict[region]['R'] = {}
152
+ res_dict[region]["F"] = {}
153
+ res_dict[region]["R"] = {}
133
154
 
134
155
  # Loop through every primer of a certain region
135
156
  for primer_name, primer_seq in primer.items():
136
-
137
- region_name_str = f'{region};{primer_name}'
157
+
158
+ region_name_str = f"{region};{primer_name}"
138
159
  primer_count = 0.0
139
160
 
140
- if 'F' in primer_name:
141
- primer_count = run_primer_matching_once(input_path, primer_seq, rev=False) # Get proportion of a F primer with fuzzy regex matching
142
- elif 'R' in primer_name:
143
- primer_count = run_primer_matching_once(input_path, primer_seq, rev=True) # Get proportion of a R primer with fuzzy regex matching
161
+ if "F" in primer_name:
162
+ primer_count = run_primer_matching_once(
163
+ input_path, primer_seq, rev=False
164
+ ) # Get proportion of a F primer with fuzzy regex matching
165
+ elif "R" in primer_name:
166
+ primer_count = run_primer_matching_once(
167
+ input_path, primer_seq, rev=True
168
+ ) # Get proportion of a R primer with fuzzy regex matching
144
169
 
145
170
  try:
146
171
  primer_prop = primer_count / read_count
147
172
  except ZeroDivisionError:
148
173
  primer_prop = 0
149
174
 
150
- if 'F' in primer_name:
151
- if primer_prop > threshold: # Only collect primer if it's above threshold
152
- res_dict[region]['F'][primer_name] = primer_prop
153
- elif 'R' in primer_name:
154
- if primer_prop > threshold: # Only collect primer if it's above threshold
155
- res_dict[region]['R'][primer_name] = primer_prop
175
+ if "F" in primer_name:
176
+ if (
177
+ primer_prop > threshold
178
+ ): # Only collect primer if it's above threshold
179
+ res_dict[region]["F"][primer_name] = primer_prop
180
+ elif "R" in primer_name:
181
+ if (
182
+ primer_prop > threshold
183
+ ): # Only collect primer if it's above threshold
184
+ res_dict[region]["R"][primer_name] = primer_prop
185
+
186
+ print(f"{region_name_str}: {primer_prop}")
156
187
 
157
- print(f'{region_name_str}: {primer_prop}')
158
-
159
188
  # If an F or/and R primer wasn't found then just remove it from the dictionary
160
- if res_dict[region]['F'] == {}:
161
- res_dict[region].pop('F')
162
- if res_dict[region]['R'] == {}:
163
- res_dict[region].pop('R')
164
-
189
+ if res_dict[region]["F"] == {}:
190
+ res_dict[region].pop("F")
191
+ if res_dict[region]["R"] == {}:
192
+ res_dict[region].pop("R")
165
193
 
166
194
  singles = defaultdict(str)
167
195
  doubles = defaultdict(list)
168
196
 
169
- double_status = False # Flag for whether primers were found on both strands
197
+ double_status = False # Flag for whether primers were found on both strands
170
198
 
171
199
  # Loop through every collected primer and put primers in singles or doubles
172
200
  for region in res_dict.keys():
173
201
  strands = res_dict[region]
174
-
202
+
175
203
  for strand in strands.keys():
176
204
  primers = strands[strand]
177
205
  max_prop = 0
178
- max_name = ''
206
+ max_name = ""
179
207
  for primer_name, prop in primers.items():
180
208
  if prop > max_prop:
181
209
  max_prop = prop
182
210
  max_name = primer_name
183
-
211
+
184
212
  if len(strands.keys()) == 2:
185
213
  double_status = True
186
214
  doubles[region].append({max_name: max_prop})
187
215
  elif len(strands.keys()) == 1:
188
216
  singles[region] = {max_name: max_prop}
189
217
 
190
- max_region = ''
218
+ max_region = ""
191
219
  max_primers = {}
192
220
  max_mean_prop = 0
193
-
221
+
194
222
  # if at least one pair of primers was collected
195
223
  if double_status:
196
- for region in doubles: # Loop through all pairs of primers and choose the best one
224
+ for (
225
+ region
226
+ ) in doubles: # Loop through all pairs of primers and choose the best one
197
227
  primers = doubles[region]
198
228
 
199
229
  f_primer_name = list(primers[0].keys())[0]
200
230
  r_primer_name = list(primers[1].keys())[0]
201
231
  f_primer_prop = primers[0][f_primer_name]
202
232
  r_primer_prop = primers[1][r_primer_name]
203
-
233
+
204
234
  mean_prop = (f_primer_prop + r_primer_prop) / 2.0
205
235
  if mean_prop > max_mean_prop:
206
236
  max_mean_prop = mean_prop
207
237
  max_region = region
208
- max_primers = [{f_primer_name: f_primer_prop}, {r_primer_name: r_primer_prop}]
238
+ max_primers = [
239
+ {f_primer_name: f_primer_prop},
240
+ {r_primer_name: r_primer_prop},
241
+ ]
209
242
 
210
243
  else:
211
- for region in singles: # Choose the best single primer
244
+ for region in singles: # Choose the best single primer
212
245
  primer = singles[region]
213
246
  primer_name = list(primer.keys())[0]
214
247
  prop = primer[primer_name]
@@ -217,23 +250,22 @@ def get_primer_props(std_primer_dict_regex, input_path):
217
250
  max_region = region
218
251
  max_primers = {primer_name: prop}
219
252
 
220
- if max_region == '':
221
- print('No standard library primers!')
222
- return([])
253
+ if max_region == "":
254
+ print("No standard library primers!")
255
+ return []
223
256
  elif double_status:
224
- print('Standard library primers found!')
225
- print(f'Region: {max_region}')
226
- print(f'Forward Primer: {max_primers[0]}')
227
- print(f'Reverse Primer: {max_primers[1]}')
257
+ print("Standard library primers found!")
258
+ print(f"Region: {max_region}")
259
+ print(f"Forward Primer: {max_primers[0]}")
260
+ print(f"Reverse Primer: {max_primers[1]}")
228
261
 
229
- return([max_region, max_primers[0], max_primers[1]])
262
+ return [max_region, max_primers[0], max_primers[1]]
230
263
  else:
231
- print('Standard library primer found on one strand!')
232
- print(f'Region: {max_region}')
233
- print(f'Primer: {max_primers}')
234
-
235
- return([max_region, max_primers])
264
+ print("Standard library primer found on one strand!")
265
+ print(f"Region: {max_region}")
266
+ print(f"Primer: {max_primers}")
236
267
 
268
+ return [max_region, max_primers]
237
269
 
238
270
 
239
271
  def save_out(results, sample_id, output, std_primer_dict):
@@ -241,24 +273,26 @@ def save_out(results, sample_id, output, std_primer_dict):
241
273
  Save found std primers into a fasta file.
242
274
  """
243
275
 
244
- with open(f'{output}/{sample_id}_std_primer_out.txt', 'w') as fw_out, open(f'{output}/{sample_id}_std_primers.fasta', 'w') as fw_seq:
276
+ with (
277
+ open(f"{output}/{sample_id}_std_primer_out.txt", "w") as fw_out,
278
+ open(f"{output}/{sample_id}_std_primers.fasta", "w") as fw_seq,
279
+ ):
245
280
  if results == []:
246
- fw_out.write(f'')
247
- fw_seq.write(f'')
248
-
281
+ fw_out.write("")
282
+ fw_seq.write("")
283
+
249
284
  elif len(results) == 2:
250
285
  region = results[0]
251
286
  primer_name = list(results[1].keys())[0]
252
287
  primer_prop = results[1][list(results[1].keys())[0]]
253
288
  seq = std_primer_dict[region][primer_name]
254
- if 'R' in primer_name:
289
+ if "R" in primer_name:
255
290
  seq = str(Seq(seq).complement())
256
- fw_out.write(f'{region}\n')
257
- fw_out.write(f'{primer_name}: {primer_prop}')
291
+ fw_out.write(f"{region}\n")
292
+ fw_out.write(f"{primer_name}: {primer_prop}")
293
+
294
+ fw_seq.write(f">{primer_name}\n{seq}")
258
295
 
259
- fw_seq.write(f'>{primer_name}\n{seq}')
260
-
261
-
262
296
  elif len(results) == 3:
263
297
  region = results[0]
264
298
  f_primer_name = list(results[1].keys())[0]
@@ -268,22 +302,26 @@ def save_out(results, sample_id, output, std_primer_dict):
268
302
  r_primer_prop = results[2][list(results[2].keys())[0]]
269
303
  r_seq = std_primer_dict[region][r_primer_name]
270
304
  r_seq = str(Seq(r_seq).complement())
271
-
272
305
 
273
- fw_out.write(f'{region}\n')
274
- fw_out.write(f'{f_primer_name}: {f_primer_prop}\n')
275
- fw_out.write(f'{r_primer_name}: {r_primer_prop}')
306
+ fw_out.write(f"{region}\n")
307
+ fw_out.write(f"{f_primer_name}: {f_primer_prop}\n")
308
+ fw_out.write(f"{r_primer_name}: {r_primer_prop}")
309
+
310
+ fw_seq.write(f">{f_primer_name}\n{f_seq}\n")
311
+ fw_seq.write(f">{r_primer_name}\n{r_seq}\n")
276
312
 
277
- fw_seq.write(f'>{f_primer_name}\n{f_seq}\n')
278
- fw_seq.write(f'>{r_primer_name}\n{r_seq}\n')
279
313
 
280
-
281
314
  def main():
282
-
283
- _INPUT, _PRIMERS, _SAMPLE, _OUTPUT = parse_args()
284
- std_primer_dict_regex, std_primer_dict = parse_std_primers(_PRIMERS) # Parse std primer library into dictionaries
285
- results = get_primer_props(std_primer_dict_regex, _INPUT) # Find all the std primers in the input and select most common
286
- save_out(results, _SAMPLE, _OUTPUT, std_primer_dict)
315
+
316
+ input, primers, sample, output = parse_args()
317
+ std_primer_dict_regex, std_primer_dict = parse_std_primers(
318
+ primers
319
+ ) # Parse std primer library into dictionaries
320
+ results = get_primer_props(
321
+ std_primer_dict_regex, input
322
+ ) # Find all the std primers in the input and select most common
323
+ save_out(results, sample, output, std_primer_dict)
324
+
287
325
 
288
326
  if __name__ == "__main__":
289
- main()
327
+ main()
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import gzip
20
+ import json
21
+ import logging
22
+
23
+ logging.basicConfig(level=logging.DEBUG)
24
+
25
+
26
+ def parse_args():
27
+
28
+ parser = argparse.ArgumentParser(
29
+ description="Script that sanity checks whether the strand suffix of a FASTQ file matches the headers inside the FASTQ file."
30
+ )
31
+ parser.add_argument(
32
+ "-f",
33
+ "--fwd",
34
+ required=True,
35
+ type=str,
36
+ help="Input forward read headers file (PE) OR SE read file",
37
+ )
38
+ parser.add_argument(
39
+ "-r",
40
+ "--rev",
41
+ required=False,
42
+ type=str,
43
+ help="Input reverse read headers file (PE)",
44
+ )
45
+ parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
46
+ parser.add_argument("-o", "--output", required=True, type=str, help="Output")
47
+
48
+ args = parser.parse_args()
49
+
50
+ fwd = args.fwd
51
+ rev = args.rev
52
+ sample = args.sample
53
+ output = args.output
54
+
55
+ return fwd, rev, sample, output
56
+
57
+
58
+ def choose_open_func(file_path):
59
+
60
+ open_func = open
61
+
62
+ if file_path[-2:] == "gz":
63
+ open_func = gzip.open
64
+
65
+ return open_func
66
+
67
+
68
+ def main():
69
+
70
+ fwd, rev, sample, output = parse_args()
71
+
72
+ files_to_parse = []
73
+
74
+ if "_1" in fwd:
75
+ if not rev:
76
+ logging.error(
77
+ 'No reverse file given, yet given forward file has the "_1" suffix implying it\'s paired-end. '
78
+ + "Either supply the reverse file, or supply a single-end file."
79
+ )
80
+ elif "_2" not in rev:
81
+ logging.error(
82
+ 'The expected suffix "_2" for a supplied reverse file is missing. Please verify your inputs.'
83
+ )
84
+ else:
85
+ files_to_parse = [fwd, rev]
86
+
87
+ else:
88
+ files_to_parse = [fwd]
89
+
90
+ open_func = choose_open_func(
91
+ fwd
92
+ ) # Choose between gzip.open() and open() by checking the file extension
93
+ reads_with_err = defaultdict(list)
94
+
95
+ for file in files_to_parse:
96
+
97
+ header_str = ""
98
+
99
+ if "_1" in file:
100
+ header_str = "/1"
101
+ elif "_2" in file:
102
+ header_str = "/2"
103
+ else:
104
+ header_str = "/1" # SE files still have "/1" in the headers
105
+
106
+ for counter, line in enumerate(open_func(file)):
107
+
108
+ if counter % 4 == 0: # Only do stuff every four lines to hit the header
109
+ line = line.decode("ascii").strip()
110
+ curr_read_strand = line[-2:]
111
+
112
+ if curr_read_strand != header_str:
113
+ reads_with_err[file].append(line)
114
+ reads_with_err["total"].append(1)
115
+
116
+ if len(reads_with_err) != 0:
117
+
118
+ num_of_reads_with_err = len(reads_with_err["total"])
119
+ reads_with_err["total"] = num_of_reads_with_err
120
+
121
+ logging.error(
122
+ f"Found {num_of_reads_with_err} reads with header strands that don't match file suffix. See log file at {output}/{sample}_suffix_header_err.json" # noqa: E501
123
+ )
124
+
125
+ with open(
126
+ f"{output}/{sample}_suffix_header_err.json", "w"
127
+ ) as fw: # Writes JSON file containing the headers of reads with errors
128
+ json.dump(reads_with_err, fw)
129
+
130
+ else:
131
+ with open(
132
+ f"{output}/{sample}_suffix_header_err.json", "w"
133
+ ) as fw: # Creates an empty file if there are no errors
134
+ print("No errors.")
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
@@ -36,17 +36,31 @@ LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
36
36
 
37
37
  def set_model_names(prefix, name, directory):
38
38
  pattern_dict = {}
39
- pattern_dict[SSU] = os.path.join(directory, f'{name}_SSU.fasta')
40
- pattern_dict[SSU_rRNA_archaea] = os.path.join(directory, f'{prefix}{name}_{SSU_rRNA_archaea}.RF01959.fa')
41
- pattern_dict[SSU_rRNA_bacteria] = os.path.join(directory, f'{prefix}{name}_{SSU_rRNA_bacteria}.RF00177.fa')
42
- pattern_dict[SSU_rRNA_eukarya] = os.path.join(directory, f'{prefix}{name}_{SSU_rRNA_eukarya}.RF01960.fa')
43
- pattern_dict[SSU_rRNA_microsporidia] = os.path.join(directory, f'{prefix}{name}_{SSU_rRNA_microsporidia}.RF02542.fa')
44
- pattern_dict[LSU] = os.path.join(directory, f'{name}_LSU.fasta')
45
- pattern_dict[LSU_rRNA_archaea] = os.path.join(directory, f'{prefix}{name}_{LSU_rRNA_archaea}.RF02540.fa')
46
- pattern_dict[LSU_rRNA_bacteria] = os.path.join(directory, f'{prefix}{name}_{LSU_rRNA_bacteria}.RF02541.fa')
47
- pattern_dict[LSU_rRNA_eukarya] = os.path.join(directory, f'{prefix}{name}_{LSU_rRNA_eukarya}.RF02543.fa')
48
- pattern_dict[Seq5S] = os.path.join(directory, f'{name}_5S.fa')
49
- pattern_dict[Seq5_8S] = os.path.join(directory, f'{name}_5_8S.fa')
39
+ pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
40
+ pattern_dict[SSU_rRNA_archaea] = os.path.join(
41
+ directory, f"{prefix}{name}_{SSU_rRNA_archaea}.RF01959.fa"
42
+ )
43
+ pattern_dict[SSU_rRNA_bacteria] = os.path.join(
44
+ directory, f"{prefix}{name}_{SSU_rRNA_bacteria}.RF00177.fa"
45
+ )
46
+ pattern_dict[SSU_rRNA_eukarya] = os.path.join(
47
+ directory, f"{prefix}{name}_{SSU_rRNA_eukarya}.RF01960.fa"
48
+ )
49
+ pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
50
+ directory, f"{prefix}{name}_{SSU_rRNA_microsporidia}.RF02542.fa"
51
+ )
52
+ pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
53
+ pattern_dict[LSU_rRNA_archaea] = os.path.join(
54
+ directory, f"{prefix}{name}_{LSU_rRNA_archaea}.RF02540.fa"
55
+ )
56
+ pattern_dict[LSU_rRNA_bacteria] = os.path.join(
57
+ directory, f"{prefix}{name}_{LSU_rRNA_bacteria}.RF02541.fa"
58
+ )
59
+ pattern_dict[LSU_rRNA_eukarya] = os.path.join(
60
+ directory, f"{prefix}{name}_{LSU_rRNA_eukarya}.RF02543.fa"
61
+ )
62
+ pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fa")
63
+ pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fa")
50
64
  return pattern_dict
51
65
 
52
66
 
@@ -69,42 +83,57 @@ def main():
69
83
  directory = "sequence-categorisation"
70
84
  if not os.path.exists(directory):
71
85
  os.makedirs(directory)
72
- directory_ncRNA = os.path.join("sequence-categorisation", "ncRNA")
73
- if not os.path.exists(directory_ncRNA):
74
- os.makedirs(directory_ncRNA)
86
+ directory_ncrna = os.path.join("sequence-categorisation", "ncRNA")
87
+ if not os.path.exists(directory_ncrna):
88
+ os.makedirs(directory_ncrna)
75
89
 
76
- print('Start fasta mode')
90
+ print("Start fasta mode")
77
91
  pattern_dict = set_model_names(prefix, name, directory)
78
- coding_rna = [SSU_rRNA_archaea, SSU_rRNA_bacteria, SSU_rRNA_eukarya, SSU_rRNA_microsporidia,
79
- LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya, Seq5S, Seq5_8S]
92
+ coding_rna = [
93
+ SSU_rRNA_archaea,
94
+ SSU_rRNA_bacteria,
95
+ SSU_rRNA_eukarya,
96
+ SSU_rRNA_microsporidia,
97
+ LSU_rRNA_archaea,
98
+ LSU_rRNA_bacteria,
99
+ LSU_rRNA_eukarya,
100
+ Seq5S,
101
+ Seq5_8S,
102
+ ]
80
103
  open_files = {}
81
104
  for record in SeqIO.parse(args.input, "fasta"):
82
- model = '-'.join(record.id.split('/')[0].split('-')[1:])
105
+ model = "-".join(record.id.split("/")[0].split("-")[1:])
83
106
  if model in coding_rna:
84
107
  filename = pattern_dict[model]
85
108
  else:
86
- filename = os.path.join(directory_ncRNA, f'{prefix}{name}_{model}.fasta')
109
+ filename = os.path.join(directory_ncrna, f"{prefix}{name}_{model}.fasta")
87
110
  if model not in open_files:
88
- file_out = open(filename, 'w')
111
+ file_out = open(filename, "w")
89
112
  open_files[model] = file_out
90
113
  SeqIO.write(record, open_files[model], "fasta")
91
114
 
92
- if model in (SSU_rRNA_archaea, SSU_rRNA_bacteria, SSU_rRNA_eukarya, SSU_rRNA_microsporidia):
115
+ if model in (
116
+ SSU_rRNA_archaea,
117
+ SSU_rRNA_bacteria,
118
+ SSU_rRNA_eukarya,
119
+ SSU_rRNA_microsporidia,
120
+ ):
93
121
  if SSU not in open_files:
94
- file_out = open(pattern_dict[SSU], 'w')
122
+ file_out = open(pattern_dict[SSU], "w")
95
123
  open_files[SSU] = file_out
96
124
  SeqIO.write(record, open_files[SSU], "fasta")
97
125
  if model in (LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya):
98
126
  if LSU not in open_files:
99
- file_out = open(pattern_dict[LSU], 'w')
127
+ file_out = open(pattern_dict[LSU], "w")
100
128
  open_files[LSU] = file_out
101
129
  SeqIO.write(record, open_files[LSU], "fasta")
102
130
 
103
131
  for item in open_files:
104
132
  open_files[item].close()
105
133
 
106
- if len(os.listdir(directory_ncRNA)) == 0:
107
- os.rmdir(directory_ncRNA)
134
+ if len(os.listdir(directory_ncrna)) == 0:
135
+ os.rmdir(directory_ncrna)
136
+
108
137
 
109
138
  if __name__ == "__main__":
110
- main()
139
+ main()
@@ -16,41 +16,47 @@
16
16
 
17
17
  import argparse
18
18
  import sys
19
- import os
20
- import gzip
21
- from Bio import SeqIO
22
19
 
23
20
 
24
21
  def main():
25
22
  parser = argparse.ArgumentParser(description="Extract lsu, ssu and 5s")
26
- parser.add_argument("-i", "--input", dest="input", help="Input fasta file", required=True)
23
+ parser.add_argument(
24
+ "-i", "--input", dest="input", help="Input fasta file", required=True
25
+ )
27
26
  parser.add_argument("-l", "--lsu", dest="lsu", help="LSU pattern", required=True)
28
27
  parser.add_argument("-s", "--ssu", dest="ssu", help="SSU pattern", required=True)
29
28
 
30
- SSU_coords = "SSU_coords"
31
- LSU_coords = "LSU_coords"
32
- SSU_count = 0
33
- LSU_count = 0
29
+ ssu_coords = "SSU_coords"
30
+ lsu_coords = "LSU_coords"
31
+ ssu_count = 0
32
+ lsu_count = 0
34
33
 
35
34
  if len(sys.argv) == 1:
36
35
  parser.print_help()
37
36
  else:
38
37
  args = parser.parse_args()
39
38
 
40
- with open(SSU_coords, 'w') as out_ssu, open(LSU_coords, 'w') as out_lsu, open(args.input, 'r') as input:
39
+ with (
40
+ open(ssu_coords, "w") as out_ssu,
41
+ open(lsu_coords, "w") as out_lsu,
42
+ open(args.input, "r") as input,
43
+ ):
41
44
  for line in input:
42
45
  if args.lsu in line:
43
46
  out_lsu.write(line)
44
- LSU_count += 1
47
+ lsu_count += 1
45
48
  elif args.ssu in line:
46
49
  out_ssu.write(line)
47
- SSU_count += 1
48
- with open("RNA-counts", 'w') as count:
49
- count.write("LSU count\t" + str(LSU_count) + "\nSSU count\t" + str(SSU_count))
50
+ ssu_count += 1
51
+ with open("RNA-counts", "w") as count:
52
+ count.write(
53
+ "LSU count\t" + str(lsu_count) + "\nSSU count\t" + str(ssu_count)
54
+ )
50
55
 
51
56
  out_ssu.close()
52
57
  out_lsu.close()
53
58
  count.close()
54
59
 
60
+
55
61
  if __name__ == "__main__":
56
62
  main()