mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (32) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
  13. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +138 -0
  14. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
  15. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
  16. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
  17. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
  18. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
  19. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
  20. mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
  21. mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
  22. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
  23. mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  24. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
  25. mgnify_pipelines_toolkit/utils/get_mpt_version.py +26 -0
  26. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/METADATA +18 -1
  27. mgnify_pipelines_toolkit-0.1.6.dist-info/RECORD +34 -0
  28. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/WHEEL +1 -1
  29. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/entry_points.txt +4 -0
  30. mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
  31. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/LICENSE +0 -0
  32. {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/top_level.txt +0 -0
@@ -20,21 +20,29 @@ import argparse
20
20
  import pandas as pd
21
21
  import numpy as np
22
22
 
23
+
23
24
  def parse_args():
24
25
 
25
26
  parser = argparse.ArgumentParser()
26
27
 
27
- parser.add_argument("-i", "--input", required=True, type=str, help="Path to mcp tsv file to find inflection points")
28
+ parser.add_argument(
29
+ "-i",
30
+ "--input",
31
+ required=True,
32
+ type=str,
33
+ help="Path to mcp tsv file to find inflection points",
34
+ )
28
35
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
29
36
  parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
30
37
 
31
38
  args = parser.parse_args()
32
-
33
- _PATH = args.input
34
- _SAMPLE = args.sample
35
- _OUTPUT = args.output
36
39
 
37
- return _PATH, _SAMPLE, _OUTPUT
40
+ path = args.input
41
+ sample = args.sample
42
+ output = args.output
43
+
44
+ return path, sample, output
45
+
38
46
 
39
47
  def find_mcp_inf_points(mcp_df):
40
48
  """
@@ -50,45 +58,54 @@ def find_mcp_inf_points(mcp_df):
50
58
  """
51
59
 
52
60
  inf_point_dict = defaultdict(list)
53
- start_indices = [ int(i) for i in mcp_df.columns.tolist() ]
61
+ start_indices = [int(i) for i in mcp_df.columns.tolist()]
54
62
 
55
- for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
63
+ for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
56
64
  strand = mcp_df.index[i]
57
65
  props = mcp_df.iloc[i].tolist()
58
- props = [ -val for val in props ]
66
+ props = [-val for val in props]
59
67
 
60
- prop_diff = np.diff(props)/np.diff(start_indices) # Get the derivative
61
- infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[0] # Grab points above 80th percentile
68
+ prop_diff = np.diff(props) / np.diff(start_indices) # Get the derivative
69
+ infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
70
+ 0
71
+ ] # Grab points above 80th percentile
62
72
 
63
73
  for ind in infl_points:
64
74
  inf_point = start_indices[ind]
65
75
 
66
- if inf_point < 10 or inf_point > 20: # Rule to facilitate results - won't accept
67
- continue # points below index 10 or above index 20
68
- # 10 means a cutoff of 15 and 20 a cutoff of 25
69
- # literature points to no primers existing that are
70
- # shorter or bigger than these lengths
76
+ if (
77
+ inf_point < 10 or inf_point > 20
78
+ ): # Rule to facilitate results - won't accept
79
+ continue # points below index 10 or above index 20
80
+ # 10 means a cutoff of 15 and 20 a cutoff of 25
81
+ # literature points to no primers existing that are
82
+ # shorter or bigger than these lengths
83
+
84
+ inf_point_dict["strand"].append(strand)
85
+ inf_point_dict["inf_point"].append(inf_point)
71
86
 
72
- inf_point_dict['strand'].append(strand)
73
- inf_point_dict['inf_point'].append(inf_point)
74
-
75
87
  return inf_point_dict
76
88
 
89
+
77
90
  def main():
78
91
 
79
- _PATH, _SAMPLE, _OUTPUT = parse_args()
92
+ path, sample, output = parse_args()
80
93
 
81
- mcp_df = pd.read_csv(_PATH, sep='\t', index_col=0) # Read mcp_df
82
- inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
94
+ mcp_df = pd.read_csv(path, sep="\t", index_col=0) # Read mcp_df
95
+ inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
83
96
 
84
- if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
85
- inf_point_df = pd.DataFrame.from_dict(inf_point_dict) # .. turn it into a dataframe
86
- inf_point_df.to_csv(f'{_OUTPUT}/{_SAMPLE}_inf_points.tsv', sep='\t', index=False) # ..save it to a .tsv file
97
+ if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
98
+ inf_point_df = pd.DataFrame.from_dict(
99
+ inf_point_dict
100
+ ) # .. turn it into a dataframe
101
+ inf_point_df.to_csv(
102
+ f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
103
+ ) # ..save it to a .tsv file
87
104
 
88
- else: # If it is empty..
89
- fw = open(f'{_OUTPUT}/{_SAMPLE}_inf_points.tsv', 'w') # ..make an empty file
105
+ else: # If it is empty..
106
+ fw = open(f"{output}/{sample}_inf_points.tsv", "w") # ..make an empty file
90
107
  fw.close()
91
108
 
92
109
 
93
110
  if __name__ == "__main__":
94
- main()
111
+ main()
@@ -20,32 +20,48 @@ import logging
20
20
 
21
21
  import pandas as pd
22
22
 
23
- from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_TAX_RANKS
23
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
24
+ _SILVA_TAX_RANKS,
25
+ _PR2_TAX_RANKS,
26
+ )
24
27
 
25
28
  logging.basicConfig(level=logging.DEBUG)
26
29
 
30
+
27
31
  def parse_args():
28
32
  parser = argparse.ArgumentParser()
29
33
 
30
- parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
31
- parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
32
- parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
33
34
  parser.add_argument(
34
- "-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
35
+ "-t", "--taxa", required=True, type=str, help="Path to taxa file"
36
+ )
37
+ parser.add_argument(
38
+ "-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file"
39
+ )
40
+ parser.add_argument(
41
+ "-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file"
42
+ )
43
+ parser.add_argument(
44
+ "-a",
45
+ "--amp",
46
+ required=True,
47
+ type=str,
48
+ help="Path to extracted amp_region reads from inference subworkflow",
49
+ )
50
+ parser.add_argument(
51
+ "-hd", "--headers", required=True, type=str, help="Path to fastq headers"
35
52
  )
36
- parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
37
53
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
38
54
 
39
55
  args = parser.parse_args()
40
-
41
- _TAXA = args.taxa
42
- _FWD = args.fwd
43
- _REV = args.rev
44
- _AMP = args.amp
45
- _HEADERS = args.headers
46
- _SAMPLE = args.sample
47
56
 
48
- return _TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE
57
+ taxa = args.taxa
58
+ fwd = args.fwd
59
+ rev = args.rev
60
+ amp = args.amp
61
+ headers = args.headers
62
+ sample = args.sample
63
+
64
+ return taxa, fwd, rev, amp, headers, sample
49
65
 
50
66
 
51
67
  def order_df(taxa_df):
@@ -59,6 +75,7 @@ def order_df(taxa_df):
59
75
 
60
76
  return taxa_df
61
77
 
78
+
62
79
  def make_tax_assignment_dict_silva(taxa_df, asv_dict):
63
80
  tax_assignment_dict = defaultdict(int)
64
81
 
@@ -93,7 +110,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
93
110
  k = "_".join(k.split(" "))
94
111
  tax_assignment += f"\t{k}"
95
112
  elif sk != "0":
96
- tax_assignment += f"\tk__"
113
+ tax_assignment += "\tk__"
97
114
  else:
98
115
  break
99
116
 
@@ -136,9 +153,10 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
136
153
  continue
137
154
 
138
155
  tax_assignment_dict[tax_assignment] += asv_count
139
-
156
+
140
157
  return tax_assignment_dict
141
158
 
159
+
142
160
  def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
143
161
  tax_assignment_dict = defaultdict(int)
144
162
 
@@ -223,6 +241,7 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
223
241
 
224
242
  return tax_assignment_dict
225
243
 
244
+
226
245
  def generate_asv_count_dict(asv_dict):
227
246
 
228
247
  res_dict = defaultdict(list)
@@ -232,36 +251,35 @@ def generate_asv_count_dict(asv_dict):
232
251
  if count == 0:
233
252
  continue
234
253
 
235
- res_dict['asv'].append(asv_id)
236
- res_dict['count'].append(count)
254
+ res_dict["asv"].append(asv_id)
255
+ res_dict["count"].append(count)
237
256
 
238
257
  res_df = pd.DataFrame.from_dict(res_dict)
239
- res_df = res_df.sort_values(by='asv', ascending=True)
240
- res_df = res_df.sort_values(by='count', ascending=False)
258
+ res_df = res_df.sort_values(by="asv", ascending=True)
259
+ res_df = res_df.sort_values(by="count", ascending=False)
241
260
 
242
261
  return res_df
243
262
 
244
263
 
245
264
  def main():
246
- _TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
265
+ taxa, fwd, rev, amp, headers, sample = parse_args()
247
266
 
248
- fwd_fr = open(_FWD, "r")
267
+ fwd_fr = open(fwd, "r")
249
268
  paired_end = True
250
269
 
251
- if _REV == None:
270
+ if rev is None:
252
271
  paired_end = False
253
272
  rev_fr = [True]
254
273
  else:
255
- rev_fr = open(_REV, "r")
274
+ rev_fr = open(rev, "r")
256
275
 
257
- taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
276
+ taxa_df = pd.read_csv(taxa, sep="\t", dtype=str)
258
277
  taxa_df = taxa_df.fillna("0")
259
278
  taxa_df = order_df(taxa_df)
260
279
 
261
- amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
262
- headers = [read.split(" ")[0][1:] for read in
263
- list(open(_HEADERS, "r"))]
264
- amp_region = ".".join(_AMP.split(".")[1:3])
280
+ amp_reads = [read.strip() for read in list(open(amp, "r"))]
281
+ headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
282
+ amp_region = ".".join(amp.split(".")[1:3])
265
283
 
266
284
  asv_dict = defaultdict(int)
267
285
 
@@ -270,7 +288,7 @@ def main():
270
288
  counter += 1
271
289
  line_fwd = line_fwd.strip()
272
290
 
273
- if line_fwd == '0':
291
+ if line_fwd == "0":
274
292
  continue
275
293
 
276
294
  if headers[counter] in amp_reads:
@@ -289,12 +307,15 @@ def main():
289
307
  tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
290
308
  ref_db = "pr2"
291
309
 
292
- with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
310
+ with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
293
311
  for tax_assignment, count in tax_assignment_dict.items():
294
312
  fw.write(f"{count}\t{tax_assignment}\n")
295
313
 
296
314
  asv_count_df = generate_asv_count_dict(asv_dict)
297
- asv_count_df.to_csv(f'./{_SAMPLE}_{amp_region}_asv_read_counts.tsv', sep='\t', index=False)
315
+ asv_count_df.to_csv(
316
+ f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
317
+ )
318
+
298
319
 
299
320
  if __name__ == "__main__":
300
321
  main()
@@ -22,9 +22,12 @@ import pandas as pd
22
22
 
23
23
  logging.basicConfig(level=logging.DEBUG)
24
24
 
25
+
25
26
  def parse_args():
26
27
  parser = argparse.ArgumentParser()
27
- parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
28
+ parser.add_argument(
29
+ "-i", "--input", required=True, type=str, help="Input from MAPseq output"
30
+ )
28
31
  parser.add_argument(
29
32
  "-l",
30
33
  "--label",
@@ -37,18 +40,48 @@ def parse_args():
37
40
 
38
41
  args = parser.parse_args()
39
42
 
40
- _INPUT = args.input
41
- _LABEL = args.label
42
- _SAMPLE = args.sample
43
+ input = args.input
44
+ label = args.label
45
+ sample = args.sample
46
+
47
+ return input, label, sample
43
48
 
44
- return _INPUT, _LABEL, _SAMPLE
45
49
 
46
50
  def parse_label(label):
47
51
  silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
48
- pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
49
-
50
- silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
51
- pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
52
+ pr2_short_ranks = [
53
+ "d__",
54
+ "sg__",
55
+ "dv__",
56
+ "sdv__",
57
+ "c__",
58
+ "o__",
59
+ "f__",
60
+ "g__",
61
+ "s__",
62
+ ]
63
+
64
+ silva_long_ranks = [
65
+ "Superkingdom",
66
+ "Kingdom",
67
+ "Phylum",
68
+ "Class",
69
+ "Order",
70
+ "Family",
71
+ "Genus",
72
+ "Species",
73
+ ]
74
+ pr2_long_ranks = [
75
+ "Domain",
76
+ "Supergroup",
77
+ "Division",
78
+ "Subdivision",
79
+ "Class",
80
+ "Order",
81
+ "Family",
82
+ "Genus",
83
+ "Species",
84
+ ]
52
85
 
53
86
  chosen_short_ranks = ""
54
87
  chosen_long_ranks = ""
@@ -65,6 +98,7 @@ def parse_label(label):
65
98
 
66
99
  return chosen_short_ranks, chosen_long_ranks
67
100
 
101
+
68
102
  def parse_mapseq(mseq_df, short_ranks, long_ranks):
69
103
  res_dict = defaultdict(list)
70
104
 
@@ -91,7 +125,8 @@ def parse_mapseq(mseq_df, short_ranks, long_ranks):
91
125
  res_dict[curr_rank].append(curr_tax)
92
126
  res_df = pd.DataFrame.from_dict(res_dict)
93
127
 
94
- return(res_df)
128
+ return res_df
129
+
95
130
 
96
131
  def process_blank_tax_ends(res_df, ranks):
97
132
  # Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
@@ -105,7 +140,9 @@ def process_blank_tax_ends(res_df, ranks):
105
140
  ): # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
106
141
  curr_rank = res_df.iloc[i, j + 1]
107
142
  if curr_rank in ranks:
108
- if last_empty_rank == "": # Last rank is empty, start window of consecutive blanks
143
+ if (
144
+ last_empty_rank == ""
145
+ ): # Last rank is empty, start window of consecutive blanks
109
146
  last_empty_rank = j + 1
110
147
  currently_empty = True
111
148
  elif (
@@ -124,16 +161,17 @@ def process_blank_tax_ends(res_df, ranks):
124
161
  return res_df
125
162
 
126
163
 
127
- def main():
128
- _INPUT, _LABEL, _SAMPLE = parse_args()
164
+ def main():
165
+ input, label, sample = parse_args()
129
166
 
130
- mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
167
+ mseq_df = pd.read_csv(input, header=0, delim_whitespace=True, usecols=[0, 12])
131
168
 
132
- short_ranks, long_ranks = parse_label(_LABEL)
169
+ short_ranks, long_ranks = parse_label(label)
133
170
  res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
134
171
  final_res_df = process_blank_tax_ends(res_df, short_ranks)
135
172
 
136
- final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
173
+ final_res_df.to_csv(f"./{sample}_{label}_asv_taxa.tsv", sep="\t", index=False)
174
+
137
175
 
138
176
  if __name__ == "__main__":
139
177
  main()
@@ -21,24 +21,41 @@ import re
21
21
  from Bio import SeqIO
22
22
  import pandas as pd
23
23
 
24
- from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
24
+ from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
25
+ REGIONS_16S_BACTERIA,
26
+ REGIONS_16S_ARCHAEA,
27
+ REGIONS_18S,
28
+ )
25
29
 
26
30
  STRAND_FWD = "fwd"
27
31
  STRAND_REV = "rev"
28
32
 
33
+
29
34
  def parse_args():
30
35
  parser = argparse.ArgumentParser()
31
36
 
32
- parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
33
- parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
37
+ parser.add_argument(
38
+ "-i",
39
+ "--input",
40
+ required=True,
41
+ type=str,
42
+ help="Path to cmsearch_deoverlap_tblout file",
43
+ )
44
+ parser.add_argument(
45
+ "-f",
46
+ "--fasta",
47
+ required=True,
48
+ type=str,
49
+ help="Path to concatenated primers fasta file",
50
+ )
34
51
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
35
52
  args = parser.parse_args()
36
53
 
37
- _INPUT = args.input
38
- _FASTA = args.fasta
39
- _SAMPLE = args.sample
54
+ input = args.input
55
+ fasta = args.fasta
56
+ sample = args.sample
40
57
 
41
- return _INPUT, _FASTA, _SAMPLE
58
+ return input, fasta, sample
42
59
 
43
60
 
44
61
  def get_amp_region(beg, strand, model):
@@ -62,11 +79,11 @@ def get_amp_region(beg, strand, model):
62
79
 
63
80
 
64
81
  def main():
65
- _INPUT, _FASTA, _SAMPLE = parse_args()
82
+ input, fasta, sample = parse_args()
66
83
  res_dict = defaultdict(list)
67
- fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
84
+ fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
68
85
 
69
- with open(_INPUT, "r") as fr:
86
+ with open(input, "r") as fr:
70
87
  for line in fr:
71
88
  line = line.strip()
72
89
  line = re.sub("[ \t]+", "\t", line)
@@ -88,7 +105,7 @@ def main():
88
105
  else:
89
106
  continue
90
107
 
91
- res_dict["Run"].append(_SAMPLE)
108
+ res_dict["Run"].append(sample)
92
109
  res_dict["AssertionEvidence"].append("ECO_0000363")
93
110
  res_dict["AssertionMethod"].append("automatic assertion")
94
111
 
@@ -109,7 +126,7 @@ def main():
109
126
  res_dict["PrimerSeq"].append(primer_seq)
110
127
 
111
128
  res_df = pd.DataFrame.from_dict(res_dict)
112
- res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
129
+ res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
113
130
 
114
131
 
115
132
  if __name__ == "__main__":
@@ -21,39 +21,48 @@ import gzip
21
21
 
22
22
  from Bio import SeqIO, bgzf
23
23
 
24
+
24
25
  def parse_args():
25
26
 
26
27
  parser = argparse.ArgumentParser()
27
28
 
28
- parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to forward (or single-end) fastq file")
29
- parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
29
+ parser.add_argument(
30
+ "-f",
31
+ "--fwd",
32
+ required=True,
33
+ type=str,
34
+ help="Path to forward (or single-end) fastq file",
35
+ )
36
+ parser.add_argument(
37
+ "-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
38
+ )
30
39
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
31
40
  args = parser.parse_args()
32
-
33
- _FWD = args.fwd
34
- _REV = args.rev
35
- _SAMPLE = args.sample
36
41
 
37
- return _FWD, _REV, _SAMPLE
42
+ fwd = args.fwd
43
+ rev = args.rev
44
+ sample = args.sample
45
+
46
+ return fwd, rev, sample
38
47
 
39
48
 
40
49
  def main():
41
50
 
42
- _FWD, _REV, _SAMPLE = parse_args()
51
+ fwd, rev, sample = parse_args()
43
52
 
44
- fwd_handle = gzip.open(_FWD, "rt")
53
+ fwd_handle = gzip.open(fwd, "rt")
45
54
  fwd_reads = SeqIO.to_dict(SeqIO.parse(fwd_handle, "fastq"))
46
55
  fwd_handle.close()
47
56
 
48
57
  paired_end = True
49
58
 
50
- if _REV == None:
59
+ if rev is None:
51
60
  paired_end = False
52
61
  else:
53
- rev_handle = gzip.open(_REV, "rt")
62
+ rev_handle = gzip.open(rev, "rt")
54
63
  rev_reads = SeqIO.to_dict(SeqIO.parse(rev_handle, "fastq"))
55
64
  rev_handle.close()
56
-
65
+
57
66
  remove_set = set()
58
67
 
59
68
  for read_id in fwd_reads.keys():
@@ -78,23 +87,24 @@ def main():
78
87
  remove_set.add(read_id)
79
88
  continue
80
89
 
81
- [ fwd_reads.pop(read_id) for read_id in remove_set ]
90
+ [fwd_reads.pop(read_id) for read_id in remove_set]
82
91
  if paired_end:
83
- [ rev_reads.pop(read_id) for read_id in remove_set ]
92
+ [rev_reads.pop(read_id) for read_id in remove_set]
84
93
 
85
94
  if paired_end:
86
- fwd_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_1.fastq.gz", "wb")
87
- rev_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_2.fastq.gz", "wb")
88
-
95
+ fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig_1.fastq.gz", "wb")
96
+ rev_handle = bgzf.BgzfWriter(f"./{sample}_noambig_2.fastq.gz", "wb")
97
+
89
98
  SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
90
99
  SeqIO.write(sequences=rev_reads.values(), handle=rev_handle, format="fastq")
91
100
 
92
101
  fwd_handle.close()
93
102
  rev_handle.close()
94
103
  else:
95
- fwd_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig.fastq.gz", "wb")
104
+ fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig.fastq.gz", "wb")
96
105
  SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
97
106
  fwd_handle.close()
98
107
 
108
+
99
109
  if __name__ == "__main__":
100
- main()
110
+ main()
@@ -16,29 +16,37 @@
16
16
 
17
17
  import argparse
18
18
 
19
- from Bio import Seq, SeqIO
19
+ from Bio import SeqIO
20
+
20
21
 
21
22
  def parse_args():
22
23
 
23
24
  parser = argparse.ArgumentParser()
24
25
 
25
- parser.add_argument("-i", "--input", required=True, type=str, help="Path to finalised primer list fasta file")
26
+ parser.add_argument(
27
+ "-i",
28
+ "--input",
29
+ required=True,
30
+ type=str,
31
+ help="Path to finalised primer list fasta file",
32
+ )
26
33
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
27
34
  parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
28
35
  args = parser.parse_args()
29
-
30
- _INPUT = args.input
31
- _SAMPLE = args.sample
32
- _OUTPUT = args.output
33
36
 
34
- return _INPUT, _SAMPLE, _OUTPUT
37
+ input = args.input
38
+ sample = args.sample
39
+ output = args.output
40
+
41
+ return input, sample, output
42
+
35
43
 
36
44
  def main():
37
-
38
- _INPUT, _SAMPLE, _OUTPUT = parse_args()
39
45
 
40
- primers_dict = SeqIO.to_dict(SeqIO.parse(_INPUT, "fasta"))
41
-
46
+ input, sample, output = parse_args()
47
+
48
+ primers_dict = SeqIO.to_dict(SeqIO.parse(input, "fasta"))
49
+
42
50
  for primer_key in primers_dict.keys():
43
51
 
44
52
  primer = primers_dict[primer_key]
@@ -47,8 +55,10 @@ def main():
47
55
  if "R" in primer_name:
48
56
  primers_dict[primer_key].seq = primer.seq.reverse_complement()
49
57
 
50
- SeqIO.write(primers_dict.values(), f"{_OUTPUT}/{_SAMPLE}_rev_comp_se_primers.fasta", "fasta")
58
+ SeqIO.write(
59
+ primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
60
+ )
51
61
 
52
62
 
53
63
  if __name__ == "__main__":
54
- main()
64
+ main()