mgnify-pipelines-toolkit 1.2.3__py3-none-any.whl → 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,87 @@
1
+ import argparse
2
+ from itertools import product
3
+ from pathlib import Path
4
+
5
+ from Bio import SeqIO
6
+
7
+
8
+ def parse_args():
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument(
12
+ "-i",
13
+ "--input_primers",
14
+ required=True,
15
+ type=str,
16
+ help="Input primers to generate permutations for due to IUPAC ambiguous codes",
17
+ )
18
+ parser.add_argument("-p", "--prefix", required=True, type=str, help="Output prefix")
19
+
20
+ args = parser.parse_args()
21
+
22
+ input_path = args.input_primers
23
+ prefix = args.prefix
24
+
25
+ return input_path, prefix
26
+
27
+
28
+ def permute_seq(seq):
29
+
30
+ ambiguous_bases_dict = {
31
+ "R": ["A", "G"],
32
+ "Y": ["C", "T"],
33
+ "S": ["G", "C"],
34
+ "W": ["A", "T"],
35
+ "K": ["G", "T"],
36
+ "M": ["A", "C"],
37
+ "B": ["C", "G", "T"],
38
+ "D": ["A", "G", "T"],
39
+ "H": ["A", "C", "T"],
40
+ "V": ["A", "C", "G"],
41
+ "N": ["A", "C", "T", "G"],
42
+ }
43
+
44
+ seq_template = []
45
+
46
+ for base in seq:
47
+ if base in ["A", "C", "T", "G"]:
48
+ seq_template.append(base)
49
+ else:
50
+ seq_template.append(ambiguous_bases_dict[base])
51
+
52
+ seq_permutations = []
53
+ for combo in product(*seq_template):
54
+ seq_permutations.append("".join(combo))
55
+
56
+ return seq_permutations
57
+
58
+
59
+ def make_primer_permutations(primers_dict, prefix):
60
+
61
+ with open(f"{prefix}_permuted_primers.fasta", "w") as fw:
62
+ for primer_name, seq in primers_dict.items():
63
+
64
+ primer_seq = seq.seq
65
+ fw.write(f">{primer_name}\n{primer_seq}\n")
66
+
67
+ if primer_name == "F_auto" or primer_name[-1] == "F":
68
+ strand = "F"
69
+ elif primer_name == "R_auto" or primer_name[-1] == "R":
70
+ strand = "R"
71
+
72
+ seq_permutations = permute_seq(primer_seq)
73
+
74
+ for counter, permuted_seq in enumerate(seq_permutations, 1):
75
+ variant_name = f"{primer_name}_variant_{counter}_{strand}"
76
+ fw.write(f">{variant_name}\n{permuted_seq}\n")
77
+
78
+
79
+ def main():
80
+
81
+ input_path, prefix = parse_args()
82
+ primers_dict = SeqIO.to_dict(SeqIO.parse(Path(input_path), "fasta"))
83
+ make_primer_permutations(primers_dict, prefix)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -97,6 +97,8 @@ def main():
97
97
  fwd_primers_fw = open("./fwd_primers.fasta", "w")
98
98
  rev_primers_fw = open("./rev_primers.fasta", "w")
99
99
 
100
+ matched_primers_list = []
101
+
100
102
  with open(input, "r") as fr:
101
103
  for line in fr:
102
104
  line = line.strip()
@@ -108,6 +110,10 @@ def main():
108
110
  beg = float(line_lst[5])
109
111
  end = float(line_lst[6])
110
112
 
113
+ cleaned_primer_name = "_".join(primer_name.split("_")[0:-3])
114
+ if cleaned_primer_name in matched_primers_list:
115
+ continue
116
+
111
117
  if rfam == "RF00177":
112
118
  gene = "16S"
113
119
  model = REGIONS_16S_BACTERIA
@@ -118,7 +124,7 @@ def main():
118
124
  gene = "18S"
119
125
  model = REGIONS_18S
120
126
  else: # For cases when it's a std primer but for some reason hasn't matched the model
121
- if primer_name == "F_auto" or primer_name == "R_auto":
127
+ if cleaned_primer_name == "F_auto" or cleaned_primer_name == "R_auto":
122
128
  continue
123
129
  gene = "Unknown"
124
130
  amp_region = "Unknown"
@@ -130,27 +136,32 @@ def main():
130
136
 
131
137
  strand = ""
132
138
 
133
- if primer_name == "F_auto" or primer_name[-1] == "F":
139
+ if primer_name[-1] == "F":
134
140
  strand = STRAND_FWD
135
- elif primer_name == "R_auto" or primer_name[-1] == "R":
141
+ elif primer_name[-1] == "R":
136
142
  strand = STRAND_REV
143
+ else:
144
+ print(f"Not sure what strand this is, exiting: {primer_name}")
137
145
 
138
146
  if model:
139
147
  amp_region = get_amp_region(beg, end, strand, model)
140
- primer_seq = str(fasta_dict[primer_name].seq)
148
+
149
+ primer_seq = str(fasta_dict[cleaned_primer_name].seq)
141
150
 
142
151
  res_dict["Gene"].append(gene)
143
152
  res_dict["VariableRegion"].append(amp_region)
144
- res_dict["PrimerName"].append(primer_name)
153
+ res_dict["PrimerName"].append(cleaned_primer_name)
145
154
  res_dict["PrimerStrand"].append(strand)
146
155
  res_dict["PrimerSeq"].append(primer_seq)
147
156
 
148
157
  if strand == STRAND_FWD:
149
- fwd_primers_fw.write(f">{primer_name}\n{primer_seq}\n")
158
+ fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
150
159
  elif strand == STRAND_REV:
151
160
  if single_end:
152
161
  primer_seq = Seq(primer_seq).reverse_complement()
153
- rev_primers_fw.write(f">{primer_name}\n{primer_seq}\n")
162
+ rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
163
+
164
+ matched_primers_list.append(cleaned_primer_name)
154
165
 
155
166
  res_df = pd.DataFrame.from_dict(res_dict)
156
167
  res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.3
3
+ Version: 1.2.4
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -3,7 +3,8 @@ mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
3
3
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=nUvboLz08RyqRE8Thfh8NRlmgJk0kVdXcSvgmAfKip0,20649
4
4
  mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=ohguvrMSg7GuiiZ5aHj1DvCnfThKFUG4s13LUSMM0mo,8892
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
6
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=aJiKzp2uLdVeobCOF2ySkofXPN5NMaQ7esc4oGDx_h4,4841
6
+ mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
7
+ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=I9JfkM_o6Wp4VINOMO6ff9mHqghdJw1kDIfiF37JtLo,5185
7
8
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
8
9
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
9
10
  mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
@@ -42,9 +43,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQd
42
43
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
44
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
44
45
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
45
- mgnify_pipelines_toolkit-1.2.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- mgnify_pipelines_toolkit-1.2.3.dist-info/METADATA,sha256=CJCRbR2EbaS2qjpqqjRJHQ7jDjg7jZc9SLDbkZMjwcc,5775
47
- mgnify_pipelines_toolkit-1.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
- mgnify_pipelines_toolkit-1.2.3.dist-info/entry_points.txt,sha256=d7r4_VUS1hWNMnTJOy8u2kTRSFcy-sDN5NLRUXz-IhU,3041
49
- mgnify_pipelines_toolkit-1.2.3.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
50
- mgnify_pipelines_toolkit-1.2.3.dist-info/RECORD,,
46
+ mgnify_pipelines_toolkit-1.2.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
47
+ mgnify_pipelines_toolkit-1.2.4.dist-info/METADATA,sha256=UXCHFcEcjuPMZvUgtzITSY_iIG-j_nfGVBMGCWjBjjA,5775
48
+ mgnify_pipelines_toolkit-1.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
+ mgnify_pipelines_toolkit-1.2.4.dist-info/entry_points.txt,sha256=sHDxlHizt_iZPtkNp0EDuohDGvC4O12B57JtpUmHwYk,3123
50
+ mgnify_pipelines_toolkit-1.2.4.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
51
+ mgnify_pipelines_toolkit-1.2.4.dist-info/RECORD,,
@@ -23,6 +23,7 @@ make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count
23
23
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
24
24
  mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
25
25
  markergene_study_summary = mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main
26
+ permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:main
26
27
  primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
27
28
  process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
28
29
  process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main