mgnify-pipelines-toolkit 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -22,12 +22,12 @@ import os
22
22
  import logging
23
23
  import json
24
24
  import time
25
+ import numpy as np
25
26
 
26
27
  from mgnify_pipelines_toolkit.constants.thresholds import (
27
28
  MIN_OVERLAP,
28
29
  MIN_SEQ_COUNT,
29
30
  MAX_ERROR_PROPORTION,
30
- MAX_INTERNAL_PRIMER_PROPORTION,
31
31
  )
32
32
  from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
33
33
  REGIONS_16S_BACTERIA,
@@ -62,8 +62,16 @@ def get_multiregion(raw_sequence_coords, regions):
62
62
 
63
63
  Returns:
64
64
  amplified_region: Amplified variable regions.
65
+ region_coverages: Coverage of all detected variable regions
65
66
 
66
67
  """
68
+
69
+ region_coverages = defaultdict(float)
70
+
71
+ for region, limits in regions.items():
72
+ overlap = calc_overlap(raw_sequence_coords, limits)
73
+ region_coverages[region] = overlap
74
+
67
75
  # check if any of the coords are inside the region
68
76
  matched_regions = [
69
77
  region
@@ -76,7 +84,7 @@ def get_multiregion(raw_sequence_coords, regions):
76
84
  amplified_region = matched_regions[0]
77
85
  else:
78
86
  amplified_region = ""
79
- return amplified_region
87
+ return amplified_region, region_coverages
80
88
 
81
89
 
82
90
  def check_primer_position(raw_sequence_coords, regions):
@@ -90,7 +98,7 @@ def check_primer_position(raw_sequence_coords, regions):
90
98
 
91
99
  """
92
100
  result_flag = False
93
- margin = 3 # allowed margin of error
101
+ margin = 10 # allowed margin of error
94
102
  for coord in raw_sequence_coords:
95
103
  for region in regions.values():
96
104
  if coord in range(region[0] + margin, region[1] - margin):
@@ -342,22 +350,30 @@ def retrieve_regions(
342
350
  per_read_info = (
343
351
  dict()
344
352
  ) # dictionary will contain read names for each variable region
353
+ all_region_coverages = defaultdict(lambda: defaultdict(list))
345
354
  for read in data:
355
+ # Example structure of `read`
356
+ # ('ERR14650515.1', 'SSU_rRNA_archaea', 'RF01959', 'hmm', '3', '525', '1', '518', '+', '-', '6', '0.55', '0.6', '363.6', '7.8e-107')
346
357
  regions = determine_cm(read[2])
347
358
  sequence_counter_total += 1
348
359
  limits = list(map(int, read[4:6]))
349
360
  domain = determine_domain(read[2])
350
361
  marker_gene = determine_marker_gene(domain)
351
362
  if not regions == "unsupported":
352
- multiregion_matches.setdefault(read[2], []).append(
353
- get_multiregion(limits, regions)
354
- )
363
+ matches, coverages = get_multiregion(limits, regions)
364
+
365
+ [
366
+ all_region_coverages[domain][region].append(coverage)
367
+ for region, coverage in coverages.items()
368
+ ]
369
+
370
+ multiregion_matches.setdefault(read[2], []).append(matches)
355
371
  if check_primer_position(limits, regions):
356
372
  primer_inside_vr += 1
357
373
  sequence_counter_useful += 1
358
- per_read_info.setdefault(
359
- marker_gene + "." + get_multiregion(limits, regions), []
360
- ).append(read[0])
374
+ per_read_info.setdefault(marker_gene + "." + matches, []).append(
375
+ read[0]
376
+ )
361
377
  else:
362
378
  unsupported_matches += 1
363
379
 
@@ -385,18 +401,6 @@ def retrieve_regions(
385
401
  )
386
402
  continue
387
403
 
388
- # filter out runs with too many sequences starting/ending inside variable regions
389
- internal_seq_fract = primer_inside_vr / len(data)
390
- if internal_seq_fract > MAX_INTERNAL_PRIMER_PROPORTION:
391
- failed_run_counter += 1
392
- logging.info("No output will be produced - too many internal mappings")
393
- logging.info(
394
- "Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
395
- tblout_file, "{0:.2f}".format(internal_seq_fract)
396
- )
397
- )
398
- continue
399
-
400
404
  normalised_matches[run_id] = dict()
401
405
  region_counter = defaultdict(int)
402
406
 
@@ -432,14 +436,12 @@ def retrieve_regions(
432
436
  multiregion_matches[model] = new_value
433
437
 
434
438
  [multiregion_matches.pop(model) for model in models_to_remove]
435
- print(multiregion_matches)
436
439
 
437
440
  run_status = "one"
438
441
  run_result = dict()
439
442
  total_useful_sequences = 0.0
440
443
  temp_seq_counter = dict()
441
444
  for model, model_regions in multiregion_matches.items():
442
- print(model)
443
445
  result = normalise_results(model_regions)
444
446
  if result is None:
445
447
  run_status = "ambiguous"
@@ -469,6 +471,16 @@ def retrieve_regions(
469
471
  logging.info("No output will be produced - the run is ambiguous.")
470
472
  continue
471
473
 
474
+ coverage_fw = open(f"{outfile_prefix}_all_coverages.txt", "w")
475
+
476
+ for domain, regions in all_region_coverages.items():
477
+ for region in regions:
478
+ if len(regions[region]) < MIN_SEQ_COUNT:
479
+ continue
480
+ region_coverage = float(np.mean(regions[region]))
481
+ if region_coverage > 0:
482
+ coverage_fw.write(f"{domain}:{region}: {region_coverage}\n")
483
+
472
484
  json_outfile = "{}.json".format(outfile_prefix)
473
485
  tsv_outfile = "{}.tsv".format(outfile_prefix)
474
486
  with open(json_outfile, "w") as f:
@@ -0,0 +1,87 @@
1
+ import argparse
2
+ from itertools import product
3
+ from pathlib import Path
4
+
5
+ from Bio import SeqIO
6
+
7
+
8
+ def parse_args():
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument(
12
+ "-i",
13
+ "--input_primers",
14
+ required=True,
15
+ type=str,
16
+ help="Input primers to generate permutations for due to IUPAC ambiguous codes",
17
+ )
18
+ parser.add_argument("-p", "--prefix", required=True, type=str, help="Output prefix")
19
+
20
+ args = parser.parse_args()
21
+
22
+ input_path = args.input_primers
23
+ prefix = args.prefix
24
+
25
+ return input_path, prefix
26
+
27
+
28
+ def permute_seq(seq):
29
+
30
+ ambiguous_bases_dict = {
31
+ "R": ["A", "G"],
32
+ "Y": ["C", "T"],
33
+ "S": ["G", "C"],
34
+ "W": ["A", "T"],
35
+ "K": ["G", "T"],
36
+ "M": ["A", "C"],
37
+ "B": ["C", "G", "T"],
38
+ "D": ["A", "G", "T"],
39
+ "H": ["A", "C", "T"],
40
+ "V": ["A", "C", "G"],
41
+ "N": ["A", "C", "T", "G"],
42
+ }
43
+
44
+ seq_template = []
45
+
46
+ for base in seq:
47
+ if base in ["A", "C", "T", "G"]:
48
+ seq_template.append(base)
49
+ else:
50
+ seq_template.append(ambiguous_bases_dict[base])
51
+
52
+ seq_permutations = []
53
+ for combo in product(*seq_template):
54
+ seq_permutations.append("".join(combo))
55
+
56
+ return seq_permutations
57
+
58
+
59
+ def make_primer_permutations(primers_dict, prefix):
60
+
61
+ with open(f"{prefix}_permuted_primers.fasta", "w") as fw:
62
+ for primer_name, seq in primers_dict.items():
63
+
64
+ primer_seq = seq.seq
65
+ fw.write(f">{primer_name}\n{primer_seq}\n")
66
+
67
+ if primer_name == "F_auto" or primer_name[-1] == "F":
68
+ strand = "F"
69
+ elif primer_name == "R_auto" or primer_name[-1] == "R":
70
+ strand = "R"
71
+
72
+ seq_permutations = permute_seq(primer_seq)
73
+
74
+ for counter, permuted_seq in enumerate(seq_permutations, 1):
75
+ variant_name = f"{primer_name}_variant_{counter}_{strand}"
76
+ fw.write(f">{variant_name}\n{permuted_seq}\n")
77
+
78
+
79
+ def main():
80
+
81
+ input_path, prefix = parse_args()
82
+ primers_dict = SeqIO.to_dict(SeqIO.parse(Path(input_path), "fasta"))
83
+ make_primer_permutations(primers_dict, prefix)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -19,6 +19,7 @@ from collections import defaultdict
19
19
  import re
20
20
 
21
21
  from Bio import SeqIO
22
+ from Bio.Seq import Seq
22
23
  import pandas as pd
23
24
 
24
25
  from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
@@ -49,18 +50,26 @@ def parse_args():
49
50
  help="Path to concatenated primers fasta file",
50
51
  )
51
52
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
53
+ parser.add_argument(
54
+ "--se",
55
+ action=argparse.BooleanOptionalAction,
56
+ help="Flag for if run is single-end",
57
+ )
52
58
  args = parser.parse_args()
53
59
 
54
60
  input = args.input
55
61
  fasta = args.fasta
56
62
  sample = args.sample
63
+ single_end = args.se
57
64
 
58
- return input, fasta, sample
65
+ return input, fasta, sample, single_end
59
66
 
60
67
 
61
68
  def get_amp_region(beg, end, strand, model):
62
69
  prev_region = ""
63
70
 
71
+ margin = -10
72
+
64
73
  for region, region_coords in model.items():
65
74
 
66
75
  region_beg = region_coords[0]
@@ -68,10 +77,10 @@ def get_amp_region(beg, end, strand, model):
68
77
  end_diff = region_beg - end
69
78
 
70
79
  if strand == STRAND_FWD:
71
- if beg_diff > 0 and end_diff > 0:
80
+ if beg_diff >= margin and end_diff >= margin:
72
81
  return region
73
82
  else:
74
- if beg_diff > 0 and end_diff > 0:
83
+ if beg_diff >= margin and end_diff >= margin:
75
84
  return prev_region
76
85
 
77
86
  prev_region = region
@@ -80,10 +89,16 @@ def get_amp_region(beg, end, strand, model):
80
89
 
81
90
 
82
91
  def main():
83
- input, fasta, sample = parse_args()
92
+
93
+ input, fasta, sample, single_end = parse_args()
84
94
  res_dict = defaultdict(list)
85
95
  fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
86
96
 
97
+ fwd_primers_fw = open("./fwd_primers.fasta", "w")
98
+ rev_primers_fw = open("./rev_primers.fasta", "w")
99
+
100
+ matched_primers_list = []
101
+
87
102
  with open(input, "r") as fr:
88
103
  for line in fr:
89
104
  line = line.strip()
@@ -95,6 +110,10 @@ def main():
95
110
  beg = float(line_lst[5])
96
111
  end = float(line_lst[6])
97
112
 
113
+ cleaned_primer_name = "_".join(primer_name.split("_")[0:-3])
114
+ if cleaned_primer_name in matched_primers_list:
115
+ continue
116
+
98
117
  if rfam == "RF00177":
99
118
  gene = "16S"
100
119
  model = REGIONS_16S_BACTERIA
@@ -104,8 +123,12 @@ def main():
104
123
  elif rfam == "RF01960":
105
124
  gene = "18S"
106
125
  model = REGIONS_18S
107
- else:
108
- continue
126
+ else: # For cases when it's a std primer but for some reason hasn't matched the model
127
+ if cleaned_primer_name == "F_auto" or cleaned_primer_name == "R_auto":
128
+ continue
129
+ gene = "Unknown"
130
+ amp_region = "Unknown"
131
+ model = ""
109
132
 
110
133
  res_dict["Run"].append(sample)
111
134
  res_dict["AssertionEvidence"].append("ECO_0000363")
@@ -113,23 +136,39 @@ def main():
113
136
 
114
137
  strand = ""
115
138
 
116
- if "F" in primer_name:
139
+ if primer_name[-1] == "F":
117
140
  strand = STRAND_FWD
118
- elif "R" in primer_name:
141
+ elif primer_name[-1] == "R":
119
142
  strand = STRAND_REV
143
+ else:
144
+ print(f"Not sure what strand this is, exiting: {primer_name}")
145
+
146
+ if model:
147
+ amp_region = get_amp_region(beg, end, strand, model)
120
148
 
121
- amp_region = get_amp_region(beg, end, strand, model)
122
- primer_seq = str(fasta_dict[primer_name].seq)
149
+ primer_seq = str(fasta_dict[cleaned_primer_name].seq)
123
150
 
124
151
  res_dict["Gene"].append(gene)
125
152
  res_dict["VariableRegion"].append(amp_region)
126
- res_dict["PrimerName"].append(primer_name)
153
+ res_dict["PrimerName"].append(cleaned_primer_name)
127
154
  res_dict["PrimerStrand"].append(strand)
128
155
  res_dict["PrimerSeq"].append(primer_seq)
129
156
 
157
+ if strand == STRAND_FWD:
158
+ fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
159
+ elif strand == STRAND_REV:
160
+ if single_end:
161
+ primer_seq = Seq(primer_seq).reverse_complement()
162
+ rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
163
+
164
+ matched_primers_list.append(cleaned_primer_name)
165
+
130
166
  res_df = pd.DataFrame.from_dict(res_dict)
131
167
  res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
132
168
 
169
+ fwd_primers_fw.close()
170
+ rev_primers_fw.close()
171
+
133
172
 
134
173
  if __name__ == "__main__":
135
174
  main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -1,9 +1,10 @@
1
1
  mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=W0ob9z_8sjrB1Ck48Ac-_5Vw2kyoRFalcxhrR6KSXpI,20196
3
+ mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=nUvboLz08RyqRE8Thfh8NRlmgJk0kVdXcSvgmAfKip0,20649
4
4
  mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=ohguvrMSg7GuiiZ5aHj1DvCnfThKFUG4s13LUSMM0mo,8892
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
6
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=Bmc4Yu8inpT6AVTG1zwxp9F9mknIDLY33-UuFdaZuq0,3756
6
+ mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
7
+ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=I9JfkM_o6Wp4VINOMO6ff9mHqghdJw1kDIfiF37JtLo,5185
7
8
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
8
9
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
9
10
  mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
@@ -42,9 +43,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQd
42
43
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
44
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
44
45
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
45
- mgnify_pipelines_toolkit-1.2.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- mgnify_pipelines_toolkit-1.2.2.dist-info/METADATA,sha256=MRDDDp1xhJhoIc-rU1IKWASfleAvFXsJLyhaoG8FLUg,5775
47
- mgnify_pipelines_toolkit-1.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
- mgnify_pipelines_toolkit-1.2.2.dist-info/entry_points.txt,sha256=d7r4_VUS1hWNMnTJOy8u2kTRSFcy-sDN5NLRUXz-IhU,3041
49
- mgnify_pipelines_toolkit-1.2.2.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
50
- mgnify_pipelines_toolkit-1.2.2.dist-info/RECORD,,
46
+ mgnify_pipelines_toolkit-1.2.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
47
+ mgnify_pipelines_toolkit-1.2.4.dist-info/METADATA,sha256=UXCHFcEcjuPMZvUgtzITSY_iIG-j_nfGVBMGCWjBjjA,5775
48
+ mgnify_pipelines_toolkit-1.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
+ mgnify_pipelines_toolkit-1.2.4.dist-info/entry_points.txt,sha256=sHDxlHizt_iZPtkNp0EDuohDGvC4O12B57JtpUmHwYk,3123
50
+ mgnify_pipelines_toolkit-1.2.4.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
51
+ mgnify_pipelines_toolkit-1.2.4.dist-info/RECORD,,
@@ -23,6 +23,7 @@ make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count
23
23
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
24
24
  mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
25
25
  markergene_study_summary = mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main
26
+ permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:main
26
27
  primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
27
28
  process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
28
29
  process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main