mgnify-pipelines-toolkit 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -449,7 +449,7 @@ def parse_args(argv):
449
449
  return parser.parse_args(argv)
450
450
 
451
451
 
452
- def main(argv):
452
+ def main(argv=None):
453
453
  t_start = time.perf_counter() # time the run
454
454
  args = parse_args(argv)
455
455
  if not os.path.isdir(args.output_dir):
@@ -481,7 +481,7 @@ def main(argv):
481
481
 
482
482
 
483
483
  if __name__ == '__main__':
484
- main(sys.argv[1:])
484
+ main()
485
485
 
486
486
  # don't print json
487
487
  # name the tsv file better
@@ -16,8 +16,14 @@
16
16
 
17
17
  import argparse
18
18
  from collections import defaultdict
19
+ import logging
20
+
19
21
  import pandas as pd
20
22
 
23
+ from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_TAX_RANKS
24
+
25
+ logging.basicConfig(level=logging.DEBUG)
26
+
21
27
  def parse_args():
22
28
 
23
29
  parser = argparse.ArgumentParser()
@@ -43,10 +49,13 @@ def parse_args():
43
49
 
44
50
  def order_df(taxa_df):
45
51
 
46
- if len(taxa_df.columns) == 7:
47
- taxa_df = taxa_df.sort_values(["Kingdom", "Phylum", "Class", "Order", "Family", "Genus"], ascending=True)
52
+ if len(taxa_df.columns) == 8:
53
+ taxa_df = taxa_df.sort_values(_SILVA_TAX_RANKS, ascending=True)
48
54
  elif len(taxa_df.columns) == 10:
49
- taxa_df = taxa_df.sort_values(["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"], ascending=True)
55
+ taxa_df = taxa_df.sort_values(_PR2_TAX_RANKS, ascending=True)
56
+ else:
57
+ logging.error("Data frame not the right size, something wrong.")
58
+ exit(1)
50
59
 
51
60
  return taxa_df
52
61
 
@@ -68,6 +77,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
68
77
  o = taxa_df.loc[sorted_index, "Order"]
69
78
  f = taxa_df.loc[sorted_index, "Family"]
70
79
  g = taxa_df.loc[sorted_index, "Genus"]
80
+ s = taxa_df.loc[sorted_index, "Species"]
71
81
 
72
82
  tax_assignment = ""
73
83
 
@@ -109,6 +119,11 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
109
119
  if g != "0":
110
120
  g = "_".join(g.split(" "))
111
121
  tax_assignment += f"\tg__{g}"
122
+ else:
123
+ break
124
+ if s != "0":
125
+ s = "_".join(s.split(" "))
126
+ tax_assignment += f"\ts__{s}"
112
127
  break
113
128
 
114
129
  if tax_assignment == "":
@@ -147,10 +162,6 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
147
162
  if d != "0":
148
163
  d = "_".join(d.split(" "))
149
164
  tax_assignment += f"d__{d}"
150
- # if d == "Archaea" or d == "Bacteria":
151
- # tax_assignment += f"d__{d}"
152
- # elif d == "Eukaryota":
153
- # tax_assignment += f"d__Eukaryota"
154
165
  else:
155
166
  break
156
167
 
@@ -159,7 +170,6 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
159
170
  tax_assignment += f"\tsg__{sg}"
160
171
  else:
161
172
  break
162
-
163
173
  if dv != "0":
164
174
  dv = "_".join(dv.split(" "))
165
175
  tax_assignment += f"\tdv__{dv}"
@@ -167,7 +177,6 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
167
177
  if sdv != "0":
168
178
  sdv = "_".join(sdv.split(" "))
169
179
  tax_assignment += f"\tsdv__{sdv}"
170
-
171
180
  if c != "0":
172
181
  c = "_".join(c.split(" "))
173
182
  tax_assignment += f"\tc__{c}"
@@ -250,12 +259,16 @@ def main():
250
259
  if paired_end:
251
260
  rev_fr.close()
252
261
 
253
- if len(taxa_df.columns) == 7:
262
+ ref_db = ""
263
+
264
+ if len(taxa_df.columns) == 8:
254
265
  tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
266
+ ref_db = "silva"
255
267
  elif len(taxa_df.columns) == 10:
256
268
  tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
269
+ ref_db = "pr2"
257
270
 
258
- with open(f"./{_SAMPLE}_{amp_region}_asv_krona_counts.txt", "w") as fw:
271
+ with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
259
272
  for tax_assignment, count in tax_assignment_dict.items():
260
273
  fw.write(f"{count}\t{tax_assignment}\n")
261
274
 
@@ -21,14 +21,14 @@ import gzip
21
21
 
22
22
  from Bio import SeqIO, bgzf
23
23
 
24
- def parse_args(argv=None):
24
+ def parse_args():
25
25
 
26
26
  parser = argparse.ArgumentParser()
27
27
 
28
28
  parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to forward (or single-end) fastq file")
29
29
  parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
30
30
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
31
- args = parser.parse_args(argv)
31
+ args = parser.parse_args()
32
32
 
33
33
  _FWD = args.fwd
34
34
  _REV = args.rev
@@ -37,9 +37,9 @@ def parse_args(argv=None):
37
37
  return _FWD, _REV, _SAMPLE
38
38
 
39
39
 
40
- def main(argv=None):
40
+ def main():
41
41
 
42
- _FWD, _REV, _SAMPLE = parse_args(argv)
42
+ _FWD, _REV, _SAMPLE = parse_args()
43
43
 
44
44
  fwd_handle = gzip.open(_FWD, "rt")
45
45
  fwd_reads = SeqIO.to_dict(SeqIO.parse(fwd_handle, "fastq"))
@@ -54,25 +54,35 @@ def main(argv=None):
54
54
  rev_reads = SeqIO.to_dict(SeqIO.parse(rev_handle, "fastq"))
55
55
  rev_handle.close()
56
56
 
57
- remove_lst = []
57
+ remove_set = set()
58
58
 
59
59
  for read_id in fwd_reads.keys():
60
60
 
61
- if "N" in str(fwd_reads[read_id].seq):
62
- print(read_id)
63
- remove_lst.append(read_id)
61
+ fwd_read_seq = str(fwd_reads[read_id].seq)
62
+ if len(fwd_read_seq) < 100:
63
+ remove_set.add(read_id)
64
64
  continue
65
- elif paired_end and "N" in str(rev_reads[read_id].seq):
65
+ elif "N" in fwd_read_seq:
66
66
  print(read_id)
67
- remove_lst.append(read_id)
67
+ remove_set.add(read_id)
68
68
  continue
69
69
 
70
- [ fwd_reads.pop(read_id) for read_id in remove_lst ]
70
+ if paired_end:
71
+ rev_read_seq = str(rev_reads[read_id].seq)
72
+ if len(rev_read_seq) < 100:
73
+ print(read_id)
74
+ remove_set.add(read_id)
75
+ continue
76
+ elif "N" in rev_read_seq:
77
+ print(read_id)
78
+ remove_set.add(read_id)
79
+ continue
80
+
81
+ [ fwd_reads.pop(read_id) for read_id in remove_set ]
71
82
  if paired_end:
72
- [ rev_reads.pop(read_id) for read_id in remove_lst ]
83
+ [ rev_reads.pop(read_id) for read_id in remove_set ]
73
84
 
74
85
  if paired_end:
75
-
76
86
  fwd_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_1.fastq.gz", "wb")
77
87
  rev_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_2.fastq.gz", "wb")
78
88
 
@@ -15,18 +15,17 @@
15
15
  # limitations under the License.
16
16
 
17
17
  import argparse
18
- import sys
19
18
 
20
19
  from Bio import Seq, SeqIO
21
20
 
22
- def parse_args(argv=None):
21
+ def parse_args():
23
22
 
24
23
  parser = argparse.ArgumentParser()
25
24
 
26
25
  parser.add_argument("-i", "--input", required=True, type=str, help="Path to finalised primer list fasta file")
27
26
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
28
27
  parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
29
- args = parser.parse_args(argv)
28
+ args = parser.parse_args()
30
29
 
31
30
  _INPUT = args.input
32
31
  _SAMPLE = args.sample
@@ -34,9 +33,9 @@ def parse_args(argv=None):
34
33
 
35
34
  return _INPUT, _SAMPLE, _OUTPUT
36
35
 
37
- def main(argv=None):
36
+ def main():
38
37
 
39
- _INPUT, _SAMPLE, _OUTPUT = parse_args(argv)
38
+ _INPUT, _SAMPLE, _OUTPUT = parse_args()
40
39
 
41
40
  primers_dict = SeqIO.to_dict(SeqIO.parse(_INPUT, "fasta"))
42
41
 
@@ -101,7 +101,7 @@ def main():
101
101
  notaxidfile_header = []
102
102
 
103
103
  if args.taxid:
104
- if args.label.startswith("UNITE"):
104
+ if args.label.startswith("UNITE") or args.label.startswith("PR2"):
105
105
  output_header = [
106
106
  "# OTU ID",
107
107
  args.label,
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ _SILVA_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
18
+ _PR2_TAX_RANKS = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -4,20 +4,21 @@ mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=cF_7zsdJTzIC
4
4
  mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=fNjzWpXjqpQNRyWG6CoROyHAvA4lZsvPh8sxDpjyMkY,5141
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=leUjaD59EXA39HaMSDm909DGgNDuHVi-2EnUyBSNIWE,6847
6
6
  mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=AUGeYWWpJ8sP5wpPbTq32G3Iai3NV5JQTZz6MQ0CUU4,4935
7
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=kmS5-2SSldXyGElQ11eC4IqwfvwtZXhDWgZQvbPCAgo,18904
7
+ mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=htoZyGuDV_64-4PRGpE2qjxw4VihmY4b-2ktuR03Pgg,18897
8
8
  mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=5emeZjk8891IgyL5ygVcr8wMP-hGEJoEs2rcBbseWj0,3536
9
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=CMkD4Uyw1PUJg7npVORtDG7lRYM-MyW45qaQ2gpyEwc,8275
10
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=WsvqAQ_lL_L_XC4vb2lWe0AsYmf1TIsRF6ROfpMz8wU,2845
11
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=v5dgOmaj1FuAoZb9iOhbO_jTeLkfhurqtTlDfHw-7gI,1717
9
+ mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=3ODVM7YxYrPiM95UnbYiKmtN93PdlF_JmjZYTEyoCL8,8468
10
+ mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=yhDJFGD3t3TMfUlBCJGwzlK4IjFwm7Bporwp-aIM8uU,3139
11
+ mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=VoSbou3BAZL4bJes4FsYJvmd45_PjKj8F2sQDIyLDoI,1680
12
12
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=odGVde7Ct9dS2aqsySWgdgVLCOqfr_ZGeHFcXcuukxs,10846
13
13
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=vvhn8O9t1zzD8rIvQ5bDLLgdzogBGKqgKXuMybnHEXA,4551
15
15
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=hFXUTZb-etmJS7Si3mCCVCXV5ZYN0tP6FSbeiVxG1jo,1879
16
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=65nvJMEQAPaJnpWdFXeYSZkbi0xD6auWe9kdTA4OUaY,5533
16
+ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=glvql2Y-BTyA1hTIZz2nlmST3SE6LJbep9sKdMH-vaI,5565
17
17
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=oVeeCy33bY1WJ-rffOULZ3ogi48Jz0FfTS73MPTur-A,1095
18
- mgnify_pipelines_toolkit-0.0.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
19
- mgnify_pipelines_toolkit-0.0.7.dist-info/METADATA,sha256=NKCi11tTiURD10pfskuBC5LPV_N526ou60E3J73obpM,4779
20
- mgnify_pipelines_toolkit-0.0.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
21
- mgnify_pipelines_toolkit-0.0.7.dist-info/entry_points.txt,sha256=UqG2YtR6sMIwrJHV3Zx9r-hEwkLt8j59ah6Xfs88YDQ,1014
22
- mgnify_pipelines_toolkit-0.0.7.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
23
- mgnify_pipelines_toolkit-0.0.7.dist-info/RECORD,,
18
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=QOVkxZjMTYwIxCN7L4CLYFQjWoxsuZ4WKlKiTblD4tM,851
19
+ mgnify_pipelines_toolkit-0.0.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
20
+ mgnify_pipelines_toolkit-0.0.9.dist-info/METADATA,sha256=ZnCa-e-9xYNpGuhm9E4-rGK1GiCdmqQgdQkWRurvteA,4779
21
+ mgnify_pipelines_toolkit-0.0.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
22
+ mgnify_pipelines_toolkit-0.0.9.dist-info/entry_points.txt,sha256=UqG2YtR6sMIwrJHV3Zx9r-hEwkLt8j59ah6Xfs88YDQ,1014
23
+ mgnify_pipelines_toolkit-0.0.9.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
24
+ mgnify_pipelines_toolkit-0.0.9.dist-info/RECORD,,