mgnify-pipelines-toolkit 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (18) hide show
  1. mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +5 -1
  2. mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +84 -21
  3. mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +11 -0
  4. mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +25 -7
  5. mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +13 -9
  6. mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +211 -0
  7. mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +162 -0
  8. mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +230 -0
  9. mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +119 -0
  10. mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +6 -3
  11. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
  12. mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
  13. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/METADATA +19 -27
  14. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/RECORD +18 -14
  15. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/WHEEL +1 -1
  16. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/entry_points.txt +5 -0
  17. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/licenses/LICENSE +0 -0
  18. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2023-2024 EMBL - European Bioinformatics Institute
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ from pathlib import Path
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+
25
+ def main(standard_file, substrate_file, outfile, dbcan_version):
26
+ standard_path = Path(standard_file)
27
+ substrate_path = Path(substrate_file)
28
+
29
+ if not standard_path.exists():
30
+ raise FileNotFoundError(f"Input standards path does not exist: {standard_file}")
31
+
32
+ if not substrate_path.exists():
33
+ raise FileNotFoundError(
34
+ f"Input substrate path does not exist: {substrate_file}"
35
+ )
36
+
37
+ substrates = load_substrates(substrate_path)
38
+ cgc_locations = load_cgcs(standard_path)
39
+ print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations)
40
+
41
+
42
+ def load_cgcs(standard_path):
43
+ cgc_locations = dict()
44
+ with fileinput.hook_compressed(standard_path, "rt") as file_in:
45
+ for line in file_in:
46
+ if not line.startswith("CGC#"):
47
+ cgc, _, contig, _, start, end, _, _ = line.strip().split("\t")
48
+ cgc_id = f"{contig}_{cgc}"
49
+ if cgc_id in cgc_locations:
50
+ if cgc_locations[cgc_id]["start"] > int(start):
51
+ cgc_locations[cgc_id]["start"] = int(start)
52
+ if cgc_locations[cgc_id]["end"] < int(end):
53
+ cgc_locations[cgc_id]["end"] = int(end)
54
+ else:
55
+ cgc_locations[cgc_id] = {
56
+ "start": int(start),
57
+ "end": int(end),
58
+ "contig": contig,
59
+ }
60
+ return cgc_locations
61
+
62
+
63
+ def print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations):
64
+ with open(outfile, "w") as file_out:
65
+ file_out.write("##gff-version 3\n")
66
+ cgcs_printed = list()
67
+ with fileinput.hook_compressed(standard_path, "rt") as file_in:
68
+ for line in file_in:
69
+ if not line.startswith("CGC#"):
70
+ cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = (
71
+ line.strip().split("\t")
72
+ )
73
+ cgc_id = f"{contig}_{cgc}"
74
+ protein_fam = protein_fam.replace(" ", "")
75
+ if cgc_id not in cgcs_printed:
76
+ substrate = (
77
+ substrates[cgc_id]
78
+ if cgc_id in substrates
79
+ else "substrate_dbcan-pul=N/A;substrate_dbcan-sub=N/A"
80
+ )
81
+ file_out.write(
82
+ "{}\tdbCAN:{}\tpredicted PUL\t{}\t{}\t.\t.\t.\tID={};{}\n".format(
83
+ contig,
84
+ dbcan_version,
85
+ cgc_locations[cgc_id]["start"],
86
+ cgc_locations[cgc_id]["end"],
87
+ cgc_id,
88
+ substrate,
89
+ )
90
+ )
91
+ cgcs_printed.append(cgc_id)
92
+ file_out.write(
93
+ (
94
+ f"{contig}\tdbCAN:{dbcan_version}\t{gene_type}\t{start}"
95
+ + f"\t{end}\t.\t{strand}\t.\tID={prot_id};Parent={cgc_id};protein_family={protein_fam}\n"
96
+ )
97
+ )
98
+
99
+
100
+ def load_substrates(substrate_path):
101
+ substrates = dict()
102
+ with fileinput.hook_compressed(substrate_path, "rt") as file_in:
103
+ for line in file_in:
104
+ if not line.startswith("#"):
105
+ parts = line.strip().split("\t")
106
+ cgc_parts = parts[0].rsplit("|", 1)
107
+ cgc = "_".join(cgc_parts)
108
+ try:
109
+ substrate_pul = parts[2]
110
+ except IndexError:
111
+ substrate_pul = "N/A"
112
+ try:
113
+ substrate_ecami = parts[5]
114
+ except IndexError:
115
+ substrate_ecami = "N/A"
116
+ if not substrate_pul:
117
+ substrate_pul = "N/A"
118
+ if not substrate_ecami:
119
+ substrate_ecami = "N/A"
120
+ substrates[cgc] = (
121
+ f"substrate_dbcan-pul={substrate_pul};substrate_dbcan-sub={substrate_ecami}"
122
+ )
123
+
124
+ return substrates
125
+
126
+
127
+ def parse_args():
128
+ parser = argparse.ArgumentParser(
129
+ description=(
130
+ "The script takes dbCAN output and parses it to create a standalone GFF."
131
+ )
132
+ )
133
+ parser.add_argument(
134
+ "-st",
135
+ dest="standard_file",
136
+ required=True,
137
+ help="Path to the standard file (*cgc_standard.out)",
138
+ )
139
+ parser.add_argument(
140
+ "-sb",
141
+ dest="substrate_file",
142
+ required=True,
143
+ help="Path to the substrate file (*substrate.out)",
144
+ )
145
+ parser.add_argument(
146
+ "-o",
147
+ dest="outfile",
148
+ required=True,
149
+ help="Path to the output file.",
150
+ )
151
+ parser.add_argument(
152
+ "-v",
153
+ dest="dbcan_ver",
154
+ required=True,
155
+ help="dbCAN version used.",
156
+ )
157
+ return parser.parse_args()
158
+
159
+
160
+ if __name__ == "__main__":
161
+ args = parse_args()
162
+ main(args.standard_file, args.substrate_file, args.outfile, args.dbcan_ver)
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ import pandas as pd
21
+
22
+
23
+ logging.basicConfig(
24
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
25
+ )
26
+
27
+ ANTISMASH_VERSION = "7.1.x"
28
+
29
+ f"""
30
+ Script parses antismash GFF output and adds descriptions from pre-parsed glossary https://docs.antismash.secondarymetabolites.org/glossary/.
31
+ Glossary was taken from version {ANTISMASH_VERSION} and commit dbeeb0e https://github.com/antismash/documentation/blob/master/docs/glossary.md
32
+ """
33
+
34
+ DESCRIPTIONS = {
35
+ "2dos": "2-deoxy-streptamine aminoglycoside",
36
+ "acyl_amino_acids": "N-acyl amino acid",
37
+ "amglyccycl": "Aminoglycoside/aminocyclitol",
38
+ "aminocoumarin": "Aminocoumarin",
39
+ "aminopolycarboxylic-acid": "Aminopolycarboxylic acid metallophores (doi:10.1039/C8MT00009C)",
40
+ "archaeal-ripp": "Archaeal RiPPs (doi:10.1021/jacs.2c00521 supplemental)",
41
+ "arylpolyene": "Aryl polyene",
42
+ "atropopeptide": "Atropopeptide RiPPs, e.g. scabrirubin and tryptorubin",
43
+ "azoxy-crosslink": "axoxy compounds formed by carboxilic cross-link",
44
+ "azoxy-dimer": "axoxy compounds formed by dimerisation",
45
+ "benzoxazole": "Benzoxazoles",
46
+ "betalactone": "Beta-lactone containing protease inhibitor",
47
+ "blactam": "&beta;-lactam",
48
+ "bottromycin": "Bottromycin",
49
+ "butyrolactone": "Butyrolactone",
50
+ "cdps": "tRNA-dependent cyclodipeptide synthases",
51
+ "crocagin": "Crocagin-like",
52
+ "cyanobactin": "Cyanobactins like patellamide (AY986476)",
53
+ "cyclic-lactone-autoinducer": "agrD-like cyclic lactone autoinducer peptides (AF001782)",
54
+ "cytokinin": "Adenine-type cytokinins, e.g. fusatin and trans-zeatin",
55
+ "darobactin": "Darobactin-like compounds",
56
+ "deazapurine": "Deazapurine",
57
+ "ectoine": "Ectoine",
58
+ "epipeptide": "D-amino-acid containing RiPPs such as yydF (D78193)",
59
+ "fungal_cdps": "Fungal cyclodipeptide synthases",
60
+ "fungal-ripp": "Fungal RiPP with POP or UstH peptidase types and a modification",
61
+ "furan": "Furan",
62
+ "glycocin": "Glycocin",
63
+ "guanidinotides": "Pheganomycin-style protein ligase-containing cluster",
64
+ "hgle-ks": "Heterocyst glycolipid synthase-like PKS",
65
+ "hr-t2pks": "Highly reducing type II PKS like ishigamide and skyllamycin",
66
+ "hserlactone": "Homoserine lactone",
67
+ "hydrogen-cyanide": "Hydrogen cyanide (AF208523, doi:10.1128/jb.182.24.6940-6949.20)",
68
+ "hydroxy-tropolone": "7-hydroxytropolone-like cluster",
69
+ "indole": "Indole",
70
+ "isocyanide": "Isocyanides (doi:10.1093/nar/gkad573)",
71
+ "nrp with isocyanide": "Isocyanides (doi:0.1128/mBio.00785-18)",
72
+ "ladderane": "Ladderane",
73
+ "lanthipeptide class i": "Class I lanthipeptides like nisin",
74
+ "lanthipeptide class ii": "Class II lanthipeptides like mutacin II (U40620)",
75
+ "lanthipeptide class iii": "Class III lanthipeptides like labyrinthopeptin (FN178622)",
76
+ "lanthipeptide class iv": "Class IV lanthipeptides like venezuelin (HQ328852)",
77
+ "lanthipeptide class v": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
78
+ "lassopeptide": "Lasso peptide",
79
+ "leupeptin": "leupeptin-like compounds",
80
+ "linaridin": "Linear arid peptide such as cypemycin (HQ148718) and salinipeptin (MG788286)",
81
+ "lincosamides": "NRPS-adjacent biosynthesis of lincosamides",
82
+ "lipolanthine": "Lanthipeptide class containing N-terminal fatty acids such as MG673929",
83
+ "melanin": "Melanin",
84
+ "methanobactin": "Copper-chelating/transporting peptides (doi:10.1126/science.aap9437)",
85
+ "microviridin": "Microviridin",
86
+ "mycosporine": "Molecules containing mycosporine-like amino acid",
87
+ "naggn": "N-acetylglutaminylglutamine amide",
88
+ "napaa": "Non-alpha poly-amino acids like e-Polylysin",
89
+ "ni-siderophore": "NRPS-independent, IucA/IucC-like siderophores (*siderophore* prior to 7.0)",
90
+ "nitropropanoic-acid": "3-Nitropropanoic acid (neurotoxin)",
91
+ "nrps": "Non-ribosomal peptide synthetase",
92
+ "nrp-metallophore": "Non-ribosomal peptide metallophores",
93
+ "nucleoside": "Nucleoside",
94
+ "oligosaccharide": "Oligosaccharide",
95
+ "opine-like-metallophore": "Opine-like zincophores like staphylopine (doi:10.1128/mSystems.00554-20)",
96
+ "other": "Cluster containing a secondary metabolite-related protein that does not fit into any other category",
97
+ "pbde": "Polybrominated diphenyl ether",
98
+ "phenazine": "Phenazine",
99
+ "phosphoglycolipid": "Phosphoglycolipid",
100
+ "phosphonate": "Phosphonate",
101
+ "polyhalogenated-pyrrole": "Polyhalogenated pyrrole",
102
+ "polyyne": "Polyyne",
103
+ "ppys-ks": "PPY-like pyrone",
104
+ "prodigiosin": "Serratia-type non-traditional PKS prodigiosin biosynthesis pathway",
105
+ "proteusin": "Proteusin",
106
+ "pufa": "Polyunsaturated fatty acid",
107
+ "pyrrolidine": "Pyrrolidines like described in BGC0001510",
108
+ "ranthipeptide": "Cys-rich peptides (aka. SCIFF: six Cys in fourty-five) like in CP001581:3481278-3502939",
109
+ "ras-ripp": "Streptide-like thioether-bond RiPPs",
110
+ "rcdps": "Fungal Arginine-containing cyclic dipeptides",
111
+ "redox-cofactor": "Redox-cofactors such as PQQ (NC_021985:1458906-1494876)",
112
+ "resorcinol": "Resorcinol",
113
+ "sactipeptide": "Sactipeptide",
114
+ "spliceotide": "RiPPs containing plpX type spliceases (NZ_KB235920:17899-42115)",
115
+ "t1pks": "Type I PKS (Polyketide synthase)",
116
+ "t2pks": "Type II PKS",
117
+ "t3pks": "Type III PKS",
118
+ "terpene": "Terpene",
119
+ "thioamitides": "Thioamitide RiPPs as found in JOBF01000011",
120
+ "thioamide-nrp": "Thioamide-containing non-ribosomal peptide",
121
+ "transat-pks": "Trans-AT PKS",
122
+ "triceptide": "Triceptides",
123
+ "tropodithietic-acid": "Tropodithietic acid",
124
+ "fungal-ripp-like": "Fungal RiPP-likes",
125
+ "nrps-like": "NRPS-like fragment",
126
+ "phosphonate-like": "Phosphonate-like (prior to 7.0 this was the phosphonate rule)",
127
+ "pks-like": "Other types of PKS",
128
+ "ripp-like": "Other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
129
+ "rre-containing": "RRE-element containing cluster",
130
+ "terpene-precursor": "Compound likely used as a terpene precursor",
131
+ "transat-pks-like": "Trans-AT PKS fragment, with trans-AT domain not found",
132
+ "fatty_acid": "Fatty acid (loose strictness, likely from primary metabolism)",
133
+ "halogenated": "Halogenase-containing cluster, potentially generating a halogenated product",
134
+ "lysine": "Fungal lysine primary metabolism",
135
+ "saccharide": "Saccharide (loose strictness, likely from primary metabolism)",
136
+ "lap": "Linear azol(in)e-containing peptides",
137
+ "mycosporine-like": "Molecules containing mycosporine-like amino acid",
138
+ "thiopeptide": "Thiopeptide",
139
+ "siderophore": "Siderophore",
140
+ "bacteriocin": "Bacteriocin or other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
141
+ "fused": "Pheganomycin-style protein ligase-containing cluster",
142
+ "head_to_tail": "Head-to-tail cyclised RiPP (subtilosin-like)",
143
+ "lanthidin": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
144
+ "lanthipeptide": "Lanthipeptides",
145
+ "tfua-related": "TfuA-related RiPPs",
146
+ "otherks": "Other types of PKS",
147
+ "microcin": "Microcin",
148
+ "cf_saccharide": "Possible saccharide",
149
+ "cf_fatty_acid": "Possible fatty acid",
150
+ "cf_putative": "Putative cluster of unknown type identified with the ClusterFinder algorithm",
151
+ }
152
+
153
+
154
+ def parse_args():
155
+ description = (
156
+ "antiSMASH output summary generator. "
157
+ "Script takes regions from GFF and counts its appearance in annotation. "
158
+ "Output columns contain classID, descriptions and count. "
159
+ f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
160
+ f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
161
+ )
162
+ parser = argparse.ArgumentParser(description=description)
163
+ parser.add_argument("-i", "--antismash-gff", help="antiSMASH GFF", required=True)
164
+ parser.add_argument(
165
+ "-o", "--output", help="Antisamsh summary TSV output file.", required=True
166
+ )
167
+ parser.add_argument(
168
+ "-a",
169
+ "--antismash-version",
170
+ help="antiSMASH version that was used to generate GFF",
171
+ required=False,
172
+ default=ANTISMASH_VERSION,
173
+ )
174
+ args = parser.parse_args()
175
+ if args.antismash_version > ANTISMASH_VERSION:
176
+ logging.error(
177
+ "Provided version of antiSMASH is bigger than supported. "
178
+ "Please, make sure you have updated descriptions dictionary. Exit."
179
+ )
180
+ exit(1)
181
+ return args.antismash_gff, args.output
182
+
183
+
184
+ def main():
185
+ input_gff, output_filename = parse_args()
186
+ dict_list = []
187
+ with fileinput.hook_compressed(input_gff, "r") as file_in:
188
+ # TODO: to be merged with the GFF toolkit
189
+ for line in file_in:
190
+ if line.startswith("#"):
191
+ continue
192
+ info = line.strip().split("\t")[8].split(";")
193
+ entry_dict = {}
194
+ for pair in info:
195
+ key, value = pair.split(
196
+ "=", 1
197
+ ) # Ensure split only occurs at the first '=' occurrence
198
+ entry_dict[key] = value
199
+ dict_list.append(entry_dict)
200
+
201
+ # Convert to DataFrame
202
+ df = pd.DataFrame(dict_list)
203
+ df = df[df["product"].notna()]
204
+ df_grouped = (
205
+ df.groupby(["product"]).size().reset_index(name="Count")
206
+ ).sort_values(by="Count", ascending=False)
207
+
208
+ df_grouped = df_grouped.rename(
209
+ columns={
210
+ "product": "label",
211
+ }
212
+ )
213
+ df_grouped["Description"] = df_grouped["label"].apply(
214
+ lambda x: ",".join(
215
+ [
216
+ DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
217
+ for cls in x.split(",")
218
+ ]
219
+ )
220
+ )
221
+ df_grouped = df_grouped[["label", "Description", "Count"]]
222
+ df_grouped = df_grouped.rename(columns={
223
+ "Description": "description",
224
+ "Count": "count"
225
+ })
226
+ df_grouped.to_csv(output_filename, sep="\t", index=False)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ import pandas as pd
21
+
22
+ logging.basicConfig(
23
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
24
+ )
25
+
26
+ SANNTIS_VERSION = "0.9.4.1"
27
+
28
+ f"""
29
+ Script parses SanntiS GFF output and adds descriptions of annotated MIBiGs classes.
30
+ Descriptions were pre-parsed for version {SANNTIS_VERSION} and stored as a dictionary.
31
+ """
32
+
33
+ DESCRIPTIONS = {
34
+ "Polyketide": "Built from iterative condensation of acetate units derived from acetyl-CoA",
35
+ "Terpene": "Composed of isoprene (C5) units derived from isopentenyl pyrophosphate",
36
+ "Alkaloid": "Nitrogen-containing compounds derived from amino acids (e.g., ornithine, lysine, tyrosine, tryptophan)",
37
+ "RiPP": "Ribosomally synthesised and Post-translationally modified Peptide",
38
+ "NRP": "Nonribosomal Peptide",
39
+ "Saccharide": "Carbohydrate-based natural products (e.g., aminoglycoside antibiotics)",
40
+ "Other": "Catch-all class for clusters encoding metabolites outside main classes (e.g., cyclitols, indolocarbazoles, and phosphonates)",
41
+ }
42
+
43
+
44
+ def parse_args():
45
+ description = (
46
+ "Sanntis output summary generator. "
47
+ "Script takes SanntiS GFF and counts pairs of (nearest_MiBIG, nearest_MiBIG_class)."
48
+ "It also adds pre-parsed descriptions of classes stored in that script as a dictionary. "
49
+ f"Descriptions were taken from SanntiS docs v{SANNTIS_VERSION}."
50
+ )
51
+ parser = argparse.ArgumentParser(description=description)
52
+ parser.add_argument("-i", "--sanntis-gff", help="SanntiS GFF", required=True)
53
+ parser.add_argument(
54
+ "-o", "--output", help="SanntiS summary TSV output file.", required=True
55
+ )
56
+ args = parser.parse_args()
57
+ return args.sanntis_gff, args.output
58
+
59
+
60
+ def main():
61
+ input_gff, output_filename = parse_args()
62
+ dict_list = []
63
+ with fileinput.hook_compressed(input_gff, "r") as file_in:
64
+ # TODO: to be merged with the GFF toolkit
65
+ for line in file_in:
66
+ if line.startswith("#"):
67
+ continue
68
+ info = line.strip().split("\t")[8].split(";")
69
+ entry_dict = {}
70
+ # TODO: merge this with the GFF toolkit GFF reader
71
+ for pair in info:
72
+ key, value = pair.split(
73
+ "=", 1
74
+ ) # Ensure split only occurs at the first '=' occurrence
75
+ entry_dict[key] = value
76
+ dict_list.append(entry_dict)
77
+
78
+ # Convert to DataFrame
79
+ df = pd.DataFrame(dict_list)
80
+ df = df.rename(
81
+ columns={
82
+ "nearest_MiBIG": "nearest_MIBiG",
83
+ "nearest_MiBIG_class": "nearest_MIBiG_class",
84
+ }
85
+ )
86
+ df_grouped = (
87
+ df.groupby(["nearest_MIBiG", "nearest_MIBiG_class"])
88
+ .size()
89
+ .reset_index(name="Count")
90
+ )
91
+ df_grouped = df_grouped.sort_values(by="Count", ascending=False)
92
+
93
+ df_desc = pd.DataFrame(
94
+ list(DESCRIPTIONS.items()), columns=["MIBiG_class", "Description"]
95
+ )
96
+ df_desc = df_desc.set_index("MIBiG_class")
97
+ df_merged = df_grouped.merge(
98
+ df_desc, left_on="nearest_MIBiG_class", right_index=True, how="left"
99
+ )
100
+ df_merged["Description"] = df_merged.apply(
101
+ lambda row: row["nearest_MIBiG_class"].replace(
102
+ "NRP", df_desc.loc["NRP"]["Description"]
103
+ )
104
+ if pd.isna(row["Description"]) and "NRP" in row["nearest_MIBiG_class"]
105
+ else row["Description"],
106
+ axis=1,
107
+ )
108
+ df_merged = df_merged[
109
+ ["nearest_MIBiG", "nearest_MIBiG_class", "Description", "Count"]
110
+ ]
111
+ df_merged = df_merged.rename(columns={
112
+ "Description": "description",
113
+ "Count": "count"
114
+ })
115
+ df_merged.to_csv(output_filename, sep="\t", index=False)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
@@ -62,9 +62,12 @@ class TableModifier:
62
62
  self.output_file = output_file
63
63
 
64
64
  def modify_table(self):
65
- with fileinput.hook_compressed(self.input_file, "rt") as file_in, open(
66
- self.output_file, "w"
67
- ) as file_out:
65
+ with (
66
+ fileinput.hook_compressed(
67
+ self.input_file, "r", encoding="utf-8"
68
+ ) as file_in,
69
+ open(self.output_file, "w") as file_out,
70
+ ):
68
71
  header_written = False
69
72
  separator_line, header = "", ""
70
73
  for line in file_in:
@@ -108,7 +108,7 @@ def main():
108
108
 
109
109
  open_files = {}
110
110
  for record in SeqIO.parse(args.input, "fasta"):
111
- model = "-".join(record.id.split("/")[0].split("-")[-1:])
111
+ model = "-".join("/".join(record.id.split("/")[:-1]).split("-")[-1:])
112
112
  if model in SSU_MODELS:
113
113
  if SSU not in open_files:
114
114
  file_out = open(pattern_dict[SSU], "w")
@@ -26,9 +26,9 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
26
26
  # used by library_strategy_checker in analysis.shared
27
27
  MIN_AMPLICON_STRATEGY_CHECK = 0.30
28
28
 
29
+
29
30
  # used by markergene_study_summary in analysis.shared
30
31
  MAJORITY_MARKER_PROPORTION = 0.45
31
-
32
32
  # used by gff_toolkit in analysis.assembly
33
33
  EVALUE_CUTOFF_IPS = 1e-10
34
34
  EVALUE_CUTOFF_EGGNOG = 1e-10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -11,33 +11,24 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: biopython==1.82
15
- Requires-Dist: numpy==1.26.0
16
- Requires-Dist: pandas==2.0.2
17
- Requires-Dist: regex==2023.12.25
18
- Requires-Dist: requests==2.32.3
19
- Requires-Dist: click==8.1.7
20
- Requires-Dist: pandera==0.22.1
21
- Requires-Dist: pyfastx>=2.2.0
22
- Requires-Dist: intervaltree==3.1.0
14
+ Requires-Dist: biopython>=1.85
15
+ Requires-Dist: numpy<3,>=2.2.4
16
+ Requires-Dist: pandas<3,>=2.2.3
17
+ Requires-Dist: regex>=2024.11.6
18
+ Requires-Dist: requests<3,>=2.32.3
19
+ Requires-Dist: click<9,>=8.1.8
20
+ Requires-Dist: pandera<0.24,>=0.23.1
21
+ Requires-Dist: pyfastx<3,>=2.2.0
22
+ Requires-Dist: intervaltree<4,>=3.1.0
23
23
  Provides-Extra: tests
24
- Requires-Dist: pytest==7.4.0; extra == "tests"
25
- Requires-Dist: pytest-md==0.2.0; extra == "tests"
26
- Requires-Dist: pytest-workflow==2.0.1; extra == "tests"
27
- Requires-Dist: biopython==1.82; extra == "tests"
28
- Requires-Dist: pandas==2.0.2; extra == "tests"
29
- Requires-Dist: numpy==1.26.0; extra == "tests"
30
- Requires-Dist: regex==2023.12.25; extra == "tests"
31
- Requires-Dist: requests==2.32.3; extra == "tests"
32
- Requires-Dist: click==8.1.7; extra == "tests"
33
- Requires-Dist: pandera==0.22.1; extra == "tests"
34
- Requires-Dist: pyfastx>=2.2.0; extra == "tests"
24
+ Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
25
+ Requires-Dist: pytest-md>=0.2.0; extra == "tests"
26
+ Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
35
27
  Provides-Extra: dev
36
- Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
37
- Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
- Requires-Dist: black==24.8.0; extra == "dev"
39
- Requires-Dist: flake8==7.1.1; extra == "dev"
40
- Requires-Dist: pep8-naming==0.14.1; extra == "dev"
28
+ Requires-Dist: pre-commit>=4.2.0; extra == "dev"
29
+ Requires-Dist: black>=25.1.0; extra == "dev"
30
+ Requires-Dist: flake8>=7.1.2; extra == "dev"
31
+ Requires-Dist: pep8-naming>=0.14.1; extra == "dev"
41
32
  Dynamic: license-file
42
33
 
43
34
  # mgnify-pipelines-toolkit
@@ -74,8 +65,9 @@ Before starting any development, you should do these few steps:
74
65
  - Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
75
66
  - Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
76
67
  - Activate you new environment (i.e. `conda activate my_new_env`)
77
- - Install dev dependencies `pip install -e '.[dev]'`
68
+ - Install dev dependencies `pip install -e '.[tests,dev]'`
78
69
  - Install pre-commit hooks `pre-commit install`
70
+ - Run unit tests `pytest`
79
71
 
80
72
  When doing these steps above, you ensure that the code you add will be linted and formatted properly.
81
73