mgnify-pipelines-toolkit 0.1.8__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (48) hide show
  1. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/PKG-INFO +8 -2
  2. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +130 -0
  3. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +178 -0
  4. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +424 -0
  5. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +117 -0
  6. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +135 -0
  7. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +181 -0
  8. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +382 -0
  9. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/constants/db_labels.py +21 -0
  10. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/constants/tax_ranks.py +4 -0
  11. mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/schemas/schemas.py +217 -0
  12. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +8 -2
  13. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +9 -0
  14. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +5 -0
  15. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/requires.txt +6 -0
  16. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/pyproject.toml +18 -4
  17. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/LICENSE +0 -0
  18. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/README.md +0 -0
  19. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/__init__.py +0 -0
  20. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  21. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  22. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  23. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  24. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  25. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  26. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  27. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  28. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  29. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  30. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  31. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  32. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  33. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  34. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  35. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  36. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  37. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  38. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  39. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  40. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  41. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  42. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  43. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  44. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  45. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  46. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  47. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  48. {mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.8
3
+ Version: 0.2.0
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
15
15
  Requires-Dist: numpy==1.26.0
16
16
  Requires-Dist: pandas==2.0.2
17
17
  Requires-Dist: regex==2023.12.25
18
+ Requires-Dist: requests==2.32.3
19
+ Requires-Dist: click==8.1.7
20
+ Requires-Dist: pandera==0.22.1
18
21
  Provides-Extra: tests
19
22
  Requires-Dist: pytest==7.4.0; extra == "tests"
20
23
  Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
23
26
  Requires-Dist: pandas==2.0.2; extra == "tests"
24
27
  Requires-Dist: numpy==1.26.0; extra == "tests"
25
28
  Requires-Dist: regex==2023.12.25; extra == "tests"
29
+ Requires-Dist: requests==2.32.3; extra == "tests"
30
+ Requires-Dist: click==8.1.7; extra == "tests"
31
+ Requires-Dist: pandera==0.22.1; extra == "tests"
26
32
  Provides-Extra: dev
27
33
  Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
28
34
  Requires-Dist: pre-commit==3.8.0; extra == "dev"
@@ -0,0 +1,130 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import hashlib
19
+ import logging
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ from Bio import SeqIO
24
+ import pandas as pd
25
+
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(levelname)s - %(message)s",
30
+ handlers=[logging.StreamHandler()],
31
+ )
32
+
33
+
34
+ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
35
+ current_protein = None
36
+ for line in lines:
37
+ parts = line.strip().split("\t")
38
+ protein_id = parts[0]
39
+ if protein_id != current_protein:
40
+ current_protein = protein_id
41
+ protein_rheas = set()
42
+ rhea_list = parts[-1].split("RheaID=")[1].split()
43
+ top_hit = "top hit" if rhea_list and not protein_rheas else ""
44
+
45
+ for rhea in rhea_list:
46
+ if rhea not in protein_rheas:
47
+ chebi_reaction, reaction = rhea2reaction_dict[rhea]
48
+ contig_id = protein_id.split("_")[0]
49
+ protein_hash = protein_hashes[protein_id]
50
+
51
+ print(
52
+ contig_id,
53
+ protein_id,
54
+ protein_hash,
55
+ rhea,
56
+ chebi_reaction,
57
+ reaction,
58
+ top_hit,
59
+ sep="\t",
60
+ file=output_handler,
61
+ )
62
+ protein_rheas.add(rhea)
63
+
64
+
65
+ def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
66
+ logging.info(
67
+ f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
68
+ )
69
+ protein_hashes = {}
70
+ with open(proteins, "r") as fasta_file:
71
+ for record in SeqIO.parse(fasta_file, "fasta"):
72
+ protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
73
+ protein_hashes[record.id] = protein_hash
74
+
75
+ logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
76
+ df = pd.read_csv(rhea2chebi, delimiter="\t")
77
+ rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
78
+
79
+ logging.info(
80
+ f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
81
+ )
82
+ with open(output, "w") as output_handler:
83
+ if input == "-":
84
+ process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
85
+ else:
86
+ with open(args.input, "r") as input_file:
87
+ process_lines(
88
+ input_file, output_handler, rhea2reaction_dict, protein_hashes
89
+ )
90
+
91
+ logging.info("Processed successfully. Exiting.")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ parser = argparse.ArgumentParser(
96
+ """
97
+ Use diamond output file to create a table with Rhea and CHEBI
98
+ reaction annotation for every protein.
99
+ """
100
+ )
101
+ parser.add_argument(
102
+ "-i",
103
+ "--input",
104
+ required=True,
105
+ type=str,
106
+ help="DIAMOND results file, use '-' for stdin",
107
+ )
108
+ parser.add_argument(
109
+ "-o",
110
+ "--output",
111
+ required=True,
112
+ type=Path,
113
+ help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
114
+ )
115
+ parser.add_argument(
116
+ "-p",
117
+ "--proteins",
118
+ required=True,
119
+ type=Path,
120
+ help="Protein fasta file used as DIAMOND input",
121
+ )
122
+ parser.add_argument(
123
+ "--rhea2chebi",
124
+ default=None,
125
+ type=Path,
126
+ help="File that maps rhea_ids to CHEBI",
127
+ )
128
+
129
+ args = parser.parse_args()
130
+ main(args.input, args.output, args.proteins, args.rhea2chebi)
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import json
20
+
21
+ import pandas as pd
22
+
23
+
24
+ def parse_args():
25
+
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument(
28
+ "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
29
+ )
30
+ parser.add_argument(
31
+ "-o", "--output", required=True, type=str, help="Output GFF3 file name"
32
+ )
33
+ parser.add_argument(
34
+ "--cds_tag",
35
+ default="ID",
36
+ type=str,
37
+ help="Type of CDS ID tag to use in the GFF3 (default: locus_tag)",
38
+ ) # The CDS' identifier changes from tool to tool.
39
+
40
+ args = parser.parse_args()
41
+
42
+ return args.input, args.output, args.cds_tag
43
+
44
+
45
+ def main():
46
+ """Transform an antiSMASH JSON into a GFF3 with 'regions' and CDS within those regions"""
47
+
48
+ json_input, output_file, cds_tag = parse_args()
49
+
50
+ with open(json_input, "r") as json_data:
51
+ antismash_analysis = json.load(json_data)
52
+
53
+ res_dict = defaultdict(list)
54
+ attributes_dict = defaultdict(dict)
55
+
56
+ antismash_ver = antismash_analysis["version"]
57
+
58
+ for record in antismash_analysis["records"]:
59
+ record_id = record["id"]
60
+
61
+ iter_cds = (
62
+ "antismash.detection.genefunctions" in record["modules"].keys()
63
+ ) # Flag to iterate CDS
64
+ region_name = None
65
+
66
+ for feature in record["features"]:
67
+
68
+ if feature["type"] == "region":
69
+ # Annotate region features
70
+ region_name = (
71
+ f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
72
+ )
73
+ region_start = int(feature["location"].split(":")[0].split("[")[1])
74
+ region_end = int(feature["location"].split(":")[1].split("]")[0])
75
+
76
+ res_dict["contig"].append(record_id)
77
+ res_dict["version"].append(f"antiSMASH:{antismash_ver}")
78
+ res_dict["type"].append("region")
79
+ res_dict["start"].append(region_start + 1)
80
+ res_dict["end"].append(region_end)
81
+ res_dict["score"].append(".")
82
+ res_dict["strand"].append(".")
83
+ res_dict["phase"].append(".")
84
+
85
+ product = ",".join(feature["qualifiers"].get("product", []))
86
+
87
+ attributes_dict[region_name].update(
88
+ {"ID": region_name, "product": product}
89
+ )
90
+
91
+ if iter_cds and feature["type"] == "CDS":
92
+ # Annotate CDS features
93
+
94
+ start = int(feature["location"].split(":")[0][1:])
95
+ end = int(feature["location"].split(":")[1].split("]")[0])
96
+ strand = feature["location"].split("(")[1][0] # + or -
97
+
98
+ if not region_name or not (region_start <= end and start <= region_end):
99
+ continue
100
+
101
+ res_dict["contig"].append(record_id)
102
+ res_dict["version"].append(f"antiSMASH:{antismash_ver}")
103
+ res_dict["type"].append("gene")
104
+ res_dict["start"].append(start + 1) # Correct for 1-based indexing
105
+ res_dict["end"].append(end)
106
+ res_dict["score"].append(".")
107
+ res_dict["strand"].append(strand)
108
+ res_dict["phase"].append(".")
109
+
110
+ locus_tag = feature["qualifiers"][cds_tag][0]
111
+ attributes_dict[locus_tag].update(
112
+ {
113
+ "ID": locus_tag,
114
+ "as_type": ",".join(
115
+ feature["qualifiers"].get("gene_kind", ["other"])
116
+ ),
117
+ "gene_functions": ",".join(
118
+ feature["qualifiers"].get("gene_functions", [])
119
+ )
120
+ .replace(" ", "_")
121
+ .replace(":_", ":")
122
+ .replace(";_", "%3B"),
123
+ "Parent": region_name,
124
+ }
125
+ )
126
+
127
+ # Extended CDS attributes
128
+ if "antismash.detection.hmm_detection" in record["modules"].keys():
129
+ cds_by_protocluster = record["modules"][
130
+ "antismash.detection.hmm_detection"
131
+ ]["rule_results"]["cds_by_protocluster"]
132
+ if len(cds_by_protocluster) > 0:
133
+ for feature in cds_by_protocluster[0][1]:
134
+ if "cds_name" in feature.keys():
135
+ locus_tag = feature["cds_name"]
136
+ as_clusters = ",".join(
137
+ list(feature["definition_domains"].keys())
138
+ )
139
+ if locus_tag in attributes_dict.keys():
140
+ attributes_dict[locus_tag].update(
141
+ {"as_gene_clusters": as_clusters}
142
+ )
143
+
144
+ if "antismash.detection.genefunctions" in record["modules"].keys():
145
+ for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
146
+ if tool["tool"] == "smcogs":
147
+ for locus_tag in tool["best_hits"]:
148
+ hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
149
+ hit_desc = (
150
+ tool["best_hits"][locus_tag]["hit_id"]
151
+ .split(":")[1]
152
+ .replace(" ", "_")
153
+ )
154
+ score = tool["best_hits"][locus_tag]["bitscore"]
155
+ e_value = tool["best_hits"][locus_tag]["evalue"]
156
+
157
+ smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
158
+ if locus_tag in attributes_dict.keys():
159
+ attributes_dict[locus_tag].update({"as_notes": smcog_note})
160
+ break
161
+
162
+ attributes = [
163
+ ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
164
+ for attrib_data in attributes_dict.values()
165
+ ]
166
+ res_dict["attributes"] = attributes
167
+
168
+ res_df = pd.DataFrame.from_dict(res_dict)
169
+
170
+ with open(output_file, "w") as f_out:
171
+ f_out.write(
172
+ "##gff-version 3\n"
173
+ ) # Save data to the GFF3 file with the proper header
174
+ res_df.to_csv(f_out, header=False, index=False, sep="\t")
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()