mgnify-pipelines-toolkit 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (44) hide show
  1. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/PKG-INFO +8 -2
  2. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +3 -1
  3. mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +130 -0
  4. mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +178 -0
  5. mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +382 -0
  6. mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/constants/db_labels.py +21 -0
  7. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/tax_ranks.py +4 -0
  8. mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/schemas/schemas.py +217 -0
  9. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +8 -2
  10. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +5 -0
  11. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +2 -0
  12. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/requires.txt +6 -0
  13. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/pyproject.toml +15 -4
  14. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/LICENSE +0 -0
  15. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/README.md +0 -0
  16. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/__init__.py +0 -0
  17. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  18. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  19. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  20. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  21. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  22. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  23. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  24. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  25. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  26. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  27. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  28. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  29. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  30. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  31. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  32. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  33. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  34. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  35. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  36. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  37. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  38. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  39. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  40. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  41. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  42. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  43. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  44. {mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
15
15
  Requires-Dist: numpy==1.26.0
16
16
  Requires-Dist: pandas==2.0.2
17
17
  Requires-Dist: regex==2023.12.25
18
+ Requires-Dist: requests==2.32.3
19
+ Requires-Dist: click==8.1.7
20
+ Requires-Dist: pandera==0.22.1
18
21
  Provides-Extra: tests
19
22
  Requires-Dist: pytest==7.4.0; extra == "tests"
20
23
  Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
23
26
  Requires-Dist: pandas==2.0.2; extra == "tests"
24
27
  Requires-Dist: numpy==1.26.0; extra == "tests"
25
28
  Requires-Dist: regex==2023.12.25; extra == "tests"
29
+ Requires-Dist: requests==2.32.3; extra == "tests"
30
+ Requires-Dist: click==8.1.7; extra == "tests"
31
+ Requires-Dist: pandera==0.22.1; extra == "tests"
26
32
  Provides-Extra: dev
27
33
  Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
28
34
  Requires-Dist: pre-commit==3.8.0; extra == "dev"
@@ -277,6 +277,8 @@ def main():
277
277
  taxa_df = taxa_df.fillna("0")
278
278
  taxa_df = order_df(taxa_df)
279
279
 
280
+ asv_list = taxa_df.ASV.to_list()
281
+
280
282
  amp_reads = [read.strip() for read in list(open(amp, "r"))]
281
283
  headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
282
284
  amp_region = ".".join(amp.split(".")[1:3])
@@ -288,7 +290,7 @@ def main():
288
290
  counter += 1
289
291
  line_fwd = line_fwd.strip()
290
292
 
291
- if line_fwd == "0":
293
+ if line_fwd == "0" or f"seq_{line_fwd}" not in asv_list:
292
294
  continue
293
295
 
294
296
  if headers[counter] in amp_reads:
@@ -0,0 +1,130 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import hashlib
19
+ import logging
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ from Bio import SeqIO
24
+ import pandas as pd
25
+
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(levelname)s - %(message)s",
30
+ handlers=[logging.StreamHandler()],
31
+ )
32
+
33
+
34
+ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
35
+ current_protein = None
36
+ for line in lines:
37
+ parts = line.strip().split("\t")
38
+ protein_id = parts[0]
39
+ if protein_id != current_protein:
40
+ current_protein = protein_id
41
+ protein_rheas = set()
42
+ rhea_list = parts[-1].split("RheaID=")[1].split()
43
+ top_hit = "top hit" if rhea_list and not protein_rheas else ""
44
+
45
+ for rhea in rhea_list:
46
+ if rhea not in protein_rheas:
47
+ chebi_reaction, reaction = rhea2reaction_dict[rhea]
48
+ contig_id = protein_id.split("_")[0]
49
+ protein_hash = protein_hashes[protein_id]
50
+
51
+ print(
52
+ contig_id,
53
+ protein_id,
54
+ protein_hash,
55
+ rhea,
56
+ chebi_reaction,
57
+ reaction,
58
+ top_hit,
59
+ sep="\t",
60
+ file=output_handler,
61
+ )
62
+ protein_rheas.add(rhea)
63
+
64
+
65
+ def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
66
+ logging.info(
67
+ f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
68
+ )
69
+ protein_hashes = {}
70
+ with open(proteins, "r") as fasta_file:
71
+ for record in SeqIO.parse(fasta_file, "fasta"):
72
+ protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
73
+ protein_hashes[record.id] = protein_hash
74
+
75
+ logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
76
+ df = pd.read_csv(rhea2chebi, delimiter="\t")
77
+ rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
78
+
79
+ logging.info(
80
+ f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
81
+ )
82
+ with open(output, "w") as output_handler:
83
+ if input == "-":
84
+ process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
85
+ else:
86
+ with open(args.input, "r") as input_file:
87
+ process_lines(
88
+ input_file, output_handler, rhea2reaction_dict, protein_hashes
89
+ )
90
+
91
+ logging.info("Processed successfully. Exiting.")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ parser = argparse.ArgumentParser(
96
+ """
97
+ Use diamond output file to create a table with Rhea and CHEBI
98
+ reaction annotation for every protein.
99
+ """
100
+ )
101
+ parser.add_argument(
102
+ "-i",
103
+ "--input",
104
+ required=True,
105
+ type=str,
106
+ help="DIAMOND results file, use '-' for stdin",
107
+ )
108
+ parser.add_argument(
109
+ "-o",
110
+ "--output",
111
+ required=True,
112
+ type=Path,
113
+ help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
114
+ )
115
+ parser.add_argument(
116
+ "-p",
117
+ "--proteins",
118
+ required=True,
119
+ type=Path,
120
+ help="Protein fasta file used as DIAMOND input",
121
+ )
122
+ parser.add_argument(
123
+ "--rhea2chebi",
124
+ default=None,
125
+ type=Path,
126
+ help="File that maps rhea_ids to CHEBI",
127
+ )
128
+
129
+ args = parser.parse_args()
130
+ main(args.input, args.output, args.proteins, args.rhea2chebi)
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import json
20
+
21
+ import pandas as pd
22
+
23
+
24
+ def parse_args():
25
+
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument(
28
+ "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
29
+ )
30
+ parser.add_argument(
31
+ "-o", "--output", required=True, type=str, help="Output GFF3 file name"
32
+ )
33
+ parser.add_argument(
34
+ "--cds_tag",
35
+ default="ID",
36
+ type=str,
37
+ help="Type of CDS ID tag to use in the GFF3 (default: locus_tag)",
38
+ ) # The CDS' identifier changes from tool to tool.
39
+
40
+ args = parser.parse_args()
41
+
42
+ return args.input, args.output, args.cds_tag
43
+
44
+
45
+ def main():
46
+ """Transform an antiSMASH JSON into a GFF3 with 'regions' and CDS within those regions"""
47
+
48
+ json_input, output_file, cds_tag = parse_args()
49
+
50
+ with open(json_input, "r") as json_data:
51
+ antismash_analysis = json.load(json_data)
52
+
53
+ res_dict = defaultdict(list)
54
+ attributes_dict = defaultdict(dict)
55
+
56
+ antismash_ver = antismash_analysis["version"]
57
+
58
+ for record in antismash_analysis["records"]:
59
+ record_id = record["id"]
60
+
61
+ iter_cds = (
62
+ "antismash.detection.genefunctions" in record["modules"].keys()
63
+ ) # Flag to iterate CDS
64
+ region_name = None
65
+
66
+ for feature in record["features"]:
67
+
68
+ if feature["type"] == "region":
69
+ # Annotate region features
70
+ region_name = (
71
+ f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
72
+ )
73
+ region_start = int(feature["location"].split(":")[0].split("[")[1])
74
+ region_end = int(feature["location"].split(":")[1].split("]")[0])
75
+
76
+ res_dict["contig"].append(record_id)
77
+ res_dict["version"].append(f"antiSMASH:{antismash_ver}")
78
+ res_dict["type"].append("region")
79
+ res_dict["start"].append(region_start + 1)
80
+ res_dict["end"].append(region_end)
81
+ res_dict["score"].append(".")
82
+ res_dict["strand"].append(".")
83
+ res_dict["phase"].append(".")
84
+
85
+ product = ",".join(feature["qualifiers"].get("product", []))
86
+
87
+ attributes_dict[region_name].update(
88
+ {"ID": region_name, "product": product}
89
+ )
90
+
91
+ if iter_cds and feature["type"] == "CDS":
92
+ # Annotate CDS features
93
+
94
+ start = int(feature["location"].split(":")[0][1:])
95
+ end = int(feature["location"].split(":")[1].split("]")[0])
96
+ strand = feature["location"].split("(")[1][0] # + or -
97
+
98
+ if not region_name or not (region_start <= end and start <= region_end):
99
+ continue
100
+
101
+ res_dict["contig"].append(record_id)
102
+ res_dict["version"].append(f"antiSMASH:{antismash_ver}")
103
+ res_dict["type"].append("gene")
104
+ res_dict["start"].append(start + 1) # Correct for 1-based indexing
105
+ res_dict["end"].append(end)
106
+ res_dict["score"].append(".")
107
+ res_dict["strand"].append(strand)
108
+ res_dict["phase"].append(".")
109
+
110
+ locus_tag = feature["qualifiers"][cds_tag][0]
111
+ attributes_dict[locus_tag].update(
112
+ {
113
+ "ID": locus_tag,
114
+ "as_type": ",".join(
115
+ feature["qualifiers"].get("gene_kind", ["other"])
116
+ ),
117
+ "gene_functions": ",".join(
118
+ feature["qualifiers"].get("gene_functions", [])
119
+ )
120
+ .replace(" ", "_")
121
+ .replace(":_", ":")
122
+ .replace(";_", "%3B"),
123
+ "Parent": region_name,
124
+ }
125
+ )
126
+
127
+ # Extended CDS attributes
128
+ if "antismash.detection.hmm_detection" in record["modules"].keys():
129
+ cds_by_protocluster = record["modules"][
130
+ "antismash.detection.hmm_detection"
131
+ ]["rule_results"]["cds_by_protocluster"]
132
+ if len(cds_by_protocluster) > 0:
133
+ for feature in cds_by_protocluster[0][1]:
134
+ if "cds_name" in feature.keys():
135
+ locus_tag = feature["cds_name"]
136
+ as_clusters = ",".join(
137
+ list(feature["definition_domains"].keys())
138
+ )
139
+ if locus_tag in attributes_dict.keys():
140
+ attributes_dict[locus_tag].update(
141
+ {"as_gene_clusters": as_clusters}
142
+ )
143
+
144
+ if "antismash.detection.genefunctions" in record["modules"].keys():
145
+ for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
146
+ if tool["tool"] == "smcogs":
147
+ for locus_tag in tool["best_hits"]:
148
+ hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
149
+ hit_desc = (
150
+ tool["best_hits"][locus_tag]["hit_id"]
151
+ .split(":")[1]
152
+ .replace(" ", "_")
153
+ )
154
+ score = tool["best_hits"][locus_tag]["bitscore"]
155
+ e_value = tool["best_hits"][locus_tag]["evalue"]
156
+
157
+ smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
158
+ if locus_tag in attributes_dict.keys():
159
+ attributes_dict[locus_tag].update({"as_notes": smcog_note})
160
+ break
161
+
162
+ attributes = [
163
+ ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
164
+ for attrib_data in attributes_dict.values()
165
+ ]
166
+ res_dict["attributes"] = attributes
167
+
168
+ res_df = pd.DataFrame.from_dict(res_dict)
169
+
170
+ with open(output_file, "w") as f_out:
171
+ f_out.write(
172
+ "##gff-version 3\n"
173
+ ) # Save data to the GFF3 file with the proper header
174
+ res_df.to_csv(f_out, header=False, index=False, sep="\t")
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
@@ -0,0 +1,382 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import click
18
+ from collections import defaultdict
19
+ import glob
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Union, List
23
+
24
+ import pandas as pd
25
+
26
+ from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
27
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
28
+ _SILVA_TAX_RANKS,
29
+ _PR2_TAX_RANKS,
30
+ )
31
+ from mgnify_pipelines_toolkit.schemas.schemas import (
32
+ AmpliconPassedRunsSchema,
33
+ AmpliconNonINSDCPassedRunsSchema,
34
+ TaxonSchema,
35
+ PR2TaxonSchema,
36
+ )
37
+
38
+ logging.basicConfig(level=logging.DEBUG)
39
+
40
+
41
+ @click.group()
42
+ def cli():
43
+ pass
44
+
45
+
46
+ def get_tax_file(
47
+ run_acc: str, analyses_dir: Path, db_label: str
48
+ ) -> Union[Path, List[Path]]:
49
+ """Takes path information for a particular analysis and db_label combo, and returns any existing files.
50
+
51
+ :param run_acc: Run accession for the tax file that should be retrieved.
52
+ :type run_acc: str
53
+ :param analyses_dir: The path to the directory containing all of the analyses,
54
+ including the tax file corresponding to :param:`run_acc`.
55
+ :type analyses_dir: Path
56
+ :param db_label: One of the database labels that results might exist for,
57
+ values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
58
+ :type db_label: str
59
+ :return: Either a :class:`Path` object if :param:`db_label` comes from ``TAXDB_LABELS``,
60
+ or a list of :class:`Path` objects if from ``ASV_TAXDB_LABELS``.
61
+ :rtype: Union[Path, List[Path]]
62
+ """
63
+
64
+ tax_file = None
65
+
66
+ db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
67
+
68
+ if not db_path.exists():
69
+ logging.debug(
70
+ f"DB {db_path} doesn't exist for {run_acc}. Skipping"
71
+ ) # or error?
72
+ return
73
+
74
+ if db_label in TAXDB_LABELS:
75
+ tax_file = Path(
76
+ f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
77
+ )
78
+ if not tax_file.exists():
79
+ logging.error(
80
+ f"DB path exists but file doesn't - exiting. Path: {tax_file}"
81
+ )
82
+ exit(1)
83
+
84
+ file_size = tax_file.stat().st_size
85
+ if (
86
+ file_size == 0
87
+ ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
88
+ # so need to skip those. Should probably fix that at some point
89
+ logging.debug(
90
+ f"File {tax_file} exists but is empty, so will be skipping it."
91
+ )
92
+ tax_file = None
93
+ elif db_label in ASV_TAXDB_LABELS:
94
+ # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
95
+ # So will need to handle this differently to closed-reference files
96
+ asv_tax_files = glob.glob(
97
+ f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
98
+ )
99
+ asv_tax_files = [
100
+ Path(file) for file in asv_tax_files if "concat" not in file
101
+ ] # Have to filter out concatenated file if it exists
102
+
103
+ tax_file = asv_tax_files
104
+
105
+ return tax_file
106
+
107
+
108
+ def parse_one_tax_file(
109
+ run_acc: str, tax_file: Path, long_tax_ranks: list
110
+ ) -> pd.DataFrame:
111
+ """Parses a taxonomy file, and returns it as a pandas DataFrame object.
112
+
113
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
114
+ :type run_acc: str
115
+ :param tax_file: Taxonomy file that will be parsed.
116
+ :type tax_file: Path
117
+ :param long_tax_ranks: Either the imported list _SILVA_TAX_RANKS or _PR2_TAX_RANKS
118
+ to validate the taxonomic ranks of the file.
119
+ :type tax_file: list
120
+ :return: The parsed :param:`tax_file` as a :class:`pd.DataFrame` object
121
+ :rtype: pd.DataFrame
122
+ """
123
+
124
+ res_df = pd.read_csv(tax_file, sep="\t", names=["Count"] + long_tax_ranks)
125
+ res_df = res_df.fillna("")
126
+
127
+ # Two different schemas used for validation depending on the database
128
+ # because PR2 schema has different taxonomic ranks than the standard
129
+ if len(long_tax_ranks) == 8:
130
+ TaxonSchema(res_df)
131
+ elif len(long_tax_ranks) == 9:
132
+ PR2TaxonSchema(res_df)
133
+
134
+ res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
135
+ lambda x: ";".join(x).strip(";"), axis=1
136
+ )
137
+ final_df = res_df.iloc[:, [0, -1]]
138
+ final_df = final_df.set_index("full_taxon")
139
+ final_df.columns = [run_acc]
140
+
141
+ return final_df
142
+
143
+
144
+ def generate_db_summary(
145
+ db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
146
+ ) -> None:
147
+ """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
148
+ and respective db_label, joins them together, and generates a study-wide summary
149
+ in the form of a .tsv file.
150
+
151
+ :param db_label: One of the database labels that results might exist for,
152
+ values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
153
+ :param tax_dfs: Dictionary where the key is a run accession,
154
+ and values are either one parsed taxonomy dataframe if the :param:db_label comes from ``TAXDB_LABELS``,
155
+ or a list of at least 1 and at most 2 dataframes if it comes from ``ASV_TAXDB_LABELS``.
156
+ These dataframes are parsed by :func:`parse_one_tax_file`
157
+ :type tax_dfs: defaultdict[Path]
158
+ :param output_prefix: Prefix to be added to the generated summary file.
159
+ :type output_prefix: str
160
+ """
161
+
162
+ if db_label in TAXDB_LABELS:
163
+ df_list = []
164
+
165
+ if "PR2" in db_label:
166
+ long_tax_ranks = _PR2_TAX_RANKS
167
+ else:
168
+ long_tax_ranks = _SILVA_TAX_RANKS
169
+
170
+ for run_acc, tax_df in tax_dfs.items():
171
+ res_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
172
+ df_list.append(res_df)
173
+
174
+ res_df = pd.concat(df_list, axis=1).fillna(0)
175
+ res_df = res_df.sort_index()
176
+ res_df = res_df.astype(int)
177
+
178
+ res_df.to_csv(
179
+ f"{output_prefix}_{db_label}_study_summary.tsv",
180
+ sep="\t",
181
+ index_label="taxonomy",
182
+ )
183
+
184
+ elif db_label in ASV_TAXDB_LABELS:
185
+
186
+ if "PR2" in db_label:
187
+ long_tax_ranks = _PR2_TAX_RANKS
188
+ else:
189
+ long_tax_ranks = _SILVA_TAX_RANKS
190
+
191
+ amp_region_dict = defaultdict(list)
192
+
193
+ for (
194
+ run_acc,
195
+ tax_df_asv_lst,
196
+ ) in (
197
+ tax_dfs.items()
198
+ ): # each `tax_file` will be a list containing at most two files (one for each amp_region)
199
+ for tax_df in tax_df_asv_lst:
200
+ amp_region = str(tax_df).split("_")[
201
+ -5
202
+ ] # there are a lot of underscores in these names... but it is consistent
203
+ # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
204
+ amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
205
+ amp_region_dict[amp_region].append(amp_region_df)
206
+
207
+ for amp_region, amp_region_dfs in amp_region_dict.items():
208
+ if (
209
+ len(amp_region_dfs) > 1
210
+ ): # Need at least two analyses with this amp_region to bother with the summary
211
+ amp_res_df = amp_region_dfs[0]
212
+ for amp_df in amp_region_dfs[1:]:
213
+ amp_res_df = amp_res_df.join(amp_df, how="outer")
214
+ amp_res_df = amp_res_df.fillna(0)
215
+ amp_res_df = amp_res_df.astype(int)
216
+
217
+ amp_res_df.to_csv(
218
+ f"{output_prefix}_{db_label}_{amp_region}_asv_study_summary.tsv",
219
+ sep="\t",
220
+ index_label="taxonomy",
221
+ )
222
+
223
+
224
+ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List]:
225
+ """Matches different summary files of the same database label and analysis
226
+ type (and amplified region for ASVs) into a dictionary to help merge
227
+ the correct summaries.
228
+
229
+ :param all_study_summaries: List of file paths to different summary files
230
+ :type all_study_summaries: List[str]
231
+ :return: Organised dictionary where each summary is paired to a specific
232
+ database label key to be merged together.
233
+ :rtype: defaultdict[List]
234
+ """
235
+ summaries_dict = defaultdict(list)
236
+
237
+ for summary in all_study_summaries:
238
+ summary_path = Path(summary)
239
+ summary_filename = summary_path.stem
240
+
241
+ temp_lst = summary_filename.split("_")
242
+ if "asv_study_summary" in summary_filename:
243
+ summary_db_label = "_".join(
244
+ temp_lst[1:3]
245
+ ) # For ASVs we need to include the amp_region in the label
246
+ else:
247
+ summary_db_label = temp_lst[
248
+ 1
249
+ ] # For closed reference, just the db_label is needed
250
+
251
+ summaries_dict[summary_db_label].append(summary_path)
252
+
253
+ return summaries_dict
254
+
255
+
256
+ @cli.command(
257
+ "summarise",
258
+ options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
259
+ short_help="Generate study-level analysis summaries.",
260
+ )
261
+ @click.option(
262
+ "-r",
263
+ "--runs",
264
+ required=True,
265
+ help="CSV file containing successful analyses generated by the pipeline",
266
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
267
+ )
268
+ @click.option(
269
+ "-a",
270
+ "--analyses_dir",
271
+ required=True,
272
+ help="Input directory to where all the individual analyses subdirectories for summarising",
273
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
274
+ )
275
+ @click.option(
276
+ "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
277
+ )
278
+ @click.option(
279
+ "--non_insdc",
280
+ default=False,
281
+ is_flag=True,
282
+ help="If run accessions aren't INSDC-formatted",
283
+ )
284
+ def summarise_analyses(
285
+ runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
286
+ ) -> None:
287
+ """Function that will take a file of pipeline-successful run accessions
288
+ that should be used for the generation of the relevant db-specific
289
+ study-level summary files. For ASV results, these will also be on a
290
+ per-amplified-region basis.
291
+ \f
292
+
293
+ :param runs: Path to a qc_passed_runs file from the pipeline execution.
294
+ Contains the accessions of runs that should therefore be included in the generated
295
+ summaries.
296
+ :type runs: Path
297
+ :param analyses_dir: The path to the directory containing all of the analyses.
298
+ :type analyses_dir: Path
299
+ :param output_prefix: Prefix to be added to the generated summary file.
300
+ :type output_prefix: str
301
+ """
302
+ runs_df = pd.read_csv(runs, names=["run", "status"])
303
+
304
+ if not non_insdc:
305
+ AmpliconPassedRunsSchema(
306
+ runs_df
307
+ ) # Run validation on the successful_runs .csv file
308
+ else:
309
+ AmpliconNonINSDCPassedRunsSchema(runs_df)
310
+
311
+ all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
312
+ for db_label in all_db_labels:
313
+
314
+ tax_files = defaultdict(Path)
315
+ for i in range(0, len(runs_df)):
316
+ run_acc = runs_df.loc[i, "run"]
317
+ tax_file = get_tax_file(run_acc, analyses_dir, db_label)
318
+
319
+ if tax_file:
320
+ tax_files[run_acc] = tax_file
321
+
322
+ if (
323
+ len(tax_files) > 1
324
+ ): # If at least two analyses have results from the current DB, generate a study-level summary for it
325
+ generate_db_summary(db_label, tax_files, output_prefix)
326
+
327
+
328
+ @cli.command(
329
+ "merge",
330
+ options_metavar="-a <analyses_dir> -p <output_prefix>",
331
+ short_help="Merge multiple study-level analysis summaries.",
332
+ )
333
+ @click.option(
334
+ "-a",
335
+ "--analyses_dir",
336
+ required=True,
337
+ help="Input directory to where all the individual analyses subdirectories for merging",
338
+ type=click.Path(exists=True, file_okay=False),
339
+ )
340
+ @click.option(
341
+ "-p",
342
+ "--output_prefix",
343
+ required=True,
344
+ help="Prefix to merged summary files",
345
+ type=str,
346
+ )
347
+ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
348
+ """Function that will take a file path containing study-level
349
+ summaries that should be merged together on a per-db-per-amplified-region
350
+ basis.
351
+ \f
352
+
353
+ :param analyses_dir: The filepath to the directory containing all of the analyses.
354
+ :type analyses_dir: str
355
+ :param output_prefix: Prefix to be added to the generated summary file.
356
+ :type output_prefix: str
357
+ """
358
+
359
+ # TODO: The way we grab all the summaries might change depending on how the prefect side does things
360
+ all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
361
+
362
+ summaries_dict = organise_study_summaries(all_study_summaries)
363
+
364
+ for db_label, summaries in summaries_dict.items():
365
+ if len(summaries) > 1:
366
+ res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
367
+ for summary in summaries[1:]:
368
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
369
+ res_df = res_df.join(curr_df, how="outer")
370
+ res_df = res_df.fillna(0)
371
+ res_df = res_df.astype(int)
372
+
373
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
374
+ res_df.to_csv(
375
+ f"{output_prefix}_{db_label}_study_summary.tsv",
376
+ sep="\t",
377
+ index_label="taxonomy",
378
+ )
379
+
380
+
381
+ if __name__ == "__main__":
382
+ cli()
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # taxonomy_summary labels for closed-reference method
18
+ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
19
+
20
+ # taxonomy_summary for ASV method
21
+ ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
@@ -35,3 +35,7 @@ _PR2_TAX_RANKS = [
35
35
  "Genus",
36
36
  "Species",
37
37
  ]
38
+
39
+ SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
40
+
41
+ SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]
@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import re
18
+
19
+ from enum import Enum
20
+ from typing import ClassVar, Optional
21
+ import pandera as pa
22
+
23
+ from pydantic import (
24
+ Field,
25
+ BaseModel,
26
+ field_validator,
27
+ RootModel,
28
+ )
29
+ from pandera.engines.pandas_engine import PydanticModel
30
+
31
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
32
+ SHORT_TAX_RANKS,
33
+ SHORT_PR2_TAX_RANKS,
34
+ )
35
+
36
+
37
+ class INSDCRunAccession(RootModel):
38
+ """Class for modelling for INSDC-specific run accessions.
39
+ Essentially is just a special string with regex-based validation of the accession.
40
+ """
41
+
42
+ # RootModel example:
43
+ # https://stackoverflow.com/questions/78393675/how-to-make-a-custom-type-inheriting-from-uuid-work-as-a-pydantic-model
44
+
45
+ root: str = Field(
46
+ unique=True,
47
+ description="The run needs to be a valid ENA accession",
48
+ examples=["ERR123456", "DRR789012", "SRR345678"],
49
+ )
50
+
51
+ @field_validator("root", mode="after")
52
+ @classmethod
53
+ def run_validity_check(cls, run: str) -> bool:
54
+ """Checks that the run string matches the regex code of an INSDC run accession.
55
+ Throws a `ValueError` exception if not, which is what Pydantic prefers for validation errors.
56
+ """
57
+
58
+ run_accession_regex = "(E|D|S)RR[0-9]{6,}"
59
+ regex_res = re.match(run_accession_regex, run)
60
+
61
+ if regex_res is None:
62
+ raise ValueError(
63
+ f"Accession `{run}` does not fit INSDC format [ERR*,SRR*,DRR*]."
64
+ )
65
+
66
+ return run
67
+
68
+
69
+ class AmpliconResultTypes(str, Enum):
70
+ """Class that models the two allowed statuses for successful amplicon analysis runs.
71
+ Pydantic validates Enums very simply without needing to declare a new function.
72
+ """
73
+
74
+ all_results = "all_results"
75
+ no_asvs = "no_asvs"
76
+
77
+
78
+ class AmpliconPassedRunsRecord(BaseModel):
79
+ """Class defining a Pydantic model for a single "row" of an amplicon passed runs file.
80
+ Uses the previous two classes.
81
+ """
82
+
83
+ run: INSDCRunAccession
84
+ status: AmpliconResultTypes
85
+
86
+
87
+ class AmpliconNonINSDCSPassedRunsRecord(BaseModel):
88
+ """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
89
+ This is achieved by replacing the type of the runs with just a simple string so no validation
90
+ happens.
91
+ """
92
+
93
+ run: str
94
+ status: AmpliconResultTypes
95
+
96
+
97
+ # This is the schema for the whole DF
98
+ class AmpliconPassedRunsSchema(pa.DataFrameModel):
99
+ """Class modelling a Pandera dataframe schema that uses the AmpliconPassedRunsRecord class as dtype.
100
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
101
+ """
102
+
103
+ class Config:
104
+ """Config with dataframe-level data type."""
105
+
106
+ dtype = PydanticModel(AmpliconPassedRunsRecord)
107
+ coerce = True
108
+
109
+
110
+ class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
111
+ """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
112
+ Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
113
+ """
114
+
115
+ class Config:
116
+ """Config with dataframe-level data type."""
117
+
118
+ dtype = PydanticModel(AmpliconNonINSDCSPassedRunsRecord)
119
+ coerce = True
120
+
121
+
122
+ class TaxRank(RootModel):
123
+ """Class for modelling a single Taxonomic Rank.
124
+ Essentially is just a special string with validation of the structure:
125
+ `${rank}__${taxon}`
126
+ Where `${rank}` is one of the allowed short ranks defined by the imported
127
+ `SHORT_TAX_RANKS` and `SHORT_PR2_TAX_RANKS` variables.
128
+ And `${taxon}` is the actual taxon for that rank (this isn't validated).
129
+ It will also validate if the whole string is the permitted "Unclassified".
130
+ """
131
+
132
+ valid_tax_ranks: ClassVar = SHORT_TAX_RANKS + SHORT_PR2_TAX_RANKS
133
+
134
+ root: str = Field(
135
+ unique=True,
136
+ description="A single taxon in a taxonomy record",
137
+ examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
138
+ )
139
+
140
+ @field_validator("root", mode="after")
141
+ @classmethod
142
+ def rank_structure_validity_check(cls, taxrank: str) -> bool:
143
+ taxrank_list = taxrank.split("__")
144
+ rank = taxrank_list[0]
145
+ if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
146
+ raise ValueError(f"Invalid taxonomy rank {rank}.")
147
+
148
+ return taxrank
149
+
150
+
151
+ # TODO: see if we can simplify the declaration of two Taxon classes by using one of these solutions
152
+ # None of the solutions have a model-only way of doing it, but worth considering maybe
153
+ # https://stackoverflow.com/questions/76537360/initialize-one-of-two-pydantic-models-depending-on-an-init-parameter
154
+
155
+
156
+ class Taxon(BaseModel):
157
+ """Class for modelling an entire Taxon or taxonomic assignment.
158
+ All of the ranks are optional, to model for the taxon being "Unclassified".
159
+ """
160
+
161
+ Superkingdom: Optional[TaxRank] = None
162
+ Kingdom: Optional[TaxRank] = None
163
+ Phylum: Optional[TaxRank] = None
164
+ Class: Optional[TaxRank] = None
165
+ Order: Optional[TaxRank] = None
166
+ Family: Optional[TaxRank] = None
167
+ Genus: Optional[TaxRank] = None
168
+ Species: Optional[TaxRank] = None
169
+
170
+
171
+ class PR2Taxon(Taxon):
172
+ """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
173
+
174
+ Domain: Optional[TaxRank] = None
175
+ Supergroup: Optional[TaxRank] = None
176
+ Division: Optional[TaxRank] = None
177
+ Subdivision: Optional[TaxRank] = None
178
+
179
+
180
+ class TaxonRecord(Taxon):
181
+ """Class for modelling a single taxon record in a taxonomy file.
182
+ It inherits the Taxon class, and simply adds a Count field, modelling the read counts
183
+ for that particular Taxon record.
184
+ """
185
+
186
+ Count: int
187
+
188
+
189
+ class PR2TaxonRecord(PR2Taxon):
190
+ """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
191
+
192
+ Count: int
193
+
194
+
195
+ # This is the schema for the whole DF
196
+ class TaxonSchema(pa.DataFrameModel):
197
+ """Class modelling a Pandera dataframe schema that uses the TaxonRecord class as dtype.
198
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
199
+ """
200
+
201
+ class Config:
202
+ """Config with dataframe-level data type."""
203
+
204
+ dtype = PydanticModel(TaxonRecord)
205
+ coerce = True
206
+
207
+
208
+ class PR2TaxonSchema(pa.DataFrameModel):
209
+ """Class modelling the same dataframe schema as the preceding one, except for the PR2 taxonomy.
210
+ Uses the PR2TaxonSchema as a dtype to achieve this.
211
+ """
212
+
213
+ class Config:
214
+ """Config with dataframe-level data type."""
215
+
216
+ dtype = PydanticModel(PR2TaxonRecord)
217
+ coerce = True
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
15
15
  Requires-Dist: numpy==1.26.0
16
16
  Requires-Dist: pandas==2.0.2
17
17
  Requires-Dist: regex==2023.12.25
18
+ Requires-Dist: requests==2.32.3
19
+ Requires-Dist: click==8.1.7
20
+ Requires-Dist: pandera==0.22.1
18
21
  Provides-Extra: tests
19
22
  Requires-Dist: pytest==7.4.0; extra == "tests"
20
23
  Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
23
26
  Requires-Dist: pandas==2.0.2; extra == "tests"
24
27
  Requires-Dist: numpy==1.26.0; extra == "tests"
25
28
  Requires-Dist: regex==2023.12.25; extra == "tests"
29
+ Requires-Dist: requests==2.32.3; extra == "tests"
30
+ Requires-Dist: click==8.1.7; extra == "tests"
31
+ Requires-Dist: pandera==0.22.1; extra == "tests"
26
32
  Provides-Extra: dev
27
33
  Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
28
34
  Requires-Dist: pre-commit==3.8.0; extra == "dev"
@@ -21,17 +21,22 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py
21
21
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
22
22
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
23
23
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
24
+ mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py
25
+ mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py
24
26
  mgnify_pipelines_toolkit/analysis/shared/__init__.py
25
27
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py
26
28
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
27
29
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py
28
30
  mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py
29
31
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
32
+ mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py
33
+ mgnify_pipelines_toolkit/constants/db_labels.py
30
34
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
31
35
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py
32
36
  mgnify_pipelines_toolkit/constants/tax_ranks.py
33
37
  mgnify_pipelines_toolkit/constants/thresholds.py
34
38
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py
39
+ mgnify_pipelines_toolkit/schemas/schemas.py
35
40
  mgnify_pipelines_toolkit/utils/__init__.py
36
41
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py
37
42
  mgnify_pipelines_toolkit/utils/get_mpt_version.py
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
2
3
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
3
4
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
4
5
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
@@ -17,3 +18,4 @@ primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_va
17
18
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
18
19
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
19
20
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
21
+ study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main
@@ -2,6 +2,9 @@ biopython==1.82
2
2
  numpy==1.26.0
3
3
  pandas==2.0.2
4
4
  regex==2023.12.25
5
+ requests==2.32.3
6
+ click==8.1.7
7
+ pandera==0.22.1
5
8
 
6
9
  [dev]
7
10
  mgnify_pipelines_toolkit[tests]
@@ -18,3 +21,6 @@ biopython==1.82
18
21
  pandas==2.0.2
19
22
  numpy==1.26.0
20
23
  regex==2023.12.25
24
+ requests==2.32.3
25
+ click==8.1.7
26
+ pandera==0.22.1
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "0.1.7"
3
+ version = "0.1.9"
4
4
  readme = "README.md"
5
5
  license = {text = "Apache Software License 2.0"}
6
6
  authors = [
@@ -19,7 +19,10 @@ dependencies = [
19
19
  "biopython==1.82",
20
20
  "numpy==1.26.0",
21
21
  "pandas==2.0.2",
22
- "regex==2023.12.25"
22
+ "regex==2023.12.25",
23
+ "requests==2.32.3",
24
+ "click==8.1.7",
25
+ "pandera==0.22.1"
23
26
  ]
24
27
 
25
28
  [build-system]
@@ -31,8 +34,10 @@ packages = ["mgnify_pipelines_toolkit",
31
34
  "mgnify_pipelines_toolkit.analysis",
32
35
  "mgnify_pipelines_toolkit.constants",
33
36
  "mgnify_pipelines_toolkit.utils",
37
+ "mgnify_pipelines_toolkit.schemas",
34
38
  "mgnify_pipelines_toolkit.analysis.shared",
35
- "mgnify_pipelines_toolkit.analysis.amplicon"
39
+ "mgnify_pipelines_toolkit.analysis.amplicon",
40
+ "mgnify_pipelines_toolkit.analysis.assembly",
36
41
  ]
37
42
 
38
43
  [project.scripts]
@@ -42,6 +47,7 @@ get_subunits_coords = "mgnify_pipelines_toolkit.analysis.shared.get_subunits_coo
42
47
  mapseq2biom = "mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main"
43
48
  fastq_suffix_header_check = "mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main"
44
49
  library_strategy_check = "mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main"
50
+ study_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main"
45
51
  # analysis.amplicon
46
52
  are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
47
53
  assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
@@ -54,6 +60,8 @@ rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_pr
54
60
  standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
55
61
  mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
56
62
  primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
63
+ # analysis.assembly
64
+ add_rhea_chebi_annotation = "mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main"
57
65
  # utils
58
66
  fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
59
67
  get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"
@@ -66,7 +74,10 @@ tests = [
66
74
  "biopython==1.82",
67
75
  "pandas==2.0.2",
68
76
  "numpy==1.26.0",
69
- "regex==2023.12.25"
77
+ "regex==2023.12.25",
78
+ "requests==2.32.3",
79
+ "click==8.1.7",
80
+ "pandera==0.22.1"
70
81
  ]
71
82
 
72
83
  dev = [