mgnify-pipelines-toolkit 1.0.1__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (58) hide show
  1. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/PKG-INFO +3 -2
  2. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +44 -45
  3. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +127 -0
  4. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +56 -49
  5. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +116 -0
  6. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +141 -0
  7. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/constants/ncrna.py +84 -0
  8. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  9. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +3 -2
  10. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -0
  11. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +7 -1
  12. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/pyproject.toml +15 -6
  13. mgnify_pipelines_toolkit-1.0.1/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -139
  14. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/LICENSE +0 -0
  15. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/README.md +0 -0
  16. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/__init__.py +0 -0
  17. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  18. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  19. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  20. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  21. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  22. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  23. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  24. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  25. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  26. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  27. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  28. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  29. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  30. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  31. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  32. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  33. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  34. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  35. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  36. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  37. {mgnify_pipelines_toolkit-1.0.1/mgnify_pipelines_toolkit/analysis/shared → mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/genomes}/__init__.py +0 -0
  38. {mgnify_pipelines_toolkit-1.0.1/mgnify_pipelines_toolkit/utils → mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/shared}/__init__.py +0 -0
  39. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  40. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  41. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  42. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  43. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  44. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  45. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
  46. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  47. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  48. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  49. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  50. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  51. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  52. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
  53. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  54. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  55. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  56. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  57. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  58. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -38,6 +38,7 @@ Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
38
  Requires-Dist: black==24.8.0; extra == "dev"
39
39
  Requires-Dist: flake8==7.1.1; extra == "dev"
40
40
  Requires-Dist: pep8-naming==0.14.1; extra == "dev"
41
+ Dynamic: license-file
41
42
 
42
43
  # mgnify-pipelines-toolkit
43
44
 
@@ -84,52 +84,51 @@ def parse_interproscan_tsv(ips_file: Path, mapped_go_terms: dict = None) -> dict
84
84
  previous_protein_acc = None
85
85
  go_annotations_single_protein = set()
86
86
 
87
- fr = open(ips_file, "r")
88
87
  go_pattern = re.compile("GO:\\d+")
89
88
 
90
- for line in fr:
91
- # IPS files are parsed line by line - the same protein accession will appear multiple lines in a row with different annotation
92
- line_counter += 1
93
- line = line.strip()
94
- chunks = line.split("\t")
95
- # Get protein accession
96
- current_protein_acc = chunks[0]
97
-
98
- # TODO: not sure if this line is needed - do we ever have more than one protein in a single line of IPS?
99
- # Will keep just in case
100
- num_of_proteins = len(current_protein_acc.split("|"))
101
-
102
- # If we're at a new protein accession in the IPS file then we finally increment
103
- # the go2protein_count dictionary for each term that was found in that protein
104
- if current_protein_acc != previous_protein_acc:
105
- total_num_of_proteins += 1
106
- if len(go_annotations_single_protein) > 0:
107
- num_of_proteins_with_go += 1
108
- go2protein_count = count_and_assign_go_annotations(
109
- go2protein_count,
110
- go_annotations_single_protein,
111
- num_of_proteins,
112
- mapped_go_terms,
113
- )
114
- # reset GO id set because we hit a new protein accession
115
- go_annotations_single_protein = set()
116
- previous_protein_acc = current_protein_acc
117
-
118
- # Parse out GO annotations
119
- # GO annotations are associated to InterPro entries (InterPro entries start with 'IPR')
120
- # Than use the regex to extract the GO Ids (e.g. GO:0009842)
121
- if len(chunks) >= 13 and chunks[11].startswith("IPR"):
122
- for go_annotation in go_pattern.findall(line):
123
- go_annotations_single_protein.add(go_annotation)
124
-
125
- # Do final counting for the last protein
126
- go2protein_count = count_and_assign_go_annotations(
127
- go2protein_count,
128
- go_annotations_single_protein,
129
- num_of_proteins,
130
- mapped_go_terms,
131
- )
132
-
133
- fr.close()
89
+ with open(ips_file, "r") as fr:
90
+
91
+ for line in fr:
92
+ # IPS files are parsed line by line - the same protein accession will appear multiple lines in a row with different annotation
93
+ line_counter += 1
94
+ line = line.strip()
95
+ chunks = line.split("\t")
96
+ # Get protein accession
97
+ current_protein_acc = chunks[0]
98
+
99
+ # TODO: not sure if this line is needed - do we ever have more than one protein in a single line of IPS?
100
+ # Will keep just in case
101
+ num_of_proteins = len(current_protein_acc.split("|"))
102
+
103
+ # If we're at a new protein accession in the IPS file then we finally increment
104
+ # the go2protein_count dictionary for each term that was found in that protein
105
+ if current_protein_acc != previous_protein_acc:
106
+ total_num_of_proteins += 1
107
+ if len(go_annotations_single_protein) > 0:
108
+ num_of_proteins_with_go += 1
109
+ go2protein_count = count_and_assign_go_annotations(
110
+ go2protein_count,
111
+ go_annotations_single_protein,
112
+ num_of_proteins,
113
+ mapped_go_terms,
114
+ )
115
+ # reset GO id set because we hit a new protein accession
116
+ go_annotations_single_protein = set()
117
+ previous_protein_acc = current_protein_acc
118
+
119
+ # Parse out GO annotations
120
+ # GO annotations are associated to InterPro entries (InterPro entries start with 'IPR')
121
+ # Than use the regex to extract the GO Ids (e.g. GO:0009842)
122
+ if len(chunks) >= 13 and chunks[11].startswith("IPR"):
123
+ for go_annotation in go_pattern.findall(line):
124
+ go_annotations_single_protein.add(go_annotation)
125
+
126
+ # Do final counting for the last protein
127
+ go2protein_count = count_and_assign_go_annotations(
128
+ go2protein_count,
129
+ go_annotations_single_protein,
130
+ num_of_proteins,
131
+ mapped_go_terms,
132
+ )
134
133
 
135
134
  return go2protein_count
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import Counter
19
+ import csv
20
+ import logging
21
+
22
+ RANK_PREFIXES = {
23
+ "superkingdom": "sk__",
24
+ "kingdom": "k__",
25
+ "phylum": "p__",
26
+ "class": "c__",
27
+ "order": "o__",
28
+ "family": "f__",
29
+ "genus": "g__",
30
+ "species": "s__",
31
+ }
32
+
33
+ logging.basicConfig(
34
+ level=logging.INFO, format="[%(asctime)s] - %(levelname)s - %(message)s"
35
+ )
36
+
37
+
38
+ def import_nodes(nodes_dmp):
39
+ logging.info(f"Loading file {nodes_dmp}")
40
+ taxid2rank = {}
41
+
42
+ with open(nodes_dmp) as f1:
43
+ reader = csv.reader(f1, delimiter="\t")
44
+ for line in reader:
45
+ taxid = line[0]
46
+ rank = line[4]
47
+ taxid2rank[taxid] = rank
48
+
49
+ return taxid2rank
50
+
51
+
52
+ def import_names(names_dmp):
53
+ logging.info(f"Loading file {names_dmp}")
54
+ taxid2name = {}
55
+
56
+ with open(names_dmp, newline="") as f1:
57
+ reader = csv.reader(f1, delimiter="\t")
58
+ for line in reader:
59
+ if line[6] == "scientific name":
60
+ taxid = line[0]
61
+ name = line[2]
62
+ taxid2name[taxid] = name
63
+
64
+ return taxid2name
65
+
66
+
67
+ def convert_to_official_names(lineage, taxid2rank, taxid2name):
68
+ lineage_ranks = [taxid2rank[taxid.rstrip("*")] for taxid in lineage]
69
+ official_names = list(RANK_PREFIXES.values())
70
+ lowest_classification_index = -1
71
+
72
+ for i, rank in enumerate(RANK_PREFIXES):
73
+ if rank in lineage_ranks:
74
+ index = lineage_ranks.index(rank)
75
+ taxid = lineage[index].rstrip("*")
76
+ name = taxid2name[taxid]
77
+ official_names[i] = official_names[i] + name
78
+ lowest_classification_index = i
79
+
80
+ return official_names[: lowest_classification_index + 1]
81
+
82
+
83
+ def main():
84
+ parser = argparse.ArgumentParser(
85
+ description="Process TSV classification generated by CAT_pack contigs and write input file for Krona ktImportText"
86
+ )
87
+ parser.add_argument(
88
+ "-i", "--input", help="Path to the input TSV file from CAT_pack contigs"
89
+ )
90
+ parser.add_argument("-o", "--output", help="Name of the output Krona TXT file")
91
+ parser.add_argument(
92
+ "-n", "--names_dmp", help="Path to the nodes.dmp file from NCBI taxonomy"
93
+ )
94
+ parser.add_argument(
95
+ "-r", "--nodes_dmp", help="Path to the names.dmp file from NCBI taxonomy"
96
+ )
97
+ args = parser.parse_args()
98
+
99
+ taxid2rank = import_nodes(args.nodes_dmp)
100
+ taxid2name = import_names(args.names_dmp)
101
+
102
+ logging.info(f"Begin parsing of CAT_pack classiffication file {args.input}")
103
+ lineage_counter = Counter()
104
+ with open(args.input) as infile:
105
+ reader = csv.reader(infile, delimiter="\t")
106
+ next(reader) # Skip the header row
107
+ for row in reader:
108
+ if row[1] == "no taxid assigned":
109
+ lineage = "unclassified"
110
+ else:
111
+ taxid_lineage = row[3].split(";")
112
+ names_lineage = convert_to_official_names(
113
+ taxid_lineage, taxid2rank, taxid2name
114
+ )
115
+ lineage = "\t".join(names_lineage) if names_lineage else "unclassified"
116
+ lineage_counter[lineage] += 1
117
+
118
+ logging.info(f"Writting output to {args.output}")
119
+ with open(args.output, "w") as outfile:
120
+ for lineage, count in lineage_counter.most_common():
121
+ outfile.write(f"{count}\t{lineage}\n")
122
+
123
+ logging.info("Done")
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
@@ -15,9 +15,10 @@
15
15
  # limitations under the License.
16
16
 
17
17
  import argparse
18
- from collections import defaultdict
18
+ import csv
19
19
  import logging
20
20
  import os
21
+ from collections import defaultdict
21
22
  from pathlib import Path
22
23
 
23
24
  from mgnify_pipelines_toolkit.analysis.assembly.go_utils import parse_interproscan_tsv
@@ -28,7 +29,6 @@ logging.basicConfig(
28
29
 
29
30
 
30
31
  def parse_args():
31
-
32
32
  description = "Go slim pipeline."
33
33
  parser = argparse.ArgumentParser(description=description)
34
34
  parser.add_argument(
@@ -59,46 +59,56 @@ def parse_args():
59
59
 
60
60
 
61
61
  def parse_mapped_gaf_file(gaf_file: Path) -> defaultdict[set]:
62
-
63
62
  mapped_go_dict = defaultdict(set)
64
63
  if os.path.exists(gaf_file):
65
- handle = open(gaf_file, "r")
66
- for line in handle:
67
- if not line.startswith("!"):
68
- line = line.strip()
69
- splitted_line = line.split("\t")
70
- go_id = splitted_line[1]
71
- mapped_go_id = splitted_line[4]
72
- mapped_go_dict[go_id].add(mapped_go_id)
73
-
64
+ with open(gaf_file, "r") as handle:
65
+ for line in handle:
66
+ if not line.startswith("!"):
67
+ line = line.strip()
68
+ splitted_line = line.split("\t")
69
+ go_id = splitted_line[1]
70
+ mapped_go_id = splitted_line[4]
71
+ mapped_go_dict[go_id].add(mapped_go_id)
74
72
  return mapped_go_dict
75
73
 
76
74
 
77
75
  def get_go_slim_summary(go_slim_banding_file, goslims2_protein_count):
78
76
  summary = []
79
77
 
80
- fr = open(go_slim_banding_file, "r")
81
-
82
- for line in fr:
83
- if line.startswith("GO"):
84
- line = line.strip()
85
- line_chunks = line.split("\t")
86
- go_id = line_chunks[0]
87
- term = line_chunks[1]
88
- category = line_chunks[2]
89
- # Default value for the count
90
- count = 0
91
- if go_id in goslims2_protein_count:
92
- count = goslims2_protein_count[go_id]
93
- summary.append((go_id, term, category, count))
78
+ with open(go_slim_banding_file, "r") as fr:
79
+ for line in fr:
80
+ if line.startswith("GO"):
81
+ line = line.strip()
82
+ line_chunks = line.split("\t")
83
+ go_id = line_chunks[0]
84
+ term = line_chunks[1]
85
+ category = line_chunks[2]
86
+ # Default value for the count
87
+ count = 0
88
+ if go_id in goslims2_protein_count:
89
+ count = goslims2_protein_count[go_id]
90
+ summary.append((go_id, term, category, count))
94
91
  return summary
95
92
 
96
93
 
97
94
  def write_go_summary_to_file(go_summary, output_file):
98
- fw = open(output_file, "w")
99
- for go, term, category, count in go_summary:
100
- fw.write('","'.join(['"' + go, term, category, str(count) + '"']) + "\n")
101
- fw.close()
95
+ """
96
+ Write a sorted GO summary to a TSV file.
97
+
98
+ :param go_summary: A list of tuples, where each tuple contains the following
99
+ elements:
100
+ - go (str): The GO identifier.
101
+ - term (str): The GO term description.
102
+ - category (str): The category of the GO term.
103
+ - count (int): The count associated with the GO term.
104
+ :param output_file: The path to the output TSV file where the sorted GO
105
+ """
106
+ sorted_go_summary = sorted(go_summary, key=lambda x: x[3], reverse=True)
107
+ with open(output_file, "w", newline="") as fw:
108
+ tsv_writer = csv.writer(fw, delimiter="\t")
109
+ tsv_writer.writerow(["go", "term", "category", "count"])
110
+ for go, term, category, count in sorted_go_summary:
111
+ tsv_writer.writerow([go, term, category, count])
102
112
 
103
113
 
104
114
  def parse_gene_ontology(obo_file):
@@ -108,23 +118,22 @@ def parse_gene_ontology(obo_file):
108
118
  :return:
109
119
  """
110
120
  go_term_tuples = []
111
- fr = open(obo_file, "r")
112
- id, term, category = "", "", ""
113
- for line in fr:
114
- line = line.strip()
115
- split_line = line.split(": ")
116
- if line.startswith("id:"):
117
- id = split_line[1]
118
- elif line.startswith("name:"):
119
- term = split_line[1]
120
- elif line.startswith("namespace"):
121
- category = split_line[1]
122
- else:
123
- if id.startswith("GO:") and id and term and category:
124
- item = (id, term, category)
125
- go_term_tuples.append(item)
126
- id, term, category = "", "", ""
127
- fr.close()
121
+ with open(obo_file, "r") as fr:
122
+ id, term, category = "", "", ""
123
+ for line in fr:
124
+ line = line.strip()
125
+ split_line = line.split(": ")
126
+ if line.startswith("id:"):
127
+ id = split_line[1]
128
+ elif line.startswith("name:"):
129
+ term = split_line[1]
130
+ elif line.startswith("namespace"):
131
+ category = split_line[1]
132
+ else:
133
+ if id.startswith("GO:") and id and term and category:
134
+ item = (id, term, category)
135
+ go_term_tuples.append(item)
136
+ id, term, category = "", "", ""
128
137
  return go_term_tuples
129
138
 
130
139
 
@@ -132,7 +141,6 @@ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_
132
141
  summary = []
133
142
 
134
143
  for go_id, term, category in core_gene_ontology:
135
-
136
144
  if (go_id in go2protein_count_dict) and (
137
145
  go_id not in top_level_go_ids
138
146
  ): # make sure that top level terms are not included (they tell you nothing!)
@@ -143,7 +151,6 @@ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_
143
151
 
144
152
 
145
153
  def main():
146
-
147
154
  go_obo, go_banding, gaf_input, ips_input, output = parse_args()
148
155
 
149
156
  logging.info("Parsing the InterProScan input: " + ips_input)
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2025 EMBL - European Bioinformatics Institute
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Script to convert cmscan-table to cmsearch-table (swap columns 1 and 2 with 3 and 4)
18
+
19
+ input example:
20
+ #target name accession query name accession mdl mdl from mdl to seq from seq to strand ..
21
+ #------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ..
22
+ SSU_rRNA_eukarya RF01960 SRR17062740.1 - cm 582 1025 1 452 + ..
23
+
24
+ expected output:
25
+ #------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ..
26
+ #target name accession query name accession mdl mdl from mdl to seq from seq to strand ..
27
+ SRR17062740.1 - SSU_rRNA_eukarya RF01960 cm 582 1025 1 452 + ..
28
+
29
+ """
30
+
31
+ import sys
32
+ import argparse
33
+ import fileinput
34
+ from itertools import accumulate
35
+
36
+
37
+ def parse_args(argv):
38
+ parser = argparse.ArgumentParser(
39
+ description="Convert cmscan table to cmsearch table"
40
+ )
41
+ parser.add_argument(
42
+ "-i", "--input", dest="input", help="Input cmscan file", required=True
43
+ )
44
+ parser.add_argument(
45
+ "-o", "--output", dest="output", help="Output filename", required=True
46
+ )
47
+ return parser.parse_args(argv)
48
+
49
+
50
+ class TableModifier:
51
+ def __init__(
52
+ self,
53
+ input_file: str,
54
+ output_file: str,
55
+ ):
56
+ """
57
+ Output of cmsearch-table has columns separated with different number of spaces (to keep humanreadable format)
58
+ :param input_file: output of cmscan-table
59
+ :param output_file: name of cmsearch table
60
+ """
61
+ self.input_file = input_file
62
+ self.output_file = output_file
63
+
64
+ def modify_table(self):
65
+ with fileinput.hook_compressed(self.input_file, "rt") as file_in, open(
66
+ self.output_file, "w"
67
+ ) as file_out:
68
+ header_written = False
69
+ separator_line, header = "", ""
70
+ for line in file_in:
71
+ if line.startswith("#"):
72
+ if "--" in line:
73
+ separator_line = line.split(" ")
74
+ separator_line[0] = separator_line[0].replace("#", "-")
75
+ lengths = [0] + list(
76
+ accumulate(len(s) + 1 for s in separator_line)
77
+ )
78
+ else:
79
+ header = line
80
+ else:
81
+ coord_to_keep = len(" ".join(separator_line[0:4]))
82
+ if not header_written:
83
+ file_out.write(header)
84
+ file_out.write(
85
+ " ".join(
86
+ [
87
+ "#" + separator_line[2][1:],
88
+ separator_line[3],
89
+ separator_line[0].replace("#", ""),
90
+ separator_line[1],
91
+ ]
92
+ + separator_line[4:]
93
+ )
94
+ )
95
+ header_written = True
96
+ new_line = (
97
+ line[lengths[2] : lengths[3]]
98
+ + line[lengths[3] : lengths[4]]
99
+ + line[lengths[0] : lengths[1]]
100
+ + line[lengths[1] : lengths[2]]
101
+ + line[coord_to_keep + 1 :]
102
+ )
103
+ file_out.write(new_line)
104
+
105
+
106
+ def main():
107
+ args = parse_args(sys.argv[1:])
108
+ table_modifier = TableModifier(
109
+ input_file=args.input,
110
+ output_file=args.output,
111
+ )
112
+ table_modifier.modify_table()
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+
18
+ import argparse
19
+ import os
20
+ from Bio import SeqIO
21
+ from mgnify_pipelines_toolkit.constants.ncrna import (
22
+ DIRECTORY_SEQ_CAT,
23
+ SSU,
24
+ LSU,
25
+ Seq5S,
26
+ Seq5_8S,
27
+ SSU_rRNA_archaea,
28
+ SSU_rRNA_bacteria,
29
+ SSU_rRNA_eukarya,
30
+ SSU_rRNA_microsporidia,
31
+ LSU_rRNA_archaea,
32
+ LSU_rRNA_bacteria,
33
+ LSU_rRNA_eukarya,
34
+ NON_CODING_RNA,
35
+ SSU_MODELS,
36
+ LSU_MODELS,
37
+ RFAM_MODELS,
38
+ )
39
+
40
+
41
+ def set_model_names(prefix, name, directory, separate_subunits):
42
+ pattern_dict = {}
43
+ pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
44
+ pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
45
+ pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fasta")
46
+ pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fasta")
47
+ if separate_subunits:
48
+ pattern_dict[SSU_rRNA_archaea] = os.path.join(
49
+ directory,
50
+ f"{prefix}{name}_{SSU_rRNA_archaea}.{RFAM_MODELS[SSU_rRNA_archaea]}.fasta",
51
+ )
52
+ pattern_dict[SSU_rRNA_bacteria] = os.path.join(
53
+ directory,
54
+ f"{prefix}{name}_{SSU_rRNA_bacteria}.{RFAM_MODELS[SSU_rRNA_bacteria]}.fasta",
55
+ )
56
+ pattern_dict[SSU_rRNA_eukarya] = os.path.join(
57
+ directory,
58
+ f"{prefix}{name}_{SSU_rRNA_eukarya}.{RFAM_MODELS[SSU_rRNA_eukarya]}.fasta",
59
+ )
60
+ pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
61
+ directory,
62
+ f"{prefix}{name}_{SSU_rRNA_microsporidia}.{RFAM_MODELS[SSU_rRNA_microsporidia]}.fasta",
63
+ )
64
+ pattern_dict[LSU_rRNA_archaea] = os.path.join(
65
+ directory,
66
+ f"{prefix}{name}_{LSU_rRNA_archaea}.{RFAM_MODELS[LSU_rRNA_archaea]}.fasta",
67
+ )
68
+ pattern_dict[LSU_rRNA_bacteria] = os.path.join(
69
+ directory,
70
+ f"{prefix}{name}_{LSU_rRNA_bacteria}.{RFAM_MODELS[LSU_rRNA_bacteria]}.fasta",
71
+ )
72
+ pattern_dict[LSU_rRNA_eukarya] = os.path.join(
73
+ directory,
74
+ f"{prefix}{name}_{LSU_rRNA_eukarya}.{RFAM_MODELS[LSU_rRNA_eukarya]}.fasta",
75
+ )
76
+ return pattern_dict
77
+
78
+
79
+ def main():
80
+ parser = argparse.ArgumentParser(
81
+ description="Extract lsu, ssu and 5s and other models"
82
+ )
83
+ parser.add_argument(
84
+ "-i", "--input", dest="input", help="Input fasta file", required=True
85
+ )
86
+ parser.add_argument(
87
+ "-p", "--prefix", dest="prefix", help="prefix for models", required=False
88
+ )
89
+ parser.add_argument("-n", "--name", dest="name", help="Accession", required=True)
90
+ parser.add_argument(
91
+ "--separate-subunits-by-models",
92
+ action="store_true",
93
+ help="Create separate files for each kingdon example: sample_SSU_rRNA_eukarya.RF01960.fasta",
94
+ )
95
+
96
+ args = parser.parse_args()
97
+ prefix = args.prefix if args.prefix else ""
98
+ name = args.name if args.name else "accession"
99
+
100
+ directory = DIRECTORY_SEQ_CAT
101
+ if not os.path.exists(directory):
102
+ os.makedirs(directory)
103
+
104
+ print("Start fasta mode")
105
+ pattern_dict = set_model_names(
106
+ prefix, name, directory, args.separate_subunits_by_models
107
+ )
108
+
109
+ open_files = {}
110
+ for record in SeqIO.parse(args.input, "fasta"):
111
+ model = "-".join(record.id.split("/")[0].split("-")[-1:])
112
+ if model in SSU_MODELS:
113
+ if SSU not in open_files:
114
+ file_out = open(pattern_dict[SSU], "w")
115
+ open_files[SSU] = file_out
116
+ SeqIO.write(record, open_files[SSU], "fasta")
117
+ elif model in LSU_MODELS:
118
+ if LSU not in open_files:
119
+ file_out = open(pattern_dict[LSU], "w")
120
+ open_files[LSU] = file_out
121
+ SeqIO.write(record, open_files[LSU], "fasta")
122
+
123
+ if model in NON_CODING_RNA:
124
+ if model in pattern_dict:
125
+ filename = pattern_dict[model]
126
+ else:
127
+ filename = None
128
+ else:
129
+ filename = os.path.join(directory, f"{name}_other_ncRNA.fasta")
130
+ if filename:
131
+ if model not in open_files:
132
+ file_out = open(filename, "w")
133
+ open_files[model] = file_out
134
+ SeqIO.write(record, open_files[model], "fasta")
135
+
136
+ for item in open_files:
137
+ open_files[item].close()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ DIRECTORY_SEQ_CAT = "sequence-categorisation"
18
+
19
+ SSU = "SSU_rRNA"
20
+ LSU = "LSU_rRNA"
21
+ Seq5S = "mtPerm-5S"
22
+ Seq5_8S = "5_8S_rRNA"
23
+
24
+ SSU_rRNA_archaea = "SSU_rRNA_archaea"
25
+ SSU_rRNA_bacteria = "SSU_rRNA_bacteria"
26
+ SSU_rRNA_eukarya = "SSU_rRNA_eukarya"
27
+ SSU_rRNA_microsporidia = "SSU_rRNA_microsporidia"
28
+
29
+ LSU_rRNA_archaea = "LSU_rRNA_archaea"
30
+ LSU_rRNA_bacteria = "LSU_rRNA_bacteria"
31
+ LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
32
+
33
+ NON_CODING_RNA = [
34
+ SSU_rRNA_archaea,
35
+ SSU_rRNA_bacteria,
36
+ SSU_rRNA_eukarya,
37
+ SSU_rRNA_microsporidia,
38
+ LSU_rRNA_archaea,
39
+ LSU_rRNA_bacteria,
40
+ LSU_rRNA_eukarya,
41
+ Seq5S,
42
+ Seq5_8S,
43
+ ]
44
+
45
+ SSU_MODELS = [
46
+ SSU_rRNA_archaea,
47
+ SSU_rRNA_bacteria,
48
+ SSU_rRNA_eukarya,
49
+ SSU_rRNA_microsporidia,
50
+ ]
51
+
52
+ LSU_MODELS = [LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya]
53
+
54
+ RFAM_MODELS = {
55
+ SSU_rRNA_archaea: "RF01959",
56
+ SSU_rRNA_bacteria: "RF00177",
57
+ SSU_rRNA_eukarya: "RF01960",
58
+ SSU_rRNA_microsporidia: "RF02542",
59
+ LSU_rRNA_archaea: "RF02540",
60
+ LSU_rRNA_bacteria: "RF02541",
61
+ LSU_rRNA_eukarya: "RF02543",
62
+ }
63
+
64
+ TRNA = [
65
+ "Ala",
66
+ "Gly",
67
+ "Pro",
68
+ "Thr",
69
+ "Val",
70
+ "Ser",
71
+ "Arg",
72
+ "Leu",
73
+ "Phe",
74
+ "Asn",
75
+ "Lys",
76
+ "Asp",
77
+ "Glu",
78
+ "His",
79
+ "Gln",
80
+ "Ile",
81
+ "Tyr",
82
+ "Cys",
83
+ "Trp",
84
+ ]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -38,6 +38,7 @@ Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
38
  Requires-Dist: black==24.8.0; extra == "dev"
39
39
  Requires-Dist: flake8==7.1.1; extra == "dev"
40
40
  Requires-Dist: pep8-naming==0.14.1; extra == "dev"
41
+ Dynamic: license-file
41
42
 
42
43
  # mgnify-pipelines-toolkit
43
44
 
@@ -29,8 +29,11 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py
29
29
  mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py
30
30
  mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py
31
31
  mgnify_pipelines_toolkit/analysis/assembly/go_utils.py
32
+ mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py
32
33
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py
34
+ mgnify_pipelines_toolkit/analysis/genomes/__init__.py
33
35
  mgnify_pipelines_toolkit/analysis/shared/__init__.py
36
+ mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py
34
37
  mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py
35
38
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py
36
39
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
@@ -40,6 +43,7 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
40
43
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py
41
44
  mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py
42
45
  mgnify_pipelines_toolkit/constants/db_labels.py
46
+ mgnify_pipelines_toolkit/constants/ncrna.py
43
47
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
44
48
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py
45
49
  mgnify_pipelines_toolkit/constants/tax_ranks.py
@@ -1,18 +1,24 @@
1
1
  [console_scripts]
2
2
  add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
3
+ antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
3
4
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
4
5
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
5
6
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
6
7
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
8
  combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
8
- dwc_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main
9
+ convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
10
+ dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
9
11
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
10
12
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
11
13
  find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
12
14
  generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
15
+ genomes_extract_bacterial_rrnas_as_tsv = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main
16
+ genomes_extract_rrnas_as_fasta = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main
17
+ genomes_extract_trnas = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_trnas:main
13
18
  get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
14
19
  get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
15
20
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
21
+ krona_txt_from_cat_classification = mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main
16
22
  library_strategy_check = mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main
17
23
  make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
18
24
  mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "1.0.1"
3
+ version = "1.0.3"
4
4
  readme = "README.md"
5
5
  license = {text = "Apache Software License 2.0"}
6
6
  authors = [
@@ -40,10 +40,11 @@ packages = ["mgnify_pipelines_toolkit",
40
40
  "mgnify_pipelines_toolkit.analysis.shared",
41
41
  "mgnify_pipelines_toolkit.analysis.amplicon",
42
42
  "mgnify_pipelines_toolkit.analysis.assembly",
43
- ]
43
+ "mgnify_pipelines_toolkit.analysis.genomes"
44
+ ]
44
45
 
45
46
  [project.scripts]
46
- # analysis.shared
47
+ # analysis.shared #
47
48
  get_subunits = "mgnify_pipelines_toolkit.analysis.shared.get_subunits:main"
48
49
  get_subunits_coords = "mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main"
49
50
  mapseq2biom = "mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main"
@@ -51,7 +52,9 @@ fastq_suffix_header_check = "mgnify_pipelines_toolkit.analysis.shared.fastq_suff
51
52
  library_strategy_check = "mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main"
52
53
  study_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli"
53
54
  markergene_study_summary = "mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main"
54
- # analysis.amplicon
55
+ convert_cmscan_to_cmsearch_tblout = "mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main"
56
+ dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main"
57
+ # analysis.amplicon #
55
58
  are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
56
59
  assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
57
60
  assess_mcp_proportions = "mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main"
@@ -63,12 +66,18 @@ rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_pr
63
66
  standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
64
67
  mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
65
68
  primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
66
- # analysis.assembly
69
+ # analysis.assembly #
70
+ krona_txt_from_cat_classification = "mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main"
67
71
  add_rhea_chebi_annotation = "mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main"
68
72
  combined_gene_caller_merge = "mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main"
69
73
  generate_gaf = "mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main"
70
74
  summarise_goslims = "mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main"
71
- dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main"
75
+ antismash_gff_builder = "mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main"
76
+ # genomes #
77
+ genomes_extract_bacterial_rrnas_as_tsv = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main"
78
+ genomes_extract_rrnas_as_fasta = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main"
79
+ genomes_extract_trnas = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_trnas:main"
80
+
72
81
  # utils
73
82
  fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
74
83
  get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"
@@ -1,139 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
- import os
19
- from Bio import SeqIO
20
-
21
-
22
- SSU = "SSU_rRNA"
23
- LSU = "LSU_rRNA"
24
- Seq5S = "mtPerm-5S"
25
- Seq5_8S = "5_8S_rRNA"
26
-
27
- SSU_rRNA_archaea = "SSU_rRNA_archaea"
28
- SSU_rRNA_bacteria = "SSU_rRNA_bacteria"
29
- SSU_rRNA_eukarya = "SSU_rRNA_eukarya"
30
- SSU_rRNA_microsporidia = "SSU_rRNA_microsporidia"
31
-
32
- LSU_rRNA_archaea = "LSU_rRNA_archaea"
33
- LSU_rRNA_bacteria = "LSU_rRNA_bacteria"
34
- LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
35
-
36
-
37
- def set_model_names(prefix, name, directory):
38
- pattern_dict = {}
39
- pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
40
- pattern_dict[SSU_rRNA_archaea] = os.path.join(
41
- directory, f"{prefix}{name}_{SSU_rRNA_archaea}.RF01959.fa"
42
- )
43
- pattern_dict[SSU_rRNA_bacteria] = os.path.join(
44
- directory, f"{prefix}{name}_{SSU_rRNA_bacteria}.RF00177.fa"
45
- )
46
- pattern_dict[SSU_rRNA_eukarya] = os.path.join(
47
- directory, f"{prefix}{name}_{SSU_rRNA_eukarya}.RF01960.fa"
48
- )
49
- pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
50
- directory, f"{prefix}{name}_{SSU_rRNA_microsporidia}.RF02542.fa"
51
- )
52
- pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
53
- pattern_dict[LSU_rRNA_archaea] = os.path.join(
54
- directory, f"{prefix}{name}_{LSU_rRNA_archaea}.RF02540.fa"
55
- )
56
- pattern_dict[LSU_rRNA_bacteria] = os.path.join(
57
- directory, f"{prefix}{name}_{LSU_rRNA_bacteria}.RF02541.fa"
58
- )
59
- pattern_dict[LSU_rRNA_eukarya] = os.path.join(
60
- directory, f"{prefix}{name}_{LSU_rRNA_eukarya}.RF02543.fa"
61
- )
62
- pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fa")
63
- pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fa")
64
- return pattern_dict
65
-
66
-
67
- def main():
68
- parser = argparse.ArgumentParser(
69
- description="Extract lsu, ssu and 5s and other models"
70
- )
71
- parser.add_argument(
72
- "-i", "--input", dest="input", help="Input fasta file", required=True
73
- )
74
- parser.add_argument(
75
- "-p", "--prefix", dest="prefix", help="prefix for models", required=False
76
- )
77
- parser.add_argument("-n", "--name", dest="name", help="Accession", required=True)
78
-
79
- args = parser.parse_args()
80
- prefix = args.prefix if args.prefix else ""
81
- name = args.name if args.name else "accession"
82
-
83
- directory = "sequence-categorisation"
84
- if not os.path.exists(directory):
85
- os.makedirs(directory)
86
- directory_ncrna = os.path.join("sequence-categorisation", "ncRNA")
87
- if not os.path.exists(directory_ncrna):
88
- os.makedirs(directory_ncrna)
89
-
90
- print("Start fasta mode")
91
- pattern_dict = set_model_names(prefix, name, directory)
92
- coding_rna = [
93
- SSU_rRNA_archaea,
94
- SSU_rRNA_bacteria,
95
- SSU_rRNA_eukarya,
96
- SSU_rRNA_microsporidia,
97
- LSU_rRNA_archaea,
98
- LSU_rRNA_bacteria,
99
- LSU_rRNA_eukarya,
100
- Seq5S,
101
- Seq5_8S,
102
- ]
103
- open_files = {}
104
- for record in SeqIO.parse(args.input, "fasta"):
105
- model = "-".join(record.id.split("/")[0].split("-")[-1:])
106
- if model in coding_rna:
107
- filename = pattern_dict[model]
108
- else:
109
- filename = os.path.join(directory_ncrna, f"{prefix}{name}_{model}.fasta")
110
- if model not in open_files:
111
- file_out = open(filename, "w")
112
- open_files[model] = file_out
113
- SeqIO.write(record, open_files[model], "fasta")
114
-
115
- if model in (
116
- SSU_rRNA_archaea,
117
- SSU_rRNA_bacteria,
118
- SSU_rRNA_eukarya,
119
- SSU_rRNA_microsporidia,
120
- ):
121
- if SSU not in open_files:
122
- file_out = open(pattern_dict[SSU], "w")
123
- open_files[SSU] = file_out
124
- SeqIO.write(record, open_files[SSU], "fasta")
125
- if model in (LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya):
126
- if LSU not in open_files:
127
- file_out = open(pattern_dict[LSU], "w")
128
- open_files[LSU] = file_out
129
- SeqIO.write(record, open_files[LSU], "fasta")
130
-
131
- for item in open_files:
132
- open_files[item].close()
133
-
134
- if len(os.listdir(directory_ncrna)) == 0:
135
- os.rmdir(directory_ncrna)
136
-
137
-
138
- if __name__ == "__main__":
139
- main()