mgnify-pipelines-toolkit 1.0.1__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (56) hide show
  1. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/PKG-INFO +3 -2
  2. mgnify_pipelines_toolkit-1.0.2/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +116 -0
  3. mgnify_pipelines_toolkit-1.0.2/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +141 -0
  4. mgnify_pipelines_toolkit-1.0.2/mgnify_pipelines_toolkit/constants/ncrna.py +62 -0
  5. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +3 -2
  6. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +2 -0
  7. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
  8. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/pyproject.toml +2 -1
  9. mgnify_pipelines_toolkit-1.0.1/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -139
  10. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/LICENSE +0 -0
  11. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/README.md +0 -0
  12. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/__init__.py +0 -0
  13. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  14. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  15. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  16. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  17. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  18. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  19. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  20. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  21. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  22. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  23. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  24. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  25. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  26. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  27. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  28. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  29. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  30. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  31. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  32. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  33. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  34. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  35. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  36. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  37. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  38. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  39. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  40. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  41. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  42. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
  43. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  44. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  45. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  46. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  47. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  48. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  49. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
  50. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  51. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  52. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  53. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  54. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  55. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  56. {mgnify_pipelines_toolkit-1.0.1 → mgnify_pipelines_toolkit-1.0.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -38,6 +38,7 @@ Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
38
  Requires-Dist: black==24.8.0; extra == "dev"
39
39
  Requires-Dist: flake8==7.1.1; extra == "dev"
40
40
  Requires-Dist: pep8-naming==0.14.1; extra == "dev"
41
+ Dynamic: license-file
41
42
 
42
43
  # mgnify-pipelines-toolkit
43
44
 
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2025 EMBL - European Bioinformatics Institute
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Script to convert cmscan-table to cmsearch-table (swap columns 1 and 2 with 3 and 4)
18
+
19
+ input example:
20
+ #target name accession query name accession mdl mdl from mdl to seq from seq to strand ..
21
+ #------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ..
22
+ SSU_rRNA_eukarya RF01960 SRR17062740.1 - cm 582 1025 1 452 + ..
23
+
24
+ expected output:
25
+ #------------------- --------- -------------------- --------- --- -------- -------- -------- -------- ------ ..
26
+ #target name accession query name accession mdl mdl from mdl to seq from seq to strand ..
27
+ SRR17062740.1 - SSU_rRNA_eukarya RF01960 cm 582 1025 1 452 + ..
28
+
29
+ """
30
+
31
+ import sys
32
+ import argparse
33
+ import fileinput
34
+ from itertools import accumulate
35
+
36
+
37
+ def parse_args(argv):
38
+ parser = argparse.ArgumentParser(
39
+ description="Convert cmscan table to cmsearch table"
40
+ )
41
+ parser.add_argument(
42
+ "-i", "--input", dest="input", help="Input cmscan file", required=True
43
+ )
44
+ parser.add_argument(
45
+ "-o", "--output", dest="output", help="Output filename", required=True
46
+ )
47
+ return parser.parse_args(argv)
48
+
49
+
50
+ class TableModifier:
51
+ def __init__(
52
+ self,
53
+ input_file: str,
54
+ output_file: str,
55
+ ):
56
+ """
57
+ Output of cmsearch-table has columns separated with different number of spaces (to keep humanreadable format)
58
+ :param input_file: output of cmscan-table
59
+ :param output_file: name of cmsearch table
60
+ """
61
+ self.input_file = input_file
62
+ self.output_file = output_file
63
+
64
+ def modify_table(self):
65
+ with fileinput.hook_compressed(self.input_file, "rt") as file_in, open(
66
+ self.output_file, "w"
67
+ ) as file_out:
68
+ header_written = False
69
+ separator_line, header = "", ""
70
+ for line in file_in:
71
+ if line.startswith("#"):
72
+ if "--" in line:
73
+ separator_line = line.split(" ")
74
+ separator_line[0] = separator_line[0].replace("#", "-")
75
+ lengths = [0] + list(
76
+ accumulate(len(s) + 1 for s in separator_line)
77
+ )
78
+ else:
79
+ header = line
80
+ else:
81
+ coord_to_keep = len(" ".join(separator_line[0:4]))
82
+ if not header_written:
83
+ file_out.write(header)
84
+ file_out.write(
85
+ " ".join(
86
+ [
87
+ "#" + separator_line[2][1:],
88
+ separator_line[3],
89
+ separator_line[0].replace("#", ""),
90
+ separator_line[1],
91
+ ]
92
+ + separator_line[4:]
93
+ )
94
+ )
95
+ header_written = True
96
+ new_line = (
97
+ line[lengths[2] : lengths[3]]
98
+ + line[lengths[3] : lengths[4]]
99
+ + line[lengths[0] : lengths[1]]
100
+ + line[lengths[1] : lengths[2]]
101
+ + line[coord_to_keep + 1 :]
102
+ )
103
+ file_out.write(new_line)
104
+
105
+
106
+ def main():
107
+ args = parse_args(sys.argv[1:])
108
+ table_modifier = TableModifier(
109
+ input_file=args.input,
110
+ output_file=args.output,
111
+ )
112
+ table_modifier.modify_table()
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+
18
+ import argparse
19
+ import os
20
+ from Bio import SeqIO
21
+ from mgnify_pipelines_toolkit.constants.ncrna import (
22
+ DIRECTORY_SEQ_CAT,
23
+ SSU,
24
+ LSU,
25
+ Seq5S,
26
+ Seq5_8S,
27
+ SSU_rRNA_archaea,
28
+ SSU_rRNA_bacteria,
29
+ SSU_rRNA_eukarya,
30
+ SSU_rRNA_microsporidia,
31
+ LSU_rRNA_archaea,
32
+ LSU_rRNA_bacteria,
33
+ LSU_rRNA_eukarya,
34
+ NON_CODING_RNA,
35
+ SSU_MODELS,
36
+ LSU_MODELS,
37
+ RFAM_MODELS,
38
+ )
39
+
40
+
41
+ def set_model_names(prefix, name, directory, separate_subunits):
42
+ pattern_dict = {}
43
+ pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
44
+ pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
45
+ pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fasta")
46
+ pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fasta")
47
+ if separate_subunits:
48
+ pattern_dict[SSU_rRNA_archaea] = os.path.join(
49
+ directory,
50
+ f"{prefix}{name}_{SSU_rRNA_archaea}.{RFAM_MODELS[SSU_rRNA_archaea]}.fasta",
51
+ )
52
+ pattern_dict[SSU_rRNA_bacteria] = os.path.join(
53
+ directory,
54
+ f"{prefix}{name}_{SSU_rRNA_bacteria}.{RFAM_MODELS[SSU_rRNA_bacteria]}.fasta",
55
+ )
56
+ pattern_dict[SSU_rRNA_eukarya] = os.path.join(
57
+ directory,
58
+ f"{prefix}{name}_{SSU_rRNA_eukarya}.{RFAM_MODELS[SSU_rRNA_eukarya]}.fasta",
59
+ )
60
+ pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
61
+ directory,
62
+ f"{prefix}{name}_{SSU_rRNA_microsporidia}.{RFAM_MODELS[SSU_rRNA_microsporidia]}.fasta",
63
+ )
64
+ pattern_dict[LSU_rRNA_archaea] = os.path.join(
65
+ directory,
66
+ f"{prefix}{name}_{LSU_rRNA_archaea}.{RFAM_MODELS[LSU_rRNA_archaea]}.fasta",
67
+ )
68
+ pattern_dict[LSU_rRNA_bacteria] = os.path.join(
69
+ directory,
70
+ f"{prefix}{name}_{LSU_rRNA_bacteria}.{RFAM_MODELS[LSU_rRNA_bacteria]}.fasta",
71
+ )
72
+ pattern_dict[LSU_rRNA_eukarya] = os.path.join(
73
+ directory,
74
+ f"{prefix}{name}_{LSU_rRNA_eukarya}.{RFAM_MODELS[LSU_rRNA_eukarya]}.fasta",
75
+ )
76
+ return pattern_dict
77
+
78
+
79
+ def main():
80
+ parser = argparse.ArgumentParser(
81
+ description="Extract lsu, ssu and 5s and other models"
82
+ )
83
+ parser.add_argument(
84
+ "-i", "--input", dest="input", help="Input fasta file", required=True
85
+ )
86
+ parser.add_argument(
87
+ "-p", "--prefix", dest="prefix", help="prefix for models", required=False
88
+ )
89
+ parser.add_argument("-n", "--name", dest="name", help="Accession", required=True)
90
+ parser.add_argument(
91
+ "--separate-subunits-by-models",
92
+ action="store_true",
93
+ help="Create separate files for each kingdon example: sample_SSU_rRNA_eukarya.RF01960.fasta",
94
+ )
95
+
96
+ args = parser.parse_args()
97
+ prefix = args.prefix if args.prefix else ""
98
+ name = args.name if args.name else "accession"
99
+
100
+ directory = DIRECTORY_SEQ_CAT
101
+ if not os.path.exists(directory):
102
+ os.makedirs(directory)
103
+
104
+ print("Start fasta mode")
105
+ pattern_dict = set_model_names(
106
+ prefix, name, directory, args.separate_subunits_by_models
107
+ )
108
+
109
+ open_files = {}
110
+ for record in SeqIO.parse(args.input, "fasta"):
111
+ model = "-".join(record.id.split("/")[0].split("-")[-1:])
112
+ if model in SSU_MODELS:
113
+ if SSU not in open_files:
114
+ file_out = open(pattern_dict[SSU], "w")
115
+ open_files[SSU] = file_out
116
+ SeqIO.write(record, open_files[SSU], "fasta")
117
+ elif model in LSU_MODELS:
118
+ if LSU not in open_files:
119
+ file_out = open(pattern_dict[LSU], "w")
120
+ open_files[LSU] = file_out
121
+ SeqIO.write(record, open_files[LSU], "fasta")
122
+
123
+ if model in NON_CODING_RNA:
124
+ if model in pattern_dict:
125
+ filename = pattern_dict[model]
126
+ else:
127
+ filename = None
128
+ else:
129
+ filename = os.path.join(directory, f"{name}_other_ncRNA.fasta")
130
+ if filename:
131
+ if model not in open_files:
132
+ file_out = open(filename, "w")
133
+ open_files[model] = file_out
134
+ SeqIO.write(record, open_files[model], "fasta")
135
+
136
+ for item in open_files:
137
+ open_files[item].close()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ DIRECTORY_SEQ_CAT = "sequence-categorisation"
18
+
19
+ SSU = "SSU_rRNA"
20
+ LSU = "LSU_rRNA"
21
+ Seq5S = "mtPerm-5S"
22
+ Seq5_8S = "5_8S_rRNA"
23
+
24
+ SSU_rRNA_archaea = "SSU_rRNA_archaea"
25
+ SSU_rRNA_bacteria = "SSU_rRNA_bacteria"
26
+ SSU_rRNA_eukarya = "SSU_rRNA_eukarya"
27
+ SSU_rRNA_microsporidia = "SSU_rRNA_microsporidia"
28
+
29
+ LSU_rRNA_archaea = "LSU_rRNA_archaea"
30
+ LSU_rRNA_bacteria = "LSU_rRNA_bacteria"
31
+ LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
32
+
33
+ NON_CODING_RNA = [
34
+ SSU_rRNA_archaea,
35
+ SSU_rRNA_bacteria,
36
+ SSU_rRNA_eukarya,
37
+ SSU_rRNA_microsporidia,
38
+ LSU_rRNA_archaea,
39
+ LSU_rRNA_bacteria,
40
+ LSU_rRNA_eukarya,
41
+ Seq5S,
42
+ Seq5_8S,
43
+ ]
44
+
45
+ SSU_MODELS = [
46
+ SSU_rRNA_archaea,
47
+ SSU_rRNA_bacteria,
48
+ SSU_rRNA_eukarya,
49
+ SSU_rRNA_microsporidia,
50
+ ]
51
+
52
+ LSU_MODELS = [LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya]
53
+
54
+ RFAM_MODELS = {
55
+ SSU_rRNA_archaea: "RF01959",
56
+ SSU_rRNA_bacteria: "RF00177",
57
+ SSU_rRNA_eukarya: "RF01960",
58
+ SSU_rRNA_microsporidia: "RF02542",
59
+ LSU_rRNA_archaea: "RF02540",
60
+ LSU_rRNA_bacteria: "RF02541",
61
+ LSU_rRNA_eukarya: "RF02543",
62
+ }
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -38,6 +38,7 @@ Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
38
  Requires-Dist: black==24.8.0; extra == "dev"
39
39
  Requires-Dist: flake8==7.1.1; extra == "dev"
40
40
  Requires-Dist: pep8-naming==0.14.1; extra == "dev"
41
+ Dynamic: license-file
41
42
 
42
43
  # mgnify-pipelines-toolkit
43
44
 
@@ -31,6 +31,7 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py
31
31
  mgnify_pipelines_toolkit/analysis/assembly/go_utils.py
32
32
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py
33
33
  mgnify_pipelines_toolkit/analysis/shared/__init__.py
34
+ mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py
34
35
  mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py
35
36
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py
36
37
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
@@ -40,6 +41,7 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
40
41
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py
41
42
  mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py
42
43
  mgnify_pipelines_toolkit/constants/db_labels.py
44
+ mgnify_pipelines_toolkit/constants/ncrna.py
43
45
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
44
46
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py
45
47
  mgnify_pipelines_toolkit/constants/tax_ranks.py
@@ -5,6 +5,7 @@ assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_
5
5
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
6
6
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
7
  combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
8
+ convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
8
9
  dwc_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main
9
10
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
10
11
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "1.0.1"
3
+ version = "1.0.2"
4
4
  readme = "README.md"
5
5
  license = {text = "Apache Software License 2.0"}
6
6
  authors = [
@@ -51,6 +51,7 @@ fastq_suffix_header_check = "mgnify_pipelines_toolkit.analysis.shared.fastq_suff
51
51
  library_strategy_check = "mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main"
52
52
  study_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli"
53
53
  markergene_study_summary = "mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main"
54
+ convert_cmscan_to_cmsearch_tblout = "mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main"
54
55
  # analysis.amplicon
55
56
  are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
56
57
  assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
@@ -1,139 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
- import os
19
- from Bio import SeqIO
20
-
21
-
22
- SSU = "SSU_rRNA"
23
- LSU = "LSU_rRNA"
24
- Seq5S = "mtPerm-5S"
25
- Seq5_8S = "5_8S_rRNA"
26
-
27
- SSU_rRNA_archaea = "SSU_rRNA_archaea"
28
- SSU_rRNA_bacteria = "SSU_rRNA_bacteria"
29
- SSU_rRNA_eukarya = "SSU_rRNA_eukarya"
30
- SSU_rRNA_microsporidia = "SSU_rRNA_microsporidia"
31
-
32
- LSU_rRNA_archaea = "LSU_rRNA_archaea"
33
- LSU_rRNA_bacteria = "LSU_rRNA_bacteria"
34
- LSU_rRNA_eukarya = "LSU_rRNA_eukarya"
35
-
36
-
37
- def set_model_names(prefix, name, directory):
38
- pattern_dict = {}
39
- pattern_dict[SSU] = os.path.join(directory, f"{name}_SSU.fasta")
40
- pattern_dict[SSU_rRNA_archaea] = os.path.join(
41
- directory, f"{prefix}{name}_{SSU_rRNA_archaea}.RF01959.fa"
42
- )
43
- pattern_dict[SSU_rRNA_bacteria] = os.path.join(
44
- directory, f"{prefix}{name}_{SSU_rRNA_bacteria}.RF00177.fa"
45
- )
46
- pattern_dict[SSU_rRNA_eukarya] = os.path.join(
47
- directory, f"{prefix}{name}_{SSU_rRNA_eukarya}.RF01960.fa"
48
- )
49
- pattern_dict[SSU_rRNA_microsporidia] = os.path.join(
50
- directory, f"{prefix}{name}_{SSU_rRNA_microsporidia}.RF02542.fa"
51
- )
52
- pattern_dict[LSU] = os.path.join(directory, f"{name}_LSU.fasta")
53
- pattern_dict[LSU_rRNA_archaea] = os.path.join(
54
- directory, f"{prefix}{name}_{LSU_rRNA_archaea}.RF02540.fa"
55
- )
56
- pattern_dict[LSU_rRNA_bacteria] = os.path.join(
57
- directory, f"{prefix}{name}_{LSU_rRNA_bacteria}.RF02541.fa"
58
- )
59
- pattern_dict[LSU_rRNA_eukarya] = os.path.join(
60
- directory, f"{prefix}{name}_{LSU_rRNA_eukarya}.RF02543.fa"
61
- )
62
- pattern_dict[Seq5S] = os.path.join(directory, f"{name}_5S.fa")
63
- pattern_dict[Seq5_8S] = os.path.join(directory, f"{name}_5_8S.fa")
64
- return pattern_dict
65
-
66
-
67
- def main():
68
- parser = argparse.ArgumentParser(
69
- description="Extract lsu, ssu and 5s and other models"
70
- )
71
- parser.add_argument(
72
- "-i", "--input", dest="input", help="Input fasta file", required=True
73
- )
74
- parser.add_argument(
75
- "-p", "--prefix", dest="prefix", help="prefix for models", required=False
76
- )
77
- parser.add_argument("-n", "--name", dest="name", help="Accession", required=True)
78
-
79
- args = parser.parse_args()
80
- prefix = args.prefix if args.prefix else ""
81
- name = args.name if args.name else "accession"
82
-
83
- directory = "sequence-categorisation"
84
- if not os.path.exists(directory):
85
- os.makedirs(directory)
86
- directory_ncrna = os.path.join("sequence-categorisation", "ncRNA")
87
- if not os.path.exists(directory_ncrna):
88
- os.makedirs(directory_ncrna)
89
-
90
- print("Start fasta mode")
91
- pattern_dict = set_model_names(prefix, name, directory)
92
- coding_rna = [
93
- SSU_rRNA_archaea,
94
- SSU_rRNA_bacteria,
95
- SSU_rRNA_eukarya,
96
- SSU_rRNA_microsporidia,
97
- LSU_rRNA_archaea,
98
- LSU_rRNA_bacteria,
99
- LSU_rRNA_eukarya,
100
- Seq5S,
101
- Seq5_8S,
102
- ]
103
- open_files = {}
104
- for record in SeqIO.parse(args.input, "fasta"):
105
- model = "-".join(record.id.split("/")[0].split("-")[-1:])
106
- if model in coding_rna:
107
- filename = pattern_dict[model]
108
- else:
109
- filename = os.path.join(directory_ncrna, f"{prefix}{name}_{model}.fasta")
110
- if model not in open_files:
111
- file_out = open(filename, "w")
112
- open_files[model] = file_out
113
- SeqIO.write(record, open_files[model], "fasta")
114
-
115
- if model in (
116
- SSU_rRNA_archaea,
117
- SSU_rRNA_bacteria,
118
- SSU_rRNA_eukarya,
119
- SSU_rRNA_microsporidia,
120
- ):
121
- if SSU not in open_files:
122
- file_out = open(pattern_dict[SSU], "w")
123
- open_files[SSU] = file_out
124
- SeqIO.write(record, open_files[SSU], "fasta")
125
- if model in (LSU_rRNA_archaea, LSU_rRNA_bacteria, LSU_rRNA_eukarya):
126
- if LSU not in open_files:
127
- file_out = open(pattern_dict[LSU], "w")
128
- open_files[LSU] = file_out
129
- SeqIO.write(record, open_files[LSU], "fasta")
130
-
131
- for item in open_files:
132
- open_files[item].close()
133
-
134
- if len(os.listdir(directory_ncrna)) == 0:
135
- os.rmdir(directory_ncrna)
136
-
137
-
138
- if __name__ == "__main__":
139
- main()