mgnify-pipelines-toolkit 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (62) hide show
  1. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/PKG-INFO +19 -27
  2. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/README.md +2 -1
  3. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +5 -1
  4. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +84 -21
  5. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +11 -0
  6. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +25 -7
  7. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +13 -9
  8. mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +211 -0
  9. mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +162 -0
  10. mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +230 -0
  11. mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +119 -0
  12. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +6 -3
  13. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
  14. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +19 -27
  15. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -0
  16. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +5 -0
  17. mgnify_pipelines_toolkit-1.0.5/mgnify_pipelines_toolkit.egg-info/requires.txt +20 -0
  18. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/pyproject.toml +22 -26
  19. mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -29
  20. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/LICENSE +0 -0
  21. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/__init__.py +0 -0
  22. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  23. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -0
  24. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -0
  25. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  26. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -0
  27. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  28. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  29. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  30. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  31. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  32. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  33. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  34. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -0
  35. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  36. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  37. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  38. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  39. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  40. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  41. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  42. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  43. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  44. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  45. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  46. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  47. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  48. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
  49. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  50. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  51. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  52. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  53. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  54. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
  55. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  56. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
  57. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  58. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  59. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  60. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  61. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  62. {mgnify_pipelines_toolkit-1.0.3 → mgnify_pipelines_toolkit-1.0.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -11,33 +11,24 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: biopython==1.82
15
- Requires-Dist: numpy==1.26.0
16
- Requires-Dist: pandas==2.0.2
17
- Requires-Dist: regex==2023.12.25
18
- Requires-Dist: requests==2.32.3
19
- Requires-Dist: click==8.1.7
20
- Requires-Dist: pandera==0.22.1
21
- Requires-Dist: pyfastx>=2.2.0
22
- Requires-Dist: intervaltree==3.1.0
14
+ Requires-Dist: biopython>=1.85
15
+ Requires-Dist: numpy<3,>=2.2.4
16
+ Requires-Dist: pandas<3,>=2.2.3
17
+ Requires-Dist: regex>=2024.11.6
18
+ Requires-Dist: requests<3,>=2.32.3
19
+ Requires-Dist: click<9,>=8.1.8
20
+ Requires-Dist: pandera<0.24,>=0.23.1
21
+ Requires-Dist: pyfastx<3,>=2.2.0
22
+ Requires-Dist: intervaltree<4,>=3.1.0
23
23
  Provides-Extra: tests
24
- Requires-Dist: pytest==7.4.0; extra == "tests"
25
- Requires-Dist: pytest-md==0.2.0; extra == "tests"
26
- Requires-Dist: pytest-workflow==2.0.1; extra == "tests"
27
- Requires-Dist: biopython==1.82; extra == "tests"
28
- Requires-Dist: pandas==2.0.2; extra == "tests"
29
- Requires-Dist: numpy==1.26.0; extra == "tests"
30
- Requires-Dist: regex==2023.12.25; extra == "tests"
31
- Requires-Dist: requests==2.32.3; extra == "tests"
32
- Requires-Dist: click==8.1.7; extra == "tests"
33
- Requires-Dist: pandera==0.22.1; extra == "tests"
34
- Requires-Dist: pyfastx>=2.2.0; extra == "tests"
24
+ Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
25
+ Requires-Dist: pytest-md>=0.2.0; extra == "tests"
26
+ Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
35
27
  Provides-Extra: dev
36
- Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
37
- Requires-Dist: pre-commit==3.8.0; extra == "dev"
38
- Requires-Dist: black==24.8.0; extra == "dev"
39
- Requires-Dist: flake8==7.1.1; extra == "dev"
40
- Requires-Dist: pep8-naming==0.14.1; extra == "dev"
28
+ Requires-Dist: pre-commit>=4.2.0; extra == "dev"
29
+ Requires-Dist: black>=25.1.0; extra == "dev"
30
+ Requires-Dist: flake8>=7.1.2; extra == "dev"
31
+ Requires-Dist: pep8-naming>=0.14.1; extra == "dev"
41
32
  Dynamic: license-file
42
33
 
43
34
  # mgnify-pipelines-toolkit
@@ -74,8 +65,9 @@ Before starting any development, you should do these few steps:
74
65
  - Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
75
66
  - Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
76
67
  - Activate you new environment (i.e. `conda activate my_new_env`)
77
- - Install dev dependencies `pip install -e '.[dev]'`
68
+ - Install dev dependencies `pip install -e '.[tests,dev]'`
78
69
  - Install pre-commit hooks `pre-commit install`
70
+ - Run unit tests `pytest`
79
71
 
80
72
  When doing these steps above, you ensure that the code you add will be linted and formatted properly.
81
73
 
@@ -32,8 +32,9 @@ Before starting any development, you should do these few steps:
32
32
  - Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
33
33
  - Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
34
34
  - Activate you new environment (i.e. `conda activate my_new_env`)
35
- - Install dev dependencies `pip install -e '.[dev]'`
35
+ - Install dev dependencies `pip install -e '.[tests,dev]'`
36
36
  - Install pre-commit hooks `pre-commit install`
37
+ - Run unit tests `pytest`
37
38
 
38
39
  When doing these steps above, you ensure that the code you add will be linted and formatted properly.
39
40
 
@@ -78,7 +78,11 @@ def main():
78
78
  "--output",
79
79
  required=True,
80
80
  type=Path,
81
- help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
81
+ help=(
82
+ "Output TSV file with columns: contig_id, protein_id, protein hash, "
83
+ "Rhea IDs, CHEBI reaction, reaction definition, 'top hit' if it is "
84
+ "the first hit for the protein"
85
+ ),
82
86
  )
83
87
  parser.add_argument(
84
88
  "-p",
@@ -17,8 +17,19 @@
17
17
 
18
18
  import re
19
19
  import sys
20
+ import fileinput
20
21
 
21
- from mgnify_pipelines_toolkit.constants.thresholds import EVALUE_CUTOFF_IPS, EVALUE_CUTOFF_EGGNOG
22
+ from mgnify_pipelines_toolkit.constants.thresholds import (
23
+ EVALUE_CUTOFF_IPS,
24
+ EVALUE_CUTOFF_EGGNOG,
25
+ )
26
+
27
+ DBCAN_CLASSES_DICT = {
28
+ "TC": "dbcan_transporter_classification",
29
+ "TF": "dbcan_transcription_factor",
30
+ "STP": "dbcan_signal_transduction_prot",
31
+ "CAZyme": "dbcan_prot_family",
32
+ }
22
33
 
23
34
 
24
35
  def get_iprs(ipr_annot):
@@ -26,7 +37,8 @@ def get_iprs(ipr_annot):
26
37
  antifams = list()
27
38
  if not ipr_annot:
28
39
  return iprs, antifams
29
- with open(ipr_annot) as f:
40
+ with fileinput.hook_compressed(ipr_annot, "r", encoding="utf-8") as f:
41
+
30
42
  for line in f:
31
43
  cols = line.strip().split("\t")
32
44
  protein = cols[0]
@@ -55,7 +67,8 @@ def get_eggnog(eggnog_annot):
55
67
  eggnogs = {}
56
68
  if not eggnog_annot:
57
69
  return eggnogs
58
- with open(eggnog_annot, "r") as f:
70
+ with fileinput.hook_compressed(eggnog_annot, "r", encoding="utf-8") as f:
71
+
59
72
  for line in f:
60
73
  line = line.rstrip()
61
74
  cols = line.split("\t")
@@ -104,7 +117,8 @@ def get_bgcs(bgc_file, prokka_gff, tool):
104
117
  return bgc_annotations
105
118
  # save positions of each BGC cluster to dictionary cluster_positions
106
119
  # and save the annotations to dictionary bgc_result
107
- with open(bgc_file, "r") as bgc_in:
120
+ with fileinput.hook_compressed(bgc_file, "r", encoding="utf-8") as bgc_in:
121
+
108
122
  for line in bgc_in:
109
123
  if not line.startswith("#"):
110
124
  (
@@ -138,7 +152,7 @@ def get_bgcs(bgc_file, prokka_gff, tool):
138
152
  type_value = ""
139
153
  as_product = ""
140
154
  for a in annotations.split(
141
- ";"
155
+ ";"
142
156
  ): # go through all parts of the annotation field
143
157
  if a.startswith("as_type="):
144
158
  type_value = a.split("=")[1]
@@ -170,9 +184,12 @@ def get_bgcs(bgc_file, prokka_gff, tool):
170
184
  {"bgc_function": type_value},
171
185
  )
172
186
  if as_product:
173
- tool_result[contig]["_".join([start_pos, end_pos])]["bgc_product"] = as_product
187
+ tool_result[contig]["_".join([start_pos, end_pos])][
188
+ "bgc_product"
189
+ ] = as_product
174
190
  # identify CDSs that fall into each of the clusters annotated by the BGC tool
175
- with open(prokka_gff, "r") as gff_in:
191
+ with fileinput.hook_compressed(prokka_gff, "r", encoding="utf-8") as gff_in:
192
+
176
193
  for line in gff_in:
177
194
  if not line.startswith("#"):
178
195
  matching_interval = ""
@@ -228,8 +245,9 @@ def get_bgcs(bgc_file, prokka_gff, tool):
228
245
  },
229
246
  )
230
247
  if "bgc_product" in tool_result[contig][matching_interval]:
231
- bgc_annotations[cds_id]["antismash_product"] = tool_result[contig][matching_interval][
232
- "bgc_product"]
248
+ bgc_annotations[cds_id]["antismash_product"] = tool_result[
249
+ contig
250
+ ][matching_interval]["bgc_product"]
233
251
  elif line.startswith("##FASTA"):
234
252
  break
235
253
  return bgc_annotations
@@ -239,7 +257,7 @@ def get_amr(amr_file):
239
257
  amr_annotations = {}
240
258
  if not amr_file:
241
259
  return amr_annotations
242
- with open(amr_file, "r") as f:
260
+ with fileinput.hook_compressed(amr_file, "r", encoding="utf-8") as f:
243
261
  for line in f:
244
262
  if line.startswith("Protein identifier"):
245
263
  continue
@@ -286,7 +304,7 @@ def get_dbcan(dbcan_file):
286
304
  substrates = dict()
287
305
  if not dbcan_file:
288
306
  return dbcan_annotations
289
- with open(dbcan_file, "r") as f:
307
+ with fileinput.hook_compressed(dbcan_file, "r", encoding="utf-8") as f:
290
308
  for line in f:
291
309
  if "predicted PUL" in line:
292
310
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -314,13 +332,45 @@ def get_dbcan(dbcan_file):
314
332
  elif a.startswith("Parent"):
315
333
  parent = a.split("=")[1]
316
334
  dbcan_annotations[acc] = (
317
- "dbcan_prot_type={};dbcan_prot_family={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
335
+ "dbcan_prot_type={};{}={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
318
336
  prot_type,
337
+ DBCAN_CLASSES_DICT[prot_type],
319
338
  prot_fam,
320
339
  substrates[parent]["substrate_pul"],
321
340
  substrates[parent]["substrate_ecami"],
322
341
  )
323
342
  )
343
+
344
+ return dbcan_annotations
345
+
346
+
347
+ def get_dbcan_individual_cazys(dbcan_cazys_file):
348
+ dbcan_annotations = dict()
349
+ if not dbcan_cazys_file:
350
+ return dbcan_annotations
351
+ with fileinput.hook_compressed(dbcan_cazys_file, "r", encoding="utf-8") as f:
352
+ for line in f:
353
+ if line.startswith("#"):
354
+ continue
355
+ attributes = line.strip().split("\t")[8]
356
+ attributes_dict = dict(
357
+ re.split(r"(?<!\\)=", item)
358
+ for item in re.split(r"(?<!\\);", attributes.rstrip(";"))
359
+ )
360
+ if "num_tools" in attributes_dict and int(attributes_dict["num_tools"]) < 2:
361
+ continue # don't keep annotations supported by only one tool within dbcan
362
+ cds_pattern = r"\.CDS\d+$"
363
+ protein = re.sub(
364
+ cds_pattern, "", attributes_dict["ID"]
365
+ ) # remove the CDS number
366
+ annotation_text = "dbcan_prot_type=CAZyme;"
367
+ for field in ["protein_family", "substrate_dbcan-sub", "eC_number"]:
368
+ if field in attributes_dict:
369
+ annotation_text += (
370
+ f"{'dbcan_prot_family' if field == 'protein_family' else field}"
371
+ f"={attributes_dict[field]};"
372
+ )
373
+ dbcan_annotations[protein] = annotation_text.strip(";")
324
374
  return dbcan_annotations
325
375
 
326
376
 
@@ -329,7 +379,8 @@ def get_defense_finder(df_file):
329
379
  type_info = dict()
330
380
  if not df_file:
331
381
  return defense_finder_annotations
332
- with open(df_file, "r") as f:
382
+ with fileinput.hook_compressed(df_file, "r", encoding="utf-8") as f:
383
+
333
384
  for line in f:
334
385
  if "Anti-phage system" in line:
335
386
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -366,6 +417,7 @@ def load_annotations(
366
417
  antismash_file,
367
418
  gecco_file,
368
419
  dbcan_file,
420
+ dbcan_cazys_file,
369
421
  defense_finder_file,
370
422
  pseudofinder_file,
371
423
  ):
@@ -376,6 +428,7 @@ def load_annotations(
376
428
  antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
377
429
  amr_annotations = get_amr(amr_file)
378
430
  dbcan_annotations = get_dbcan(dbcan_file)
431
+ dbcan_cazys_annotations = get_dbcan_individual_cazys(dbcan_cazys_file)
379
432
  defense_finder_annotations = get_defense_finder(defense_finder_file)
380
433
  pseudogenes = get_pseudogenes(pseudofinder_file)
381
434
  pseudogene_report_dict = dict()
@@ -384,7 +437,7 @@ def load_annotations(
384
437
  header = []
385
438
  fasta = []
386
439
  fasta_flag = False
387
- with open(in_gff) as f:
440
+ with fileinput.hook_compressed(in_gff, "r", encoding="utf-8") as f:
388
441
  for line in f:
389
442
  line = line.strip()
390
443
  if line[0] != "#" and not fasta_flag:
@@ -496,6 +549,11 @@ def load_annotations(
496
549
  added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
497
550
  except KeyError:
498
551
  pass
552
+ try:
553
+ dbcan_cazys_annotations[protein]
554
+ added_annot[protein]["dbCAN"] = dbcan_cazys_annotations[protein]
555
+ except KeyError:
556
+ pass
499
557
  try:
500
558
  defense_finder_annotations[protein]
501
559
  added_annot[protein]["defense_finder"] = (
@@ -530,7 +588,7 @@ def load_annotations(
530
588
  def get_ncrnas(ncrnas_file):
531
589
  ncrnas = {}
532
590
  counts = 0
533
- with open(ncrnas_file, "r") as f:
591
+ with fileinput.hook_compressed(ncrnas_file, "r", encoding="utf-8") as f:
534
592
  for line in f:
535
593
  if not line.startswith("#"):
536
594
  cols = line.strip().split()
@@ -543,7 +601,9 @@ def get_ncrnas(ncrnas_file):
543
601
  # Skip tRNAs, we add them from tRNAscan-SE
544
602
  continue
545
603
  strand = cols[11]
546
- start, end = (int(cols[9]), int(cols[10])) if strand == "+" else (int(cols[10]), int(cols[9]))
604
+ start, end = int(cols[10]), int(cols[9])
605
+ if strand == "+":
606
+ start, end = end, start
547
607
  rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
548
608
  annot = [
549
609
  "ID=" + locus,
@@ -718,7 +778,10 @@ def prepare_rna_gff_fields(cols):
718
778
  }
719
779
 
720
780
  if rna_feature_name == "ncRNA":
721
- ncrna_class = next((rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams), None)
781
+ ncrna_class = next(
782
+ (rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams),
783
+ None,
784
+ )
722
785
  if not ncrna_class:
723
786
  if "microRNA" in cols[-1]:
724
787
  ncrna_class = "pre_miRNA"
@@ -729,7 +792,7 @@ def prepare_rna_gff_fields(cols):
729
792
 
730
793
  def get_trnas(trnas_file):
731
794
  trnas = {}
732
- with open(trnas_file, "r") as f:
795
+ with fileinput.hook_compressed(trnas_file, "r", encoding="utf-8") as f:
733
796
  for line in f:
734
797
  if not line.startswith("#"):
735
798
  cols = line.split("\t")
@@ -738,13 +801,13 @@ def get_trnas(trnas_file):
738
801
  line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
739
802
  trnas.setdefault(contig, dict()).setdefault(
740
803
  int(start), list()
741
- ).append(line.strip())
804
+ ).append(line.strip().strip(";"))
742
805
  return trnas
743
806
 
744
807
 
745
808
  def load_crispr(crispr_file):
746
809
  crispr_annotations = dict()
747
- with open(crispr_file, "r") as f:
810
+ with fileinput.hook_compressed(crispr_file, "r", encoding="utf-8") as f:
748
811
  record = list()
749
812
  left_coord = ""
750
813
  loc_contig = ""
@@ -791,7 +854,7 @@ def get_pseudogenes(pseudofinder_file):
791
854
  pseudogenes = dict()
792
855
  if not pseudofinder_file:
793
856
  return pseudogenes
794
- with open(pseudofinder_file) as file_in:
857
+ with fileinput.hook_compressed(pseudofinder_file, "r", encoding="utf-8") as file_in:
795
858
  for line in file_in:
796
859
  if not line.startswith("#"):
797
860
  col9 = line.strip().split("\t")[8]
@@ -28,6 +28,17 @@ def write_results_to_file(
28
28
  contig_list = check_for_additional_keys(
29
29
  ncrnas, trnas, crispr_annotations, contig_list
30
30
  )
31
+ # sort contigs by digit at the end of contig/genome accession
32
+ if contig_list[0].startswith(
33
+ "MGYG"
34
+ ): # e.g. 'MGYG000500002_1', 'MGYG000500002_2', 'MGYG000500002_3'
35
+ contig_list = sorted(list(contig_list), key=lambda x: int(x.split("_")[-1]))
36
+ elif contig_list[0].startswith(
37
+ "ERZ"
38
+ ): # e.g. 'ERZ1049444', 'ERZ1049445', 'ERZ1049446'
39
+ contig_list = sorted(
40
+ list(contig_list), key=lambda x: int(x.split("ERZ")[-1])
41
+ )
31
42
  for contig in contig_list:
32
43
  sorted_pos_list = sort_positions(
33
44
  contig, main_gff_extended, ncrnas, trnas, crispr_annotations
@@ -17,8 +17,16 @@
17
17
 
18
18
  import argparse
19
19
 
20
- from gff_annotation_utils import get_ncrnas, get_trnas, load_annotations, load_crispr
21
- from gff_file_utils import write_results_to_file, print_pseudogene_report
20
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_annotation_utils import (
21
+ get_ncrnas,
22
+ get_trnas,
23
+ load_annotations,
24
+ load_crispr,
25
+ )
26
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_file_utils import (
27
+ write_results_to_file,
28
+ print_pseudogene_report,
29
+ )
22
30
 
23
31
 
24
32
  def main(
@@ -31,6 +39,7 @@ def main(
31
39
  antismash_file,
32
40
  gecco_file,
33
41
  dbcan_file,
42
+ dbcan_cazys_file,
34
43
  defense_finder_file,
35
44
  pseudofinder_file,
36
45
  rfam_file,
@@ -53,6 +62,7 @@ def main(
53
62
  antismash_file,
54
63
  gecco_file,
55
64
  dbcan_file,
65
+ dbcan_cazys_file,
56
66
  defense_finder_file,
57
67
  pseudofinder_file,
58
68
  )
@@ -66,7 +76,9 @@ def main(
66
76
  if crispr_file:
67
77
  crispr_annotations = load_crispr(crispr_file)
68
78
 
69
- write_results_to_file(outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations)
79
+ write_results_to_file(
80
+ outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations
81
+ )
70
82
  if pseudogene_report_file:
71
83
  print_pseudogene_report(pseudogene_report_dict, pseudogene_report_file)
72
84
 
@@ -74,7 +86,7 @@ def main(
74
86
  def parse_args():
75
87
  parser = argparse.ArgumentParser(
76
88
  description="The script extends a user-provided base GFF annotation file by incorporating "
77
- "information extracted from the user-provided outputs of supplementary annotation tools.",
89
+ "information extracted from the user-provided outputs of supplementary annotation tools.",
78
90
  )
79
91
  parser.add_argument(
80
92
  "-g",
@@ -124,7 +136,12 @@ def parse_args():
124
136
  )
125
137
  parser.add_argument(
126
138
  "--dbcan",
127
- help="The GFF file produced by dbCAN post-processing script",
139
+ help="The GFF file produced by dbCAN post-processing script that uses cluster annotations",
140
+ required=False,
141
+ )
142
+ parser.add_argument(
143
+ "--dbcan-cazys",
144
+ help="The GFF file produced by dbCAN-CAZYs post-processing script",
128
145
  required=False,
129
146
  )
130
147
  parser.add_argument(
@@ -149,7 +166,7 @@ def parse_args():
149
166
  return parser.parse_args()
150
167
 
151
168
 
152
- if __name__ == '__main__':
169
+ if __name__ == "__main__":
153
170
  args = parse_args()
154
171
  main(
155
172
  args.gff_input,
@@ -161,10 +178,11 @@ if __name__ == '__main__':
161
178
  args.antismash,
162
179
  args.gecco,
163
180
  args.dbcan,
181
+ args.dbcan_cazys,
164
182
  args.defense_finder,
165
183
  args.pseudofinder,
166
184
  args.rfam,
167
185
  args.trnascan,
168
186
  args.outfile,
169
187
  args.pseudogene_report,
170
- )
188
+ )
@@ -40,10 +40,12 @@ def import_nodes(nodes_dmp):
40
40
  taxid2rank = {}
41
41
 
42
42
  with open(nodes_dmp) as f1:
43
- reader = csv.reader(f1, delimiter="\t")
44
- for line in reader:
45
- taxid = line[0]
46
- rank = line[4]
43
+ for line in f1:
44
+ fields = [part.strip() for part in line.split("|")]
45
+ if len(fields) != 14:
46
+ raise ValueError(f"Unexpected number of columns in line: {line}")
47
+ taxid = fields[0]
48
+ rank = fields[2]
47
49
  taxid2rank[taxid] = rank
48
50
 
49
51
  return taxid2rank
@@ -54,11 +56,13 @@ def import_names(names_dmp):
54
56
  taxid2name = {}
55
57
 
56
58
  with open(names_dmp, newline="") as f1:
57
- reader = csv.reader(f1, delimiter="\t")
58
- for line in reader:
59
- if line[6] == "scientific name":
60
- taxid = line[0]
61
- name = line[2]
59
+ for line in f1:
60
+ fields = [part.strip() for part in line.split("|")]
61
+ if len(fields) != 5:
62
+ raise ValueError(f"Unexpected number of columns in line: {line}")
63
+ if fields[3] == "scientific name":
64
+ taxid = fields[0]
65
+ name = fields[1]
62
66
  taxid2name[taxid] = name
63
67
 
64
68
  return taxid2name