mgnify-pipelines-toolkit 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (18) hide show
  1. mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +5 -1
  2. mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +84 -21
  3. mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +11 -0
  4. mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +25 -7
  5. mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +13 -9
  6. mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +211 -0
  7. mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +162 -0
  8. mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +230 -0
  9. mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +119 -0
  10. mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +6 -3
  11. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +1 -1
  12. mgnify_pipelines_toolkit/constants/thresholds.py +1 -1
  13. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/METADATA +19 -27
  14. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/RECORD +18 -14
  15. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/WHEEL +1 -1
  16. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/entry_points.txt +5 -0
  17. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/licenses/LICENSE +0 -0
  18. {mgnify_pipelines_toolkit-1.0.3.dist-info → mgnify_pipelines_toolkit-1.0.5.dist-info}/top_level.txt +0 -0
@@ -78,7 +78,11 @@ def main():
78
78
  "--output",
79
79
  required=True,
80
80
  type=Path,
81
- help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
81
+ help=(
82
+ "Output TSV file with columns: contig_id, protein_id, protein hash, "
83
+ "Rhea IDs, CHEBI reaction, reaction definition, 'top hit' if it is "
84
+ "the first hit for the protein"
85
+ ),
82
86
  )
83
87
  parser.add_argument(
84
88
  "-p",
@@ -17,8 +17,19 @@
17
17
 
18
18
  import re
19
19
  import sys
20
+ import fileinput
20
21
 
21
- from mgnify_pipelines_toolkit.constants.thresholds import EVALUE_CUTOFF_IPS, EVALUE_CUTOFF_EGGNOG
22
+ from mgnify_pipelines_toolkit.constants.thresholds import (
23
+ EVALUE_CUTOFF_IPS,
24
+ EVALUE_CUTOFF_EGGNOG,
25
+ )
26
+
27
+ DBCAN_CLASSES_DICT = {
28
+ "TC": "dbcan_transporter_classification",
29
+ "TF": "dbcan_transcription_factor",
30
+ "STP": "dbcan_signal_transduction_prot",
31
+ "CAZyme": "dbcan_prot_family",
32
+ }
22
33
 
23
34
 
24
35
  def get_iprs(ipr_annot):
@@ -26,7 +37,8 @@ def get_iprs(ipr_annot):
26
37
  antifams = list()
27
38
  if not ipr_annot:
28
39
  return iprs, antifams
29
- with open(ipr_annot) as f:
40
+ with fileinput.hook_compressed(ipr_annot, "r", encoding="utf-8") as f:
41
+
30
42
  for line in f:
31
43
  cols = line.strip().split("\t")
32
44
  protein = cols[0]
@@ -55,7 +67,8 @@ def get_eggnog(eggnog_annot):
55
67
  eggnogs = {}
56
68
  if not eggnog_annot:
57
69
  return eggnogs
58
- with open(eggnog_annot, "r") as f:
70
+ with fileinput.hook_compressed(eggnog_annot, "r", encoding="utf-8") as f:
71
+
59
72
  for line in f:
60
73
  line = line.rstrip()
61
74
  cols = line.split("\t")
@@ -104,7 +117,8 @@ def get_bgcs(bgc_file, prokka_gff, tool):
104
117
  return bgc_annotations
105
118
  # save positions of each BGC cluster to dictionary cluster_positions
106
119
  # and save the annotations to dictionary bgc_result
107
- with open(bgc_file, "r") as bgc_in:
120
+ with fileinput.hook_compressed(bgc_file, "r", encoding="utf-8") as bgc_in:
121
+
108
122
  for line in bgc_in:
109
123
  if not line.startswith("#"):
110
124
  (
@@ -138,7 +152,7 @@ def get_bgcs(bgc_file, prokka_gff, tool):
138
152
  type_value = ""
139
153
  as_product = ""
140
154
  for a in annotations.split(
141
- ";"
155
+ ";"
142
156
  ): # go through all parts of the annotation field
143
157
  if a.startswith("as_type="):
144
158
  type_value = a.split("=")[1]
@@ -170,9 +184,12 @@ def get_bgcs(bgc_file, prokka_gff, tool):
170
184
  {"bgc_function": type_value},
171
185
  )
172
186
  if as_product:
173
- tool_result[contig]["_".join([start_pos, end_pos])]["bgc_product"] = as_product
187
+ tool_result[contig]["_".join([start_pos, end_pos])][
188
+ "bgc_product"
189
+ ] = as_product
174
190
  # identify CDSs that fall into each of the clusters annotated by the BGC tool
175
- with open(prokka_gff, "r") as gff_in:
191
+ with fileinput.hook_compressed(prokka_gff, "r", encoding="utf-8") as gff_in:
192
+
176
193
  for line in gff_in:
177
194
  if not line.startswith("#"):
178
195
  matching_interval = ""
@@ -228,8 +245,9 @@ def get_bgcs(bgc_file, prokka_gff, tool):
228
245
  },
229
246
  )
230
247
  if "bgc_product" in tool_result[contig][matching_interval]:
231
- bgc_annotations[cds_id]["antismash_product"] = tool_result[contig][matching_interval][
232
- "bgc_product"]
248
+ bgc_annotations[cds_id]["antismash_product"] = tool_result[
249
+ contig
250
+ ][matching_interval]["bgc_product"]
233
251
  elif line.startswith("##FASTA"):
234
252
  break
235
253
  return bgc_annotations
@@ -239,7 +257,7 @@ def get_amr(amr_file):
239
257
  amr_annotations = {}
240
258
  if not amr_file:
241
259
  return amr_annotations
242
- with open(amr_file, "r") as f:
260
+ with fileinput.hook_compressed(amr_file, "r", encoding="utf-8") as f:
243
261
  for line in f:
244
262
  if line.startswith("Protein identifier"):
245
263
  continue
@@ -286,7 +304,7 @@ def get_dbcan(dbcan_file):
286
304
  substrates = dict()
287
305
  if not dbcan_file:
288
306
  return dbcan_annotations
289
- with open(dbcan_file, "r") as f:
307
+ with fileinput.hook_compressed(dbcan_file, "r", encoding="utf-8") as f:
290
308
  for line in f:
291
309
  if "predicted PUL" in line:
292
310
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -314,13 +332,45 @@ def get_dbcan(dbcan_file):
314
332
  elif a.startswith("Parent"):
315
333
  parent = a.split("=")[1]
316
334
  dbcan_annotations[acc] = (
317
- "dbcan_prot_type={};dbcan_prot_family={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
335
+ "dbcan_prot_type={};{}={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
318
336
  prot_type,
337
+ DBCAN_CLASSES_DICT[prot_type],
319
338
  prot_fam,
320
339
  substrates[parent]["substrate_pul"],
321
340
  substrates[parent]["substrate_ecami"],
322
341
  )
323
342
  )
343
+
344
+ return dbcan_annotations
345
+
346
+
347
+ def get_dbcan_individual_cazys(dbcan_cazys_file):
348
+ dbcan_annotations = dict()
349
+ if not dbcan_cazys_file:
350
+ return dbcan_annotations
351
+ with fileinput.hook_compressed(dbcan_cazys_file, "r", encoding="utf-8") as f:
352
+ for line in f:
353
+ if line.startswith("#"):
354
+ continue
355
+ attributes = line.strip().split("\t")[8]
356
+ attributes_dict = dict(
357
+ re.split(r"(?<!\\)=", item)
358
+ for item in re.split(r"(?<!\\);", attributes.rstrip(";"))
359
+ )
360
+ if "num_tools" in attributes_dict and int(attributes_dict["num_tools"]) < 2:
361
+ continue # don't keep annotations supported by only one tool within dbcan
362
+ cds_pattern = r"\.CDS\d+$"
363
+ protein = re.sub(
364
+ cds_pattern, "", attributes_dict["ID"]
365
+ ) # remove the CDS number
366
+ annotation_text = "dbcan_prot_type=CAZyme;"
367
+ for field in ["protein_family", "substrate_dbcan-sub", "eC_number"]:
368
+ if field in attributes_dict:
369
+ annotation_text += (
370
+ f"{'dbcan_prot_family' if field == 'protein_family' else field}"
371
+ f"={attributes_dict[field]};"
372
+ )
373
+ dbcan_annotations[protein] = annotation_text.strip(";")
324
374
  return dbcan_annotations
325
375
 
326
376
 
@@ -329,7 +379,8 @@ def get_defense_finder(df_file):
329
379
  type_info = dict()
330
380
  if not df_file:
331
381
  return defense_finder_annotations
332
- with open(df_file, "r") as f:
382
+ with fileinput.hook_compressed(df_file, "r", encoding="utf-8") as f:
383
+
333
384
  for line in f:
334
385
  if "Anti-phage system" in line:
335
386
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -366,6 +417,7 @@ def load_annotations(
366
417
  antismash_file,
367
418
  gecco_file,
368
419
  dbcan_file,
420
+ dbcan_cazys_file,
369
421
  defense_finder_file,
370
422
  pseudofinder_file,
371
423
  ):
@@ -376,6 +428,7 @@ def load_annotations(
376
428
  antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
377
429
  amr_annotations = get_amr(amr_file)
378
430
  dbcan_annotations = get_dbcan(dbcan_file)
431
+ dbcan_cazys_annotations = get_dbcan_individual_cazys(dbcan_cazys_file)
379
432
  defense_finder_annotations = get_defense_finder(defense_finder_file)
380
433
  pseudogenes = get_pseudogenes(pseudofinder_file)
381
434
  pseudogene_report_dict = dict()
@@ -384,7 +437,7 @@ def load_annotations(
384
437
  header = []
385
438
  fasta = []
386
439
  fasta_flag = False
387
- with open(in_gff) as f:
440
+ with fileinput.hook_compressed(in_gff, "r", encoding="utf-8") as f:
388
441
  for line in f:
389
442
  line = line.strip()
390
443
  if line[0] != "#" and not fasta_flag:
@@ -496,6 +549,11 @@ def load_annotations(
496
549
  added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
497
550
  except KeyError:
498
551
  pass
552
+ try:
553
+ dbcan_cazys_annotations[protein]
554
+ added_annot[protein]["dbCAN"] = dbcan_cazys_annotations[protein]
555
+ except KeyError:
556
+ pass
499
557
  try:
500
558
  defense_finder_annotations[protein]
501
559
  added_annot[protein]["defense_finder"] = (
@@ -530,7 +588,7 @@ def load_annotations(
530
588
  def get_ncrnas(ncrnas_file):
531
589
  ncrnas = {}
532
590
  counts = 0
533
- with open(ncrnas_file, "r") as f:
591
+ with fileinput.hook_compressed(ncrnas_file, "r", encoding="utf-8") as f:
534
592
  for line in f:
535
593
  if not line.startswith("#"):
536
594
  cols = line.strip().split()
@@ -543,7 +601,9 @@ def get_ncrnas(ncrnas_file):
543
601
  # Skip tRNAs, we add them from tRNAscan-SE
544
602
  continue
545
603
  strand = cols[11]
546
- start, end = (int(cols[9]), int(cols[10])) if strand == "+" else (int(cols[10]), int(cols[9]))
604
+ start, end = int(cols[10]), int(cols[9])
605
+ if strand == "+":
606
+ start, end = end, start
547
607
  rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
548
608
  annot = [
549
609
  "ID=" + locus,
@@ -718,7 +778,10 @@ def prepare_rna_gff_fields(cols):
718
778
  }
719
779
 
720
780
  if rna_feature_name == "ncRNA":
721
- ncrna_class = next((rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams), None)
781
+ ncrna_class = next(
782
+ (rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams),
783
+ None,
784
+ )
722
785
  if not ncrna_class:
723
786
  if "microRNA" in cols[-1]:
724
787
  ncrna_class = "pre_miRNA"
@@ -729,7 +792,7 @@ def prepare_rna_gff_fields(cols):
729
792
 
730
793
  def get_trnas(trnas_file):
731
794
  trnas = {}
732
- with open(trnas_file, "r") as f:
795
+ with fileinput.hook_compressed(trnas_file, "r", encoding="utf-8") as f:
733
796
  for line in f:
734
797
  if not line.startswith("#"):
735
798
  cols = line.split("\t")
@@ -738,13 +801,13 @@ def get_trnas(trnas_file):
738
801
  line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
739
802
  trnas.setdefault(contig, dict()).setdefault(
740
803
  int(start), list()
741
- ).append(line.strip())
804
+ ).append(line.strip().strip(";"))
742
805
  return trnas
743
806
 
744
807
 
745
808
  def load_crispr(crispr_file):
746
809
  crispr_annotations = dict()
747
- with open(crispr_file, "r") as f:
810
+ with fileinput.hook_compressed(crispr_file, "r", encoding="utf-8") as f:
748
811
  record = list()
749
812
  left_coord = ""
750
813
  loc_contig = ""
@@ -791,7 +854,7 @@ def get_pseudogenes(pseudofinder_file):
791
854
  pseudogenes = dict()
792
855
  if not pseudofinder_file:
793
856
  return pseudogenes
794
- with open(pseudofinder_file) as file_in:
857
+ with fileinput.hook_compressed(pseudofinder_file, "r", encoding="utf-8") as file_in:
795
858
  for line in file_in:
796
859
  if not line.startswith("#"):
797
860
  col9 = line.strip().split("\t")[8]
@@ -28,6 +28,17 @@ def write_results_to_file(
28
28
  contig_list = check_for_additional_keys(
29
29
  ncrnas, trnas, crispr_annotations, contig_list
30
30
  )
31
+ # sort contigs by digit at the end of contig/genome accession
32
+ if contig_list[0].startswith(
33
+ "MGYG"
34
+ ): # e.g. 'MGYG000500002_1', 'MGYG000500002_2', 'MGYG000500002_3'
35
+ contig_list = sorted(list(contig_list), key=lambda x: int(x.split("_")[-1]))
36
+ elif contig_list[0].startswith(
37
+ "ERZ"
38
+ ): # e.g. 'ERZ1049444', 'ERZ1049445', 'ERZ1049446'
39
+ contig_list = sorted(
40
+ list(contig_list), key=lambda x: int(x.split("ERZ")[-1])
41
+ )
31
42
  for contig in contig_list:
32
43
  sorted_pos_list = sort_positions(
33
44
  contig, main_gff_extended, ncrnas, trnas, crispr_annotations
@@ -17,8 +17,16 @@
17
17
 
18
18
  import argparse
19
19
 
20
- from gff_annotation_utils import get_ncrnas, get_trnas, load_annotations, load_crispr
21
- from gff_file_utils import write_results_to_file, print_pseudogene_report
20
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_annotation_utils import (
21
+ get_ncrnas,
22
+ get_trnas,
23
+ load_annotations,
24
+ load_crispr,
25
+ )
26
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_file_utils import (
27
+ write_results_to_file,
28
+ print_pseudogene_report,
29
+ )
22
30
 
23
31
 
24
32
  def main(
@@ -31,6 +39,7 @@ def main(
31
39
  antismash_file,
32
40
  gecco_file,
33
41
  dbcan_file,
42
+ dbcan_cazys_file,
34
43
  defense_finder_file,
35
44
  pseudofinder_file,
36
45
  rfam_file,
@@ -53,6 +62,7 @@ def main(
53
62
  antismash_file,
54
63
  gecco_file,
55
64
  dbcan_file,
65
+ dbcan_cazys_file,
56
66
  defense_finder_file,
57
67
  pseudofinder_file,
58
68
  )
@@ -66,7 +76,9 @@ def main(
66
76
  if crispr_file:
67
77
  crispr_annotations = load_crispr(crispr_file)
68
78
 
69
- write_results_to_file(outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations)
79
+ write_results_to_file(
80
+ outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations
81
+ )
70
82
  if pseudogene_report_file:
71
83
  print_pseudogene_report(pseudogene_report_dict, pseudogene_report_file)
72
84
 
@@ -74,7 +86,7 @@ def main(
74
86
  def parse_args():
75
87
  parser = argparse.ArgumentParser(
76
88
  description="The script extends a user-provided base GFF annotation file by incorporating "
77
- "information extracted from the user-provided outputs of supplementary annotation tools.",
89
+ "information extracted from the user-provided outputs of supplementary annotation tools.",
78
90
  )
79
91
  parser.add_argument(
80
92
  "-g",
@@ -124,7 +136,12 @@ def parse_args():
124
136
  )
125
137
  parser.add_argument(
126
138
  "--dbcan",
127
- help="The GFF file produced by dbCAN post-processing script",
139
+ help="The GFF file produced by dbCAN post-processing script that uses cluster annotations",
140
+ required=False,
141
+ )
142
+ parser.add_argument(
143
+ "--dbcan-cazys",
144
+ help="The GFF file produced by dbCAN-CAZYs post-processing script",
128
145
  required=False,
129
146
  )
130
147
  parser.add_argument(
@@ -149,7 +166,7 @@ def parse_args():
149
166
  return parser.parse_args()
150
167
 
151
168
 
152
- if __name__ == '__main__':
169
+ if __name__ == "__main__":
153
170
  args = parse_args()
154
171
  main(
155
172
  args.gff_input,
@@ -161,10 +178,11 @@ if __name__ == '__main__':
161
178
  args.antismash,
162
179
  args.gecco,
163
180
  args.dbcan,
181
+ args.dbcan_cazys,
164
182
  args.defense_finder,
165
183
  args.pseudofinder,
166
184
  args.rfam,
167
185
  args.trnascan,
168
186
  args.outfile,
169
187
  args.pseudogene_report,
170
- )
188
+ )
@@ -40,10 +40,12 @@ def import_nodes(nodes_dmp):
40
40
  taxid2rank = {}
41
41
 
42
42
  with open(nodes_dmp) as f1:
43
- reader = csv.reader(f1, delimiter="\t")
44
- for line in reader:
45
- taxid = line[0]
46
- rank = line[4]
43
+ for line in f1:
44
+ fields = [part.strip() for part in line.split("|")]
45
+ if len(fields) != 14:
46
+ raise ValueError(f"Unexpected number of columns in line: {line}")
47
+ taxid = fields[0]
48
+ rank = fields[2]
47
49
  taxid2rank[taxid] = rank
48
50
 
49
51
  return taxid2rank
@@ -54,11 +56,13 @@ def import_names(names_dmp):
54
56
  taxid2name = {}
55
57
 
56
58
  with open(names_dmp, newline="") as f1:
57
- reader = csv.reader(f1, delimiter="\t")
58
- for line in reader:
59
- if line[6] == "scientific name":
60
- taxid = line[0]
61
- name = line[2]
59
+ for line in f1:
60
+ fields = [part.strip() for part in line.split("|")]
61
+ if len(fields) != 5:
62
+ raise ValueError(f"Unexpected number of columns in line: {line}")
63
+ if fields[3] == "scientific name":
64
+ taxid = fields[0]
65
+ name = fields[1]
62
66
  taxid2name[taxid] = name
63
67
 
64
68
  return taxid2name
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2023-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import argparse
19
+ import fileinput
20
+ import logging
21
+ from pathlib import Path
22
+ import re
23
+
24
+ logging.basicConfig(level=logging.INFO)
25
+
26
+
27
+ def main(hmm_file, overview_file, genome_gff, outfile, dbcan_version):
28
+
29
+ hmm_path = Path(hmm_file)
30
+ overview_path = Path(overview_file)
31
+
32
+ if not hmm_path.is_file():
33
+ raise FileNotFoundError(f"Input hmm path does not exist: {hmm_file}")
34
+
35
+ if not overview_path.is_file():
36
+ raise FileNotFoundError(f"Input overview path does not exist: {overview_file}")
37
+
38
+ substrates = load_substrates(hmm_path)
39
+ genome_gff_lines = load_gff(genome_gff)
40
+
41
+ print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines)
42
+
43
+
44
+ def load_gff(gff):
45
+ genome_gff_lines = dict()
46
+ with fileinput.hook_compressed(gff, "rt") as gff:
47
+ for line in gff:
48
+ if line.startswith("##FASTA"):
49
+ return genome_gff_lines
50
+
51
+ fields = line.strip().split("\t")
52
+ if len(fields) != 9 or fields[2] != "CDS":
53
+ continue
54
+
55
+ if "Parent=" in line:
56
+ # Get transcript name from the 9th column for mettannotator
57
+ match = re.search(r"Parent=([^;]+)", fields[8])
58
+ elif "ID=" in line:
59
+ # Get transcript name from the 9th column for ASA
60
+ match = re.search(r"ID=([^;]+)", fields[8])
61
+ else:
62
+ logging.error(
63
+ "Not sure what gff annotation delimiter is in use. Exiting"
64
+ )
65
+ exit(1)
66
+
67
+ transcript_name = match.group(1)
68
+ genome_gff_lines.setdefault(transcript_name, []).append(line)
69
+ return genome_gff_lines
70
+
71
+
72
+ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines):
73
+ with open(outfile, "w") as file_out:
74
+ file_out.write("##gff-version 3\n")
75
+ with fileinput.hook_compressed(overview_file, "rt") as file_in:
76
+ for line in file_in:
77
+ if line.startswith("MGYG") or line.startswith("ERZ"):
78
+ (
79
+ transcript,
80
+ ec_number_raw,
81
+ dbcan_hmmer,
82
+ dbcan_sub_ecami,
83
+ diamond,
84
+ num_of_tools,
85
+ ) = line.strip().split("\t")
86
+ # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
87
+
88
+ ec_number = ""
89
+ ec_list = ec_number_raw.split("|")
90
+ for ec in ec_list:
91
+ if ec != "-":
92
+ ec_number += ec.split(":")[0] + "|"
93
+
94
+ ec_number = ec_number.strip("|")
95
+
96
+ # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
97
+ # diamond is messier, so we don't report it here
98
+ if dbcan_hmmer != "-":
99
+ # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
100
+ subfamily = dbcan_hmmer.split("(")[0]
101
+ elif dbcan_sub_ecami != "-":
102
+ subfamily = dbcan_sub_ecami
103
+ else:
104
+ continue
105
+ cleaned_substrates = ",".join(
106
+ sorted(
107
+ {
108
+ subsrate.strip()
109
+ for subsrate in substrates.get(transcript, "N/A").split(
110
+ ","
111
+ )
112
+ }
113
+ )
114
+ )
115
+ # Assemble information to add to the 9th column
116
+ col9_parts = [
117
+ f"protein_family={subfamily}",
118
+ f"substrate_dbcan-sub={cleaned_substrates}",
119
+ ]
120
+
121
+ if ec_number:
122
+ col9_parts.append(f"eC_number={ec_number}")
123
+
124
+ col9_parts.append(f"num_tools={num_of_tools}")
125
+ col9_text = ";".join(col9_parts)
126
+
127
+ for gff_line in genome_gff_lines[transcript]:
128
+ fields = gff_line.strip().split("\t")
129
+ # Replace the tool
130
+ fields[1] = f"dbCAN:{dbcan_version}"
131
+ # Replace the feature
132
+ fields[2] = "CAZyme"
133
+ # Replace the confidence value
134
+ fields[5] = "."
135
+ # Keep only the ID in the 9th column
136
+ attributes = fields[8].split(";")[0]
137
+ # Add dbcan information to the 9th column
138
+ attributes = f"{attributes};{col9_text};"
139
+ fields[8] = attributes
140
+ file_out.write("\t".join(fields) + "\n")
141
+
142
+
143
+ def load_substrates(hmm_path):
144
+ substrates = dict()
145
+ with fileinput.hook_compressed(hmm_path, "rt") as file_in:
146
+ header = next(file_in)
147
+ header_fields = header.strip().split("\t")
148
+ substrate_idx = header_fields.index("Substrate")
149
+ gene_idx = header_fields.index("Gene ID")
150
+ evalue_idx = header_fields.index("E Value")
151
+ for line in file_in:
152
+ fields = line.strip().split("\t")
153
+ if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
154
+ substrate = fields[substrate_idx]
155
+ if not substrate == "-":
156
+ gene_id = fields[gene_idx]
157
+ substrates.setdefault(gene_id, []).append(substrate)
158
+ # resolve cases with multiple substrates
159
+ for gene_id, substrate_list in substrates.items():
160
+ substrate_list = list(set(substrate_list))
161
+ if len(substrate_list) == 1:
162
+ substrates[gene_id] = substrate_list[0]
163
+ else:
164
+ substrates[gene_id] = ",".join(substrate_list)
165
+ return substrates
166
+
167
+
168
+ def parse_args():
169
+ parser = argparse.ArgumentParser(
170
+ description=(
171
+ "The script takes dbCAN output for a eukaryotic genome and parses it to create a standalone GFF."
172
+ )
173
+ )
174
+ parser.add_argument(
175
+ "-hmm",
176
+ dest="hmm_file",
177
+ required=True,
178
+ help="Path to the hmm file.",
179
+ )
180
+ parser.add_argument(
181
+ "-ov",
182
+ dest="overview_file",
183
+ required=True,
184
+ help="Path to the overview file.",
185
+ )
186
+ parser.add_argument(
187
+ "-g",
188
+ dest="genome_gff",
189
+ required=True,
190
+ help="Path to the genome GFF.",
191
+ )
192
+ parser.add_argument(
193
+ "-o",
194
+ dest="outfile",
195
+ required=True,
196
+ help="Path to the output file.",
197
+ )
198
+ parser.add_argument(
199
+ "-v",
200
+ dest="dbcan_ver",
201
+ required=True,
202
+ help="dbCAN version used.",
203
+ )
204
+ return parser.parse_args()
205
+
206
+
207
+ if __name__ == "__main__":
208
+ args = parse_args()
209
+ main(
210
+ args.hmm_file, args.overview_file, args.genome_gff, args.outfile, args.dbcan_ver
211
+ )