mgnify-pipelines-toolkit 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,829 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+
18
+ import re
19
+ import sys
20
+
21
+ from mgnify_pipelines_toolkit.constants.thresholds import EVALUE_CUTOFF_IPS, EVALUE_CUTOFF_EGGNOG
22
+
23
+
24
+ def get_iprs(ipr_annot):
25
+ iprs = {}
26
+ antifams = list()
27
+ if not ipr_annot:
28
+ return iprs, antifams
29
+ with open(ipr_annot) as f:
30
+ for line in f:
31
+ cols = line.strip().split("\t")
32
+ protein = cols[0]
33
+ try:
34
+ evalue = float(cols[8])
35
+ except ValueError:
36
+ continue
37
+ if evalue > EVALUE_CUTOFF_IPS:
38
+ continue
39
+ if cols[3] == "AntiFam":
40
+ antifams.append(protein)
41
+ continue
42
+ if protein not in iprs:
43
+ iprs[protein] = [set(), set()]
44
+ if cols[3] == "Pfam":
45
+ pfam = cols[4]
46
+ iprs[protein][0].add(pfam)
47
+ if len(cols) > 12:
48
+ ipr = cols[11]
49
+ if not ipr == "-":
50
+ iprs[protein][1].add(ipr)
51
+ return iprs, antifams
52
+
53
+
54
+ def get_eggnog(eggnog_annot):
55
+ eggnogs = {}
56
+ if not eggnog_annot:
57
+ return eggnogs
58
+ with open(eggnog_annot, "r") as f:
59
+ for line in f:
60
+ line = line.rstrip()
61
+ cols = line.split("\t")
62
+ if line.startswith("#"):
63
+ eggnog_fields = get_eggnog_fields(line)
64
+ else:
65
+ try:
66
+ evalue = float(cols[2])
67
+ except ValueError:
68
+ continue
69
+ if evalue > EVALUE_CUTOFF_EGGNOG:
70
+ continue
71
+ protein = cols[0]
72
+ eggnog = [cols[1]]
73
+
74
+ cog = list(cols[eggnog_fields["cog_func"]])
75
+ if len(cog) > 1:
76
+ cog = ["R"]
77
+
78
+ kegg = cols[eggnog_fields["KEGG_ko"]].split(",")
79
+ go = cols[eggnog_fields["GOs"]].split(",")
80
+ eggnogs[protein] = [eggnog, cog, kegg, go]
81
+ return eggnogs
82
+
83
+
84
+ def get_eggnog_fields(line):
85
+ cols = line.strip().split("\t")
86
+ try:
87
+ index_of_go = cols.index("GOs")
88
+ except ValueError:
89
+ sys.exit("Cannot find the GO terms column.")
90
+ if cols[8] == "KEGG_ko" and cols[15] == "CAZy":
91
+ eggnog_fields = {"KEGG_ko": 8, "cog_func": 20, "GOs": index_of_go}
92
+ elif cols[11] == "KEGG_ko" and cols[18] == "CAZy":
93
+ eggnog_fields = {"KEGG_ko": 11, "cog_func": 6, "GOs": index_of_go}
94
+ else:
95
+ sys.exit("Cannot parse eggNOG - unexpected field order or naming")
96
+ return eggnog_fields
97
+
98
+
99
+ def get_bgcs(bgc_file, prokka_gff, tool):
100
+ cluster_positions = dict()
101
+ tool_result = dict()
102
+ bgc_annotations = dict()
103
+ if not bgc_file:
104
+ return bgc_annotations
105
+ # save positions of each BGC cluster to dictionary cluster_positions
106
+ # and save the annotations to dictionary bgc_result
107
+ with open(bgc_file, "r") as bgc_in:
108
+ for line in bgc_in:
109
+ if not line.startswith("#"):
110
+ (
111
+ contig,
112
+ _,
113
+ feature,
114
+ start_pos,
115
+ end_pos,
116
+ _,
117
+ _,
118
+ _,
119
+ annotations,
120
+ ) = line.strip().split("\t")
121
+ if tool == "sanntis":
122
+ for a in annotations.split(
123
+ ";"
124
+ ): # go through all parts of the annotation field
125
+ if a.startswith("nearest_MiBIG_class="):
126
+ class_value = a.split("=")[1]
127
+ elif a.startswith("nearest_MiBIG="):
128
+ mibig_value = a.split("=")[1]
129
+ elif tool == "gecco":
130
+ for a in annotations.split(
131
+ ";"
132
+ ): # go through all parts of the annotation field
133
+ if a.startswith("Type="):
134
+ type_value = a.split("=")[1]
135
+ elif tool == "antismash":
136
+ if feature != "gene":
137
+ continue
138
+ type_value = ""
139
+ as_product = ""
140
+ for a in annotations.split(
141
+ ";"
142
+ ): # go through all parts of the annotation field
143
+ if a.startswith("as_type="):
144
+ type_value = a.split("=")[1]
145
+ elif a.startswith("as_gene_clusters="):
146
+ as_product = a.split("=")[1]
147
+ # save cluster positions to a dictionary where key = contig name,
148
+ # value = list of position pairs (list of lists)
149
+ cluster_positions.setdefault(contig, list()).append(
150
+ [int(start_pos), int(end_pos)]
151
+ )
152
+ # save BGC annotations to dictionary where key = contig, value = dictionary, where
153
+ # key = 'start_end' of BGC, value = dictionary, where key = feature type, value = description
154
+ if tool == "sanntis":
155
+ tool_result.setdefault(contig, dict()).setdefault(
156
+ "_".join([start_pos, end_pos]),
157
+ {
158
+ "nearest_MiBIG_class": class_value,
159
+ "nearest_MiBIG": mibig_value,
160
+ },
161
+ )
162
+ elif tool == "gecco":
163
+ tool_result.setdefault(contig, dict()).setdefault(
164
+ "_".join([start_pos, end_pos]),
165
+ {"bgc_type": type_value},
166
+ )
167
+ elif tool == "antismash":
168
+ tool_result.setdefault(contig, dict()).setdefault(
169
+ "_".join([start_pos, end_pos]),
170
+ {"bgc_function": type_value},
171
+ )
172
+ if as_product:
173
+ tool_result[contig]["_".join([start_pos, end_pos])]["bgc_product"] = as_product
174
+ # identify CDSs that fall into each of the clusters annotated by the BGC tool
175
+ with open(prokka_gff, "r") as gff_in:
176
+ for line in gff_in:
177
+ if not line.startswith("#"):
178
+ matching_interval = ""
179
+ (
180
+ contig,
181
+ _,
182
+ _,
183
+ start_pos,
184
+ end_pos,
185
+ _,
186
+ _,
187
+ _,
188
+ annotations,
189
+ ) = line.strip().split("\t")
190
+ if contig in cluster_positions:
191
+ for i in cluster_positions[contig]:
192
+ if int(start_pos) in range(i[0], i[1] + 1) and int(
193
+ end_pos
194
+ ) in range(i[0], i[1] + 1):
195
+ matching_interval = "_".join([str(i[0]), str(i[1])])
196
+ break
197
+ # if the CDS is in an interval, save cluster's annotation to this CDS
198
+ if matching_interval:
199
+ cds_id = annotations.split(";")[0].split("=")[1]
200
+ if tool == "sanntis":
201
+ bgc_annotations.setdefault(
202
+ cds_id,
203
+ {
204
+ "nearest_MiBIG": tool_result[contig][matching_interval][
205
+ "nearest_MiBIG"
206
+ ],
207
+ "nearest_MiBIG_class": tool_result[contig][
208
+ matching_interval
209
+ ]["nearest_MiBIG_class"],
210
+ },
211
+ )
212
+ elif tool == "gecco":
213
+ bgc_annotations.setdefault(
214
+ cds_id,
215
+ {
216
+ "gecco_bgc_type": tool_result[contig][
217
+ matching_interval
218
+ ]["bgc_type"],
219
+ },
220
+ )
221
+ elif tool == "antismash":
222
+ bgc_annotations.setdefault(
223
+ cds_id,
224
+ {
225
+ "antismash_bgc_function": tool_result[contig][
226
+ matching_interval
227
+ ]["bgc_function"],
228
+ },
229
+ )
230
+ if "bgc_product" in tool_result[contig][matching_interval]:
231
+ bgc_annotations[cds_id]["antismash_product"] = tool_result[contig][matching_interval][
232
+ "bgc_product"]
233
+ elif line.startswith("##FASTA"):
234
+ break
235
+ return bgc_annotations
236
+
237
+
238
+ def get_amr(amr_file):
239
+ amr_annotations = {}
240
+ if not amr_file:
241
+ return amr_annotations
242
+ with open(amr_file, "r") as f:
243
+ for line in f:
244
+ if line.startswith("Protein identifier"):
245
+ continue
246
+ (
247
+ protein_id,
248
+ _,
249
+ _,
250
+ _,
251
+ _,
252
+ gene_name,
253
+ seq_name,
254
+ scope,
255
+ element_type,
256
+ element_subtype,
257
+ drug_class,
258
+ drug_subclass,
259
+ _,
260
+ ) = line.strip().split("\t", 12)
261
+ # don't add annotations for which we don't have a protein ID (these will only be
262
+ # available in the AMRFinderPlus TSV file)
263
+ if protein_id == "NA":
264
+ continue
265
+ # check for characters that could break GFF
266
+ if ";" in seq_name:
267
+ seq_name = seq_name.replace(";", ",")
268
+ if "=" in seq_name:
269
+ seq_name = seq_name.replace("=", " ")
270
+ amr_annotations[protein_id] = ";".join(
271
+ [
272
+ f"amrfinderplus_gene_symbol={gene_name}",
273
+ f"amrfinderplus_sequence_name={seq_name}",
274
+ f"amrfinderplus_scope={scope}",
275
+ f"element_type={element_type}",
276
+ f"element_subtype={element_subtype}",
277
+ f"drug_class={drug_class}",
278
+ f"drug_subclass={drug_subclass}",
279
+ ]
280
+ )
281
+ return amr_annotations
282
+
283
+
284
+ def get_dbcan(dbcan_file):
285
+ dbcan_annotations = dict()
286
+ substrates = dict()
287
+ if not dbcan_file:
288
+ return dbcan_annotations
289
+ with open(dbcan_file, "r") as f:
290
+ for line in f:
291
+ if "predicted PUL" in line:
292
+ annot_fields = line.strip().split("\t")[8].split(";")
293
+ for a in annot_fields:
294
+ if a.startswith("ID="):
295
+ cgc = a.split("=")[1]
296
+ elif a.startswith("substrate_dbcan-pul"):
297
+ substrate_pul = a.split("=")[1]
298
+ elif a.startswith("substrate_dbcan-sub"):
299
+ substrate_ecami = a.split("=")[1]
300
+ substrates.setdefault(cgc, {})["substrate_ecami"] = substrate_ecami
301
+ substrates.setdefault(cgc, {})["substrate_pul"] = substrate_pul
302
+ elif line.startswith("#"):
303
+ continue
304
+ else:
305
+ cols = line.strip().split("\t")
306
+ prot_type = cols[2]
307
+ annot_fields = cols[8].split(";")
308
+ if not prot_type == "null":
309
+ for a in annot_fields:
310
+ if a.startswith("ID"):
311
+ acc = a.split("=")[1]
312
+ elif a.startswith("protein_family"):
313
+ prot_fam = a.split("=")[1]
314
+ elif a.startswith("Parent"):
315
+ parent = a.split("=")[1]
316
+ dbcan_annotations[acc] = (
317
+ "dbcan_prot_type={};dbcan_prot_family={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
318
+ prot_type,
319
+ prot_fam,
320
+ substrates[parent]["substrate_pul"],
321
+ substrates[parent]["substrate_ecami"],
322
+ )
323
+ )
324
+ return dbcan_annotations
325
+
326
+
327
+ def get_defense_finder(df_file):
328
+ defense_finder_annotations = dict()
329
+ type_info = dict()
330
+ if not df_file:
331
+ return defense_finder_annotations
332
+ with open(df_file, "r") as f:
333
+ for line in f:
334
+ if "Anti-phage system" in line:
335
+ annot_fields = line.strip().split("\t")[8].split(";")
336
+ for a in annot_fields:
337
+ if a.startswith("ID="):
338
+ id = a.split("=")[1]
339
+ elif a.startswith("type"):
340
+ df_type = a.split("=")[1]
341
+ elif a.startswith("subtype"):
342
+ df_subtype = a.split("=")[1]
343
+ type_info.setdefault(id, {})["df_type"] = df_type
344
+ type_info.setdefault(id, {})["df_subtype"] = df_subtype
345
+ elif "DefenseFinder" in line:
346
+ annot_fields = line.strip().split("\t")[8].split(";")
347
+ for a in annot_fields:
348
+ if a.startswith("ID="):
349
+ id = a.split("=")[1]
350
+ elif a.startswith("Parent="):
351
+ parent = a.split("=")[1]
352
+ defense_finder_annotations[id] = (
353
+ "defense_finder_type={};defense_finder_subtype={}".format(
354
+ type_info[parent]["df_type"], type_info[parent]["df_subtype"]
355
+ )
356
+ )
357
+ return defense_finder_annotations
358
+
359
+
360
+ def load_annotations(
361
+ in_gff,
362
+ eggnog_file,
363
+ ipr_file,
364
+ sanntis_file,
365
+ amr_file,
366
+ antismash_file,
367
+ gecco_file,
368
+ dbcan_file,
369
+ defense_finder_file,
370
+ pseudofinder_file,
371
+ ):
372
+ eggnogs = get_eggnog(eggnog_file)
373
+ iprs, antifams = get_iprs(ipr_file)
374
+ sanntis_bgcs = get_bgcs(sanntis_file, in_gff, tool="sanntis")
375
+ gecco_bgcs = get_bgcs(gecco_file, in_gff, tool="gecco")
376
+ antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
377
+ amr_annotations = get_amr(amr_file)
378
+ dbcan_annotations = get_dbcan(dbcan_file)
379
+ defense_finder_annotations = get_defense_finder(defense_finder_file)
380
+ pseudogenes = get_pseudogenes(pseudofinder_file)
381
+ pseudogene_report_dict = dict()
382
+ added_annot = {}
383
+ main_gff = dict()
384
+ header = []
385
+ fasta = []
386
+ fasta_flag = False
387
+ with open(in_gff) as f:
388
+ for line in f:
389
+ line = line.strip()
390
+ if line[0] != "#" and not fasta_flag:
391
+ line = line.replace("db_xref", "Dbxref")
392
+ line = line.replace(";note=", ";Note=")
393
+ line = line.replace("‘", "'").replace("’", "'")
394
+ cols = line.split("\t")
395
+ if len(cols) == 9:
396
+ contig, caller, feature, start, annot = (
397
+ cols[0],
398
+ cols[1],
399
+ cols[2],
400
+ cols[3],
401
+ cols[8],
402
+ )
403
+ if feature != "CDS":
404
+ if caller == "Bakta" and feature == "region":
405
+ main_gff.setdefault(contig, dict()).setdefault(
406
+ int(start), list()
407
+ ).append(line)
408
+ continue
409
+ else:
410
+ continue
411
+ protein = annot.split(";")[0].split("=")[-1]
412
+ if protein in antifams:
413
+ # Don't print to the final GFF proteins that are known to not be real
414
+ continue
415
+ added_annot[protein] = {}
416
+ # process pseudogenes
417
+ if "pseudo=true" in annot.lower():
418
+ # fix case
419
+ cols[8] = annot.replace("pseudo=True", "pseudo=true")
420
+ # gene is already marked as a pseudogene; log it but don't add to the annotation again
421
+ pseudogene_report_dict.setdefault(protein, dict())
422
+ pseudogene_report_dict[protein]["gene_caller"] = True
423
+ if protein in pseudogenes:
424
+ pseudogene_report_dict[protein]["pseudofinder"] = True
425
+ else:
426
+ pseudogene_report_dict[protein]["pseudofinder"] = False
427
+ else:
428
+ # gene caller did not detect this protein as a pseudogene; check if pseudofinder did
429
+ if protein in pseudogenes:
430
+ pseudogene_report_dict.setdefault(protein, dict())
431
+ pseudogene_report_dict[protein]["gene_caller"] = False
432
+ pseudogene_report_dict[protein]["pseudofinder"] = True
433
+ added_annot[protein]["pseudo"] = "true"
434
+ if pseudogenes[protein]:
435
+ cols[8] = add_pseudogene_to_note(
436
+ pseudogenes[protein], cols[8]
437
+ )
438
+ # record antifams
439
+ if protein in antifams:
440
+ pseudogene_report_dict.setdefault(protein, dict())
441
+ pseudogene_report_dict[protein]["antifams"] = True
442
+ try:
443
+ eggnogs[protein]
444
+ pos = 0
445
+ for a in eggnogs[protein]:
446
+ pos += 1
447
+ if a != [""] and a != ["NA"]:
448
+ if pos == 1:
449
+ added_annot[protein]["eggNOG"] = a
450
+ elif pos == 2:
451
+ added_annot[protein]["cog"] = a
452
+ elif pos == 3:
453
+ added_annot[protein]["kegg"] = a
454
+ elif pos == 4:
455
+ added_annot[protein]["Ontology_term"] = a
456
+ except KeyError:
457
+ pass
458
+ try:
459
+ iprs[protein]
460
+ pos = 0
461
+ for a in iprs[protein]:
462
+ pos += 1
463
+ a = list(a)
464
+ if a != [""] and a:
465
+ if pos == 1:
466
+ added_annot[protein]["pfam"] = sorted(a)
467
+ elif pos == 2:
468
+ added_annot[protein]["interpro"] = sorted(a)
469
+ except KeyError:
470
+ pass
471
+ try:
472
+ sanntis_bgcs[protein]
473
+ for key, value in sanntis_bgcs[protein].items():
474
+ added_annot[protein][key] = value
475
+ except KeyError:
476
+ pass
477
+ try:
478
+ gecco_bgcs[protein]
479
+ for key, value in gecco_bgcs[protein].items():
480
+ added_annot[protein][key] = value
481
+ except KeyError:
482
+ pass
483
+ try:
484
+ antismash_bgcs[protein]
485
+ for key, value in antismash_bgcs[protein].items():
486
+ added_annot[protein][key] = value
487
+ except KeyError:
488
+ pass
489
+ try:
490
+ amr_annotations[protein]
491
+ added_annot[protein]["AMR"] = amr_annotations[protein]
492
+ except KeyError:
493
+ pass
494
+ try:
495
+ dbcan_annotations[protein]
496
+ added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
497
+ except KeyError:
498
+ pass
499
+ try:
500
+ defense_finder_annotations[protein]
501
+ added_annot[protein]["defense_finder"] = (
502
+ defense_finder_annotations[protein]
503
+ )
504
+ except KeyError:
505
+ pass
506
+ for a in added_annot[protein]:
507
+ value = added_annot[protein][a]
508
+ if type(value) is list:
509
+ value = ",".join(value)
510
+ if a in ["AMR", "dbCAN", "defense_finder"]:
511
+ cols[8] = f"{cols[8]};{value}"
512
+ else:
513
+ if not value == "-":
514
+ cols[8] = f"{cols[8]};{a}={value}"
515
+ line = "\t".join(cols)
516
+ main_gff.setdefault(contig, dict()).setdefault(
517
+ int(start), list()
518
+ ).append(line)
519
+ elif line.startswith("#"):
520
+ if line == "##FASTA":
521
+ fasta_flag = True
522
+ fasta.append(line)
523
+ else:
524
+ header.append(line)
525
+ elif fasta_flag:
526
+ fasta.append(line)
527
+ return header, main_gff, fasta, pseudogene_report_dict
528
+
529
+
530
+ def get_ncrnas(ncrnas_file):
531
+ ncrnas = {}
532
+ counts = 0
533
+ with open(ncrnas_file, "r") as f:
534
+ for line in f:
535
+ if not line.startswith("#"):
536
+ cols = line.strip().split()
537
+ counts += 1
538
+ contig = cols[3]
539
+ locus = f"{contig}_ncRNA{counts}"
540
+ product = " ".join(cols[28:])
541
+ model = cols[2]
542
+ if model == "RF00005":
543
+ # Skip tRNAs, we add them from tRNAscan-SE
544
+ continue
545
+ strand = cols[11]
546
+ start, end = (int(cols[9]), int(cols[10])) if strand == "+" else (int(cols[10]), int(cols[9]))
547
+ rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
548
+ annot = [
549
+ "ID=" + locus,
550
+ "inference=Rfam:14.9",
551
+ "locus_tag=" + locus,
552
+ "product=" + product,
553
+ "rfam=" + model,
554
+ ]
555
+ if ncrna_class:
556
+ annot.append(f"ncRNA_class={ncrna_class}")
557
+ annot = ";".join(annot)
558
+ newline = "\t".join(
559
+ [
560
+ contig,
561
+ "INFERNAL:1.1.4",
562
+ rna_feature_name,
563
+ str(start),
564
+ str(end),
565
+ ".",
566
+ strand,
567
+ ".",
568
+ annot,
569
+ ]
570
+ )
571
+ ncrnas.setdefault(contig, dict()).setdefault(start, list()).append(
572
+ newline
573
+ )
574
+ return ncrnas
575
+
576
+
577
+ def prepare_rna_gff_fields(cols):
578
+ rna_feature_name = "ncRNA"
579
+ if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]:
580
+ rna_feature_name = "rRNA"
581
+ ncrna_class = ""
582
+ rna_types = {
583
+ "antisense_RNA": [
584
+ "RF00039",
585
+ "RF00042",
586
+ "RF00057",
587
+ "RF00106",
588
+ "RF00107",
589
+ "RF00236",
590
+ "RF00238",
591
+ "RF00240",
592
+ "RF00242",
593
+ "RF00262",
594
+ "RF00388",
595
+ "RF00489",
596
+ "RF01695",
597
+ "RF01794",
598
+ "RF01797",
599
+ "RF01809",
600
+ "RF01813",
601
+ "RF02194",
602
+ "RF02235",
603
+ "RF02236",
604
+ "RF02237",
605
+ "RF02238",
606
+ "RF02239",
607
+ "RF02519",
608
+ "RF02550",
609
+ "RF02558",
610
+ "RF02559",
611
+ "RF02560",
612
+ "RF02563",
613
+ "RF02592",
614
+ "RF02662",
615
+ "RF02674",
616
+ "RF02735",
617
+ "RF02743",
618
+ "RF02792",
619
+ "RF02793",
620
+ "RF02812",
621
+ "RF02818",
622
+ "RF02819",
623
+ "RF02820",
624
+ "RF02839",
625
+ "RF02843",
626
+ "RF02844",
627
+ "RF02846",
628
+ "RF02850",
629
+ "RF02851",
630
+ "RF02855",
631
+ "RF02873",
632
+ "RF02874",
633
+ "RF02875",
634
+ "RF02876",
635
+ "RF02891",
636
+ "RF02892",
637
+ "RF02903",
638
+ "RF02908",
639
+ ],
640
+ "autocatalytically_spliced_intron": ["RF01807"],
641
+ "ribozyme": [
642
+ "RF00621",
643
+ "RF01787",
644
+ "RF01788",
645
+ "RF01865",
646
+ "RF02678",
647
+ "RF02679",
648
+ "RF02681",
649
+ "RF02682",
650
+ "RF02684",
651
+ "RF03154",
652
+ "RF03160",
653
+ "RF04188",
654
+ ],
655
+ "hammerhead_ribozyme": [
656
+ "RF00008",
657
+ "RF00163",
658
+ "RF02275",
659
+ "RF02276",
660
+ "RF02277",
661
+ "RF03152",
662
+ ],
663
+ "RNase_P_RNA": [
664
+ "RF00009",
665
+ "RF00010",
666
+ "RF00011",
667
+ "RF00373",
668
+ "RF01577",
669
+ "RF02357",
670
+ ],
671
+ "RNase_MRP_RNA": ["RF00030", "RF02472"],
672
+ "telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"],
673
+ "scaRNA": [
674
+ "RF00231",
675
+ "RF00283",
676
+ "RF00286",
677
+ "RF00422",
678
+ "RF00423",
679
+ "RF00424",
680
+ "RF00426",
681
+ "RF00427",
682
+ "RF00478",
683
+ "RF00492",
684
+ "RF00553",
685
+ "RF00564",
686
+ "RF00565",
687
+ "RF00582",
688
+ "RF00601",
689
+ "RF00602",
690
+ "RF01268",
691
+ "RF01295",
692
+ "RF02665",
693
+ "RF02666",
694
+ "RF02667",
695
+ "RF02668",
696
+ "RF02669",
697
+ "RF02670",
698
+ "RF02718",
699
+ "RF02719",
700
+ "RF02720",
701
+ "RF02721",
702
+ "RF02722",
703
+ ],
704
+ "snRNA": ["RF01802"],
705
+ "SRP_RNA": [
706
+ "RF00017",
707
+ "RF00169",
708
+ "RF01502",
709
+ "RF01570",
710
+ "RF01854",
711
+ "RF01855",
712
+ "RF01856",
713
+ "RF01857",
714
+ "RF04183",
715
+ ],
716
+ "vault_RNA": ["RF00006"],
717
+ "Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"],
718
+ }
719
+
720
+ if rna_feature_name == "ncRNA":
721
+ ncrna_class = next((rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams), None)
722
+ if not ncrna_class:
723
+ if "microRNA" in cols[-1]:
724
+ ncrna_class = "pre_miRNA"
725
+ else:
726
+ ncrna_class = "other"
727
+ return rna_feature_name, ncrna_class
728
+
729
+
730
+ def get_trnas(trnas_file):
731
+ trnas = {}
732
+ with open(trnas_file, "r") as f:
733
+ for line in f:
734
+ if not line.startswith("#"):
735
+ cols = line.split("\t")
736
+ contig, feature, start = cols[0], cols[2], cols[3]
737
+ if feature == "tRNA":
738
+ line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
739
+ trnas.setdefault(contig, dict()).setdefault(
740
+ int(start), list()
741
+ ).append(line.strip())
742
+ return trnas
743
+
744
+
745
+ def load_crispr(crispr_file):
746
+ crispr_annotations = dict()
747
+ with open(crispr_file, "r") as f:
748
+ record = list()
749
+ left_coord = ""
750
+ loc_contig = ""
751
+ previous_end = ""
752
+ for line in f:
753
+ if not line.startswith("#"):
754
+ cols = line.strip().split("\t")
755
+ contig, _, start, end = (
756
+ cols[0],
757
+ cols[2],
758
+ int(cols[3]),
759
+ int(cols[4]),
760
+ )
761
+ if (
762
+ len(record) > 0
763
+ and contig == loc_contig
764
+ and abs(start - previous_end) < 2
765
+ ):
766
+ # the line is a continuation of an existing record
767
+ record.append(line)
768
+ previous_end = end
769
+ elif len(record) == 0:
770
+ record.append(line)
771
+ left_coord = start
772
+ loc_contig = contig
773
+ previous_end = end
774
+ else:
775
+ # the previous record is complete, started reading a new record
776
+ crispr_annotations.setdefault(contig, dict()).setdefault(
777
+ left_coord, list()
778
+ ).append(record)
779
+ record = list()
780
+ record.append(line)
781
+ previous_end = end
782
+ left_coord = start
783
+ if len(record) > 0:
784
+ crispr_annotations.setdefault(contig, dict()).setdefault(
785
+ left_coord, list()
786
+ ).append(record)
787
+ return crispr_annotations
788
+
789
+
790
+ def get_pseudogenes(pseudofinder_file):
791
+ pseudogenes = dict()
792
+ if not pseudofinder_file:
793
+ return pseudogenes
794
+ with open(pseudofinder_file) as file_in:
795
+ for line in file_in:
796
+ if not line.startswith("#"):
797
+ col9 = line.strip().split("\t")[8]
798
+ attributes_dict = dict(
799
+ re.split(r"(?<!\\)=", item) for item in re.split(r"(?<!\\);", col9)
800
+ )
801
+ if "note" in attributes_dict:
802
+ note = attributes_dict["note"]
803
+ else:
804
+ note = ""
805
+ if "old_locus_tag" in attributes_dict:
806
+ tags = attributes_dict["old_locus_tag"].split(",")
807
+ for tag in tags:
808
+ if "_ign_" not in tag:
809
+ pseudogenes[tag] = note
810
+ return pseudogenes
811
+
812
+
813
+ def add_pseudogene_to_note(note_text, col9):
814
+ col9_dict = dict(
815
+ re.split(r"(?<!\\)=", item) for item in re.split(r"(?<!\\);", col9)
816
+ )
817
+ if "Note" in col9_dict.keys():
818
+ col9_dict["Note"] = col9_dict["Note"] + f", {note_text}"
819
+ return ";".join([f"{key}={value}" for key, value in col9_dict.items()])
820
+ else:
821
+ # insert note after locus tag
822
+ keys_list = list(col9_dict.keys())
823
+ locus_tag_index = keys_list.index("locus_tag")
824
+ new_dict = (
825
+ {k: col9_dict[k] for k in keys_list[: locus_tag_index + 1]}
826
+ | {"Note": note_text}
827
+ | {k: col9_dict[k] for k in keys_list[locus_tag_index + 1 :]}
828
+ )
829
+ return ";".join([f"{key}={value}" for key, value in new_dict.items()])