gffkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gffkit/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """gffkit: region-aware GFF annotation integration utilities."""
2
+
3
+ __version__ = "0.1.0"
gffkit/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .main import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
gffkit/add_utr.py ADDED
@@ -0,0 +1,571 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ add_utr.py
6
+
7
+ Only implement the UTR-padding behavior similar to AGAT _check_utrs():
8
+
9
+ For each transcript:
10
+ exon - CDS => candidate UTR intervals
11
+
12
+ Then classify:
13
+ + strand:
14
+ interval before leftmost CDS => five_prime_UTR
15
+ interval after rightmost CDS => three_prime_UTR
16
+
17
+ - strand:
18
+ interval before leftmost CDS => three_prime_UTR
19
+ interval after rightmost CDS => five_prime_UTR
20
+
21
+ Intervals located between CDS blocks are skipped, following AGAT's behavior
22
+ of not creating UTR inside CDS/ribosomal-slippage-like middle regions.
23
+
24
+ Input:
25
+ GFF3 or simple GTF-like file with exon/CDS features.
26
+
27
+ Requirements:
28
+ - exon and CDS must have Parent=transcript_id in GFF3
29
+ or transcript_id "xxx" in GTF.
30
+ - Existing UTR features can either be kept or removed.
31
+
32
+ Usage:
33
+ python add_utr.py -i input.gff3 -o output.gff3
34
+ python add_utr.py -i input.gtf -o output.gff3
35
+ python add_utr.py -i input.gff3 -o output.gff3 --replace-existing-utrs
36
+ """
37
+
38
+ import argparse
39
+ import gzip
40
+ import sys
41
+ from dataclasses import dataclass, field
42
+ from collections import defaultdict
43
+ from typing import Dict, List, Tuple, Optional
44
+
45
+
46
+ @dataclass
47
+ class Feature:
48
+ seqid: str
49
+ source: str
50
+ ftype: str
51
+ start: int
52
+ end: int
53
+ score: str
54
+ strand: str
55
+ phase: str
56
+ attrs: Dict[str, List[str]] = field(default_factory=dict)
57
+ raw_attr: str = "."
58
+ line_no: int = 0
59
+
60
+ def parent_ids(self) -> List[str]:
61
+ """
62
+ Prefer GFF3 Parent.
63
+ Fall back to GTF transcript_id.
64
+ """
65
+ if "Parent" in self.attrs:
66
+ return self.attrs["Parent"]
67
+ if "transcript_id" in self.attrs:
68
+ return self.attrs["transcript_id"]
69
+ return []
70
+
71
+ def get_id(self) -> Optional[str]:
72
+ values = self.attrs.get("ID")
73
+ if values:
74
+ return values[0]
75
+ return None
76
+
77
+ def to_gff3(self) -> str:
78
+ return "\t".join([
79
+ self.seqid,
80
+ self.source,
81
+ self.ftype,
82
+ str(self.start),
83
+ str(self.end),
84
+ self.score,
85
+ self.strand,
86
+ self.phase,
87
+ format_gff3_attrs(self.attrs),
88
+ ])
89
+
90
+
91
+ def open_text(path: str):
92
+ if path == "-":
93
+ return sys.stdin
94
+ if path.endswith(".gz"):
95
+ return gzip.open(path, "rt")
96
+ return open(path, "r", encoding="utf-8")
97
+
98
+
99
+ def parse_attrs(attr_text: str) -> Dict[str, List[str]]:
100
+ """
101
+ Parse both:
102
+ GFF3: ID=xxx;Parent=yyy
103
+ GTF : gene_id "g1"; transcript_id "t1";
104
+ """
105
+ attr_text = attr_text.strip()
106
+ attrs: Dict[str, List[str]] = {}
107
+
108
+ if not attr_text or attr_text == ".":
109
+ return attrs
110
+
111
+ parts = [x.strip() for x in attr_text.rstrip(";").split(";") if x.strip()]
112
+
113
+ for part in parts:
114
+ if "=" in part:
115
+ key, value = part.split("=", 1)
116
+ key = key.strip()
117
+ values = [v.strip() for v in value.split(",") if v.strip()]
118
+ attrs[key] = values
119
+ else:
120
+ fields = part.split(None, 1)
121
+ if len(fields) == 2:
122
+ key = fields[0].strip()
123
+ value = fields[1].strip().strip('"')
124
+ attrs[key] = [value]
125
+
126
+ return attrs
127
+
128
+
129
+ def format_gff3_attrs(attrs: Dict[str, List[str]]) -> str:
130
+ if not attrs:
131
+ return "."
132
+
133
+ preferred = ["ID", "Parent", "Name", "gene_id", "transcript_id"]
134
+ keys = [k for k in preferred if k in attrs]
135
+ keys.extend(sorted(k for k in attrs if k not in keys))
136
+
137
+ out = []
138
+ for key in keys:
139
+ values = attrs.get(key, [])
140
+ if not values:
141
+ continue
142
+ out.append(f"{key}={','.join(values)}")
143
+
144
+ return ";".join(out) if out else "."
145
+
146
+
147
+ def read_gff(path: str) -> Tuple[List[str], List[Feature]]:
148
+ headers: List[str] = []
149
+ features: List[Feature] = []
150
+
151
+ with open_text(path) as fh:
152
+ for i, line in enumerate(fh, start=1):
153
+ line = line.rstrip("\n")
154
+
155
+ if not line:
156
+ continue
157
+
158
+ if line.startswith("#"):
159
+ headers.append(line)
160
+ continue
161
+
162
+ cols = line.split("\t")
163
+ if len(cols) != 9:
164
+ print(
165
+ f"[WARN] Skip line {i}: expected 9 columns, got {len(cols)}",
166
+ file=sys.stderr,
167
+ )
168
+ continue
169
+
170
+ seqid, source, ftype, start, end, score, strand, phase, attr_text = cols
171
+
172
+ try:
173
+ start_i = int(start)
174
+ end_i = int(end)
175
+ except ValueError:
176
+ print(
177
+ f"[WARN] Skip line {i}: start/end is not integer",
178
+ file=sys.stderr,
179
+ )
180
+ continue
181
+
182
+ features.append(
183
+ Feature(
184
+ seqid=seqid,
185
+ source=source,
186
+ ftype=ftype,
187
+ start=start_i,
188
+ end=end_i,
189
+ score=score,
190
+ strand=strand,
191
+ phase=phase,
192
+ attrs=parse_attrs(attr_text),
193
+ raw_attr=attr_text,
194
+ line_no=i,
195
+ )
196
+ )
197
+
198
+ return headers, features
199
+
200
+
201
+ def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
202
+ """
203
+ Merge overlapping or adjacent intervals.
204
+ Coordinates are 1-based closed intervals.
205
+ """
206
+ if not intervals:
207
+ return []
208
+
209
+ intervals = sorted(intervals)
210
+ merged = [intervals[0]]
211
+
212
+ for s, e in intervals[1:]:
213
+ last_s, last_e = merged[-1]
214
+ if s <= last_e + 1:
215
+ merged[-1] = (last_s, max(last_e, e))
216
+ else:
217
+ merged.append((s, e))
218
+
219
+ return merged
220
+
221
+
222
+ def subtract_intervals(
223
+ exon: Tuple[int, int],
224
+ cds_intervals: List[Tuple[int, int]],
225
+ ) -> List[Tuple[int, int]]:
226
+ """
227
+ Return exon - CDS.
228
+
229
+ Example:
230
+ exon = 100-500
231
+ CDS = 200-400
232
+
233
+ result:
234
+ 100-199
235
+ 401-500
236
+ """
237
+ remaining = [exon]
238
+
239
+ for cds_s, cds_e in cds_intervals:
240
+ new_remaining = []
241
+
242
+ for s, e in remaining:
243
+ if cds_e < s or cds_s > e:
244
+ new_remaining.append((s, e))
245
+ continue
246
+
247
+ if s < cds_s:
248
+ new_remaining.append((s, cds_s - 1))
249
+
250
+ if cds_e < e:
251
+ new_remaining.append((cds_e + 1, e))
252
+
253
+ remaining = new_remaining
254
+
255
+ return remaining
256
+
257
+
258
+ def is_utr_type(ftype: str) -> bool:
259
+ return ftype.lower() in {
260
+ "utr",
261
+ "five_prime_utr",
262
+ "three_prime_utr",
263
+ "5utr",
264
+ "3utr",
265
+ "five_prime_UTR".lower(),
266
+ "three_prime_UTR".lower(),
267
+ }
268
+
269
+
270
+ def build_used_ids(features: List[Feature]) -> set:
271
+ used = set()
272
+ for f in features:
273
+ fid = f.get_id()
274
+ if fid:
275
+ used.add(fid)
276
+ return used
277
+
278
+
279
+ def unique_id(base: str, used_ids: set) -> str:
280
+ if base not in used_ids:
281
+ used_ids.add(base)
282
+ return base
283
+
284
+ n = 2
285
+ while f"{base}_{n}" in used_ids:
286
+ n += 1
287
+
288
+ new_id = f"{base}_{n}"
289
+ used_ids.add(new_id)
290
+ return new_id
291
+
292
+
293
+ def classify_utr(
294
+ utr_start: int,
295
+ utr_end: int,
296
+ strand: str,
297
+ leftmost_cds: int,
298
+ rightmost_cds: int,
299
+ ) -> Optional[str]:
300
+ """
301
+ Match AGAT's classification idea:
302
+
303
+ If UTR is before the leftmost CDS:
304
+ + => five_prime_UTR
305
+ - => three_prime_UTR
306
+
307
+ If UTR is after the rightmost CDS:
308
+ + => three_prime_UTR
309
+ - => five_prime_UTR
310
+
311
+ If UTR is between CDS blocks, skip it.
312
+ """
313
+ if utr_end < leftmost_cds:
314
+ if strand == "-":
315
+ return "three_prime_UTR"
316
+ return "five_prime_UTR"
317
+
318
+ if utr_start > rightmost_cds:
319
+ if strand == "-":
320
+ return "five_prime_UTR"
321
+ return "three_prime_UTR"
322
+
323
+ return None
324
+
325
+
326
+ def make_utr_feature(
327
+ template_exon: Feature,
328
+ parent_id: str,
329
+ utr_start: int,
330
+ utr_end: int,
331
+ utr_type: str,
332
+ used_ids: set,
333
+ index: int,
334
+ id_prefix: str,
335
+ ) -> Feature:
336
+ """
337
+ Clone exon-like information and replace type/start/end/phase/ID/Parent.
338
+ """
339
+ utr_id = unique_id(f"{id_prefix}{parent_id}.{utr_type}.{index}", used_ids)
340
+
341
+ attrs = {
342
+ "ID": [utr_id],
343
+ "Parent": [parent_id],
344
+ }
345
+
346
+ # Keep useful GTF-origin attributes if present.
347
+ if "gene_id" in template_exon.attrs:
348
+ attrs["gene_id"] = list(template_exon.attrs["gene_id"])
349
+ if "transcript_id" in template_exon.attrs:
350
+ attrs["transcript_id"] = list(template_exon.attrs["transcript_id"])
351
+
352
+ return Feature(
353
+ seqid=template_exon.seqid,
354
+ source=template_exon.source,
355
+ ftype=utr_type,
356
+ start=utr_start,
357
+ end=utr_end,
358
+ score=template_exon.score,
359
+ strand=template_exon.strand,
360
+ phase=".",
361
+ attrs=attrs,
362
+ line_no=template_exon.line_no,
363
+ )
364
+
365
+
366
+ def add_utrs_like_agat(
367
+ features: List[Feature],
368
+ replace_existing_utrs: bool = False,
369
+ id_prefix: str = "agat_utrs_",
370
+ ) -> List[Feature]:
371
+ """
372
+ Add missing UTRs from exon and CDS.
373
+
374
+ If replace_existing_utrs=True:
375
+ remove all existing UTRs first, then recreate expected UTRs.
376
+
377
+ If replace_existing_utrs=False:
378
+ keep existing UTRs and only add expected UTRs that do not already
379
+ have identical coordinates and type under the same transcript.
380
+ """
381
+ exons_by_tx = defaultdict(list)
382
+ cds_by_tx = defaultdict(list)
383
+ utrs_by_tx = defaultdict(list)
384
+
385
+ for f in features:
386
+ ftype_lower = f.ftype.lower()
387
+
388
+ for parent in f.parent_ids():
389
+ if ftype_lower == "exon":
390
+ exons_by_tx[parent].append(f)
391
+ elif ftype_lower == "cds":
392
+ cds_by_tx[parent].append(f)
393
+ elif is_utr_type(f.ftype):
394
+ utrs_by_tx[parent].append(f)
395
+
396
+ used_ids = build_used_ids(features)
397
+ new_utrs: List[Feature] = []
398
+
399
+ # Optionally remove existing UTRs, closer to AGAT's "recreate when wrong" behavior.
400
+ if replace_existing_utrs:
401
+ features = [f for f in features if not is_utr_type(f.ftype)]
402
+
403
+ for tx_id in sorted(set(exons_by_tx) & set(cds_by_tx)):
404
+ exons = sorted(exons_by_tx[tx_id], key=lambda x: (x.start, x.end))
405
+ cds_features = sorted(cds_by_tx[tx_id], key=lambda x: (x.start, x.end))
406
+
407
+ if not exons or not cds_features:
408
+ continue
409
+
410
+ cds_intervals = merge_intervals([(c.start, c.end) for c in cds_features])
411
+ leftmost_cds = min(s for s, _ in cds_intervals)
412
+ rightmost_cds = max(e for _, e in cds_intervals)
413
+
414
+ existing_signatures = set()
415
+ if not replace_existing_utrs:
416
+ for u in utrs_by_tx.get(tx_id, []):
417
+ existing_signatures.add((u.ftype, u.start, u.end))
418
+
419
+ utr_index = 0
420
+
421
+ for exon in exons:
422
+ candidate_intervals = subtract_intervals(
423
+ (exon.start, exon.end),
424
+ cds_intervals,
425
+ )
426
+
427
+ for utr_start, utr_end in candidate_intervals:
428
+ if utr_start > utr_end:
429
+ continue
430
+
431
+ utr_type = classify_utr(
432
+ utr_start=utr_start,
433
+ utr_end=utr_end,
434
+ strand=exon.strand,
435
+ leftmost_cds=leftmost_cds,
436
+ rightmost_cds=rightmost_cds,
437
+ )
438
+
439
+ # AGAT skips UTR candidates located between CDS blocks.
440
+ if utr_type is None:
441
+ continue
442
+
443
+ signature = (utr_type, utr_start, utr_end)
444
+ if signature in existing_signatures:
445
+ continue
446
+
447
+ utr_index += 1
448
+ utr = make_utr_feature(
449
+ template_exon=exon,
450
+ parent_id=tx_id,
451
+ utr_start=utr_start,
452
+ utr_end=utr_end,
453
+ utr_type=utr_type,
454
+ used_ids=used_ids,
455
+ index=utr_index,
456
+ id_prefix=id_prefix,
457
+ )
458
+ new_utrs.append(utr)
459
+ existing_signatures.add(signature)
460
+
461
+ return insert_utrs_after_matching_exons(features, new_utrs)
462
+
463
+
464
+ def insert_utrs_after_matching_exons(
465
+ features: List[Feature],
466
+ new_utrs: List[Feature],
467
+ ) -> List[Feature]:
468
+ """
469
+ Keep original order as much as possible.
470
+ Insert newly created UTRs after the exon line that was used as template.
471
+ """
472
+ by_line = defaultdict(list)
473
+ for u in new_utrs:
474
+ by_line[u.line_no].append(u)
475
+
476
+ output = []
477
+
478
+ for f in features:
479
+ output.append(f)
480
+ if f.line_no in by_line:
481
+ output.extend(
482
+ sorted(
483
+ by_line[f.line_no],
484
+ key=lambda x: (x.start, x.end, x.ftype),
485
+ )
486
+ )
487
+
488
+ return output
489
+
490
+
491
+ def write_gff3(headers: List[str], features: List[Feature], output: str) -> None:
492
+ if output == "-":
493
+ out = sys.stdout
494
+ close = False
495
+ else:
496
+ out = open(output, "w", encoding="utf-8")
497
+ close = True
498
+
499
+ try:
500
+ has_gff_version = any(h.startswith("##gff-version") for h in headers)
501
+ if not has_gff_version:
502
+ print("##gff-version 3", file=out)
503
+
504
+ for h in headers:
505
+ if h.startswith("##FASTA"):
506
+ # This simple script does not preserve FASTA section safely.
507
+ # Stop before FASTA.
508
+ break
509
+ print(h, file=out)
510
+
511
+ for f in features:
512
+ print(f.to_gff3(), file=out)
513
+
514
+ finally:
515
+ if close:
516
+ out.close()
517
+
518
+
519
+ def build_parser() -> argparse.ArgumentParser:
520
+ parser = argparse.ArgumentParser(
521
+ description="Add missing UTR features from exon and CDS, similar to AGAT _check_utrs()."
522
+ )
523
+
524
+ parser.add_argument(
525
+ "-i",
526
+ "--input",
527
+ required=True,
528
+ help="Input GFF3/GTF file. Use '-' for stdin. .gz is supported.",
529
+ )
530
+
531
+ parser.add_argument(
532
+ "-o",
533
+ "--output",
534
+ required=True,
535
+ help="Output GFF3 file. Use '-' for stdout.",
536
+ )
537
+
538
+ parser.add_argument(
539
+ "--replace-existing-utrs",
540
+ action="store_true",
541
+ help=(
542
+ "Remove existing UTR/five_prime_UTR/three_prime_UTR features "
543
+ "and recreate UTRs from exon/CDS. This is closer to AGAT's correction mode."
544
+ ),
545
+ )
546
+
547
+ parser.add_argument(
548
+ "--id-prefix",
549
+ default="agat_utrs_",
550
+ help="Prefix used for newly created UTR IDs.",
551
+ )
552
+
553
+ return parser
554
+
555
+
556
+ def main():
557
+ args = build_parser().parse_args()
558
+
559
+ headers, features = read_gff(args.input)
560
+
561
+ new_features = add_utrs_like_agat(
562
+ features,
563
+ replace_existing_utrs=args.replace_existing_utrs,
564
+ id_prefix=args.id_prefix,
565
+ )
566
+
567
+ write_gff3(headers, new_features, args.output)
568
+
569
+
570
+ if __name__ == "__main__":
571
+ main()