gffkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,530 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ detect_bridge_merged_genes.py
6
+
7
+ 用途:
8
+ 检测 GFF3 中这种异常情况:
9
+ “一个 gene 下的转录本分成多个离散簇,且只有少数桥接转录本跨簇连接”,
10
+ 常见于自动注释把多个独立基因错误合并成一个 gene locus。
11
+
12
+ 作者思路:
13
+ 1. 解析 gene / mRNA(transcript) / exon / CDS
14
+ 2. 对每个 transcript 提取 exon blocks
15
+ 3. 优先识别“桥接型 transcript”:
16
+ - 含超长内含子
17
+ - exon 分布成多个远距离块
18
+ 4. 用非桥接 transcript 构建“核心簇”
19
+ 5. 若 gene 中存在 >=2 个核心簇,且桥接 transcript 能跨多个簇,则输出为可疑异常
20
+
21
+ 输入:
22
+ python detect_bridge_merged_genes.py -i input.gff3 -o suspicious.tsv
23
+
24
+ 可选参数:
25
+ --min-gap 认为两个 exon 块“相距很远”的最小 gap,默认 10000
26
+ --cluster-gap 构建核心簇时,两个 transcript span 允许合并的最大 gap,默认 2000
27
+ --min-core-tx-per-cluster 每个核心簇至少包含多少个非桥接 transcript,默认 1
28
+ --min-bridge-count 至少需要多少条桥接 transcript 才输出,默认 1
29
+ --use-cds-if-no-exon 若 transcript 无 exon,则使用 CDS,默认开启
30
+
31
+ 输出列:
32
+ gene_id
33
+ chrom
34
+ start
35
+ end
36
+ strand
37
+ total_tx
38
+ core_tx
39
+ bridge_tx
40
+ cluster_count
41
+ cluster_ranges
42
+ cluster_members
43
+ bridge_members
44
+ """
45
+
46
+ import argparse
47
+ import sys
48
+ from collections import defaultdict
49
+
50
+
51
+ # ----------------------------
52
+ # 数据结构
53
+ # ----------------------------
54
+
55
+ class Gene:
56
+ def __init__(self, gid, chrom, start, end, strand):
57
+ self.id = gid
58
+ self.chrom = chrom
59
+ self.start = start
60
+ self.end = end
61
+ self.strand = strand
62
+ self.transcripts = []
63
+
64
+
65
+ class Transcript:
66
+ def __init__(self, tid, parent_gene, chrom, start, end, strand):
67
+ self.id = tid
68
+ self.parent_gene = parent_gene
69
+ self.chrom = chrom
70
+ self.start = start
71
+ self.end = end
72
+ self.strand = strand
73
+ self.exons = []
74
+ self.cdss = []
75
+
76
+ def get_blocks(self, use_cds_if_no_exon=True):
77
+ """
78
+ 优先使用 exon;若没有 exon 且允许,则用 CDS。
79
+ 返回排序并合并后的 blocks: [(s1,e1), (s2,e2), ...]
80
+ """
81
+ blocks = self.exons[:]
82
+ if not blocks and use_cds_if_no_exon:
83
+ blocks = self.cdss[:]
84
+
85
+ if not blocks:
86
+ # 如果连 CDS 也没有,则退化成 transcript 自身 span
87
+ blocks = [(self.start, self.end)]
88
+
89
+ return merge_intervals(sorted(blocks))
90
+
91
+ def span(self, use_cds_if_no_exon=True):
92
+ blocks = self.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
93
+ return (blocks[0][0], blocks[-1][1])
94
+
95
+ def introns(self, use_cds_if_no_exon=True):
96
+ """
97
+ 根据 blocks 推导 intron 长度
98
+ """
99
+ blocks = self.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
100
+ introns = []
101
+ for i in range(len(blocks) - 1):
102
+ intron_len = blocks[i + 1][0] - blocks[i][1] - 1
103
+ introns.append(max(0, intron_len))
104
+ return introns
105
+
106
+
107
+ # ----------------------------
108
+ # 工具函数
109
+ # ----------------------------
110
+
111
+ def parse_attributes(attr_str):
112
+ """
113
+ 解析 GFF3 第9列属性
114
+ """
115
+ attrs = {}
116
+ for item in attr_str.strip().split(";"):
117
+ if not item:
118
+ continue
119
+ if "=" in item:
120
+ k, v = item.split("=", 1)
121
+ attrs[k] = v
122
+ return attrs
123
+
124
+
125
+ def merge_intervals(intervals):
126
+ """
127
+ 合并重叠/相邻区间
128
+ """
129
+ if not intervals:
130
+ return []
131
+ merged = [list(intervals[0])]
132
+ for s, e in intervals[1:]:
133
+ if s <= merged[-1][1] + 1:
134
+ merged[-1][1] = max(merged[-1][1], e)
135
+ else:
136
+ merged.append([s, e])
137
+ return [(x[0], x[1]) for x in merged]
138
+
139
+
140
+ def interval_gap(a, b):
141
+ """
142
+ 两区间间距;若重叠则返回 0
143
+ """
144
+ a1, a2 = a
145
+ b1, b2 = b
146
+ if a2 < b1:
147
+ return b1 - a2 - 1
148
+ elif b2 < a1:
149
+ return a1 - b2 - 1
150
+ else:
151
+ return 0
152
+
153
+
154
+ def interval_overlap(a, b):
155
+ """
156
+ 判断两区间是否重叠
157
+ """
158
+ a1, a2 = a
159
+ b1, b2 = b
160
+ return not (a2 < b1 or b2 < a1)
161
+
162
+
163
+ def interval_overlap_or_close(a, b, max_gap=0):
164
+ """
165
+ 重叠,或间距 <= max_gap
166
+ """
167
+ return interval_gap(a, b) <= max_gap
168
+
169
+
170
+ def union_find_cluster(items, get_interval, max_gap):
171
+ """
172
+ 按区间相互重叠/接近进行单链聚类
173
+ items: 任意对象列表
174
+ get_interval: 输入 item,返回 (start,end)
175
+ """
176
+ n = len(items)
177
+ parent = list(range(n))
178
+
179
+ def find(x):
180
+ while parent[x] != x:
181
+ parent[x] = parent[parent[x]]
182
+ x = parent[x]
183
+ return x
184
+
185
+ def union(x, y):
186
+ rx, ry = find(x), find(y)
187
+ if rx != ry:
188
+ parent[ry] = rx
189
+
190
+ for i in range(n):
191
+ a = get_interval(items[i])
192
+ for j in range(i + 1, n):
193
+ b = get_interval(items[j])
194
+ if interval_overlap_or_close(a, b, max_gap=max_gap):
195
+ union(i, j)
196
+
197
+ groups = defaultdict(list)
198
+ for i in range(n):
199
+ groups[find(i)].append(items[i])
200
+
201
+ return list(groups.values())
202
+
203
+
204
+ def blocks_to_groups(blocks, min_gap):
205
+ """
206
+ 把一个 transcript 的 exon blocks 按“远距离 gap”分成多个组。
207
+ 例如:
208
+ [(100,200), (300,350), (20000,20100), (20200,20300)]
209
+ 若 min_gap=10000,则前两块为一组,后两块为另一组。
210
+ """
211
+ if not blocks:
212
+ return []
213
+
214
+ groups = [[blocks[0]]]
215
+ for blk in blocks[1:]:
216
+ prev = groups[-1][-1]
217
+ gap = blk[0] - prev[1] - 1
218
+ if gap >= min_gap:
219
+ groups.append([blk])
220
+ else:
221
+ groups[-1].append(blk)
222
+
223
+ # 每组压缩成一个 span
224
+ spans = []
225
+ for g in groups:
226
+ spans.append((g[0][0], g[-1][1]))
227
+ return spans
228
+
229
+
230
+ def transcript_is_bridge_candidate(tx, min_gap, use_cds_if_no_exon=True):
231
+ """
232
+ 判断 transcript 是否像“桥接型 transcript”
233
+ 条件之一满足即可:
234
+ 1. 存在超长内含子(>= min_gap)
235
+ 2. exon blocks 可分成 >=2 个远距离组
236
+ """
237
+ blocks = tx.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
238
+ introns = tx.introns(use_cds_if_no_exon=use_cds_if_no_exon)
239
+ if any(x >= min_gap for x in introns):
240
+ return True
241
+
242
+ groups = blocks_to_groups(blocks, min_gap=min_gap)
243
+ if len(groups) >= 2:
244
+ return True
245
+
246
+ return False
247
+
248
+
249
+ def cluster_range(transcripts, use_cds_if_no_exon=True):
250
+ """
251
+ 求一个 transcript 簇的整体范围
252
+ """
253
+ spans = [tx.span(use_cds_if_no_exon=use_cds_if_no_exon) for tx in transcripts]
254
+ return (min(x[0] for x in spans), max(x[1] for x in spans))
255
+
256
+
257
+ def transcript_hits_clusters(tx, cluster_ranges, use_cds_if_no_exon=True):
258
+ """
259
+ 判断一个 transcript 是否横跨多个 cluster
260
+ 这里用 transcript 自己的 exon-group span 去命中 cluster
261
+ """
262
+ blocks = tx.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
263
+ groups = blocks_to_groups(blocks, min_gap=1) # 这里只拿所有连续块组
264
+ # 实际上 exon merge 后本身已是连续块,这里直接用 blocks 即可;
265
+ # 但为了稳妥,我们仍按单块 span 来命中 cluster。
266
+ spans = blocks if blocks else [tx.span(use_cds_if_no_exon=use_cds_if_no_exon)]
267
+
268
+ hit = set()
269
+ for i, cr in enumerate(cluster_ranges):
270
+ for sp in spans:
271
+ if interval_overlap(sp, cr):
272
+ hit.add(i)
273
+ break
274
+ return sorted(hit)
275
+
276
+
277
+ # ----------------------------
278
+ # GFF3 解析
279
+ # ----------------------------
280
+
281
+ def read_gff3(gff_file):
282
+ genes = {}
283
+ transcripts = {}
284
+
285
+ # 有些 GFF3 exon 的 Parent 可能是多个 transcript,用逗号分隔
286
+ child_features = []
287
+
288
+ with open(gff_file, "r", encoding="utf-8") as fh:
289
+ for line in fh:
290
+ if not line.strip() or line.startswith("#"):
291
+ continue
292
+
293
+ parts = line.rstrip("\n").split("\t")
294
+ if len(parts) != 9:
295
+ continue
296
+
297
+ chrom, source, feature_type, start, end, score, strand, phase, attrs_str = parts
298
+ start = int(start)
299
+ end = int(end)
300
+ attrs = parse_attributes(attrs_str)
301
+
302
+ if feature_type == "gene":
303
+ gid = attrs.get("ID")
304
+ if not gid:
305
+ continue
306
+ genes[gid] = Gene(gid, chrom, start, end, strand)
307
+
308
+ elif feature_type in ("mRNA", "transcript"):
309
+ tid = attrs.get("ID")
310
+ parent = attrs.get("Parent")
311
+ if not tid or not parent:
312
+ continue
313
+ # Parent 理论上应指向 gene
314
+ parent_gene = parent.split(",")[0]
315
+ transcripts[tid] = Transcript(tid, parent_gene, chrom, start, end, strand)
316
+
317
+ elif feature_type in ("exon", "CDS"):
318
+ parents = attrs.get("Parent")
319
+ if not parents:
320
+ continue
321
+ parent_list = parents.split(",")
322
+ child_features.append((feature_type, chrom, start, end, strand, parent_list))
323
+
324
+ # 挂接 transcript 到 gene
325
+ for tid, tx in transcripts.items():
326
+ if tx.parent_gene in genes:
327
+ genes[tx.parent_gene].transcripts.append(tx)
328
+
329
+ # 挂接 exon/CDS 到 transcript
330
+ for feature_type, chrom, start, end, strand, parent_list in child_features:
331
+ for pid in parent_list:
332
+ if pid in transcripts:
333
+ tx = transcripts[pid]
334
+ if feature_type == "exon":
335
+ tx.exons.append((start, end))
336
+ elif feature_type == "CDS":
337
+ tx.cdss.append((start, end))
338
+
339
+ return genes
340
+
341
+
342
+ # ----------------------------
343
+ # 核心检测逻辑
344
+ # ----------------------------
345
+
346
+ def analyze_gene(
347
+ gene,
348
+ min_gap=10000,
349
+ cluster_gap=2000,
350
+ min_core_tx_per_cluster=1,
351
+ min_bridge_count=1,
352
+ use_cds_if_no_exon=True
353
+ ):
354
+ """
355
+ 返回:
356
+ None -> 不可疑
357
+ dict -> 可疑结果
358
+ """
359
+
360
+ if len(gene.transcripts) < 2:
361
+ return None
362
+
363
+ # 先识别桥接 transcript
364
+ bridge_candidates = []
365
+ core_candidates = []
366
+
367
+ for tx in gene.transcripts:
368
+ if transcript_is_bridge_candidate(tx, min_gap=min_gap, use_cds_if_no_exon=use_cds_if_no_exon):
369
+ bridge_candidates.append(tx)
370
+ else:
371
+ core_candidates.append(tx)
372
+
373
+ # 如果没有 core transcript,则无法稳定构建核心簇
374
+ if len(core_candidates) < 2:
375
+ return None
376
+
377
+ # 用非桥接 transcript 构建核心簇
378
+ core_clusters = union_find_cluster(
379
+ core_candidates,
380
+ get_interval=lambda x: x.span(use_cds_if_no_exon=use_cds_if_no_exon),
381
+ max_gap=cluster_gap
382
+ )
383
+
384
+ # 过滤过小簇
385
+ core_clusters = [c for c in core_clusters if len(c) >= min_core_tx_per_cluster]
386
+
387
+ if len(core_clusters) < 2:
388
+ return None
389
+
390
+ core_cluster_ranges = [
391
+ cluster_range(c, use_cds_if_no_exon=use_cds_if_no_exon)
392
+ for c in core_clusters
393
+ ]
394
+
395
+ # 检查桥接 transcript 是否真的命中多个簇
396
+ true_bridges = []
397
+ bridge_hit_info = {}
398
+
399
+ for tx in bridge_candidates:
400
+ hits = transcript_hits_clusters(tx, core_cluster_ranges, use_cds_if_no_exon=use_cds_if_no_exon)
401
+ if len(hits) >= 2:
402
+ true_bridges.append(tx)
403
+ bridge_hit_info[tx.id] = hits
404
+
405
+ if len(true_bridges) < min_bridge_count:
406
+ return None
407
+
408
+ # 输出结果
409
+ cluster_member_strs = []
410
+ cluster_range_strs = []
411
+ for i, c in enumerate(core_clusters, start=1):
412
+ cr = cluster_range(c, use_cds_if_no_exon=use_cds_if_no_exon)
413
+ cluster_range_strs.append(f"C{i}:{cr[0]}-{cr[1]}")
414
+ cluster_member_strs.append(f"C{i}:" + ",".join(sorted(tx.id for tx in c)))
415
+
416
+ bridge_member_strs = []
417
+ for tx in sorted(true_bridges, key=lambda x: x.id):
418
+ hit_clusters = ",".join(f"C{i+1}" for i in bridge_hit_info[tx.id])
419
+ bridge_member_strs.append(f"{tx.id}[{hit_clusters}]")
420
+
421
+ result = {
422
+ "gene_id": gene.id,
423
+ "chrom": gene.chrom,
424
+ "start": gene.start,
425
+ "end": gene.end,
426
+ "strand": gene.strand,
427
+ "total_tx": len(gene.transcripts),
428
+ "core_tx": len(core_candidates),
429
+ "bridge_tx": len(true_bridges),
430
+ "cluster_count": len(core_clusters),
431
+ "cluster_ranges": ";".join(cluster_range_strs),
432
+ "cluster_members": ";".join(cluster_member_strs),
433
+ "bridge_members": ";".join(bridge_member_strs)
434
+ }
435
+ return result
436
+
437
+
438
+ # ----------------------------
439
+ # 主程序
440
+ # ----------------------------
441
+
442
+ def main():
443
+ parser = argparse.ArgumentParser(
444
+ description="检测 GFF3 中“一个 gene 下多个离散 transcript 簇被少数桥接 transcript 连接”的可疑错误合并基因。"
445
+ )
446
+ parser.add_argument("-i", "--input", required=True, help="输入 GFF3 文件")
447
+ parser.add_argument("-o", "--output", required=True, help="输出 TSV 文件")
448
+ parser.add_argument(
449
+ "--min-gap",
450
+ type=int,
451
+ default=10000,
452
+ help="判定桥接 transcript 的最小远距离 gap / 超长内含子阈值,默认 10000"
453
+ )
454
+ parser.add_argument(
455
+ "--cluster-gap",
456
+ type=int,
457
+ default=2000,
458
+ help="核心 transcript 聚类时允许的最大 gap,默认 2000"
459
+ )
460
+ parser.add_argument(
461
+ "--min-core-tx-per-cluster",
462
+ type=int,
463
+ default=1,
464
+ help="每个核心簇至少包含多少条非桥接 transcript,默认 1"
465
+ )
466
+ parser.add_argument(
467
+ "--min-bridge-count",
468
+ type=int,
469
+ default=1,
470
+ help="至少多少条真实桥接 transcript 才输出,默认 1"
471
+ )
472
+ parser.add_argument(
473
+ "--no-use-cds-if-no-exon",
474
+ action="store_true",
475
+ help="若 transcript 没有 exon,则不要回退使用 CDS"
476
+ )
477
+
478
+ args = parser.parse_args()
479
+
480
+ use_cds_if_no_exon = not args.no_use_cds_if_no_exon
481
+
482
+ genes = read_gff3(args.input)
483
+
484
+ out_fields = [
485
+ "gene_id",
486
+ "chrom",
487
+ "start",
488
+ "end",
489
+ "strand",
490
+ "total_tx",
491
+ "core_tx",
492
+ "bridge_tx",
493
+ "cluster_count",
494
+ "cluster_ranges",
495
+ "cluster_members",
496
+ "bridge_members"
497
+ ]
498
+
499
+ n_total = 0
500
+ n_flagged = 0
501
+
502
+ with open(args.output, "w", encoding="utf-8") as out:
503
+ out.write("\t".join(out_fields) + "\n")
504
+
505
+ for gid in sorted(genes.keys()):
506
+ gene = genes[gid]
507
+ n_total += 1
508
+
509
+ result = analyze_gene(
510
+ gene,
511
+ min_gap=args.min_gap,
512
+ cluster_gap=args.cluster_gap,
513
+ min_core_tx_per_cluster=args.min_core_tx_per_cluster,
514
+ min_bridge_count=args.min_bridge_count,
515
+ use_cds_if_no_exon=use_cds_if_no_exon
516
+ )
517
+
518
+ if result:
519
+ n_flagged += 1
520
+ out.write("\t".join(str(result[f]) for f in out_fields) + "\n")
521
+
522
+ sys.stderr.write(
523
+ f"[INFO] Total genes checked: {n_total}\n"
524
+ f"[INFO] Suspicious merged genes: {n_flagged}\n"
525
+ f"[INFO] Output written to: {args.output}\n"
526
+ )
527
+
528
+
529
+ if __name__ == "__main__":
530
+ main()
gffkit/main.py ADDED
@@ -0,0 +1,178 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """Command line interface for gffkit."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Callable, List, Optional
11
+
12
+ from . import __version__
13
+
14
+
15
+ def _run_legacy_main(func: Callable[[], object], prog: str, args: List[str]) -> int:
16
+ """Run an existing script-style main() by temporarily replacing sys.argv."""
17
+ old_argv = sys.argv[:]
18
+ sys.argv = [prog] + list(args)
19
+ try:
20
+ ret = func()
21
+ return 0 if ret is None else int(ret)
22
+ finally:
23
+ sys.argv = old_argv
24
+
25
+
26
+ def cmd_detect_bridge(args: argparse.Namespace, extra: List[str]) -> int:
27
+ from . import detect_bridge_merged_genes as mod
28
+ cli = ["-i", args.input, "-o", args.output]
29
+ cli += extra
30
+ return _run_legacy_main(mod.main, "gffkit detect-bridge", cli)
31
+
32
+
33
+ def cmd_complement(args: argparse.Namespace, extra: List[str]) -> int:
34
+ from . import complement_annotations as mod
35
+ cli = ["--ref", args.ref]
36
+ for add_file in args.add:
37
+ cli += ["--add", add_file]
38
+ if args.output:
39
+ cli += ["--output", args.output]
40
+ cli += extra
41
+ return _run_legacy_main(mod.main, "gffkit complement", cli)
42
+
43
+
44
+ def cmd_add_utr(args: argparse.Namespace, extra: List[str]) -> int:
45
+ from . import add_utr as mod
46
+ cli = ["-i", args.input, "-o", args.output]
47
+ cli += extra
48
+ return _run_legacy_main(mod.main, "gffkit add-utr", cli)
49
+
50
+
51
+ def cmd_integrate(args: argparse.Namespace, extra: List[str]) -> int:
52
+ """Run the full three-step annotation integration workflow."""
53
+ from . import detect_bridge_merged_genes as detect_mod
54
+ from . import complement_annotations as complement_mod
55
+ from . import add_utr as utr_mod
56
+
57
+ outdir = Path(args.outdir)
58
+ outdir.mkdir(parents=True, exist_ok=True)
59
+
60
+ suspicious_tsv = Path(args.suspicious_tsv) if args.suspicious_tsv else outdir / f"{args.prefix}.suspicious.tsv"
61
+ merged_gff = Path(args.merged_gff) if args.merged_gff else outdir / f"{args.prefix}.merged.gff3"
62
+ final_gff = Path(args.output) if args.output else outdir / f"{args.prefix}.final.withUTR.gff3"
63
+
64
+ print("[gffkit] Step 1/3: detecting suspicious merged genes", file=sys.stderr)
65
+ detect_cli = [
66
+ "-i", args.annotation_a,
67
+ "-o", str(suspicious_tsv),
68
+ "--min-gap", str(args.min_gap),
69
+ "--cluster-gap", str(args.cluster_gap),
70
+ "--min-core-tx-per-cluster", str(args.min_core_tx_per_cluster),
71
+ "--min-bridge-count", str(args.min_bridge_count),
72
+ ]
73
+ if args.no_use_cds_if_no_exon:
74
+ detect_cli.append("--no-use-cds-if-no-exon")
75
+ ret = _run_legacy_main(detect_mod.main, "gffkit detect-bridge", detect_cli)
76
+ if ret != 0:
77
+ return ret
78
+
79
+ print("[gffkit] Step 2/3: region-aware annotation merging", file=sys.stderr)
80
+ complement_cli = [
81
+ "--ref", args.annotation_a,
82
+ "--add", args.annotation_b,
83
+ "--swap_region_tsv", str(suspicious_tsv),
84
+ "--swap_region_flank", str(args.swap_region_flank),
85
+ "--size_min", str(args.size_min),
86
+ "--output", str(merged_gff),
87
+ ]
88
+ ret = _run_legacy_main(complement_mod.main, "gffkit complement", complement_cli)
89
+ if ret != 0:
90
+ return ret
91
+
92
+ print("[gffkit] Step 3/3: adding UTR features", file=sys.stderr)
93
+ utr_cli = ["-i", str(merged_gff), "-o", str(final_gff), "--id-prefix", args.utr_id_prefix]
94
+ if args.replace_existing_utrs:
95
+ utr_cli.append("--replace-existing-utrs")
96
+ ret = _run_legacy_main(utr_mod.main, "gffkit add-utr", utr_cli)
97
+ if ret != 0:
98
+ return ret
99
+
100
+ print("[gffkit] Done", file=sys.stderr)
101
+ print(f"[gffkit] suspicious TSV: {suspicious_tsv}", file=sys.stderr)
102
+ print(f"[gffkit] merged GFF3: {merged_gff}", file=sys.stderr)
103
+ print(f"[gffkit] final GFF3: {final_gff}", file=sys.stderr)
104
+ return 0
105
+
106
+
107
+ def build_parser() -> argparse.ArgumentParser:
108
+ parser = argparse.ArgumentParser(
109
+ prog="gffkit",
110
+ description="Utilities for detecting merged-gene artifacts, region-aware GFF merging, and UTR reconstruction.",
111
+ )
112
+ parser.add_argument("--version", action="version", version=f"gffkit {__version__}")
113
+
114
+ subparsers = parser.add_subparsers(dest="command", required=True)
115
+
116
+ p = subparsers.add_parser(
117
+ "detect-bridge",
118
+ help="Detect suspicious merged genes caused by bridge transcripts.",
119
+ description="Wrapper around detect_bridge_merged_genes.py. Unknown options are forwarded.",
120
+ )
121
+ p.add_argument("-i", "--input", required=True, help="Input GFF3 file, usually Annotation A.")
122
+ p.add_argument("-o", "--output", required=True, help="Output suspicious.tsv file.")
123
+ p.set_defaults(handler=cmd_detect_bridge)
124
+
125
+ p = subparsers.add_parser(
126
+ "complement",
127
+ help="Complement/merge GFF annotations with optional region-swap mode.",
128
+ description="Wrapper around complement_annotations.py. Unknown options are forwarded.",
129
+ )
130
+ p.add_argument("--ref", "-r", "-i", required=True, help="Reference GFF/GTF file.")
131
+ p.add_argument("--add", "-a", action="append", required=True, help="Supplementary GFF/GTF file; can be repeated.")
132
+ p.add_argument("--output", "--out", "-o", default=None, help="Output GFF3 path. Default: stdout.")
133
+ p.set_defaults(handler=cmd_complement)
134
+
135
+ p = subparsers.add_parser(
136
+ "add-utr",
137
+ help="Add five_prime_UTR and three_prime_UTR features from exon/CDS coordinates.",
138
+ description="Wrapper around add_utr.py. Unknown options are forwarded.",
139
+ )
140
+ p.add_argument("-i", "--input", required=True, help="Input GFF3/GTF file.")
141
+ p.add_argument("-o", "--output", required=True, help="Output GFF3 file.")
142
+ p.set_defaults(handler=cmd_add_utr)
143
+
144
+ p = subparsers.add_parser(
145
+ "integrate",
146
+ help="Run the full A/B region-aware integration workflow.",
147
+ description=(
148
+ "Detect suspicious merged-gene regions in Annotation A, use Annotation B as the local "
149
+ "primary reference in those regions, then add UTR features."
150
+ ),
151
+ )
152
+ p.add_argument("--annotation-a", "--a", required=True, help="Annotation A: EviAnn/RNA-seq-supported GFF3.")
153
+ p.add_argument("--annotation-b", "--b", required=True, help="Annotation B: ANNEVO/deep-learning GFF3.")
154
+ p.add_argument("--outdir", default="gffkit_out", help="Output directory.")
155
+ p.add_argument("--prefix", default="gffkit", help="Output file prefix.")
156
+ p.add_argument("-o", "--output", default=None, help="Final GFF3 output path. Default: OUTDIR/PREFIX.final.withUTR.gff3")
157
+ p.add_argument("--suspicious-tsv", default=None, help="Optional path for intermediate suspicious.tsv.")
158
+ p.add_argument("--merged-gff", default=None, help="Optional path for intermediate merged.gff3.")
159
+
160
+ p.add_argument("--min-gap", type=int, default=10000, help="Bridge-candidate long gap threshold.")
161
+ p.add_argument("--cluster-gap", type=int, default=2000, help="Core cluster maximum gap.")
162
+ p.add_argument("--min-core-tx-per-cluster", type=int, default=1, help="Minimum core transcripts per cluster.")
163
+ p.add_argument("--min-bridge-count", type=int, default=1, help="Minimum true bridge transcripts required.")
164
+ p.add_argument("--no-use-cds-if-no-exon", action="store_true", help="Do not use CDS when transcript has no exon.")
165
+
166
+ p.add_argument("--swap-region-flank", type=int, default=100, help="Flanking bp added to suspicious regions.")
167
+ p.add_argument("--size-min", type=int, default=0, help="Minimum CDS size for non-overlapping supplementary roots.")
168
+ p.add_argument("--replace-existing-utrs", action="store_true", help="Remove existing UTRs and recreate them.")
169
+ p.add_argument("--utr-id-prefix", default="gffkit_utr_", help="Prefix for newly created UTR IDs.")
170
+ p.set_defaults(handler=cmd_integrate)
171
+
172
+ return parser
173
+
174
+
175
+ def main(argv: Optional[List[str]] = None) -> int:
176
+ parser = build_parser()
177
+ args, extra = parser.parse_known_args(argv)
178
+ return args.handler(args, extra)