gffkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gffkit/__init__.py +3 -0
- gffkit/__main__.py +4 -0
- gffkit/add_utr.py +571 -0
- gffkit/complement_annotations.py +1001 -0
- gffkit/detect_bridge_merged_genes.py +530 -0
- gffkit/main.py +178 -0
- gffkit-0.1.0.dist-info/LICENSE +21 -0
- gffkit-0.1.0.dist-info/METADATA +96 -0
- gffkit-0.1.0.dist-info/RECORD +12 -0
- gffkit-0.1.0.dist-info/WHEEL +5 -0
- gffkit-0.1.0.dist-info/entry_points.txt +5 -0
- gffkit-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
detect_bridge_merged_genes.py
|
|
6
|
+
|
|
7
|
+
用途:
|
|
8
|
+
检测 GFF3 中这种异常情况:
|
|
9
|
+
“一个 gene 下的转录本分成多个离散簇,且只有少数桥接转录本跨簇连接”,
|
|
10
|
+
常见于自动注释把多个独立基因错误合并成一个 gene locus。
|
|
11
|
+
|
|
12
|
+
作者思路:
|
|
13
|
+
1. 解析 gene / mRNA(transcript) / exon / CDS
|
|
14
|
+
2. 对每个 transcript 提取 exon blocks
|
|
15
|
+
3. 优先识别“桥接型 transcript”:
|
|
16
|
+
- 含超长内含子
|
|
17
|
+
- exon 分布成多个远距离块
|
|
18
|
+
4. 用非桥接 transcript 构建“核心簇”
|
|
19
|
+
5. 若 gene 中存在 >=2 个核心簇,且桥接 transcript 能跨多个簇,则输出为可疑异常
|
|
20
|
+
|
|
21
|
+
输入:
|
|
22
|
+
python detect_bridge_merged_genes.py -i input.gff3 -o suspicious.tsv
|
|
23
|
+
|
|
24
|
+
可选参数:
|
|
25
|
+
--min-gap 认为两个 exon 块“相距很远”的最小 gap,默认 10000
|
|
26
|
+
--cluster-gap 构建核心簇时,两个 transcript span 允许合并的最大 gap,默认 2000
|
|
27
|
+
--min-core-tx-per-cluster 每个核心簇至少包含多少个非桥接 transcript,默认 1
|
|
28
|
+
--min-bridge-count 至少需要多少条桥接 transcript 才输出,默认 1
|
|
29
|
+
--use-cds-if-no-exon 若 transcript 无 exon,则使用 CDS,默认开启
|
|
30
|
+
|
|
31
|
+
输出列:
|
|
32
|
+
gene_id
|
|
33
|
+
chrom
|
|
34
|
+
start
|
|
35
|
+
end
|
|
36
|
+
strand
|
|
37
|
+
total_tx
|
|
38
|
+
core_tx
|
|
39
|
+
bridge_tx
|
|
40
|
+
cluster_count
|
|
41
|
+
cluster_ranges
|
|
42
|
+
cluster_members
|
|
43
|
+
bridge_members
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
import argparse
|
|
47
|
+
import sys
|
|
48
|
+
from collections import defaultdict
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ----------------------------
|
|
52
|
+
# 数据结构
|
|
53
|
+
# ----------------------------
|
|
54
|
+
|
|
55
|
+
class Gene:
|
|
56
|
+
def __init__(self, gid, chrom, start, end, strand):
|
|
57
|
+
self.id = gid
|
|
58
|
+
self.chrom = chrom
|
|
59
|
+
self.start = start
|
|
60
|
+
self.end = end
|
|
61
|
+
self.strand = strand
|
|
62
|
+
self.transcripts = []
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Transcript:
|
|
66
|
+
def __init__(self, tid, parent_gene, chrom, start, end, strand):
|
|
67
|
+
self.id = tid
|
|
68
|
+
self.parent_gene = parent_gene
|
|
69
|
+
self.chrom = chrom
|
|
70
|
+
self.start = start
|
|
71
|
+
self.end = end
|
|
72
|
+
self.strand = strand
|
|
73
|
+
self.exons = []
|
|
74
|
+
self.cdss = []
|
|
75
|
+
|
|
76
|
+
def get_blocks(self, use_cds_if_no_exon=True):
|
|
77
|
+
"""
|
|
78
|
+
优先使用 exon;若没有 exon 且允许,则用 CDS。
|
|
79
|
+
返回排序并合并后的 blocks: [(s1,e1), (s2,e2), ...]
|
|
80
|
+
"""
|
|
81
|
+
blocks = self.exons[:]
|
|
82
|
+
if not blocks and use_cds_if_no_exon:
|
|
83
|
+
blocks = self.cdss[:]
|
|
84
|
+
|
|
85
|
+
if not blocks:
|
|
86
|
+
# 如果连 CDS 也没有,则退化成 transcript 自身 span
|
|
87
|
+
blocks = [(self.start, self.end)]
|
|
88
|
+
|
|
89
|
+
return merge_intervals(sorted(blocks))
|
|
90
|
+
|
|
91
|
+
def span(self, use_cds_if_no_exon=True):
|
|
92
|
+
blocks = self.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
|
|
93
|
+
return (blocks[0][0], blocks[-1][1])
|
|
94
|
+
|
|
95
|
+
def introns(self, use_cds_if_no_exon=True):
|
|
96
|
+
"""
|
|
97
|
+
根据 blocks 推导 intron 长度
|
|
98
|
+
"""
|
|
99
|
+
blocks = self.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
|
|
100
|
+
introns = []
|
|
101
|
+
for i in range(len(blocks) - 1):
|
|
102
|
+
intron_len = blocks[i + 1][0] - blocks[i][1] - 1
|
|
103
|
+
introns.append(max(0, intron_len))
|
|
104
|
+
return introns
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ----------------------------
|
|
108
|
+
# 工具函数
|
|
109
|
+
# ----------------------------
|
|
110
|
+
|
|
111
|
+
def parse_attributes(attr_str):
|
|
112
|
+
"""
|
|
113
|
+
解析 GFF3 第9列属性
|
|
114
|
+
"""
|
|
115
|
+
attrs = {}
|
|
116
|
+
for item in attr_str.strip().split(";"):
|
|
117
|
+
if not item:
|
|
118
|
+
continue
|
|
119
|
+
if "=" in item:
|
|
120
|
+
k, v = item.split("=", 1)
|
|
121
|
+
attrs[k] = v
|
|
122
|
+
return attrs
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def merge_intervals(intervals):
|
|
126
|
+
"""
|
|
127
|
+
合并重叠/相邻区间
|
|
128
|
+
"""
|
|
129
|
+
if not intervals:
|
|
130
|
+
return []
|
|
131
|
+
merged = [list(intervals[0])]
|
|
132
|
+
for s, e in intervals[1:]:
|
|
133
|
+
if s <= merged[-1][1] + 1:
|
|
134
|
+
merged[-1][1] = max(merged[-1][1], e)
|
|
135
|
+
else:
|
|
136
|
+
merged.append([s, e])
|
|
137
|
+
return [(x[0], x[1]) for x in merged]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def interval_gap(a, b):
|
|
141
|
+
"""
|
|
142
|
+
两区间间距;若重叠则返回 0
|
|
143
|
+
"""
|
|
144
|
+
a1, a2 = a
|
|
145
|
+
b1, b2 = b
|
|
146
|
+
if a2 < b1:
|
|
147
|
+
return b1 - a2 - 1
|
|
148
|
+
elif b2 < a1:
|
|
149
|
+
return a1 - b2 - 1
|
|
150
|
+
else:
|
|
151
|
+
return 0
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def interval_overlap(a, b):
|
|
155
|
+
"""
|
|
156
|
+
判断两区间是否重叠
|
|
157
|
+
"""
|
|
158
|
+
a1, a2 = a
|
|
159
|
+
b1, b2 = b
|
|
160
|
+
return not (a2 < b1 or b2 < a1)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def interval_overlap_or_close(a, b, max_gap=0):
|
|
164
|
+
"""
|
|
165
|
+
重叠,或间距 <= max_gap
|
|
166
|
+
"""
|
|
167
|
+
return interval_gap(a, b) <= max_gap
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def union_find_cluster(items, get_interval, max_gap):
|
|
171
|
+
"""
|
|
172
|
+
按区间相互重叠/接近进行单链聚类
|
|
173
|
+
items: 任意对象列表
|
|
174
|
+
get_interval: 输入 item,返回 (start,end)
|
|
175
|
+
"""
|
|
176
|
+
n = len(items)
|
|
177
|
+
parent = list(range(n))
|
|
178
|
+
|
|
179
|
+
def find(x):
|
|
180
|
+
while parent[x] != x:
|
|
181
|
+
parent[x] = parent[parent[x]]
|
|
182
|
+
x = parent[x]
|
|
183
|
+
return x
|
|
184
|
+
|
|
185
|
+
def union(x, y):
|
|
186
|
+
rx, ry = find(x), find(y)
|
|
187
|
+
if rx != ry:
|
|
188
|
+
parent[ry] = rx
|
|
189
|
+
|
|
190
|
+
for i in range(n):
|
|
191
|
+
a = get_interval(items[i])
|
|
192
|
+
for j in range(i + 1, n):
|
|
193
|
+
b = get_interval(items[j])
|
|
194
|
+
if interval_overlap_or_close(a, b, max_gap=max_gap):
|
|
195
|
+
union(i, j)
|
|
196
|
+
|
|
197
|
+
groups = defaultdict(list)
|
|
198
|
+
for i in range(n):
|
|
199
|
+
groups[find(i)].append(items[i])
|
|
200
|
+
|
|
201
|
+
return list(groups.values())
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def blocks_to_groups(blocks, min_gap):
|
|
205
|
+
"""
|
|
206
|
+
把一个 transcript 的 exon blocks 按“远距离 gap”分成多个组。
|
|
207
|
+
例如:
|
|
208
|
+
[(100,200), (300,350), (20000,20100), (20200,20300)]
|
|
209
|
+
若 min_gap=10000,则前两块为一组,后两块为另一组。
|
|
210
|
+
"""
|
|
211
|
+
if not blocks:
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
groups = [[blocks[0]]]
|
|
215
|
+
for blk in blocks[1:]:
|
|
216
|
+
prev = groups[-1][-1]
|
|
217
|
+
gap = blk[0] - prev[1] - 1
|
|
218
|
+
if gap >= min_gap:
|
|
219
|
+
groups.append([blk])
|
|
220
|
+
else:
|
|
221
|
+
groups[-1].append(blk)
|
|
222
|
+
|
|
223
|
+
# 每组压缩成一个 span
|
|
224
|
+
spans = []
|
|
225
|
+
for g in groups:
|
|
226
|
+
spans.append((g[0][0], g[-1][1]))
|
|
227
|
+
return spans
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def transcript_is_bridge_candidate(tx, min_gap, use_cds_if_no_exon=True):
|
|
231
|
+
"""
|
|
232
|
+
判断 transcript 是否像“桥接型 transcript”
|
|
233
|
+
条件之一满足即可:
|
|
234
|
+
1. 存在超长内含子(>= min_gap)
|
|
235
|
+
2. exon blocks 可分成 >=2 个远距离组
|
|
236
|
+
"""
|
|
237
|
+
blocks = tx.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
|
|
238
|
+
introns = tx.introns(use_cds_if_no_exon=use_cds_if_no_exon)
|
|
239
|
+
if any(x >= min_gap for x in introns):
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
groups = blocks_to_groups(blocks, min_gap=min_gap)
|
|
243
|
+
if len(groups) >= 2:
|
|
244
|
+
return True
|
|
245
|
+
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def cluster_range(transcripts, use_cds_if_no_exon=True):
|
|
250
|
+
"""
|
|
251
|
+
求一个 transcript 簇的整体范围
|
|
252
|
+
"""
|
|
253
|
+
spans = [tx.span(use_cds_if_no_exon=use_cds_if_no_exon) for tx in transcripts]
|
|
254
|
+
return (min(x[0] for x in spans), max(x[1] for x in spans))
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def transcript_hits_clusters(tx, cluster_ranges, use_cds_if_no_exon=True):
|
|
258
|
+
"""
|
|
259
|
+
判断一个 transcript 是否横跨多个 cluster
|
|
260
|
+
这里用 transcript 自己的 exon-group span 去命中 cluster
|
|
261
|
+
"""
|
|
262
|
+
blocks = tx.get_blocks(use_cds_if_no_exon=use_cds_if_no_exon)
|
|
263
|
+
groups = blocks_to_groups(blocks, min_gap=1) # 这里只拿所有连续块组
|
|
264
|
+
# 实际上 exon merge 后本身已是连续块,这里直接用 blocks 即可;
|
|
265
|
+
# 但为了稳妥,我们仍按单块 span 来命中 cluster。
|
|
266
|
+
spans = blocks if blocks else [tx.span(use_cds_if_no_exon=use_cds_if_no_exon)]
|
|
267
|
+
|
|
268
|
+
hit = set()
|
|
269
|
+
for i, cr in enumerate(cluster_ranges):
|
|
270
|
+
for sp in spans:
|
|
271
|
+
if interval_overlap(sp, cr):
|
|
272
|
+
hit.add(i)
|
|
273
|
+
break
|
|
274
|
+
return sorted(hit)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ----------------------------
|
|
278
|
+
# GFF3 解析
|
|
279
|
+
# ----------------------------
|
|
280
|
+
|
|
281
|
+
def read_gff3(gff_file):
|
|
282
|
+
genes = {}
|
|
283
|
+
transcripts = {}
|
|
284
|
+
|
|
285
|
+
# 有些 GFF3 exon 的 Parent 可能是多个 transcript,用逗号分隔
|
|
286
|
+
child_features = []
|
|
287
|
+
|
|
288
|
+
with open(gff_file, "r", encoding="utf-8") as fh:
|
|
289
|
+
for line in fh:
|
|
290
|
+
if not line.strip() or line.startswith("#"):
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
parts = line.rstrip("\n").split("\t")
|
|
294
|
+
if len(parts) != 9:
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
chrom, source, feature_type, start, end, score, strand, phase, attrs_str = parts
|
|
298
|
+
start = int(start)
|
|
299
|
+
end = int(end)
|
|
300
|
+
attrs = parse_attributes(attrs_str)
|
|
301
|
+
|
|
302
|
+
if feature_type == "gene":
|
|
303
|
+
gid = attrs.get("ID")
|
|
304
|
+
if not gid:
|
|
305
|
+
continue
|
|
306
|
+
genes[gid] = Gene(gid, chrom, start, end, strand)
|
|
307
|
+
|
|
308
|
+
elif feature_type in ("mRNA", "transcript"):
|
|
309
|
+
tid = attrs.get("ID")
|
|
310
|
+
parent = attrs.get("Parent")
|
|
311
|
+
if not tid or not parent:
|
|
312
|
+
continue
|
|
313
|
+
# Parent 理论上应指向 gene
|
|
314
|
+
parent_gene = parent.split(",")[0]
|
|
315
|
+
transcripts[tid] = Transcript(tid, parent_gene, chrom, start, end, strand)
|
|
316
|
+
|
|
317
|
+
elif feature_type in ("exon", "CDS"):
|
|
318
|
+
parents = attrs.get("Parent")
|
|
319
|
+
if not parents:
|
|
320
|
+
continue
|
|
321
|
+
parent_list = parents.split(",")
|
|
322
|
+
child_features.append((feature_type, chrom, start, end, strand, parent_list))
|
|
323
|
+
|
|
324
|
+
# 挂接 transcript 到 gene
|
|
325
|
+
for tid, tx in transcripts.items():
|
|
326
|
+
if tx.parent_gene in genes:
|
|
327
|
+
genes[tx.parent_gene].transcripts.append(tx)
|
|
328
|
+
|
|
329
|
+
# 挂接 exon/CDS 到 transcript
|
|
330
|
+
for feature_type, chrom, start, end, strand, parent_list in child_features:
|
|
331
|
+
for pid in parent_list:
|
|
332
|
+
if pid in transcripts:
|
|
333
|
+
tx = transcripts[pid]
|
|
334
|
+
if feature_type == "exon":
|
|
335
|
+
tx.exons.append((start, end))
|
|
336
|
+
elif feature_type == "CDS":
|
|
337
|
+
tx.cdss.append((start, end))
|
|
338
|
+
|
|
339
|
+
return genes
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ----------------------------
|
|
343
|
+
# 核心检测逻辑
|
|
344
|
+
# ----------------------------
|
|
345
|
+
|
|
346
|
+
def analyze_gene(
|
|
347
|
+
gene,
|
|
348
|
+
min_gap=10000,
|
|
349
|
+
cluster_gap=2000,
|
|
350
|
+
min_core_tx_per_cluster=1,
|
|
351
|
+
min_bridge_count=1,
|
|
352
|
+
use_cds_if_no_exon=True
|
|
353
|
+
):
|
|
354
|
+
"""
|
|
355
|
+
返回:
|
|
356
|
+
None -> 不可疑
|
|
357
|
+
dict -> 可疑结果
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
if len(gene.transcripts) < 2:
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
# 先识别桥接 transcript
|
|
364
|
+
bridge_candidates = []
|
|
365
|
+
core_candidates = []
|
|
366
|
+
|
|
367
|
+
for tx in gene.transcripts:
|
|
368
|
+
if transcript_is_bridge_candidate(tx, min_gap=min_gap, use_cds_if_no_exon=use_cds_if_no_exon):
|
|
369
|
+
bridge_candidates.append(tx)
|
|
370
|
+
else:
|
|
371
|
+
core_candidates.append(tx)
|
|
372
|
+
|
|
373
|
+
# 如果没有 core transcript,则无法稳定构建核心簇
|
|
374
|
+
if len(core_candidates) < 2:
|
|
375
|
+
return None
|
|
376
|
+
|
|
377
|
+
# 用非桥接 transcript 构建核心簇
|
|
378
|
+
core_clusters = union_find_cluster(
|
|
379
|
+
core_candidates,
|
|
380
|
+
get_interval=lambda x: x.span(use_cds_if_no_exon=use_cds_if_no_exon),
|
|
381
|
+
max_gap=cluster_gap
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# 过滤过小簇
|
|
385
|
+
core_clusters = [c for c in core_clusters if len(c) >= min_core_tx_per_cluster]
|
|
386
|
+
|
|
387
|
+
if len(core_clusters) < 2:
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
core_cluster_ranges = [
|
|
391
|
+
cluster_range(c, use_cds_if_no_exon=use_cds_if_no_exon)
|
|
392
|
+
for c in core_clusters
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
# 检查桥接 transcript 是否真的命中多个簇
|
|
396
|
+
true_bridges = []
|
|
397
|
+
bridge_hit_info = {}
|
|
398
|
+
|
|
399
|
+
for tx in bridge_candidates:
|
|
400
|
+
hits = transcript_hits_clusters(tx, core_cluster_ranges, use_cds_if_no_exon=use_cds_if_no_exon)
|
|
401
|
+
if len(hits) >= 2:
|
|
402
|
+
true_bridges.append(tx)
|
|
403
|
+
bridge_hit_info[tx.id] = hits
|
|
404
|
+
|
|
405
|
+
if len(true_bridges) < min_bridge_count:
|
|
406
|
+
return None
|
|
407
|
+
|
|
408
|
+
# 输出结果
|
|
409
|
+
cluster_member_strs = []
|
|
410
|
+
cluster_range_strs = []
|
|
411
|
+
for i, c in enumerate(core_clusters, start=1):
|
|
412
|
+
cr = cluster_range(c, use_cds_if_no_exon=use_cds_if_no_exon)
|
|
413
|
+
cluster_range_strs.append(f"C{i}:{cr[0]}-{cr[1]}")
|
|
414
|
+
cluster_member_strs.append(f"C{i}:" + ",".join(sorted(tx.id for tx in c)))
|
|
415
|
+
|
|
416
|
+
bridge_member_strs = []
|
|
417
|
+
for tx in sorted(true_bridges, key=lambda x: x.id):
|
|
418
|
+
hit_clusters = ",".join(f"C{i+1}" for i in bridge_hit_info[tx.id])
|
|
419
|
+
bridge_member_strs.append(f"{tx.id}[{hit_clusters}]")
|
|
420
|
+
|
|
421
|
+
result = {
|
|
422
|
+
"gene_id": gene.id,
|
|
423
|
+
"chrom": gene.chrom,
|
|
424
|
+
"start": gene.start,
|
|
425
|
+
"end": gene.end,
|
|
426
|
+
"strand": gene.strand,
|
|
427
|
+
"total_tx": len(gene.transcripts),
|
|
428
|
+
"core_tx": len(core_candidates),
|
|
429
|
+
"bridge_tx": len(true_bridges),
|
|
430
|
+
"cluster_count": len(core_clusters),
|
|
431
|
+
"cluster_ranges": ";".join(cluster_range_strs),
|
|
432
|
+
"cluster_members": ";".join(cluster_member_strs),
|
|
433
|
+
"bridge_members": ";".join(bridge_member_strs)
|
|
434
|
+
}
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ----------------------------
|
|
439
|
+
# 主程序
|
|
440
|
+
# ----------------------------
|
|
441
|
+
|
|
442
|
+
def main():
|
|
443
|
+
parser = argparse.ArgumentParser(
|
|
444
|
+
description="检测 GFF3 中“一个 gene 下多个离散 transcript 簇被少数桥接 transcript 连接”的可疑错误合并基因。"
|
|
445
|
+
)
|
|
446
|
+
parser.add_argument("-i", "--input", required=True, help="输入 GFF3 文件")
|
|
447
|
+
parser.add_argument("-o", "--output", required=True, help="输出 TSV 文件")
|
|
448
|
+
parser.add_argument(
|
|
449
|
+
"--min-gap",
|
|
450
|
+
type=int,
|
|
451
|
+
default=10000,
|
|
452
|
+
help="判定桥接 transcript 的最小远距离 gap / 超长内含子阈值,默认 10000"
|
|
453
|
+
)
|
|
454
|
+
parser.add_argument(
|
|
455
|
+
"--cluster-gap",
|
|
456
|
+
type=int,
|
|
457
|
+
default=2000,
|
|
458
|
+
help="核心 transcript 聚类时允许的最大 gap,默认 2000"
|
|
459
|
+
)
|
|
460
|
+
parser.add_argument(
|
|
461
|
+
"--min-core-tx-per-cluster",
|
|
462
|
+
type=int,
|
|
463
|
+
default=1,
|
|
464
|
+
help="每个核心簇至少包含多少条非桥接 transcript,默认 1"
|
|
465
|
+
)
|
|
466
|
+
parser.add_argument(
|
|
467
|
+
"--min-bridge-count",
|
|
468
|
+
type=int,
|
|
469
|
+
default=1,
|
|
470
|
+
help="至少多少条真实桥接 transcript 才输出,默认 1"
|
|
471
|
+
)
|
|
472
|
+
parser.add_argument(
|
|
473
|
+
"--no-use-cds-if-no-exon",
|
|
474
|
+
action="store_true",
|
|
475
|
+
help="若 transcript 没有 exon,则不要回退使用 CDS"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
args = parser.parse_args()
|
|
479
|
+
|
|
480
|
+
use_cds_if_no_exon = not args.no_use_cds_if_no_exon
|
|
481
|
+
|
|
482
|
+
genes = read_gff3(args.input)
|
|
483
|
+
|
|
484
|
+
out_fields = [
|
|
485
|
+
"gene_id",
|
|
486
|
+
"chrom",
|
|
487
|
+
"start",
|
|
488
|
+
"end",
|
|
489
|
+
"strand",
|
|
490
|
+
"total_tx",
|
|
491
|
+
"core_tx",
|
|
492
|
+
"bridge_tx",
|
|
493
|
+
"cluster_count",
|
|
494
|
+
"cluster_ranges",
|
|
495
|
+
"cluster_members",
|
|
496
|
+
"bridge_members"
|
|
497
|
+
]
|
|
498
|
+
|
|
499
|
+
n_total = 0
|
|
500
|
+
n_flagged = 0
|
|
501
|
+
|
|
502
|
+
with open(args.output, "w", encoding="utf-8") as out:
|
|
503
|
+
out.write("\t".join(out_fields) + "\n")
|
|
504
|
+
|
|
505
|
+
for gid in sorted(genes.keys()):
|
|
506
|
+
gene = genes[gid]
|
|
507
|
+
n_total += 1
|
|
508
|
+
|
|
509
|
+
result = analyze_gene(
|
|
510
|
+
gene,
|
|
511
|
+
min_gap=args.min_gap,
|
|
512
|
+
cluster_gap=args.cluster_gap,
|
|
513
|
+
min_core_tx_per_cluster=args.min_core_tx_per_cluster,
|
|
514
|
+
min_bridge_count=args.min_bridge_count,
|
|
515
|
+
use_cds_if_no_exon=use_cds_if_no_exon
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
if result:
|
|
519
|
+
n_flagged += 1
|
|
520
|
+
out.write("\t".join(str(result[f]) for f in out_fields) + "\n")
|
|
521
|
+
|
|
522
|
+
sys.stderr.write(
|
|
523
|
+
f"[INFO] Total genes checked: {n_total}\n"
|
|
524
|
+
f"[INFO] Suspicious merged genes: {n_flagged}\n"
|
|
525
|
+
f"[INFO] Output written to: {args.output}\n"
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
if __name__ == "__main__":
|
|
530
|
+
main()
|
gffkit/main.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""Command line interface for gffkit."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Callable, List, Optional
|
|
11
|
+
|
|
12
|
+
from . import __version__
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _run_legacy_main(func: Callable[[], object], prog: str, args: List[str]) -> int:
|
|
16
|
+
"""Run an existing script-style main() by temporarily replacing sys.argv."""
|
|
17
|
+
old_argv = sys.argv[:]
|
|
18
|
+
sys.argv = [prog] + list(args)
|
|
19
|
+
try:
|
|
20
|
+
ret = func()
|
|
21
|
+
return 0 if ret is None else int(ret)
|
|
22
|
+
finally:
|
|
23
|
+
sys.argv = old_argv
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def cmd_detect_bridge(args: argparse.Namespace, extra: List[str]) -> int:
|
|
27
|
+
from . import detect_bridge_merged_genes as mod
|
|
28
|
+
cli = ["-i", args.input, "-o", args.output]
|
|
29
|
+
cli += extra
|
|
30
|
+
return _run_legacy_main(mod.main, "gffkit detect-bridge", cli)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def cmd_complement(args: argparse.Namespace, extra: List[str]) -> int:
|
|
34
|
+
from . import complement_annotations as mod
|
|
35
|
+
cli = ["--ref", args.ref]
|
|
36
|
+
for add_file in args.add:
|
|
37
|
+
cli += ["--add", add_file]
|
|
38
|
+
if args.output:
|
|
39
|
+
cli += ["--output", args.output]
|
|
40
|
+
cli += extra
|
|
41
|
+
return _run_legacy_main(mod.main, "gffkit complement", cli)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def cmd_add_utr(args: argparse.Namespace, extra: List[str]) -> int:
|
|
45
|
+
from . import add_utr as mod
|
|
46
|
+
cli = ["-i", args.input, "-o", args.output]
|
|
47
|
+
cli += extra
|
|
48
|
+
return _run_legacy_main(mod.main, "gffkit add-utr", cli)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def cmd_integrate(args: argparse.Namespace, extra: List[str]) -> int:
|
|
52
|
+
"""Run the full three-step annotation integration workflow."""
|
|
53
|
+
from . import detect_bridge_merged_genes as detect_mod
|
|
54
|
+
from . import complement_annotations as complement_mod
|
|
55
|
+
from . import add_utr as utr_mod
|
|
56
|
+
|
|
57
|
+
outdir = Path(args.outdir)
|
|
58
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
suspicious_tsv = Path(args.suspicious_tsv) if args.suspicious_tsv else outdir / f"{args.prefix}.suspicious.tsv"
|
|
61
|
+
merged_gff = Path(args.merged_gff) if args.merged_gff else outdir / f"{args.prefix}.merged.gff3"
|
|
62
|
+
final_gff = Path(args.output) if args.output else outdir / f"{args.prefix}.final.withUTR.gff3"
|
|
63
|
+
|
|
64
|
+
print("[gffkit] Step 1/3: detecting suspicious merged genes", file=sys.stderr)
|
|
65
|
+
detect_cli = [
|
|
66
|
+
"-i", args.annotation_a,
|
|
67
|
+
"-o", str(suspicious_tsv),
|
|
68
|
+
"--min-gap", str(args.min_gap),
|
|
69
|
+
"--cluster-gap", str(args.cluster_gap),
|
|
70
|
+
"--min-core-tx-per-cluster", str(args.min_core_tx_per_cluster),
|
|
71
|
+
"--min-bridge-count", str(args.min_bridge_count),
|
|
72
|
+
]
|
|
73
|
+
if args.no_use_cds_if_no_exon:
|
|
74
|
+
detect_cli.append("--no-use-cds-if-no-exon")
|
|
75
|
+
ret = _run_legacy_main(detect_mod.main, "gffkit detect-bridge", detect_cli)
|
|
76
|
+
if ret != 0:
|
|
77
|
+
return ret
|
|
78
|
+
|
|
79
|
+
print("[gffkit] Step 2/3: region-aware annotation merging", file=sys.stderr)
|
|
80
|
+
complement_cli = [
|
|
81
|
+
"--ref", args.annotation_a,
|
|
82
|
+
"--add", args.annotation_b,
|
|
83
|
+
"--swap_region_tsv", str(suspicious_tsv),
|
|
84
|
+
"--swap_region_flank", str(args.swap_region_flank),
|
|
85
|
+
"--size_min", str(args.size_min),
|
|
86
|
+
"--output", str(merged_gff),
|
|
87
|
+
]
|
|
88
|
+
ret = _run_legacy_main(complement_mod.main, "gffkit complement", complement_cli)
|
|
89
|
+
if ret != 0:
|
|
90
|
+
return ret
|
|
91
|
+
|
|
92
|
+
print("[gffkit] Step 3/3: adding UTR features", file=sys.stderr)
|
|
93
|
+
utr_cli = ["-i", str(merged_gff), "-o", str(final_gff), "--id-prefix", args.utr_id_prefix]
|
|
94
|
+
if args.replace_existing_utrs:
|
|
95
|
+
utr_cli.append("--replace-existing-utrs")
|
|
96
|
+
ret = _run_legacy_main(utr_mod.main, "gffkit add-utr", utr_cli)
|
|
97
|
+
if ret != 0:
|
|
98
|
+
return ret
|
|
99
|
+
|
|
100
|
+
print("[gffkit] Done", file=sys.stderr)
|
|
101
|
+
print(f"[gffkit] suspicious TSV: {suspicious_tsv}", file=sys.stderr)
|
|
102
|
+
print(f"[gffkit] merged GFF3: {merged_gff}", file=sys.stderr)
|
|
103
|
+
print(f"[gffkit] final GFF3: {final_gff}", file=sys.stderr)
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
108
|
+
parser = argparse.ArgumentParser(
|
|
109
|
+
prog="gffkit",
|
|
110
|
+
description="Utilities for detecting merged-gene artifacts, region-aware GFF merging, and UTR reconstruction.",
|
|
111
|
+
)
|
|
112
|
+
parser.add_argument("--version", action="version", version=f"gffkit {__version__}")
|
|
113
|
+
|
|
114
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
115
|
+
|
|
116
|
+
p = subparsers.add_parser(
|
|
117
|
+
"detect-bridge",
|
|
118
|
+
help="Detect suspicious merged genes caused by bridge transcripts.",
|
|
119
|
+
description="Wrapper around detect_bridge_merged_genes.py. Unknown options are forwarded.",
|
|
120
|
+
)
|
|
121
|
+
p.add_argument("-i", "--input", required=True, help="Input GFF3 file, usually Annotation A.")
|
|
122
|
+
p.add_argument("-o", "--output", required=True, help="Output suspicious.tsv file.")
|
|
123
|
+
p.set_defaults(handler=cmd_detect_bridge)
|
|
124
|
+
|
|
125
|
+
p = subparsers.add_parser(
|
|
126
|
+
"complement",
|
|
127
|
+
help="Complement/merge GFF annotations with optional region-swap mode.",
|
|
128
|
+
description="Wrapper around complement_annotations.py. Unknown options are forwarded.",
|
|
129
|
+
)
|
|
130
|
+
p.add_argument("--ref", "-r", "-i", required=True, help="Reference GFF/GTF file.")
|
|
131
|
+
p.add_argument("--add", "-a", action="append", required=True, help="Supplementary GFF/GTF file; can be repeated.")
|
|
132
|
+
p.add_argument("--output", "--out", "-o", default=None, help="Output GFF3 path. Default: stdout.")
|
|
133
|
+
p.set_defaults(handler=cmd_complement)
|
|
134
|
+
|
|
135
|
+
p = subparsers.add_parser(
|
|
136
|
+
"add-utr",
|
|
137
|
+
help="Add five_prime_UTR and three_prime_UTR features from exon/CDS coordinates.",
|
|
138
|
+
description="Wrapper around add_utr.py. Unknown options are forwarded.",
|
|
139
|
+
)
|
|
140
|
+
p.add_argument("-i", "--input", required=True, help="Input GFF3/GTF file.")
|
|
141
|
+
p.add_argument("-o", "--output", required=True, help="Output GFF3 file.")
|
|
142
|
+
p.set_defaults(handler=cmd_add_utr)
|
|
143
|
+
|
|
144
|
+
p = subparsers.add_parser(
|
|
145
|
+
"integrate",
|
|
146
|
+
help="Run the full A/B region-aware integration workflow.",
|
|
147
|
+
description=(
|
|
148
|
+
"Detect suspicious merged-gene regions in Annotation A, use Annotation B as the local "
|
|
149
|
+
"primary reference in those regions, then add UTR features."
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
p.add_argument("--annotation-a", "--a", required=True, help="Annotation A: EviAnn/RNA-seq-supported GFF3.")
|
|
153
|
+
p.add_argument("--annotation-b", "--b", required=True, help="Annotation B: ANNEVO/deep-learning GFF3.")
|
|
154
|
+
p.add_argument("--outdir", default="gffkit_out", help="Output directory.")
|
|
155
|
+
p.add_argument("--prefix", default="gffkit", help="Output file prefix.")
|
|
156
|
+
p.add_argument("-o", "--output", default=None, help="Final GFF3 output path. Default: OUTDIR/PREFIX.final.withUTR.gff3")
|
|
157
|
+
p.add_argument("--suspicious-tsv", default=None, help="Optional path for intermediate suspicious.tsv.")
|
|
158
|
+
p.add_argument("--merged-gff", default=None, help="Optional path for intermediate merged.gff3.")
|
|
159
|
+
|
|
160
|
+
p.add_argument("--min-gap", type=int, default=10000, help="Bridge-candidate long gap threshold.")
|
|
161
|
+
p.add_argument("--cluster-gap", type=int, default=2000, help="Core cluster maximum gap.")
|
|
162
|
+
p.add_argument("--min-core-tx-per-cluster", type=int, default=1, help="Minimum core transcripts per cluster.")
|
|
163
|
+
p.add_argument("--min-bridge-count", type=int, default=1, help="Minimum true bridge transcripts required.")
|
|
164
|
+
p.add_argument("--no-use-cds-if-no-exon", action="store_true", help="Do not use CDS when transcript has no exon.")
|
|
165
|
+
|
|
166
|
+
p.add_argument("--swap-region-flank", type=int, default=100, help="Flanking bp added to suspicious regions.")
|
|
167
|
+
p.add_argument("--size-min", type=int, default=0, help="Minimum CDS size for non-overlapping supplementary roots.")
|
|
168
|
+
p.add_argument("--replace-existing-utrs", action="store_true", help="Remove existing UTRs and recreate them.")
|
|
169
|
+
p.add_argument("--utr-id-prefix", default="gffkit_utr_", help="Prefix for newly created UTR IDs.")
|
|
170
|
+
p.set_defaults(handler=cmd_integrate)
|
|
171
|
+
|
|
172
|
+
return parser
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
176
|
+
parser = build_parser()
|
|
177
|
+
args, extra = parser.parse_known_args(argv)
|
|
178
|
+
return args.handler(args, extra)
|