gffkit 0.1.0__tar.gz → 0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gffkit-0.1.0/src/gffkit.egg-info → gffkit-0.3}/PKG-INFO +1 -1
- {gffkit-0.1.0 → gffkit-0.3}/pyproject.toml +1 -1
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/__init__.py +1 -1
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/complement_annotations.py +129 -44
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/detect_bridge_merged_genes.py +55 -34
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/main.py +7 -1
- {gffkit-0.1.0 → gffkit-0.3/src/gffkit.egg-info}/PKG-INFO +1 -1
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit.egg-info/SOURCES.txt +2 -1
- gffkit-0.3/tests/test_complement_sources.py +62 -0
- {gffkit-0.1.0 → gffkit-0.3}/LICENSE +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/MANIFEST.in +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/README.md +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/setup.cfg +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/__main__.py +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit/add_utr.py +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit.egg-info/dependency_links.txt +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit.egg-info/entry_points.txt +0 -0
- {gffkit-0.1.0 → gffkit-0.3}/src/gffkit.egg-info/top_level.txt +0 -0
|
@@ -20,13 +20,14 @@ agat_sp_complement_annotations.pl 的 Python 改写版(纯 Python,不调用
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
-
import argparse
|
|
24
|
-
import copy
|
|
25
|
-
import re
|
|
26
|
-
import sys
|
|
27
|
-
from collections import defaultdict
|
|
28
|
-
from dataclasses import dataclass, field
|
|
29
|
-
from
|
|
23
|
+
import argparse
|
|
24
|
+
import copy
|
|
25
|
+
import re
|
|
26
|
+
import sys
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
30
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
GENE_LIKE_TYPES = {
|
|
@@ -570,16 +571,81 @@ def cds_size_nt(root: Feature) -> int:
|
|
|
570
571
|
return total
|
|
571
572
|
|
|
572
573
|
|
|
573
|
-
def any_feature_overlap(features_a: List[Feature], features_b: List[Feature]) -> bool:
|
|
574
|
-
"""判断两组特征中是否存在任意一对重叠。"""
|
|
575
|
-
for fa in features_a:
|
|
576
|
-
for fb in features_b:
|
|
577
|
-
if fa.overlaps(fb):
|
|
578
|
-
return True
|
|
579
|
-
return False
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
def
|
|
574
|
+
def any_feature_overlap(features_a: List[Feature], features_b: List[Feature]) -> bool:
|
|
575
|
+
"""判断两组特征中是否存在任意一对重叠。"""
|
|
576
|
+
for fa in features_a:
|
|
577
|
+
for fb in features_b:
|
|
578
|
+
if fa.overlaps(fb):
|
|
579
|
+
return True
|
|
580
|
+
return False
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def split_source_names(source: str) -> List[str]:
|
|
584
|
+
"""把第二列 source 拆成有序、去重的软件名列表。"""
|
|
585
|
+
names: List[str] = []
|
|
586
|
+
for item in re.split(r"[,|+]", source):
|
|
587
|
+
name = item.strip()
|
|
588
|
+
if name and name != "." and name not in names:
|
|
589
|
+
names.append(name)
|
|
590
|
+
return names
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def collect_source_names(root: Feature) -> List[str]:
|
|
594
|
+
"""收集一棵特征树中出现过的 source 名称,保持首次出现顺序。"""
|
|
595
|
+
names: List[str] = []
|
|
596
|
+
for feat in root.iter_all():
|
|
597
|
+
for name in split_source_names(feat.source):
|
|
598
|
+
if name not in names:
|
|
599
|
+
names.append(name)
|
|
600
|
+
return names
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def merge_source_names(*source_groups: Iterable[str]) -> str:
|
|
604
|
+
"""合并多组 source 名称并返回可写入 GFF 第二列的字符串。"""
|
|
605
|
+
merged: List[str] = []
|
|
606
|
+
for group in source_groups:
|
|
607
|
+
for source in group:
|
|
608
|
+
for name in split_source_names(source):
|
|
609
|
+
if name not in merged:
|
|
610
|
+
merged.append(name)
|
|
611
|
+
return "|".join(merged) if merged else "."
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def set_tree_source(root: Feature, source: str) -> None:
|
|
615
|
+
"""把整棵特征树的第二列 source 统一为指定值。"""
|
|
616
|
+
for feat in root.iter_all():
|
|
617
|
+
feat.source = source
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def update_overlapping_sources(add_root: Feature, ref_roots: List[Feature]) -> List[Feature]:
|
|
621
|
+
"""
|
|
622
|
+
对与 add_root 重叠的已有一级特征,合并双方软件来源并写回第二列。
|
|
623
|
+
|
|
624
|
+
返回发生重叠的参考一级特征列表,供后续补充判断复用。
|
|
625
|
+
"""
|
|
626
|
+
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
627
|
+
if not overlapping_refs:
|
|
628
|
+
return []
|
|
629
|
+
|
|
630
|
+
add_sources = collect_source_names(add_root)
|
|
631
|
+
combined_sources: List[str] = []
|
|
632
|
+
for ref_root in overlapping_refs:
|
|
633
|
+
for source in collect_source_names(ref_root):
|
|
634
|
+
if source not in combined_sources:
|
|
635
|
+
combined_sources.append(source)
|
|
636
|
+
for source in add_sources:
|
|
637
|
+
if source not in combined_sources:
|
|
638
|
+
combined_sources.append(source)
|
|
639
|
+
|
|
640
|
+
merged_source = merge_source_names(combined_sources)
|
|
641
|
+
set_tree_source(add_root, merged_source)
|
|
642
|
+
for ref_root in overlapping_refs:
|
|
643
|
+
set_tree_source(ref_root, merged_source)
|
|
644
|
+
|
|
645
|
+
return overlapping_refs
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def should_add_root(add_root: Feature, ref_roots: List[Feature], size_min: int) -> bool:
|
|
583
649
|
"""
|
|
584
650
|
按原脚本说明中的规则,判断 add_root 是否应该加入参考注释。
|
|
585
651
|
|
|
@@ -596,7 +662,7 @@ def should_add_root(add_root: Feature, ref_roots: List[Feature], size_min: int)
|
|
|
596
662
|
- 只要有一个重叠关系不满足条件,就不加入整个一级特征。
|
|
597
663
|
这与原说明中“只要有一个 isoform 重叠,就阻止整个 gene 加入”的精神一致。
|
|
598
664
|
"""
|
|
599
|
-
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
665
|
+
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
600
666
|
|
|
601
667
|
# 情况 1:完全不与参考一级特征重叠
|
|
602
668
|
if not overlapping_refs:
|
|
@@ -724,7 +790,7 @@ def uniquify_feature_tree(root: Feature, existing_ids: set, existing_names: set)
|
|
|
724
790
|
existing_names.add(feat.name)
|
|
725
791
|
|
|
726
792
|
|
|
727
|
-
def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_min: int) -> int:
|
|
793
|
+
def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_min: int) -> int:
|
|
728
794
|
"""
|
|
729
795
|
把 add_set 中满足条件的一级特征补充到 ref_set 中。
|
|
730
796
|
返回值:实际添加的一级特征数量。
|
|
@@ -733,15 +799,16 @@ def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_
|
|
|
733
799
|
existing_ids = ref_set.all_ids()
|
|
734
800
|
existing_names = ref_set.all_names()
|
|
735
801
|
|
|
736
|
-
for add_root in sorted(
|
|
737
|
-
add_set.roots,
|
|
738
|
-
key=lambda x: (x.seqid, x.start, x.end, x.feature_type, x.id or "")
|
|
739
|
-
):
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
802
|
+
for add_root in sorted(
|
|
803
|
+
add_set.roots,
|
|
804
|
+
key=lambda x: (x.seqid, x.start, x.end, x.feature_type, x.id or "")
|
|
805
|
+
):
|
|
806
|
+
update_overlapping_sources(add_root, ref_set.roots)
|
|
807
|
+
if should_add_root(add_root, ref_set.roots, size_min):
|
|
808
|
+
cloned = add_root.deep_clone()
|
|
809
|
+
uniquify_feature_tree(cloned, existing_ids, existing_names)
|
|
810
|
+
ref_set.roots.append(cloned)
|
|
811
|
+
added_count += 1
|
|
745
812
|
|
|
746
813
|
return added_count
|
|
747
814
|
|
|
@@ -777,7 +844,7 @@ def print_complement_resume(before_counts: Dict[str, Dict[str, int]],
|
|
|
777
844
|
eprint("\nNow the data contains:")
|
|
778
845
|
|
|
779
846
|
|
|
780
|
-
def build_arg_parser() -> argparse.ArgumentParser:
|
|
847
|
+
def build_arg_parser() -> argparse.ArgumentParser:
|
|
781
848
|
"""构建命令行参数解析器。"""
|
|
782
849
|
parser = argparse.ArgumentParser(
|
|
783
850
|
description="用一个或多个注释文件去补充参考注释(Python 版,纯 Python,不调用 Perl)。"
|
|
@@ -814,12 +881,16 @@ def build_arg_parser() -> argparse.ArgumentParser:
|
|
|
814
881
|
default=100,
|
|
815
882
|
help="从 suspicious.tsv 读取区间时,start/end 两端各扩展的 bp 数,默认 100"
|
|
816
883
|
)
|
|
817
|
-
parser.add_argument("--output", "--out", "-o", default=None, help="输出文件路径;默认输出到 STDOUT")
|
|
818
|
-
parser.add_argument(
|
|
819
|
-
"-
|
|
820
|
-
help="
|
|
821
|
-
)
|
|
822
|
-
|
|
884
|
+
parser.add_argument("--output", "--out", "-o", default=None, help="输出文件路径;默认输出到 STDOUT")
|
|
885
|
+
parser.add_argument(
|
|
886
|
+
"-t", "--threads", type=int, default=1,
|
|
887
|
+
help="并行线程数;多个 --add 文件会并行预解析,但仍按输入顺序合并,默认 1"
|
|
888
|
+
)
|
|
889
|
+
parser.add_argument(
|
|
890
|
+
"-v", "--verbose", type=int, default=1,
|
|
891
|
+
help="日志详细程度(0~4),这里只简单保留该参数接口,默认 1"
|
|
892
|
+
)
|
|
893
|
+
return parser
|
|
823
894
|
|
|
824
895
|
|
|
825
896
|
def parse_swap_regions(raw_regions: Optional[List[List[str]]]) -> List[SwapRegion]:
|
|
@@ -852,7 +923,7 @@ def parse_swap_regions(raw_regions: Optional[List[List[str]]]) -> List[SwapRegio
|
|
|
852
923
|
return parsed
|
|
853
924
|
|
|
854
925
|
|
|
855
|
-
def parse_swap_regions_from_tsv(tsv_path: str, flank_bp: int = 100) -> List[SwapRegion]:
|
|
926
|
+
def parse_swap_regions_from_tsv(tsv_path: str, flank_bp: int = 100) -> List[SwapRegion]:
|
|
856
927
|
"""
|
|
857
928
|
从 detect_bridge_merged_genes.py 产生的 suspicious.tsv 读取区间。
|
|
858
929
|
|
|
@@ -928,12 +999,26 @@ def parse_swap_regions_from_tsv(tsv_path: str, flank_bp: int = 100) -> List[Swap
|
|
|
928
999
|
f" 当前行内容为:{line}"
|
|
929
1000
|
) from exc
|
|
930
1001
|
|
|
931
|
-
return regions
|
|
1002
|
+
return regions
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
def parse_add_files(add_files: List[str], threads: int) -> List[Tuple[str, AnnotationSet]]:
|
|
1006
|
+
"""并行预解析补充注释文件,返回顺序与命令行 --add 顺序一致。"""
|
|
1007
|
+
threads = max(1, threads)
|
|
1008
|
+
if threads == 1 or len(add_files) <= 1:
|
|
1009
|
+
return [(path, parse_annotation_file(path)) for path in add_files]
|
|
1010
|
+
|
|
1011
|
+
with ThreadPoolExecutor(max_workers=min(threads, len(add_files))) as executor:
|
|
1012
|
+
parsed_sets = list(executor.map(parse_annotation_file, add_files))
|
|
1013
|
+
return list(zip(add_files, parsed_sets))
|
|
932
1014
|
|
|
933
1015
|
|
|
934
1016
|
def main() -> int:
|
|
935
1017
|
parser = build_arg_parser()
|
|
936
|
-
args = parser.parse_args()
|
|
1018
|
+
args = parser.parse_args()
|
|
1019
|
+
|
|
1020
|
+
if args.threads < 1:
|
|
1021
|
+
parser.error("--threads/-t 必须是正整数")
|
|
937
1022
|
|
|
938
1023
|
try:
|
|
939
1024
|
swap_regions = parse_swap_regions(args.swap_region)
|
|
@@ -959,11 +1044,11 @@ def main() -> int:
|
|
|
959
1044
|
for region in swap_regions:
|
|
960
1045
|
eprint(f" - {region.seqid}:{region.start}-{region.end}")
|
|
961
1046
|
|
|
962
|
-
# 2)
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
eprint(f"{next_file} parsed")
|
|
966
|
-
add_set.info()
|
|
1047
|
+
# 2) 按用户给定顺序,逐个补充;多个输入文件可并行预解析
|
|
1048
|
+
parsed_add_sets = parse_add_files(args.add, args.threads)
|
|
1049
|
+
for next_file, add_set in parsed_add_sets:
|
|
1050
|
+
eprint(f"{next_file} parsed")
|
|
1051
|
+
add_set.info()
|
|
967
1052
|
|
|
968
1053
|
before_counts = ref_set.level_counts()
|
|
969
1054
|
|
|
@@ -998,4 +1083,4 @@ def main() -> int:
|
|
|
998
1083
|
|
|
999
1084
|
|
|
1000
1085
|
if __name__ == "__main__":
|
|
1001
|
-
sys.exit(main())
|
|
1086
|
+
sys.exit(main())
|
|
@@ -43,9 +43,10 @@ detect_bridge_merged_genes.py
|
|
|
43
43
|
bridge_members
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
|
-
import argparse
|
|
47
|
-
import sys
|
|
48
|
-
from collections import defaultdict
|
|
46
|
+
import argparse
|
|
47
|
+
import sys
|
|
48
|
+
from collections import defaultdict
|
|
49
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
# ----------------------------
|
|
@@ -343,7 +344,7 @@ def read_gff3(gff_file):
|
|
|
343
344
|
# 核心检测逻辑
|
|
344
345
|
# ----------------------------
|
|
345
346
|
|
|
346
|
-
def analyze_gene(
|
|
347
|
+
def analyze_gene(
|
|
347
348
|
gene,
|
|
348
349
|
min_gap=10000,
|
|
349
350
|
cluster_gap=2000,
|
|
@@ -432,7 +433,13 @@ def analyze_gene(
|
|
|
432
433
|
"cluster_members": ";".join(cluster_member_strs),
|
|
433
434
|
"bridge_members": ";".join(bridge_member_strs)
|
|
434
435
|
}
|
|
435
|
-
return result
|
|
436
|
+
return result
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def analyze_gene_task(task):
|
|
440
|
+
"""ThreadPoolExecutor 需要顶层函数;返回 (gene_id, result)。"""
|
|
441
|
+
gid, gene, params = task
|
|
442
|
+
return gid, analyze_gene(gene, **params)
|
|
436
443
|
|
|
437
444
|
|
|
438
445
|
# ----------------------------
|
|
@@ -469,15 +476,24 @@ def main():
|
|
|
469
476
|
default=1,
|
|
470
477
|
help="至少多少条真实桥接 transcript 才输出,默认 1"
|
|
471
478
|
)
|
|
472
|
-
parser.add_argument(
|
|
473
|
-
"--no-use-cds-if-no-exon",
|
|
474
|
-
action="store_true",
|
|
475
|
-
help="若 transcript 没有 exon,则不要回退使用 CDS"
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
479
|
+
parser.add_argument(
|
|
480
|
+
"--no-use-cds-if-no-exon",
|
|
481
|
+
action="store_true",
|
|
482
|
+
help="若 transcript 没有 exon,则不要回退使用 CDS"
|
|
483
|
+
)
|
|
484
|
+
parser.add_argument(
|
|
485
|
+
"-t", "--threads",
|
|
486
|
+
type=int,
|
|
487
|
+
default=1,
|
|
488
|
+
help="并行分析 gene 的线程数,默认 1"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
args = parser.parse_args()
|
|
492
|
+
|
|
493
|
+
if args.threads < 1:
|
|
494
|
+
parser.error("--threads/-t 必须是正整数")
|
|
495
|
+
|
|
496
|
+
use_cds_if_no_exon = not args.no_use_cds_if_no_exon
|
|
481
497
|
|
|
482
498
|
genes = read_gff3(args.input)
|
|
483
499
|
|
|
@@ -499,25 +515,30 @@ def main():
|
|
|
499
515
|
n_total = 0
|
|
500
516
|
n_flagged = 0
|
|
501
517
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
518
|
+
analyze_params = {
|
|
519
|
+
"min_gap": args.min_gap,
|
|
520
|
+
"cluster_gap": args.cluster_gap,
|
|
521
|
+
"min_core_tx_per_cluster": args.min_core_tx_per_cluster,
|
|
522
|
+
"min_bridge_count": args.min_bridge_count,
|
|
523
|
+
"use_cds_if_no_exon": use_cds_if_no_exon,
|
|
524
|
+
}
|
|
525
|
+
sorted_genes = [(gid, genes[gid], analyze_params) for gid in sorted(genes.keys())]
|
|
526
|
+
|
|
527
|
+
if args.threads == 1:
|
|
528
|
+
analyzed = [analyze_gene_task(task) for task in sorted_genes]
|
|
529
|
+
else:
|
|
530
|
+
with ThreadPoolExecutor(max_workers=args.threads) as executor:
|
|
531
|
+
analyzed = list(executor.map(analyze_gene_task, sorted_genes))
|
|
532
|
+
|
|
533
|
+
with open(args.output, "w", encoding="utf-8") as out:
|
|
534
|
+
out.write("\t".join(out_fields) + "\n")
|
|
535
|
+
|
|
536
|
+
for gid, result in analyzed:
|
|
537
|
+
n_total += 1
|
|
538
|
+
|
|
539
|
+
if result:
|
|
540
|
+
n_flagged += 1
|
|
541
|
+
out.write("\t".join(str(result[f]) for f in out_fields) + "\n")
|
|
521
542
|
|
|
522
543
|
sys.stderr.write(
|
|
523
544
|
f"[INFO] Total genes checked: {n_total}\n"
|
|
@@ -527,4 +548,4 @@ def main():
|
|
|
527
548
|
|
|
528
549
|
|
|
529
550
|
if __name__ == "__main__":
|
|
530
|
-
main()
|
|
551
|
+
main()
|
|
@@ -25,7 +25,7 @@ def _run_legacy_main(func: Callable[[], object], prog: str, args: List[str]) ->
|
|
|
25
25
|
|
|
26
26
|
def cmd_detect_bridge(args: argparse.Namespace, extra: List[str]) -> int:
|
|
27
27
|
from . import detect_bridge_merged_genes as mod
|
|
28
|
-
cli = ["-i", args.input, "-o", args.output]
|
|
28
|
+
cli = ["-i", args.input, "-o", args.output, "-t", str(args.threads)]
|
|
29
29
|
cli += extra
|
|
30
30
|
return _run_legacy_main(mod.main, "gffkit detect-bridge", cli)
|
|
31
31
|
|
|
@@ -37,6 +37,7 @@ def cmd_complement(args: argparse.Namespace, extra: List[str]) -> int:
|
|
|
37
37
|
cli += ["--add", add_file]
|
|
38
38
|
if args.output:
|
|
39
39
|
cli += ["--output", args.output]
|
|
40
|
+
cli += ["-t", str(args.threads)]
|
|
40
41
|
cli += extra
|
|
41
42
|
return _run_legacy_main(mod.main, "gffkit complement", cli)
|
|
42
43
|
|
|
@@ -69,6 +70,7 @@ def cmd_integrate(args: argparse.Namespace, extra: List[str]) -> int:
|
|
|
69
70
|
"--cluster-gap", str(args.cluster_gap),
|
|
70
71
|
"--min-core-tx-per-cluster", str(args.min_core_tx_per_cluster),
|
|
71
72
|
"--min-bridge-count", str(args.min_bridge_count),
|
|
73
|
+
"-t", str(args.threads),
|
|
72
74
|
]
|
|
73
75
|
if args.no_use_cds_if_no_exon:
|
|
74
76
|
detect_cli.append("--no-use-cds-if-no-exon")
|
|
@@ -84,6 +86,7 @@ def cmd_integrate(args: argparse.Namespace, extra: List[str]) -> int:
|
|
|
84
86
|
"--swap_region_flank", str(args.swap_region_flank),
|
|
85
87
|
"--size_min", str(args.size_min),
|
|
86
88
|
"--output", str(merged_gff),
|
|
89
|
+
"-t", str(args.threads),
|
|
87
90
|
]
|
|
88
91
|
ret = _run_legacy_main(complement_mod.main, "gffkit complement", complement_cli)
|
|
89
92
|
if ret != 0:
|
|
@@ -120,6 +123,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
120
123
|
)
|
|
121
124
|
p.add_argument("-i", "--input", required=True, help="Input GFF3 file, usually Annotation A.")
|
|
122
125
|
p.add_argument("-o", "--output", required=True, help="Output suspicious.tsv file.")
|
|
126
|
+
p.add_argument("-t", "--threads", type=int, default=1, help="Number of worker threads.")
|
|
123
127
|
p.set_defaults(handler=cmd_detect_bridge)
|
|
124
128
|
|
|
125
129
|
p = subparsers.add_parser(
|
|
@@ -130,6 +134,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
130
134
|
p.add_argument("--ref", "-r", "-i", required=True, help="Reference GFF/GTF file.")
|
|
131
135
|
p.add_argument("--add", "-a", action="append", required=True, help="Supplementary GFF/GTF file; can be repeated.")
|
|
132
136
|
p.add_argument("--output", "--out", "-o", default=None, help="Output GFF3 path. Default: stdout.")
|
|
137
|
+
p.add_argument("-t", "--threads", type=int, default=1, help="Number of worker threads.")
|
|
133
138
|
p.set_defaults(handler=cmd_complement)
|
|
134
139
|
|
|
135
140
|
p = subparsers.add_parser(
|
|
@@ -162,6 +167,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
162
167
|
p.add_argument("--min-core-tx-per-cluster", type=int, default=1, help="Minimum core transcripts per cluster.")
|
|
163
168
|
p.add_argument("--min-bridge-count", type=int, default=1, help="Minimum true bridge transcripts required.")
|
|
164
169
|
p.add_argument("--no-use-cds-if-no-exon", action="store_true", help="Do not use CDS when transcript has no exon.")
|
|
170
|
+
p.add_argument("-t", "--threads", type=int, default=1, help="Number of worker threads used by detect and complement steps.")
|
|
165
171
|
|
|
166
172
|
p.add_argument("--swap-region-flank", type=int, default=100, help="Flanking bp added to suspicious regions.")
|
|
167
173
|
p.add_argument("--size-min", type=int, default=0, help="Minimum CDS size for non-overlapping supplementary roots.")
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from gffkit.complement_annotations import complement_annotations, parse_annotation_file
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_overlapping_gene_sources_are_merged(tmp_path):
|
|
5
|
+
ref_gff = tmp_path / "ref.gff3"
|
|
6
|
+
add_gff = tmp_path / "add.gff3"
|
|
7
|
+
|
|
8
|
+
ref_gff.write_text(
|
|
9
|
+
"\n".join(
|
|
10
|
+
[
|
|
11
|
+
"##gff-version 3",
|
|
12
|
+
"chr1\tEviAnn\tgene\t100\t500\t.\t+\t.\tID=gene1",
|
|
13
|
+
"chr1\tEviAnn\tmRNA\t100\t500\t.\t+\t.\tID=tx1;Parent=gene1",
|
|
14
|
+
"chr1\tEviAnn\texon\t100\t500\t.\t+\t.\tID=exon1;Parent=tx1",
|
|
15
|
+
"",
|
|
16
|
+
]
|
|
17
|
+
),
|
|
18
|
+
encoding="utf-8",
|
|
19
|
+
)
|
|
20
|
+
add_gff.write_text(
|
|
21
|
+
"\n".join(
|
|
22
|
+
[
|
|
23
|
+
"##gff-version 3",
|
|
24
|
+
"chr1\tANNEVO\tgene\t120\t480\t.\t+\t.\tID=gene2",
|
|
25
|
+
"chr1\tANNEVO\tmRNA\t120\t480\t.\t+\t.\tID=tx2;Parent=gene2",
|
|
26
|
+
"chr1\tANNEVO\texon\t120\t480\t.\t+\t.\tID=exon2;Parent=tx2",
|
|
27
|
+
"",
|
|
28
|
+
]
|
|
29
|
+
),
|
|
30
|
+
encoding="utf-8",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
ref_set = parse_annotation_file(str(ref_gff))
|
|
34
|
+
add_set = parse_annotation_file(str(add_gff))
|
|
35
|
+
|
|
36
|
+
added = complement_annotations(ref_set, add_set, size_min=0)
|
|
37
|
+
|
|
38
|
+
assert added == 0
|
|
39
|
+
assert len(ref_set.roots) == 1
|
|
40
|
+
assert {feature.source for feature in ref_set.roots[0].iter_all()} == {"EviAnn|ANNEVO"}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_non_overlapping_gene_keeps_single_source(tmp_path):
|
|
44
|
+
ref_gff = tmp_path / "ref.gff3"
|
|
45
|
+
add_gff = tmp_path / "add.gff3"
|
|
46
|
+
|
|
47
|
+
ref_gff.write_text(
|
|
48
|
+
"chr1\tEviAnn\tgene\t100\t200\t.\t+\t.\tID=gene1\n",
|
|
49
|
+
encoding="utf-8",
|
|
50
|
+
)
|
|
51
|
+
add_gff.write_text(
|
|
52
|
+
"chr1\tANNEVO\tgene\t400\t500\t.\t+\t.\tID=gene2\n",
|
|
53
|
+
encoding="utf-8",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
ref_set = parse_annotation_file(str(ref_gff))
|
|
57
|
+
add_set = parse_annotation_file(str(add_gff))
|
|
58
|
+
|
|
59
|
+
added = complement_annotations(ref_set, add_set, size_min=0)
|
|
60
|
+
|
|
61
|
+
assert added == 1
|
|
62
|
+
assert [root.source for root in ref_set.roots] == ["EviAnn", "ANNEVO"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|