gffkit 0.1.0__tar.gz → 0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gffkit-0.1.0/src/gffkit.egg-info → gffkit-0.2}/PKG-INFO +1 -1
- {gffkit-0.1.0 → gffkit-0.2}/pyproject.toml +1 -1
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/__init__.py +1 -1
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/complement_annotations.py +88 -22
- {gffkit-0.1.0 → gffkit-0.2/src/gffkit.egg-info}/PKG-INFO +1 -1
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit.egg-info/SOURCES.txt +2 -1
- gffkit-0.2/tests/test_complement_sources.py +62 -0
- {gffkit-0.1.0 → gffkit-0.2}/LICENSE +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/MANIFEST.in +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/README.md +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/setup.cfg +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/__main__.py +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/add_utr.py +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/detect_bridge_merged_genes.py +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit/main.py +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit.egg-info/dependency_links.txt +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit.egg-info/entry_points.txt +0 -0
- {gffkit-0.1.0 → gffkit-0.2}/src/gffkit.egg-info/top_level.txt +0 -0
|
@@ -570,16 +570,81 @@ def cds_size_nt(root: Feature) -> int:
|
|
|
570
570
|
return total
|
|
571
571
|
|
|
572
572
|
|
|
573
|
-
def any_feature_overlap(features_a: List[Feature], features_b: List[Feature]) -> bool:
|
|
574
|
-
"""判断两组特征中是否存在任意一对重叠。"""
|
|
575
|
-
for fa in features_a:
|
|
576
|
-
for fb in features_b:
|
|
577
|
-
if fa.overlaps(fb):
|
|
578
|
-
return True
|
|
579
|
-
return False
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
def
|
|
573
|
+
def any_feature_overlap(features_a: List[Feature], features_b: List[Feature]) -> bool:
|
|
574
|
+
"""判断两组特征中是否存在任意一对重叠。"""
|
|
575
|
+
for fa in features_a:
|
|
576
|
+
for fb in features_b:
|
|
577
|
+
if fa.overlaps(fb):
|
|
578
|
+
return True
|
|
579
|
+
return False
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def split_source_names(source: str) -> List[str]:
|
|
583
|
+
"""把第二列 source 拆成有序、去重的软件名列表。"""
|
|
584
|
+
names: List[str] = []
|
|
585
|
+
for item in re.split(r"[,|+]", source):
|
|
586
|
+
name = item.strip()
|
|
587
|
+
if name and name != "." and name not in names:
|
|
588
|
+
names.append(name)
|
|
589
|
+
return names
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def collect_source_names(root: Feature) -> List[str]:
|
|
593
|
+
"""收集一棵特征树中出现过的 source 名称,保持首次出现顺序。"""
|
|
594
|
+
names: List[str] = []
|
|
595
|
+
for feat in root.iter_all():
|
|
596
|
+
for name in split_source_names(feat.source):
|
|
597
|
+
if name not in names:
|
|
598
|
+
names.append(name)
|
|
599
|
+
return names
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def merge_source_names(*source_groups: Iterable[str]) -> str:
|
|
603
|
+
"""合并多组 source 名称并返回可写入 GFF 第二列的字符串。"""
|
|
604
|
+
merged: List[str] = []
|
|
605
|
+
for group in source_groups:
|
|
606
|
+
for source in group:
|
|
607
|
+
for name in split_source_names(source):
|
|
608
|
+
if name not in merged:
|
|
609
|
+
merged.append(name)
|
|
610
|
+
return ",".join(merged) if merged else "."
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def set_tree_source(root: Feature, source: str) -> None:
|
|
614
|
+
"""把整棵特征树的第二列 source 统一为指定值。"""
|
|
615
|
+
for feat in root.iter_all():
|
|
616
|
+
feat.source = source
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def update_overlapping_sources(add_root: Feature, ref_roots: List[Feature]) -> List[Feature]:
|
|
620
|
+
"""
|
|
621
|
+
对与 add_root 重叠的已有一级特征,合并双方软件来源并写回第二列。
|
|
622
|
+
|
|
623
|
+
返回发生重叠的参考一级特征列表,供后续补充判断复用。
|
|
624
|
+
"""
|
|
625
|
+
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
626
|
+
if not overlapping_refs:
|
|
627
|
+
return []
|
|
628
|
+
|
|
629
|
+
add_sources = collect_source_names(add_root)
|
|
630
|
+
combined_sources: List[str] = []
|
|
631
|
+
for ref_root in overlapping_refs:
|
|
632
|
+
for source in collect_source_names(ref_root):
|
|
633
|
+
if source not in combined_sources:
|
|
634
|
+
combined_sources.append(source)
|
|
635
|
+
for source in add_sources:
|
|
636
|
+
if source not in combined_sources:
|
|
637
|
+
combined_sources.append(source)
|
|
638
|
+
|
|
639
|
+
merged_source = merge_source_names(combined_sources)
|
|
640
|
+
set_tree_source(add_root, merged_source)
|
|
641
|
+
for ref_root in overlapping_refs:
|
|
642
|
+
set_tree_source(ref_root, merged_source)
|
|
643
|
+
|
|
644
|
+
return overlapping_refs
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def should_add_root(add_root: Feature, ref_roots: List[Feature], size_min: int) -> bool:
|
|
583
648
|
"""
|
|
584
649
|
按原脚本说明中的规则,判断 add_root 是否应该加入参考注释。
|
|
585
650
|
|
|
@@ -596,7 +661,7 @@ def should_add_root(add_root: Feature, ref_roots: List[Feature], size_min: int)
|
|
|
596
661
|
- 只要有一个重叠关系不满足条件,就不加入整个一级特征。
|
|
597
662
|
这与原说明中“只要有一个 isoform 重叠,就阻止整个 gene 加入”的精神一致。
|
|
598
663
|
"""
|
|
599
|
-
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
664
|
+
overlapping_refs = [r for r in ref_roots if add_root.overlaps(r)]
|
|
600
665
|
|
|
601
666
|
# 情况 1:完全不与参考一级特征重叠
|
|
602
667
|
if not overlapping_refs:
|
|
@@ -724,7 +789,7 @@ def uniquify_feature_tree(root: Feature, existing_ids: set, existing_names: set)
|
|
|
724
789
|
existing_names.add(feat.name)
|
|
725
790
|
|
|
726
791
|
|
|
727
|
-
def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_min: int) -> int:
|
|
792
|
+
def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_min: int) -> int:
|
|
728
793
|
"""
|
|
729
794
|
把 add_set 中满足条件的一级特征补充到 ref_set 中。
|
|
730
795
|
返回值:实际添加的一级特征数量。
|
|
@@ -733,15 +798,16 @@ def complement_annotations(ref_set: AnnotationSet, add_set: AnnotationSet, size_
|
|
|
733
798
|
existing_ids = ref_set.all_ids()
|
|
734
799
|
existing_names = ref_set.all_names()
|
|
735
800
|
|
|
736
|
-
for add_root in sorted(
|
|
737
|
-
add_set.roots,
|
|
738
|
-
key=lambda x: (x.seqid, x.start, x.end, x.feature_type, x.id or "")
|
|
739
|
-
):
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
801
|
+
for add_root in sorted(
|
|
802
|
+
add_set.roots,
|
|
803
|
+
key=lambda x: (x.seqid, x.start, x.end, x.feature_type, x.id or "")
|
|
804
|
+
):
|
|
805
|
+
update_overlapping_sources(add_root, ref_set.roots)
|
|
806
|
+
if should_add_root(add_root, ref_set.roots, size_min):
|
|
807
|
+
cloned = add_root.deep_clone()
|
|
808
|
+
uniquify_feature_tree(cloned, existing_ids, existing_names)
|
|
809
|
+
ref_set.roots.append(cloned)
|
|
810
|
+
added_count += 1
|
|
745
811
|
|
|
746
812
|
return added_count
|
|
747
813
|
|
|
@@ -998,4 +1064,4 @@ def main() -> int:
|
|
|
998
1064
|
|
|
999
1065
|
|
|
1000
1066
|
if __name__ == "__main__":
|
|
1001
|
-
sys.exit(main())
|
|
1067
|
+
sys.exit(main())
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from gffkit.complement_annotations import complement_annotations, parse_annotation_file
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_overlapping_gene_sources_are_merged(tmp_path):
|
|
5
|
+
ref_gff = tmp_path / "ref.gff3"
|
|
6
|
+
add_gff = tmp_path / "add.gff3"
|
|
7
|
+
|
|
8
|
+
ref_gff.write_text(
|
|
9
|
+
"\n".join(
|
|
10
|
+
[
|
|
11
|
+
"##gff-version 3",
|
|
12
|
+
"chr1\tEviAnn\tgene\t100\t500\t.\t+\t.\tID=gene1",
|
|
13
|
+
"chr1\tEviAnn\tmRNA\t100\t500\t.\t+\t.\tID=tx1;Parent=gene1",
|
|
14
|
+
"chr1\tEviAnn\texon\t100\t500\t.\t+\t.\tID=exon1;Parent=tx1",
|
|
15
|
+
"",
|
|
16
|
+
]
|
|
17
|
+
),
|
|
18
|
+
encoding="utf-8",
|
|
19
|
+
)
|
|
20
|
+
add_gff.write_text(
|
|
21
|
+
"\n".join(
|
|
22
|
+
[
|
|
23
|
+
"##gff-version 3",
|
|
24
|
+
"chr1\tANNEVO\tgene\t120\t480\t.\t+\t.\tID=gene2",
|
|
25
|
+
"chr1\tANNEVO\tmRNA\t120\t480\t.\t+\t.\tID=tx2;Parent=gene2",
|
|
26
|
+
"chr1\tANNEVO\texon\t120\t480\t.\t+\t.\tID=exon2;Parent=tx2",
|
|
27
|
+
"",
|
|
28
|
+
]
|
|
29
|
+
),
|
|
30
|
+
encoding="utf-8",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
ref_set = parse_annotation_file(str(ref_gff))
|
|
34
|
+
add_set = parse_annotation_file(str(add_gff))
|
|
35
|
+
|
|
36
|
+
added = complement_annotations(ref_set, add_set, size_min=0)
|
|
37
|
+
|
|
38
|
+
assert added == 0
|
|
39
|
+
assert len(ref_set.roots) == 1
|
|
40
|
+
assert {feature.source for feature in ref_set.roots[0].iter_all()} == {"EviAnn,ANNEVO"}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_non_overlapping_gene_keeps_single_source(tmp_path):
|
|
44
|
+
ref_gff = tmp_path / "ref.gff3"
|
|
45
|
+
add_gff = tmp_path / "add.gff3"
|
|
46
|
+
|
|
47
|
+
ref_gff.write_text(
|
|
48
|
+
"chr1\tEviAnn\tgene\t100\t200\t.\t+\t.\tID=gene1\n",
|
|
49
|
+
encoding="utf-8",
|
|
50
|
+
)
|
|
51
|
+
add_gff.write_text(
|
|
52
|
+
"chr1\tANNEVO\tgene\t400\t500\t.\t+\t.\tID=gene2\n",
|
|
53
|
+
encoding="utf-8",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
ref_set = parse_annotation_file(str(ref_gff))
|
|
57
|
+
add_set = parse_annotation_file(str(add_gff))
|
|
58
|
+
|
|
59
|
+
added = complement_annotations(ref_set, add_set, size_min=0)
|
|
60
|
+
|
|
61
|
+
assert added == 1
|
|
62
|
+
assert [root.source for root in ref_set.roots] == ["EviAnn", "ANNEVO"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|