biofiles 0.0.5__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biofiles-0.0.5 → biofiles-0.0.7}/PKG-INFO +1 -1
- biofiles-0.0.7/biofiles/feature.py +198 -0
- biofiles-0.0.7/biofiles/gff.py +155 -0
- biofiles-0.0.7/biofiles/gtf.py +38 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/types/feature.py +8 -2
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles.egg-info/PKG-INFO +1 -1
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles.egg-info/SOURCES.txt +2 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/pyproject.toml +1 -1
- biofiles-0.0.5/biofiles/gff.py +0 -319
- {biofiles-0.0.5 → biofiles-0.0.7}/LICENSE +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/README.md +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/__init__.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/common.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/fasta.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/repeatmasker.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/types/__init__.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/types/repeat.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles/types/sequence.py +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles.egg-info/dependency_links.txt +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/biofiles.egg-info/top_level.txt +0 -0
- {biofiles-0.0.5 → biofiles-0.0.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -0,0 +1,198 @@
|
|
1
|
+
from collections import deque
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterator, TextIO, Type
|
5
|
+
|
6
|
+
from biofiles.common import Reader, Strand
|
7
|
+
from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR
|
8
|
+
|
9
|
+
|
10
|
+
@dataclass
|
11
|
+
class FeatureDraft:
|
12
|
+
idx: int
|
13
|
+
sequence_id: str
|
14
|
+
source: str
|
15
|
+
type_: str
|
16
|
+
start_original: int
|
17
|
+
end_original: int
|
18
|
+
score: float | None
|
19
|
+
strand: Strand | None
|
20
|
+
phase: int | None
|
21
|
+
attributes: dict[str, str]
|
22
|
+
|
23
|
+
def pick_attribute(self, *keys: str) -> str | None:
|
24
|
+
for key in keys:
|
25
|
+
if (value := self.attributes.get(key, None)) is not None:
|
26
|
+
return value
|
27
|
+
return None
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class FeatureDrafts:
|
32
|
+
drafts: deque[FeatureDraft] = field(default_factory=deque)
|
33
|
+
by_id: dict[str, FeatureDraft] = field(default_factory=dict)
|
34
|
+
# deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
|
35
|
+
|
36
|
+
def add(self, draft: FeatureDraft) -> None:
|
37
|
+
self.drafts.append(draft)
|
38
|
+
if id_ := draft.attributes.get("ID", None):
|
39
|
+
self.by_id[id_] = draft
|
40
|
+
# if parent_id := draft.attributes.get("Parent", None):
|
41
|
+
# parent = self.by_id[parent_id]
|
42
|
+
# self.deps[parent.idx].append(draft.idx)
|
43
|
+
|
44
|
+
# def remove_first_n(self, n: int) -> None:
|
45
|
+
# for _ in range(n):
|
46
|
+
# draft = self.drafts.popleft()
|
47
|
+
# if id_ := draft.attributes.get("ID", None):
|
48
|
+
# del self.by_id[id_]
|
49
|
+
# self.deps.pop(draft.idx, None)
|
50
|
+
|
51
|
+
|
52
|
+
@dataclass
|
53
|
+
class Features:
|
54
|
+
features: list[Feature] = field(default_factory=list)
|
55
|
+
by_id: dict[str, Feature] = field(default_factory=dict)
|
56
|
+
|
57
|
+
def add(self, feature: Feature):
|
58
|
+
self.features.append(feature)
|
59
|
+
if id_ := feature.id:
|
60
|
+
self.by_id[id_] = feature
|
61
|
+
|
62
|
+
|
63
|
+
class FeatureReader(Reader):
|
64
|
+
def __init__(
|
65
|
+
self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
|
66
|
+
):
|
67
|
+
super().__init__(input_)
|
68
|
+
self._streaming_window = streaming_window
|
69
|
+
|
70
|
+
def __iter__(self) -> Iterator[Feature]:
|
71
|
+
raise NotImplementedError
|
72
|
+
|
73
|
+
def _finalize_drafts(
|
74
|
+
self, drafts: FeatureDrafts, w: int | None
|
75
|
+
) -> Iterator[Feature]:
|
76
|
+
# TODO streaming version!
|
77
|
+
# code below is already tracking
|
78
|
+
# if not drafts.drafts:
|
79
|
+
# return
|
80
|
+
# if w is not None and len(drafts.drafts) <= w:
|
81
|
+
# return
|
82
|
+
#
|
83
|
+
# end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
|
84
|
+
#
|
85
|
+
# i = 0
|
86
|
+
# while i < len(drafts.drafts) and (
|
87
|
+
# not drafts.deps[drafts.drafts[i].idx]
|
88
|
+
# or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
|
89
|
+
# ):
|
90
|
+
# i += 1
|
91
|
+
#
|
92
|
+
# print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
|
93
|
+
#
|
94
|
+
# result = _Features()
|
95
|
+
# for j in range(i):
|
96
|
+
# draft = drafts.drafts[j]
|
97
|
+
# feature = self._finalize_draft(draft, result)
|
98
|
+
# result.add(feature)
|
99
|
+
# drafts.remove_first_n(i)
|
100
|
+
# yield from result.features
|
101
|
+
|
102
|
+
result = Features()
|
103
|
+
for draft in drafts.drafts:
|
104
|
+
feature = self._finalize_draft(draft, result)
|
105
|
+
result.add(feature)
|
106
|
+
yield from result.features
|
107
|
+
|
108
|
+
def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
|
109
|
+
match draft.type_.lower():
|
110
|
+
case "gene":
|
111
|
+
feature = self._finalize_gene(draft, result)
|
112
|
+
case "exon":
|
113
|
+
feature = self._finalize_exon(draft, result)
|
114
|
+
case "three_prime_utr":
|
115
|
+
feature = self._finalize_utr(draft, result, ThreePrimeUTR)
|
116
|
+
case "utr":
|
117
|
+
feature = self._finalize_utr(draft, result, UTR)
|
118
|
+
case _:
|
119
|
+
feature = self._finalize_other(draft, result)
|
120
|
+
if feature.parent:
|
121
|
+
new_children = feature.parent.children + (feature,)
|
122
|
+
object.__setattr__(feature.parent, "children", new_children)
|
123
|
+
return feature
|
124
|
+
|
125
|
+
def _finalize_gene(self, draft: FeatureDraft, result: Features) -> Feature:
|
126
|
+
feature = self._finalize_other(draft, result)
|
127
|
+
name = draft.pick_attribute("gene_name", "Name")
|
128
|
+
biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
|
129
|
+
if name is None or biotype is None:
|
130
|
+
return feature
|
131
|
+
return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
|
132
|
+
|
133
|
+
def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
|
134
|
+
feature = self._finalize_other(draft, result)
|
135
|
+
|
136
|
+
gene = feature.parent
|
137
|
+
while gene and not isinstance(gene, Gene):
|
138
|
+
gene = gene.parent
|
139
|
+
|
140
|
+
if gene is None:
|
141
|
+
return feature
|
142
|
+
exon = Exon(**feature.__dict__, gene=gene)
|
143
|
+
object.__setattr__(gene, "exons", gene.exons + (exon,))
|
144
|
+
return exon
|
145
|
+
|
146
|
+
def _finalize_utr(
|
147
|
+
self, draft: FeatureDraft, result: Features, type_: Type[UTR]
|
148
|
+
) -> Feature:
|
149
|
+
feature = self._finalize_other(draft, result)
|
150
|
+
|
151
|
+
gene = feature.parent
|
152
|
+
while gene and not isinstance(gene, Gene):
|
153
|
+
gene = gene.parent
|
154
|
+
|
155
|
+
if gene is None:
|
156
|
+
return feature
|
157
|
+
return type_(**feature.__dict__, gene=gene)
|
158
|
+
|
159
|
+
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
160
|
+
parent_id = self._extract_parent_id(draft)
|
161
|
+
parent = result.by_id[parent_id] if parent_id is not None else None
|
162
|
+
|
163
|
+
return Feature(
|
164
|
+
sequence_id=draft.sequence_id,
|
165
|
+
source=draft.source,
|
166
|
+
type_=draft.type_,
|
167
|
+
start_original=draft.start_original,
|
168
|
+
end_original=draft.end_original,
|
169
|
+
start_c=draft.start_original - 1,
|
170
|
+
end_c=draft.end_original,
|
171
|
+
score=draft.score,
|
172
|
+
strand=draft.strand,
|
173
|
+
phase=draft.phase,
|
174
|
+
attributes=draft.attributes,
|
175
|
+
id=self._extract_id(draft),
|
176
|
+
parent=parent,
|
177
|
+
children=(),
|
178
|
+
)
|
179
|
+
|
180
|
+
def _extract_id(self, draft: FeatureDraft) -> str | None:
|
181
|
+
if (id_ := draft.attributes.get("ID")) is not None:
|
182
|
+
return id_
|
183
|
+
if draft.type_ == "gene" and (id_ := draft.attributes.get("gene_id")):
|
184
|
+
return id_
|
185
|
+
if draft.type_ == "transcript" and (
|
186
|
+
id_ := draft.attributes.get("transcript_id")
|
187
|
+
):
|
188
|
+
return id_
|
189
|
+
return None
|
190
|
+
|
191
|
+
def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
|
192
|
+
if (id_ := draft.attributes.get("Parent")) is not None:
|
193
|
+
return id_
|
194
|
+
if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
|
195
|
+
return id_
|
196
|
+
if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
|
197
|
+
return id_
|
198
|
+
return None
|
@@ -0,0 +1,155 @@
|
|
1
|
+
import sys
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Iterator, cast, TextIO
|
4
|
+
|
5
|
+
from biofiles.common import Strand, Writer
|
6
|
+
from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
|
7
|
+
from biofiles.types.feature import Feature, Gene, Exon
|
8
|
+
|
9
|
+
__all__ = ["GFFReader", "GFF3Writer"]
|
10
|
+
|
11
|
+
|
12
|
+
class GFFReader(FeatureReader):
|
13
|
+
def __iter__(self) -> Iterator[Feature]:
|
14
|
+
for line in self._input:
|
15
|
+
line = line.rstrip("\n")
|
16
|
+
if line.startswith(_VERSION_PREFIX):
|
17
|
+
version = line.removeprefix(_VERSION_PREFIX)
|
18
|
+
if version == "3":
|
19
|
+
yield from self._read_gff3()
|
20
|
+
return
|
21
|
+
raise ValueError(f"unsupported version {version!r}")
|
22
|
+
if line.startswith("#"):
|
23
|
+
continue
|
24
|
+
raise ValueError(f"unexpected line {line!r}, expected version")
|
25
|
+
|
26
|
+
def _read_gff3(self) -> Iterator[Feature]:
|
27
|
+
drafts = FeatureDrafts()
|
28
|
+
idx = 0
|
29
|
+
for line in self._input:
|
30
|
+
if line.startswith("#"):
|
31
|
+
continue
|
32
|
+
line = line.rstrip("\n")
|
33
|
+
parts = line.split("\t", maxsplit=8)
|
34
|
+
if len(parts) != 9:
|
35
|
+
raise ValueError(f"unexpected line {line!r}, expected 9 columns")
|
36
|
+
(
|
37
|
+
sequence_id,
|
38
|
+
source,
|
39
|
+
type_,
|
40
|
+
start_str,
|
41
|
+
end_str,
|
42
|
+
score_str,
|
43
|
+
strand_str,
|
44
|
+
phase_str,
|
45
|
+
attributes_str,
|
46
|
+
) = parts
|
47
|
+
score = self._parse_score(line, score_str)
|
48
|
+
strand = self._parse_strand(line, strand_str)
|
49
|
+
phase = self._parse_phase(line, phase_str)
|
50
|
+
attributes = self._parse_attributes(line, attributes_str)
|
51
|
+
|
52
|
+
parent_id = attributes.get("Parent", None)
|
53
|
+
# if parent_id is None:
|
54
|
+
# yield from self._finalize_drafts(drafts)
|
55
|
+
# drafts = _FeatureDrafts()
|
56
|
+
if parent_id is not None and parent_id not in drafts.by_id:
|
57
|
+
raise ValueError(
|
58
|
+
f"unexpected line {line!r}, parent ID not among recent feature IDs"
|
59
|
+
)
|
60
|
+
|
61
|
+
draft = FeatureDraft(
|
62
|
+
idx=idx,
|
63
|
+
sequence_id=sequence_id,
|
64
|
+
source=source,
|
65
|
+
type_=type_,
|
66
|
+
start_original=int(start_str),
|
67
|
+
end_original=int(end_str),
|
68
|
+
score=score,
|
69
|
+
strand=strand,
|
70
|
+
phase=phase,
|
71
|
+
attributes=attributes,
|
72
|
+
)
|
73
|
+
drafts.add(draft)
|
74
|
+
idx += 1
|
75
|
+
|
76
|
+
# yield from self._finalize_drafts(drafts, self._streaming_window)
|
77
|
+
|
78
|
+
yield from self._finalize_drafts(drafts, None)
|
79
|
+
|
80
|
+
def _parse_score(self, line: str, score_str: str) -> float | None:
|
81
|
+
if score_str == ".":
|
82
|
+
return None
|
83
|
+
try:
|
84
|
+
return float(score_str)
|
85
|
+
except ValueError as exc:
|
86
|
+
raise ValueError(
|
87
|
+
f"unexpected line {line!r}, score should be a number or '.'"
|
88
|
+
) from exc
|
89
|
+
|
90
|
+
def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
|
91
|
+
if strand_str in ("-", "+"):
|
92
|
+
return cast(Strand, strand_str)
|
93
|
+
if strand_str == ".":
|
94
|
+
return None
|
95
|
+
raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
|
96
|
+
|
97
|
+
def _parse_phase(self, line: str, phase_str: str) -> int | None:
|
98
|
+
if phase_str == ".":
|
99
|
+
return None
|
100
|
+
try:
|
101
|
+
return int(phase_str)
|
102
|
+
except ValueError as exc:
|
103
|
+
raise ValueError(
|
104
|
+
f"unexpected line {line!r}, phase should be an integer or '.'"
|
105
|
+
) from exc
|
106
|
+
|
107
|
+
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
108
|
+
return {
|
109
|
+
k: v
|
110
|
+
for part in attributes_str.strip(";").split(";")
|
111
|
+
for k, v in (part.split("=", 1),)
|
112
|
+
}
|
113
|
+
|
114
|
+
|
115
|
+
class GFF3Writer(Writer):
|
116
|
+
def __init__(self, output: TextIO | Path | str) -> None:
|
117
|
+
super().__init__(output)
|
118
|
+
self._output.write(f"{_VERSION_PREFIX}3\n")
|
119
|
+
|
120
|
+
def write(self, feature: Feature) -> None:
|
121
|
+
fields = (
|
122
|
+
feature.sequence_id,
|
123
|
+
feature.source,
|
124
|
+
feature.type_,
|
125
|
+
str(feature.start_c + 1),
|
126
|
+
str(feature.end_c),
|
127
|
+
str(feature.score) if feature.score is not None else ".",
|
128
|
+
str(feature.strand) if feature.strand is not None else ".",
|
129
|
+
str(feature.phase) if feature.phase is not None else ".",
|
130
|
+
";".join(f"{k}={v}" for k, v in feature.attributes.items()),
|
131
|
+
)
|
132
|
+
self._output.write("\t".join(fields))
|
133
|
+
self._output.write("\n")
|
134
|
+
|
135
|
+
|
136
|
+
_VERSION_PREFIX = "##gff-version "
|
137
|
+
|
138
|
+
|
139
|
+
if __name__ == "__main__":
|
140
|
+
for path in sys.argv[1:]:
|
141
|
+
with GFFReader(path) as r:
|
142
|
+
total_features = 0
|
143
|
+
annotated_genes = 0
|
144
|
+
annotated_exons = 0
|
145
|
+
parsed_genes = 0
|
146
|
+
parsed_exons = 0
|
147
|
+
for feature in r:
|
148
|
+
total_features += 1
|
149
|
+
annotated_genes += feature.type_ == "gene"
|
150
|
+
annotated_exons += feature.type_ == "exon"
|
151
|
+
parsed_genes += isinstance(feature, Gene)
|
152
|
+
parsed_exons += isinstance(feature, Exon)
|
153
|
+
print(
|
154
|
+
f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
|
155
|
+
)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
__all__ = ["GTFReader"]
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from typing import Iterator
|
5
|
+
|
6
|
+
from biofiles.gff import GFFReader
|
7
|
+
from biofiles.types.feature import Gene, Exon, Feature
|
8
|
+
|
9
|
+
|
10
|
+
class GTFReader(GFFReader):
|
11
|
+
def __iter__(self) -> Iterator[Feature]:
|
12
|
+
yield from self._read_gff3()
|
13
|
+
|
14
|
+
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
15
|
+
return {
|
16
|
+
k: v.strip('"')
|
17
|
+
for part in attributes_str.strip(";").split(";")
|
18
|
+
for k, v in (part.strip().split(None, 1),)
|
19
|
+
}
|
20
|
+
|
21
|
+
|
22
|
+
if __name__ == "__main__":
|
23
|
+
for path in sys.argv[1:]:
|
24
|
+
with GTFReader(path) as r:
|
25
|
+
total_features = 0
|
26
|
+
annotated_genes = 0
|
27
|
+
annotated_exons = 0
|
28
|
+
parsed_genes = 0
|
29
|
+
parsed_exons = 0
|
30
|
+
for feature in r:
|
31
|
+
total_features += 1
|
32
|
+
annotated_genes += feature.type_ == "gene"
|
33
|
+
annotated_exons += feature.type_ == "exon"
|
34
|
+
parsed_genes += isinstance(feature, Gene)
|
35
|
+
parsed_exons += isinstance(feature, Exon)
|
36
|
+
print(
|
37
|
+
f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
|
38
|
+
)
|
@@ -14,7 +14,7 @@ class Feature:
|
|
14
14
|
|
15
15
|
start_original: int
|
16
16
|
end_original: int
|
17
|
-
# Original
|
17
|
+
# Original values as they were present in the file (1-based inclusive for .gff and .gtf).
|
18
18
|
|
19
19
|
start_c: int
|
20
20
|
end_c: int
|
@@ -25,6 +25,7 @@ class Feature:
|
|
25
25
|
phase: int | None
|
26
26
|
attributes: dict[str, str]
|
27
27
|
|
28
|
+
id: str | None
|
28
29
|
parent: "GFFFeature | None"
|
29
30
|
children: tuple["Feature", ...]
|
30
31
|
|
@@ -46,6 +47,11 @@ class Exon(Feature):
|
|
46
47
|
|
47
48
|
|
48
49
|
@dataclass(frozen=True)
|
49
|
-
class
|
50
|
+
class UTR(Feature):
|
50
51
|
gene: Gene
|
51
52
|
# TODO transcript
|
53
|
+
|
54
|
+
|
55
|
+
@dataclass(frozen=True)
|
56
|
+
class ThreePrimeUTR(UTR):
|
57
|
+
pass
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
biofiles-0.0.5/biofiles/gff.py
DELETED
@@ -1,319 +0,0 @@
|
|
1
|
-
import sys
|
2
|
-
from collections import deque
|
3
|
-
from dataclasses import dataclass, field
|
4
|
-
from pathlib import Path
|
5
|
-
from typing import Iterator, cast, TextIO
|
6
|
-
|
7
|
-
from biofiles.common import Strand, Reader, Writer
|
8
|
-
from biofiles.types.feature import Feature, Gene, Exon, ThreePrimeUTR
|
9
|
-
|
10
|
-
__all__ = ["GFFReader", "GFF3Writer"]
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class _FeatureDraft:
|
15
|
-
idx: int
|
16
|
-
sequence_id: str
|
17
|
-
source: str
|
18
|
-
type_: str
|
19
|
-
start_original: int
|
20
|
-
end_original: int
|
21
|
-
score: float | None
|
22
|
-
strand: Strand | None
|
23
|
-
phase: int | None
|
24
|
-
attributes: dict[str, str]
|
25
|
-
|
26
|
-
def pick_attribute(self, *keys: str) -> str | None:
|
27
|
-
for key in keys:
|
28
|
-
if (value := self.attributes.get(key, None)) is not None:
|
29
|
-
return value
|
30
|
-
return None
|
31
|
-
|
32
|
-
|
33
|
-
@dataclass
|
34
|
-
class _FeatureDrafts:
|
35
|
-
drafts: deque[_FeatureDraft] = field(default_factory=deque)
|
36
|
-
by_id: dict[str, _FeatureDraft] = field(default_factory=dict)
|
37
|
-
# deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
|
38
|
-
|
39
|
-
def add(self, draft: _FeatureDraft) -> None:
|
40
|
-
self.drafts.append(draft)
|
41
|
-
if id_ := draft.attributes.get("ID", None):
|
42
|
-
self.by_id[id_] = draft
|
43
|
-
# if parent_id := draft.attributes.get("Parent", None):
|
44
|
-
# parent = self.by_id[parent_id]
|
45
|
-
# self.deps[parent.idx].append(draft.idx)
|
46
|
-
|
47
|
-
# def remove_first_n(self, n: int) -> None:
|
48
|
-
# for _ in range(n):
|
49
|
-
# draft = self.drafts.popleft()
|
50
|
-
# if id_ := draft.attributes.get("ID", None):
|
51
|
-
# del self.by_id[id_]
|
52
|
-
# self.deps.pop(draft.idx, None)
|
53
|
-
|
54
|
-
|
55
|
-
@dataclass
|
56
|
-
class _Features:
|
57
|
-
features: list[Feature] = field(default_factory=list)
|
58
|
-
by_id: dict[str, Feature] = field(default_factory=dict)
|
59
|
-
|
60
|
-
def add(self, feature: Feature):
|
61
|
-
self.features.append(feature)
|
62
|
-
if id_ := feature.attributes.get("ID", None):
|
63
|
-
self.by_id[id_] = feature
|
64
|
-
|
65
|
-
|
66
|
-
class GFFReader(Reader):
|
67
|
-
def __init__(
|
68
|
-
self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
|
69
|
-
):
|
70
|
-
super().__init__(input_)
|
71
|
-
self._streaming_window = streaming_window
|
72
|
-
|
73
|
-
def __iter__(self) -> Iterator[Feature]:
|
74
|
-
for line in self._input:
|
75
|
-
line = line.rstrip("\n")
|
76
|
-
if line.startswith(_VERSION_PREFIX):
|
77
|
-
version = line.removeprefix(_VERSION_PREFIX)
|
78
|
-
if version == "3":
|
79
|
-
yield from self._read_gff3()
|
80
|
-
return
|
81
|
-
raise ValueError(f"unsupported version {version!r}")
|
82
|
-
if line.startswith("#"):
|
83
|
-
continue
|
84
|
-
raise ValueError(f"unexpected line {line!r}, expected version")
|
85
|
-
|
86
|
-
def _read_gff3(self) -> Iterator[Feature]:
|
87
|
-
drafts = _FeatureDrafts()
|
88
|
-
idx = 0
|
89
|
-
for line in self._input:
|
90
|
-
if line.startswith("#"):
|
91
|
-
continue
|
92
|
-
line = line.rstrip("\n")
|
93
|
-
parts = line.split("\t", maxsplit=8)
|
94
|
-
if len(parts) != 9:
|
95
|
-
raise ValueError(f"unexpected line {line!r}, expected 9 columns")
|
96
|
-
(
|
97
|
-
sequence_id,
|
98
|
-
source,
|
99
|
-
type_,
|
100
|
-
start_str,
|
101
|
-
end_str,
|
102
|
-
score_str,
|
103
|
-
strand_str,
|
104
|
-
phase_str,
|
105
|
-
attributes_str,
|
106
|
-
) = parts
|
107
|
-
score = self._parse_score(line, score_str)
|
108
|
-
strand = self._parse_strand(line, strand_str)
|
109
|
-
phase = self._parse_phase(line, phase_str)
|
110
|
-
attributes = self._parse_attributes(line, attributes_str)
|
111
|
-
|
112
|
-
parent_id = attributes.get("Parent", None)
|
113
|
-
# if parent_id is None:
|
114
|
-
# yield from self._finalize_drafts(drafts)
|
115
|
-
# drafts = _FeatureDrafts()
|
116
|
-
if parent_id is not None and parent_id not in drafts.by_id:
|
117
|
-
raise ValueError(
|
118
|
-
f"unexpected line {line!r}, parent ID not among recent feature IDs"
|
119
|
-
)
|
120
|
-
|
121
|
-
draft = _FeatureDraft(
|
122
|
-
idx=idx,
|
123
|
-
sequence_id=sequence_id,
|
124
|
-
source=source,
|
125
|
-
type_=type_,
|
126
|
-
start_original=int(start_str),
|
127
|
-
end_original=int(end_str),
|
128
|
-
score=score,
|
129
|
-
strand=strand,
|
130
|
-
phase=phase,
|
131
|
-
attributes=attributes,
|
132
|
-
)
|
133
|
-
drafts.add(draft)
|
134
|
-
idx += 1
|
135
|
-
|
136
|
-
# yield from self._finalize_drafts(drafts, self._streaming_window)
|
137
|
-
|
138
|
-
yield from self._finalize_drafts(drafts, None)
|
139
|
-
|
140
|
-
def _finalize_drafts(
|
141
|
-
self, drafts: _FeatureDrafts, w: int | None
|
142
|
-
) -> Iterator[Feature]:
|
143
|
-
# TODO streaming version!
|
144
|
-
# code below is already tracking
|
145
|
-
# if not drafts.drafts:
|
146
|
-
# return
|
147
|
-
# if w is not None and len(drafts.drafts) <= w:
|
148
|
-
# return
|
149
|
-
#
|
150
|
-
# end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
|
151
|
-
#
|
152
|
-
# i = 0
|
153
|
-
# while i < len(drafts.drafts) and (
|
154
|
-
# not drafts.deps[drafts.drafts[i].idx]
|
155
|
-
# or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
|
156
|
-
# ):
|
157
|
-
# i += 1
|
158
|
-
#
|
159
|
-
# print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
|
160
|
-
#
|
161
|
-
# result = _Features()
|
162
|
-
# for j in range(i):
|
163
|
-
# draft = drafts.drafts[j]
|
164
|
-
# feature = self._finalize_draft(draft, result)
|
165
|
-
# result.add(feature)
|
166
|
-
# drafts.remove_first_n(i)
|
167
|
-
# yield from result.features
|
168
|
-
|
169
|
-
result = _Features()
|
170
|
-
for draft in drafts.drafts:
|
171
|
-
feature = self._finalize_draft(draft, result)
|
172
|
-
result.add(feature)
|
173
|
-
yield from result.features
|
174
|
-
|
175
|
-
def _finalize_draft(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
176
|
-
match draft.type_.lower():
|
177
|
-
case "gene":
|
178
|
-
feature = self._finalize_gene(draft, result)
|
179
|
-
case "exon":
|
180
|
-
feature = self._finalize_exon(draft, result)
|
181
|
-
case "three_prime_utr":
|
182
|
-
feature = self._finalize_three_prime_utr(draft, result)
|
183
|
-
case _:
|
184
|
-
feature = self._finalize_other(draft, result)
|
185
|
-
if feature.parent:
|
186
|
-
new_children = feature.parent.children + (feature,)
|
187
|
-
object.__setattr__(feature.parent, "children", new_children)
|
188
|
-
return feature
|
189
|
-
|
190
|
-
def _finalize_gene(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
191
|
-
feature = self._finalize_other(draft, result)
|
192
|
-
name = draft.pick_attribute("gene_name", "Name")
|
193
|
-
biotype = draft.pick_attribute("gene_biotype", "biotype")
|
194
|
-
if name is None or biotype is None:
|
195
|
-
return feature
|
196
|
-
return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
|
197
|
-
|
198
|
-
def _finalize_exon(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
199
|
-
feature = self._finalize_other(draft, result)
|
200
|
-
|
201
|
-
gene = feature.parent
|
202
|
-
while gene and not isinstance(gene, Gene):
|
203
|
-
gene = gene.parent
|
204
|
-
|
205
|
-
if gene is None:
|
206
|
-
return feature
|
207
|
-
exon = Exon(**feature.__dict__, gene=gene)
|
208
|
-
object.__setattr__(gene, "exons", gene.exons + (exon,))
|
209
|
-
return exon
|
210
|
-
|
211
|
-
def _finalize_three_prime_utr(
|
212
|
-
self, draft: _FeatureDraft, result: _Features
|
213
|
-
) -> Feature:
|
214
|
-
feature = self._finalize_other(draft, result)
|
215
|
-
|
216
|
-
gene = feature.parent
|
217
|
-
while gene and not isinstance(gene, Gene):
|
218
|
-
gene = gene.parent
|
219
|
-
|
220
|
-
if gene is None:
|
221
|
-
return feature
|
222
|
-
return ThreePrimeUTR(**feature.__dict__, gene=gene)
|
223
|
-
|
224
|
-
def _finalize_other(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
225
|
-
parent_id = draft.attributes.get("Parent", None)
|
226
|
-
parent = result.by_id[parent_id] if parent_id is not None else None
|
227
|
-
|
228
|
-
return Feature(
|
229
|
-
sequence_id=draft.sequence_id,
|
230
|
-
source=draft.source,
|
231
|
-
type_=draft.type_,
|
232
|
-
start_original=draft.start_original,
|
233
|
-
end_original=draft.end_original,
|
234
|
-
start_c=draft.start_original - 1,
|
235
|
-
end_c=draft.end_original,
|
236
|
-
score=draft.score,
|
237
|
-
strand=draft.strand,
|
238
|
-
phase=draft.phase,
|
239
|
-
attributes=draft.attributes,
|
240
|
-
parent=parent,
|
241
|
-
children=(),
|
242
|
-
)
|
243
|
-
|
244
|
-
def _parse_score(self, line: str, score_str: str) -> float | None:
|
245
|
-
if score_str == ".":
|
246
|
-
return None
|
247
|
-
try:
|
248
|
-
return float(score_str)
|
249
|
-
except ValueError as exc:
|
250
|
-
raise ValueError(
|
251
|
-
f"unexpected line {line!r}, score should be a number or '.'"
|
252
|
-
) from exc
|
253
|
-
|
254
|
-
def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
|
255
|
-
if strand_str in ("-", "+"):
|
256
|
-
return cast(Strand, strand_str)
|
257
|
-
if strand_str == ".":
|
258
|
-
return None
|
259
|
-
raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
|
260
|
-
|
261
|
-
def _parse_phase(self, line: str, phase_str: str) -> int | None:
|
262
|
-
if phase_str == ".":
|
263
|
-
return None
|
264
|
-
try:
|
265
|
-
return int(phase_str)
|
266
|
-
except ValueError as exc:
|
267
|
-
raise ValueError(
|
268
|
-
f"unexpected line {line!r}, phase should be an integer or '.'"
|
269
|
-
) from exc
|
270
|
-
|
271
|
-
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
272
|
-
return {
|
273
|
-
k: v
|
274
|
-
for part in attributes_str.strip(";").split(";")
|
275
|
-
for k, v in (part.split("=", 1),)
|
276
|
-
}
|
277
|
-
|
278
|
-
|
279
|
-
class GFF3Writer(Writer):
|
280
|
-
def __init__(self, output: TextIO | Path | str) -> None:
|
281
|
-
super().__init__(output)
|
282
|
-
self._output.write(f"{_VERSION_PREFIX}3\n")
|
283
|
-
|
284
|
-
def write(self, feature: Feature) -> None:
|
285
|
-
fields = (
|
286
|
-
feature.sequence_id,
|
287
|
-
feature.source,
|
288
|
-
feature.type_,
|
289
|
-
str(feature.start_original),
|
290
|
-
str(feature.end_original),
|
291
|
-
str(feature.score) if feature.score is not None else ".",
|
292
|
-
str(feature.strand) if feature.strand is not None else ".",
|
293
|
-
str(feature.phase) if feature.phase is not None else ".",
|
294
|
-
";".join(f"{k}={v}" for k, v in feature.attributes.items()),
|
295
|
-
)
|
296
|
-
self._output.write("\t".join(fields))
|
297
|
-
self._output.write("\n")
|
298
|
-
|
299
|
-
|
300
|
-
_VERSION_PREFIX = "##gff-version "
|
301
|
-
|
302
|
-
|
303
|
-
if __name__ == "__main__":
|
304
|
-
for path in sys.argv[1:]:
|
305
|
-
with GFFReader(path) as r:
|
306
|
-
total_features = 0
|
307
|
-
annotated_genes = 0
|
308
|
-
annotated_exons = 0
|
309
|
-
parsed_genes = 0
|
310
|
-
parsed_exons = 0
|
311
|
-
for feature in r:
|
312
|
-
total_features += 1
|
313
|
-
annotated_genes += feature.type_ == "gene"
|
314
|
-
annotated_exons += feature.type_ == "exon"
|
315
|
-
parsed_genes += isinstance(feature, Gene)
|
316
|
-
parsed_exons += isinstance(feature, Exon)
|
317
|
-
print(
|
318
|
-
f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
|
319
|
-
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|