biofiles 0.0.5__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -0,0 +1,198 @@
1
+ from collections import deque
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import Iterator, TextIO, Type
5
+
6
+ from biofiles.common import Reader, Strand
7
+ from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR
8
+
9
+
10
+ @dataclass
11
+ class FeatureDraft:
12
+ idx: int
13
+ sequence_id: str
14
+ source: str
15
+ type_: str
16
+ start_original: int
17
+ end_original: int
18
+ score: float | None
19
+ strand: Strand | None
20
+ phase: int | None
21
+ attributes: dict[str, str]
22
+
23
+ def pick_attribute(self, *keys: str) -> str | None:
24
+ for key in keys:
25
+ if (value := self.attributes.get(key, None)) is not None:
26
+ return value
27
+ return None
28
+
29
+
30
+ @dataclass
31
+ class FeatureDrafts:
32
+ drafts: deque[FeatureDraft] = field(default_factory=deque)
33
+ by_id: dict[str, FeatureDraft] = field(default_factory=dict)
34
+ # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
35
+
36
+ def add(self, draft: FeatureDraft) -> None:
37
+ self.drafts.append(draft)
38
+ if id_ := draft.attributes.get("ID", None):
39
+ self.by_id[id_] = draft
40
+ # if parent_id := draft.attributes.get("Parent", None):
41
+ # parent = self.by_id[parent_id]
42
+ # self.deps[parent.idx].append(draft.idx)
43
+
44
+ # def remove_first_n(self, n: int) -> None:
45
+ # for _ in range(n):
46
+ # draft = self.drafts.popleft()
47
+ # if id_ := draft.attributes.get("ID", None):
48
+ # del self.by_id[id_]
49
+ # self.deps.pop(draft.idx, None)
50
+
51
+
52
+ @dataclass
53
+ class Features:
54
+ features: list[Feature] = field(default_factory=list)
55
+ by_id: dict[str, Feature] = field(default_factory=dict)
56
+
57
+ def add(self, feature: Feature):
58
+ self.features.append(feature)
59
+ if id_ := feature.id:
60
+ self.by_id[id_] = feature
61
+
62
+
63
+ class FeatureReader(Reader):
64
+ def __init__(
65
+ self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
66
+ ):
67
+ super().__init__(input_)
68
+ self._streaming_window = streaming_window
69
+
70
+ def __iter__(self) -> Iterator[Feature]:
71
+ raise NotImplementedError
72
+
73
+ def _finalize_drafts(
74
+ self, drafts: FeatureDrafts, w: int | None
75
+ ) -> Iterator[Feature]:
76
+ # TODO streaming version!
77
+ # code below is already tracking
78
+ # if not drafts.drafts:
79
+ # return
80
+ # if w is not None and len(drafts.drafts) <= w:
81
+ # return
82
+ #
83
+ # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
84
+ #
85
+ # i = 0
86
+ # while i < len(drafts.drafts) and (
87
+ # not drafts.deps[drafts.drafts[i].idx]
88
+ # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
89
+ # ):
90
+ # i += 1
91
+ #
92
+ # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
93
+ #
94
+ # result = _Features()
95
+ # for j in range(i):
96
+ # draft = drafts.drafts[j]
97
+ # feature = self._finalize_draft(draft, result)
98
+ # result.add(feature)
99
+ # drafts.remove_first_n(i)
100
+ # yield from result.features
101
+
102
+ result = Features()
103
+ for draft in drafts.drafts:
104
+ feature = self._finalize_draft(draft, result)
105
+ result.add(feature)
106
+ yield from result.features
107
+
108
+ def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
109
+ match draft.type_.lower():
110
+ case "gene":
111
+ feature = self._finalize_gene(draft, result)
112
+ case "exon":
113
+ feature = self._finalize_exon(draft, result)
114
+ case "three_prime_utr":
115
+ feature = self._finalize_utr(draft, result, ThreePrimeUTR)
116
+ case "utr":
117
+ feature = self._finalize_utr(draft, result, UTR)
118
+ case _:
119
+ feature = self._finalize_other(draft, result)
120
+ if feature.parent:
121
+ new_children = feature.parent.children + (feature,)
122
+ object.__setattr__(feature.parent, "children", new_children)
123
+ return feature
124
+
125
+ def _finalize_gene(self, draft: FeatureDraft, result: Features) -> Feature:
126
+ feature = self._finalize_other(draft, result)
127
+ name = draft.pick_attribute("gene_name", "Name")
128
+ biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
129
+ if name is None or biotype is None:
130
+ return feature
131
+ return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
132
+
133
+ def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
134
+ feature = self._finalize_other(draft, result)
135
+
136
+ gene = feature.parent
137
+ while gene and not isinstance(gene, Gene):
138
+ gene = gene.parent
139
+
140
+ if gene is None:
141
+ return feature
142
+ exon = Exon(**feature.__dict__, gene=gene)
143
+ object.__setattr__(gene, "exons", gene.exons + (exon,))
144
+ return exon
145
+
146
+ def _finalize_utr(
147
+ self, draft: FeatureDraft, result: Features, type_: Type[UTR]
148
+ ) -> Feature:
149
+ feature = self._finalize_other(draft, result)
150
+
151
+ gene = feature.parent
152
+ while gene and not isinstance(gene, Gene):
153
+ gene = gene.parent
154
+
155
+ if gene is None:
156
+ return feature
157
+ return type_(**feature.__dict__, gene=gene)
158
+
159
+ def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
160
+ parent_id = self._extract_parent_id(draft)
161
+ parent = result.by_id[parent_id] if parent_id is not None else None
162
+
163
+ return Feature(
164
+ sequence_id=draft.sequence_id,
165
+ source=draft.source,
166
+ type_=draft.type_,
167
+ start_original=draft.start_original,
168
+ end_original=draft.end_original,
169
+ start_c=draft.start_original - 1,
170
+ end_c=draft.end_original,
171
+ score=draft.score,
172
+ strand=draft.strand,
173
+ phase=draft.phase,
174
+ attributes=draft.attributes,
175
+ id=self._extract_id(draft),
176
+ parent=parent,
177
+ children=(),
178
+ )
179
+
180
+ def _extract_id(self, draft: FeatureDraft) -> str | None:
181
+ if (id_ := draft.attributes.get("ID")) is not None:
182
+ return id_
183
+ if draft.type_ == "gene" and (id_ := draft.attributes.get("gene_id")):
184
+ return id_
185
+ if draft.type_ == "transcript" and (
186
+ id_ := draft.attributes.get("transcript_id")
187
+ ):
188
+ return id_
189
+ return None
190
+
191
+ def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
192
+ if (id_ := draft.attributes.get("Parent")) is not None:
193
+ return id_
194
+ if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
195
+ return id_
196
+ if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
197
+ return id_
198
+ return None
@@ -0,0 +1,155 @@
1
+ import sys
2
+ from pathlib import Path
3
+ from typing import Iterator, cast, TextIO
4
+
5
+ from biofiles.common import Strand, Writer
6
+ from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
7
+ from biofiles.types.feature import Feature, Gene, Exon
8
+
9
+ __all__ = ["GFFReader", "GFF3Writer"]
10
+
11
+
12
+ class GFFReader(FeatureReader):
13
+ def __iter__(self) -> Iterator[Feature]:
14
+ for line in self._input:
15
+ line = line.rstrip("\n")
16
+ if line.startswith(_VERSION_PREFIX):
17
+ version = line.removeprefix(_VERSION_PREFIX)
18
+ if version == "3":
19
+ yield from self._read_gff3()
20
+ return
21
+ raise ValueError(f"unsupported version {version!r}")
22
+ if line.startswith("#"):
23
+ continue
24
+ raise ValueError(f"unexpected line {line!r}, expected version")
25
+
26
+ def _read_gff3(self) -> Iterator[Feature]:
27
+ drafts = FeatureDrafts()
28
+ idx = 0
29
+ for line in self._input:
30
+ if line.startswith("#"):
31
+ continue
32
+ line = line.rstrip("\n")
33
+ parts = line.split("\t", maxsplit=8)
34
+ if len(parts) != 9:
35
+ raise ValueError(f"unexpected line {line!r}, expected 9 columns")
36
+ (
37
+ sequence_id,
38
+ source,
39
+ type_,
40
+ start_str,
41
+ end_str,
42
+ score_str,
43
+ strand_str,
44
+ phase_str,
45
+ attributes_str,
46
+ ) = parts
47
+ score = self._parse_score(line, score_str)
48
+ strand = self._parse_strand(line, strand_str)
49
+ phase = self._parse_phase(line, phase_str)
50
+ attributes = self._parse_attributes(line, attributes_str)
51
+
52
+ parent_id = attributes.get("Parent", None)
53
+ # if parent_id is None:
54
+ # yield from self._finalize_drafts(drafts)
55
+ # drafts = _FeatureDrafts()
56
+ if parent_id is not None and parent_id not in drafts.by_id:
57
+ raise ValueError(
58
+ f"unexpected line {line!r}, parent ID not among recent feature IDs"
59
+ )
60
+
61
+ draft = FeatureDraft(
62
+ idx=idx,
63
+ sequence_id=sequence_id,
64
+ source=source,
65
+ type_=type_,
66
+ start_original=int(start_str),
67
+ end_original=int(end_str),
68
+ score=score,
69
+ strand=strand,
70
+ phase=phase,
71
+ attributes=attributes,
72
+ )
73
+ drafts.add(draft)
74
+ idx += 1
75
+
76
+ # yield from self._finalize_drafts(drafts, self._streaming_window)
77
+
78
+ yield from self._finalize_drafts(drafts, None)
79
+
80
+ def _parse_score(self, line: str, score_str: str) -> float | None:
81
+ if score_str == ".":
82
+ return None
83
+ try:
84
+ return float(score_str)
85
+ except ValueError as exc:
86
+ raise ValueError(
87
+ f"unexpected line {line!r}, score should be a number or '.'"
88
+ ) from exc
89
+
90
+ def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
91
+ if strand_str in ("-", "+"):
92
+ return cast(Strand, strand_str)
93
+ if strand_str == ".":
94
+ return None
95
+ raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
96
+
97
+ def _parse_phase(self, line: str, phase_str: str) -> int | None:
98
+ if phase_str == ".":
99
+ return None
100
+ try:
101
+ return int(phase_str)
102
+ except ValueError as exc:
103
+ raise ValueError(
104
+ f"unexpected line {line!r}, phase should be an integer or '.'"
105
+ ) from exc
106
+
107
+ def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
108
+ return {
109
+ k: v
110
+ for part in attributes_str.strip(";").split(";")
111
+ for k, v in (part.split("=", 1),)
112
+ }
113
+
114
+
115
+ class GFF3Writer(Writer):
116
+ def __init__(self, output: TextIO | Path | str) -> None:
117
+ super().__init__(output)
118
+ self._output.write(f"{_VERSION_PREFIX}3\n")
119
+
120
+ def write(self, feature: Feature) -> None:
121
+ fields = (
122
+ feature.sequence_id,
123
+ feature.source,
124
+ feature.type_,
125
+ str(feature.start_c + 1),
126
+ str(feature.end_c),
127
+ str(feature.score) if feature.score is not None else ".",
128
+ str(feature.strand) if feature.strand is not None else ".",
129
+ str(feature.phase) if feature.phase is not None else ".",
130
+ ";".join(f"{k}={v}" for k, v in feature.attributes.items()),
131
+ )
132
+ self._output.write("\t".join(fields))
133
+ self._output.write("\n")
134
+
135
+
136
+ _VERSION_PREFIX = "##gff-version "
137
+
138
+
139
+ if __name__ == "__main__":
140
+ for path in sys.argv[1:]:
141
+ with GFFReader(path) as r:
142
+ total_features = 0
143
+ annotated_genes = 0
144
+ annotated_exons = 0
145
+ parsed_genes = 0
146
+ parsed_exons = 0
147
+ for feature in r:
148
+ total_features += 1
149
+ annotated_genes += feature.type_ == "gene"
150
+ annotated_exons += feature.type_ == "exon"
151
+ parsed_genes += isinstance(feature, Gene)
152
+ parsed_exons += isinstance(feature, Exon)
153
+ print(
154
+ f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
155
+ )
@@ -0,0 +1,38 @@
1
+ __all__ = ["GTFReader"]
2
+
3
+ import sys
4
+ from typing import Iterator
5
+
6
+ from biofiles.gff import GFFReader
7
+ from biofiles.types.feature import Gene, Exon, Feature
8
+
9
+
10
+ class GTFReader(GFFReader):
11
+ def __iter__(self) -> Iterator[Feature]:
12
+ yield from self._read_gff3()
13
+
14
+ def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
15
+ return {
16
+ k: v.strip('"')
17
+ for part in attributes_str.strip(";").split(";")
18
+ for k, v in (part.strip().split(None, 1),)
19
+ }
20
+
21
+
22
+ if __name__ == "__main__":
23
+ for path in sys.argv[1:]:
24
+ with GTFReader(path) as r:
25
+ total_features = 0
26
+ annotated_genes = 0
27
+ annotated_exons = 0
28
+ parsed_genes = 0
29
+ parsed_exons = 0
30
+ for feature in r:
31
+ total_features += 1
32
+ annotated_genes += feature.type_ == "gene"
33
+ annotated_exons += feature.type_ == "exon"
34
+ parsed_genes += isinstance(feature, Gene)
35
+ parsed_exons += isinstance(feature, Exon)
36
+ print(
37
+ f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
38
+ )
@@ -14,7 +14,7 @@ class Feature:
14
14
 
15
15
  start_original: int
16
16
  end_original: int
17
- # Original, 1-based inclusive values.
17
+ # Original values as they were present in the file (1-based inclusive for .gff and .gtf).
18
18
 
19
19
  start_c: int
20
20
  end_c: int
@@ -25,6 +25,7 @@ class Feature:
25
25
  phase: int | None
26
26
  attributes: dict[str, str]
27
27
 
28
+ id: str | None
28
29
  parent: "GFFFeature | None"
29
30
  children: tuple["Feature", ...]
30
31
 
@@ -46,6 +47,11 @@ class Exon(Feature):
46
47
 
47
48
 
48
49
  @dataclass(frozen=True)
49
- class ThreePrimeUTR(Feature):
50
+ class UTR(Feature):
50
51
  gene: Gene
51
52
  # TODO transcript
53
+
54
+
55
+ @dataclass(frozen=True)
56
+ class ThreePrimeUTR(UTR):
57
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -4,7 +4,9 @@ pyproject.toml
4
4
  biofiles/__init__.py
5
5
  biofiles/common.py
6
6
  biofiles/fasta.py
7
+ biofiles/feature.py
7
8
  biofiles/gff.py
9
+ biofiles/gtf.py
8
10
  biofiles/repeatmasker.py
9
11
  biofiles.egg-info/PKG-INFO
10
12
  biofiles.egg-info/SOURCES.txt
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "biofiles"
7
- version = "0.0.5"
7
+ version = "0.0.7"
8
8
  authors = [
9
9
  { name="Tigran Saluev", email="tigran@saluev.com" },
10
10
  ]
@@ -1,319 +0,0 @@
1
- import sys
2
- from collections import deque
3
- from dataclasses import dataclass, field
4
- from pathlib import Path
5
- from typing import Iterator, cast, TextIO
6
-
7
- from biofiles.common import Strand, Reader, Writer
8
- from biofiles.types.feature import Feature, Gene, Exon, ThreePrimeUTR
9
-
10
- __all__ = ["GFFReader", "GFF3Writer"]
11
-
12
-
13
- @dataclass
14
- class _FeatureDraft:
15
- idx: int
16
- sequence_id: str
17
- source: str
18
- type_: str
19
- start_original: int
20
- end_original: int
21
- score: float | None
22
- strand: Strand | None
23
- phase: int | None
24
- attributes: dict[str, str]
25
-
26
- def pick_attribute(self, *keys: str) -> str | None:
27
- for key in keys:
28
- if (value := self.attributes.get(key, None)) is not None:
29
- return value
30
- return None
31
-
32
-
33
- @dataclass
34
- class _FeatureDrafts:
35
- drafts: deque[_FeatureDraft] = field(default_factory=deque)
36
- by_id: dict[str, _FeatureDraft] = field(default_factory=dict)
37
- # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
38
-
39
- def add(self, draft: _FeatureDraft) -> None:
40
- self.drafts.append(draft)
41
- if id_ := draft.attributes.get("ID", None):
42
- self.by_id[id_] = draft
43
- # if parent_id := draft.attributes.get("Parent", None):
44
- # parent = self.by_id[parent_id]
45
- # self.deps[parent.idx].append(draft.idx)
46
-
47
- # def remove_first_n(self, n: int) -> None:
48
- # for _ in range(n):
49
- # draft = self.drafts.popleft()
50
- # if id_ := draft.attributes.get("ID", None):
51
- # del self.by_id[id_]
52
- # self.deps.pop(draft.idx, None)
53
-
54
-
55
- @dataclass
56
- class _Features:
57
- features: list[Feature] = field(default_factory=list)
58
- by_id: dict[str, Feature] = field(default_factory=dict)
59
-
60
- def add(self, feature: Feature):
61
- self.features.append(feature)
62
- if id_ := feature.attributes.get("ID", None):
63
- self.by_id[id_] = feature
64
-
65
-
66
- class GFFReader(Reader):
67
- def __init__(
68
- self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
69
- ):
70
- super().__init__(input_)
71
- self._streaming_window = streaming_window
72
-
73
- def __iter__(self) -> Iterator[Feature]:
74
- for line in self._input:
75
- line = line.rstrip("\n")
76
- if line.startswith(_VERSION_PREFIX):
77
- version = line.removeprefix(_VERSION_PREFIX)
78
- if version == "3":
79
- yield from self._read_gff3()
80
- return
81
- raise ValueError(f"unsupported version {version!r}")
82
- if line.startswith("#"):
83
- continue
84
- raise ValueError(f"unexpected line {line!r}, expected version")
85
-
86
- def _read_gff3(self) -> Iterator[Feature]:
87
- drafts = _FeatureDrafts()
88
- idx = 0
89
- for line in self._input:
90
- if line.startswith("#"):
91
- continue
92
- line = line.rstrip("\n")
93
- parts = line.split("\t", maxsplit=8)
94
- if len(parts) != 9:
95
- raise ValueError(f"unexpected line {line!r}, expected 9 columns")
96
- (
97
- sequence_id,
98
- source,
99
- type_,
100
- start_str,
101
- end_str,
102
- score_str,
103
- strand_str,
104
- phase_str,
105
- attributes_str,
106
- ) = parts
107
- score = self._parse_score(line, score_str)
108
- strand = self._parse_strand(line, strand_str)
109
- phase = self._parse_phase(line, phase_str)
110
- attributes = self._parse_attributes(line, attributes_str)
111
-
112
- parent_id = attributes.get("Parent", None)
113
- # if parent_id is None:
114
- # yield from self._finalize_drafts(drafts)
115
- # drafts = _FeatureDrafts()
116
- if parent_id is not None and parent_id not in drafts.by_id:
117
- raise ValueError(
118
- f"unexpected line {line!r}, parent ID not among recent feature IDs"
119
- )
120
-
121
- draft = _FeatureDraft(
122
- idx=idx,
123
- sequence_id=sequence_id,
124
- source=source,
125
- type_=type_,
126
- start_original=int(start_str),
127
- end_original=int(end_str),
128
- score=score,
129
- strand=strand,
130
- phase=phase,
131
- attributes=attributes,
132
- )
133
- drafts.add(draft)
134
- idx += 1
135
-
136
- # yield from self._finalize_drafts(drafts, self._streaming_window)
137
-
138
- yield from self._finalize_drafts(drafts, None)
139
-
140
- def _finalize_drafts(
141
- self, drafts: _FeatureDrafts, w: int | None
142
- ) -> Iterator[Feature]:
143
- # TODO streaming version!
144
- # code below is already tracking
145
- # if not drafts.drafts:
146
- # return
147
- # if w is not None and len(drafts.drafts) <= w:
148
- # return
149
- #
150
- # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
151
- #
152
- # i = 0
153
- # while i < len(drafts.drafts) and (
154
- # not drafts.deps[drafts.drafts[i].idx]
155
- # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
156
- # ):
157
- # i += 1
158
- #
159
- # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
160
- #
161
- # result = _Features()
162
- # for j in range(i):
163
- # draft = drafts.drafts[j]
164
- # feature = self._finalize_draft(draft, result)
165
- # result.add(feature)
166
- # drafts.remove_first_n(i)
167
- # yield from result.features
168
-
169
- result = _Features()
170
- for draft in drafts.drafts:
171
- feature = self._finalize_draft(draft, result)
172
- result.add(feature)
173
- yield from result.features
174
-
175
- def _finalize_draft(self, draft: _FeatureDraft, result: _Features) -> Feature:
176
- match draft.type_.lower():
177
- case "gene":
178
- feature = self._finalize_gene(draft, result)
179
- case "exon":
180
- feature = self._finalize_exon(draft, result)
181
- case "three_prime_utr":
182
- feature = self._finalize_three_prime_utr(draft, result)
183
- case _:
184
- feature = self._finalize_other(draft, result)
185
- if feature.parent:
186
- new_children = feature.parent.children + (feature,)
187
- object.__setattr__(feature.parent, "children", new_children)
188
- return feature
189
-
190
- def _finalize_gene(self, draft: _FeatureDraft, result: _Features) -> Feature:
191
- feature = self._finalize_other(draft, result)
192
- name = draft.pick_attribute("gene_name", "Name")
193
- biotype = draft.pick_attribute("gene_biotype", "biotype")
194
- if name is None or biotype is None:
195
- return feature
196
- return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
197
-
198
- def _finalize_exon(self, draft: _FeatureDraft, result: _Features) -> Feature:
199
- feature = self._finalize_other(draft, result)
200
-
201
- gene = feature.parent
202
- while gene and not isinstance(gene, Gene):
203
- gene = gene.parent
204
-
205
- if gene is None:
206
- return feature
207
- exon = Exon(**feature.__dict__, gene=gene)
208
- object.__setattr__(gene, "exons", gene.exons + (exon,))
209
- return exon
210
-
211
- def _finalize_three_prime_utr(
212
- self, draft: _FeatureDraft, result: _Features
213
- ) -> Feature:
214
- feature = self._finalize_other(draft, result)
215
-
216
- gene = feature.parent
217
- while gene and not isinstance(gene, Gene):
218
- gene = gene.parent
219
-
220
- if gene is None:
221
- return feature
222
- return ThreePrimeUTR(**feature.__dict__, gene=gene)
223
-
224
- def _finalize_other(self, draft: _FeatureDraft, result: _Features) -> Feature:
225
- parent_id = draft.attributes.get("Parent", None)
226
- parent = result.by_id[parent_id] if parent_id is not None else None
227
-
228
- return Feature(
229
- sequence_id=draft.sequence_id,
230
- source=draft.source,
231
- type_=draft.type_,
232
- start_original=draft.start_original,
233
- end_original=draft.end_original,
234
- start_c=draft.start_original - 1,
235
- end_c=draft.end_original,
236
- score=draft.score,
237
- strand=draft.strand,
238
- phase=draft.phase,
239
- attributes=draft.attributes,
240
- parent=parent,
241
- children=(),
242
- )
243
-
244
- def _parse_score(self, line: str, score_str: str) -> float | None:
245
- if score_str == ".":
246
- return None
247
- try:
248
- return float(score_str)
249
- except ValueError as exc:
250
- raise ValueError(
251
- f"unexpected line {line!r}, score should be a number or '.'"
252
- ) from exc
253
-
254
- def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
255
- if strand_str in ("-", "+"):
256
- return cast(Strand, strand_str)
257
- if strand_str == ".":
258
- return None
259
- raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
260
-
261
- def _parse_phase(self, line: str, phase_str: str) -> int | None:
262
- if phase_str == ".":
263
- return None
264
- try:
265
- return int(phase_str)
266
- except ValueError as exc:
267
- raise ValueError(
268
- f"unexpected line {line!r}, phase should be an integer or '.'"
269
- ) from exc
270
-
271
- def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
272
- return {
273
- k: v
274
- for part in attributes_str.strip(";").split(";")
275
- for k, v in (part.split("=", 1),)
276
- }
277
-
278
-
279
- class GFF3Writer(Writer):
280
- def __init__(self, output: TextIO | Path | str) -> None:
281
- super().__init__(output)
282
- self._output.write(f"{_VERSION_PREFIX}3\n")
283
-
284
- def write(self, feature: Feature) -> None:
285
- fields = (
286
- feature.sequence_id,
287
- feature.source,
288
- feature.type_,
289
- str(feature.start_original),
290
- str(feature.end_original),
291
- str(feature.score) if feature.score is not None else ".",
292
- str(feature.strand) if feature.strand is not None else ".",
293
- str(feature.phase) if feature.phase is not None else ".",
294
- ";".join(f"{k}={v}" for k, v in feature.attributes.items()),
295
- )
296
- self._output.write("\t".join(fields))
297
- self._output.write("\n")
298
-
299
-
300
- _VERSION_PREFIX = "##gff-version "
301
-
302
-
303
- if __name__ == "__main__":
304
- for path in sys.argv[1:]:
305
- with GFFReader(path) as r:
306
- total_features = 0
307
- annotated_genes = 0
308
- annotated_exons = 0
309
- parsed_genes = 0
310
- parsed_exons = 0
311
- for feature in r:
312
- total_features += 1
313
- annotated_genes += feature.type_ == "gene"
314
- annotated_exons += feature.type_ == "exon"
315
- parsed_genes += isinstance(feature, Gene)
316
- parsed_exons += isinstance(feature, Exon)
317
- print(
318
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
319
- )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes