biofiles 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biofiles/feature.py ADDED
@@ -0,0 +1,193 @@
1
+ from collections import deque
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import Iterator, TextIO
5
+
6
+ from biofiles.common import Reader, Strand
7
+ from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon
8
+
9
+
10
+ @dataclass
11
+ class FeatureDraft:
12
+ idx: int
13
+ sequence_id: str
14
+ source: str
15
+ type_: str
16
+ start_original: int
17
+ end_original: int
18
+ score: float | None
19
+ strand: Strand | None
20
+ phase: int | None
21
+ attributes: dict[str, str]
22
+
23
+ def pick_attribute(self, *keys: str) -> str | None:
24
+ for key in keys:
25
+ if (value := self.attributes.get(key, None)) is not None:
26
+ return value
27
+ return None
28
+
29
+
30
+ @dataclass
31
+ class FeatureDrafts:
32
+ drafts: deque[FeatureDraft] = field(default_factory=deque)
33
+ by_id: dict[str, FeatureDraft] = field(default_factory=dict)
34
+ # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
35
+
36
+ def add(self, draft: FeatureDraft) -> None:
37
+ self.drafts.append(draft)
38
+ if id_ := draft.attributes.get("ID", None):
39
+ self.by_id[id_] = draft
40
+ # if parent_id := draft.attributes.get("Parent", None):
41
+ # parent = self.by_id[parent_id]
42
+ # self.deps[parent.idx].append(draft.idx)
43
+
44
+ # def remove_first_n(self, n: int) -> None:
45
+ # for _ in range(n):
46
+ # draft = self.drafts.popleft()
47
+ # if id_ := draft.attributes.get("ID", None):
48
+ # del self.by_id[id_]
49
+ # self.deps.pop(draft.idx, None)
50
+
51
+
52
+ @dataclass
53
+ class Features:
54
+ features: list[Feature] = field(default_factory=list)
55
+ by_id: dict[str, Feature] = field(default_factory=dict)
56
+
57
+ def add(self, feature: Feature):
58
+ self.features.append(feature)
59
+ if id_ := feature.id:
60
+ self.by_id[id_] = feature
61
+
62
+
63
+ class FeatureReader(Reader):
64
+ def __init__(
65
+ self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
66
+ ):
67
+ super().__init__(input_)
68
+ self._streaming_window = streaming_window
69
+
70
+ def _finalize_drafts(
71
+ self, drafts: FeatureDrafts, w: int | None
72
+ ) -> Iterator[Feature]:
73
+ # TODO streaming version!
74
+ # code below is already tracking
75
+ # if not drafts.drafts:
76
+ # return
77
+ # if w is not None and len(drafts.drafts) <= w:
78
+ # return
79
+ #
80
+ # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
81
+ #
82
+ # i = 0
83
+ # while i < len(drafts.drafts) and (
84
+ # not drafts.deps[drafts.drafts[i].idx]
85
+ # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
86
+ # ):
87
+ # i += 1
88
+ #
89
+ # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
90
+ #
91
+ # result = _Features()
92
+ # for j in range(i):
93
+ # draft = drafts.drafts[j]
94
+ # feature = self._finalize_draft(draft, result)
95
+ # result.add(feature)
96
+ # drafts.remove_first_n(i)
97
+ # yield from result.features
98
+
99
+ result = Features()
100
+ for draft in drafts.drafts:
101
+ feature = self._finalize_draft(draft, result)
102
+ result.add(feature)
103
+ yield from result.features
104
+
105
+ def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
106
+ match draft.type_.lower():
107
+ case "gene":
108
+ feature = self._finalize_gene(draft, result)
109
+ case "exon":
110
+ feature = self._finalize_exon(draft, result)
111
+ case "three_prime_utr":
112
+ feature = self._finalize_three_prime_utr(draft, result)
113
+ case _:
114
+ feature = self._finalize_other(draft, result)
115
+ if feature.parent:
116
+ new_children = feature.parent.children + (feature,)
117
+ object.__setattr__(feature.parent, "children", new_children)
118
+ return feature
119
+
120
+ def _finalize_gene(self, draft: FeatureDraft, result: Features) -> Feature:
121
+ feature = self._finalize_other(draft, result)
122
+ name = draft.pick_attribute("gene_name", "Name")
123
+ biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
124
+ if name is None or biotype is None:
125
+ return feature
126
+ return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
127
+
128
+ def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
129
+ feature = self._finalize_other(draft, result)
130
+
131
+ gene = feature.parent
132
+ while gene and not isinstance(gene, Gene):
133
+ gene = gene.parent
134
+
135
+ if gene is None:
136
+ return feature
137
+ exon = Exon(**feature.__dict__, gene=gene)
138
+ object.__setattr__(gene, "exons", gene.exons + (exon,))
139
+ return exon
140
+
141
+ def _finalize_three_prime_utr(
142
+ self, draft: FeatureDraft, result: Features
143
+ ) -> Feature:
144
+ feature = self._finalize_other(draft, result)
145
+
146
+ gene = feature.parent
147
+ while gene and not isinstance(gene, Gene):
148
+ gene = gene.parent
149
+
150
+ if gene is None:
151
+ return feature
152
+ return ThreePrimeUTR(**feature.__dict__, gene=gene)
153
+
154
+ def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
155
+ parent_id = self._extract_parent_id(draft)
156
+ parent = result.by_id[parent_id] if parent_id is not None else None
157
+
158
+ return Feature(
159
+ sequence_id=draft.sequence_id,
160
+ source=draft.source,
161
+ type_=draft.type_,
162
+ start_original=draft.start_original,
163
+ end_original=draft.end_original,
164
+ start_c=draft.start_original - 1,
165
+ end_c=draft.end_original,
166
+ score=draft.score,
167
+ strand=draft.strand,
168
+ phase=draft.phase,
169
+ attributes=draft.attributes,
170
+ id=self._extract_id(draft),
171
+ parent=parent,
172
+ children=(),
173
+ )
174
+
175
+ def _extract_id(self, draft: FeatureDraft) -> str | None:
176
+ if (id_ := draft.attributes.get("ID")) is not None:
177
+ return id_
178
+ if draft.type_ == "gene" and (id_ := draft.attributes.get("gene_id")):
179
+ return id_
180
+ if draft.type_ == "transcript" and (
181
+ id_ := draft.attributes.get("transcript_id")
182
+ ):
183
+ return id_
184
+ return None
185
+
186
+ def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
187
+ if (id_ := draft.attributes.get("Parent")) is not None:
188
+ return id_
189
+ if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
190
+ return id_
191
+ if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
192
+ return id_
193
+ return None
biofiles/gff.py CHANGED
@@ -1,75 +1,15 @@
1
1
  import sys
2
- from collections import deque
3
- from dataclasses import dataclass, field
4
2
  from pathlib import Path
5
3
  from typing import Iterator, cast, TextIO
6
4
 
7
- from biofiles.common import Strand, Reader, Writer
5
+ from biofiles.common import Strand, Writer
6
+ from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
8
7
  from biofiles.types.feature import Feature, Gene, Exon
9
8
 
10
9
  __all__ = ["GFFReader", "GFF3Writer"]
11
10
 
12
11
 
13
- @dataclass
14
- class _FeatureDraft:
15
- idx: int
16
- sequence_id: str
17
- source: str
18
- type_: str
19
- start_original: int
20
- end_original: int
21
- score: float | None
22
- strand: Strand | None
23
- phase: int | None
24
- attributes: dict[str, str]
25
-
26
- def pick_attribute(self, *keys: str) -> str | None:
27
- for key in keys:
28
- if (value := self.attributes.get(key, None)) is not None:
29
- return value
30
- return None
31
-
32
-
33
- @dataclass
34
- class _FeatureDrafts:
35
- drafts: deque[_FeatureDraft] = field(default_factory=deque)
36
- by_id: dict[str, _FeatureDraft] = field(default_factory=dict)
37
- # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
38
-
39
- def add(self, draft: _FeatureDraft) -> None:
40
- self.drafts.append(draft)
41
- if id_ := draft.attributes.get("ID", None):
42
- self.by_id[id_] = draft
43
- # if parent_id := draft.attributes.get("Parent", None):
44
- # parent = self.by_id[parent_id]
45
- # self.deps[parent.idx].append(draft.idx)
46
-
47
- # def remove_first_n(self, n: int) -> None:
48
- # for _ in range(n):
49
- # draft = self.drafts.popleft()
50
- # if id_ := draft.attributes.get("ID", None):
51
- # del self.by_id[id_]
52
- # self.deps.pop(draft.idx, None)
53
-
54
-
55
- @dataclass
56
- class _Features:
57
- features: list[Feature] = field(default_factory=list)
58
- by_id: dict[str, Feature] = field(default_factory=dict)
59
-
60
- def add(self, feature: Feature):
61
- self.features.append(feature)
62
- if id_ := feature.attributes.get("ID", None):
63
- self.by_id[id_] = feature
64
-
65
-
66
- class GFFReader(Reader):
67
- def __init__(
68
- self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
69
- ):
70
- super().__init__(input_)
71
- self._streaming_window = streaming_window
72
-
12
+ class GFFReader(FeatureReader):
73
13
  def __iter__(self) -> Iterator[Feature]:
74
14
  for line in self._input:
75
15
  line = line.rstrip("\n")
@@ -84,7 +24,7 @@ class GFFReader(Reader):
84
24
  raise ValueError(f"unexpected line {line!r}, expected version")
85
25
 
86
26
  def _read_gff3(self) -> Iterator[Feature]:
87
- drafts = _FeatureDrafts()
27
+ drafts = FeatureDrafts()
88
28
  idx = 0
89
29
  for line in self._input:
90
30
  if line.startswith("#"):
@@ -118,7 +58,7 @@ class GFFReader(Reader):
118
58
  f"unexpected line {line!r}, parent ID not among recent feature IDs"
119
59
  )
120
60
 
121
- draft = _FeatureDraft(
61
+ draft = FeatureDraft(
122
62
  idx=idx,
123
63
  sequence_id=sequence_id,
124
64
  source=source,
@@ -137,95 +77,6 @@ class GFFReader(Reader):
137
77
 
138
78
  yield from self._finalize_drafts(drafts, None)
139
79
 
140
- def _finalize_drafts(
141
- self, drafts: _FeatureDrafts, w: int | None
142
- ) -> Iterator[Feature]:
143
- # TODO streaming version!
144
- # code below is already tracking
145
- # if not drafts.drafts:
146
- # return
147
- # if w is not None and len(drafts.drafts) <= w:
148
- # return
149
- #
150
- # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
151
- #
152
- # i = 0
153
- # while i < len(drafts.drafts) and (
154
- # not drafts.deps[drafts.drafts[i].idx]
155
- # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
156
- # ):
157
- # i += 1
158
- #
159
- # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
160
- #
161
- # result = _Features()
162
- # for j in range(i):
163
- # draft = drafts.drafts[j]
164
- # feature = self._finalize_draft(draft, result)
165
- # result.add(feature)
166
- # drafts.remove_first_n(i)
167
- # yield from result.features
168
-
169
- result = _Features()
170
- for draft in drafts.drafts:
171
- feature = self._finalize_draft(draft, result)
172
- result.add(feature)
173
- yield from result.features
174
-
175
- def _finalize_draft(self, draft: _FeatureDraft, result: _Features) -> Feature:
176
- match draft.type_:
177
- case "gene":
178
- feature = self._finalize_gene(draft, result)
179
- case "exon":
180
- feature = self._finalize_exon(draft, result)
181
- case _:
182
- feature = self._finalize_other(draft, result)
183
- if feature.parent:
184
- new_children = feature.parent.children + (feature,)
185
- object.__setattr__(feature.parent, "children", new_children)
186
- return feature
187
-
188
- def _finalize_gene(self, draft: _FeatureDraft, result: _Features) -> Feature:
189
- feature = self._finalize_other(draft, result)
190
- name = draft.pick_attribute("gene_name", "Name")
191
- biotype = draft.pick_attribute("gene_biotype", "biotype")
192
- if name is None or biotype is None:
193
- return feature
194
- return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
195
-
196
- def _finalize_exon(self, draft: _FeatureDraft, result: _Features) -> Feature:
197
- feature = self._finalize_other(draft, result)
198
-
199
- gene = feature.parent
200
- while gene and not isinstance(gene, Gene):
201
- gene = gene.parent
202
-
203
- if gene is None:
204
- return feature
205
- exon = Exon(**feature.__dict__, gene=gene)
206
- object.__setattr__(gene, "exons", gene.exons + (exon,))
207
- return exon
208
-
209
- def _finalize_other(self, draft: _FeatureDraft, result: _Features) -> Feature:
210
- parent_id = draft.attributes.get("Parent", None)
211
- parent = result.by_id[parent_id] if parent_id is not None else None
212
-
213
- return Feature(
214
- sequence_id=draft.sequence_id,
215
- source=draft.source,
216
- type_=draft.type_,
217
- start_original=draft.start_original,
218
- end_original=draft.end_original,
219
- start_c=draft.start_original - 1,
220
- end_c=draft.end_original,
221
- score=draft.score,
222
- strand=draft.strand,
223
- phase=draft.phase,
224
- attributes=draft.attributes,
225
- parent=parent,
226
- children=(),
227
- )
228
-
229
80
  def _parse_score(self, line: str, score_str: str) -> float | None:
230
81
  if score_str == ".":
231
82
  return None
biofiles/gtf.py ADDED
@@ -0,0 +1,38 @@
1
+ __all__ = ["GTFReader"]
2
+
3
+ import sys
4
+ from typing import Iterator
5
+
6
+ from biofiles.gff import GFFReader
7
+ from biofiles.types.feature import Gene, Exon, Feature
8
+
9
+
10
+ class GTFReader(GFFReader):
11
+ def __iter__(self) -> Iterator[Feature]:
12
+ yield from self._read_gff3()
13
+
14
+ def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
15
+ return {
16
+ k: v.strip('"')
17
+ for part in attributes_str.strip(";").split(";")
18
+ for k, v in (part.strip().split(None, 1),)
19
+ }
20
+
21
+
22
+ if __name__ == "__main__":
23
+ for path in sys.argv[1:]:
24
+ with GTFReader(path) as r:
25
+ total_features = 0
26
+ annotated_genes = 0
27
+ annotated_exons = 0
28
+ parsed_genes = 0
29
+ parsed_exons = 0
30
+ for feature in r:
31
+ total_features += 1
32
+ annotated_genes += feature.type_ == "gene"
33
+ annotated_exons += feature.type_ == "exon"
34
+ parsed_genes += isinstance(feature, Gene)
35
+ parsed_exons += isinstance(feature, Exon)
36
+ print(
37
+ f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
38
+ )
biofiles/types/feature.py CHANGED
@@ -25,6 +25,7 @@ class Feature:
25
25
  phase: int | None
26
26
  attributes: dict[str, str]
27
27
 
28
+ id: str | None
28
29
  parent: "GFFFeature | None"
29
30
  children: tuple["Feature", ...]
30
31
 
@@ -43,3 +44,9 @@ class Gene(Feature):
43
44
  class Exon(Feature):
44
45
  gene: Gene
45
46
  # TODO transcript, mRNA
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class ThreePrimeUTR(Feature):
51
+ gene: Gene
52
+ # TODO transcript
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -1,14 +1,16 @@
1
1
  biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
3
3
  biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
4
- biofiles/gff.py,sha256=Bag0Z1xNR8l4IDEGct6OU2gCjmRldKRrhKcw8gNjbMY,10629
4
+ biofiles/feature.py,sha256=4eRXmusxTbrDqkwPkMKaa1nYRSSYvt3Y3_-7jC7rkII,6693
5
+ biofiles/gff.py,sha256=TwBasmakNmeS5yikP-E5iMEvB6BXgczGYQKiGa2_LAw,5516
6
+ biofiles/gtf.py,sha256=hWfjQjzwsrXLjCGr9ia6GdHNdYtlwkBrG1ldJYhRD-4,1251
5
7
  biofiles/repeatmasker.py,sha256=DqD1z1hUfCP4-qnfjF-oMF-ZpW_6XhOf_nzA8VHhQbw,3079
6
8
  biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- biofiles/types/feature.py,sha256=c9m_DU_EaUIWG6-xsiPxQIiGkfDs96nm5ls2T5SMeuA,826
9
+ biofiles/types/feature.py,sha256=PyxIkbpWLcboY8cGEcJBn6sWv5wIFgOyD9Ey5D2G3JQ,938
8
10
  biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
9
11
  biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
10
- biofiles-0.0.4.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
11
- biofiles-0.0.4.dist-info/METADATA,sha256=X4kN2G9g266dSlVtZ7RPuh2PUXtdu_0W41rPL_-hPzE,3033
12
- biofiles-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- biofiles-0.0.4.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
14
- biofiles-0.0.4.dist-info/RECORD,,
12
+ biofiles-0.0.6.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
13
+ biofiles-0.0.6.dist-info/METADATA,sha256=vbIWtDF_yHm-qV8EKlCiKQzjajnoIZdV0RWZJoS0zh0,3033
14
+ biofiles-0.0.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ biofiles-0.0.6.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
16
+ biofiles-0.0.6.dist-info/RECORD,,