biofiles 0.0.6__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -1,10 +1,10 @@
1
1
  from collections import deque
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import Iterator, TextIO
4
+ from typing import Iterator, TextIO, Type, TypeVar
5
5
 
6
6
  from biofiles.common import Reader, Strand
7
- from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon
7
+ from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR, Transcript
8
8
 
9
9
 
10
10
  @dataclass
@@ -67,6 +67,9 @@ class FeatureReader(Reader):
67
67
  super().__init__(input_)
68
68
  self._streaming_window = streaming_window
69
69
 
70
+ def __iter__(self) -> Iterator[Feature]:
71
+ raise NotImplementedError
72
+
70
73
  def _finalize_drafts(
71
74
  self, drafts: FeatureDrafts, w: int | None
72
75
  ) -> Iterator[Feature]:
@@ -106,10 +109,14 @@ class FeatureReader(Reader):
106
109
  match draft.type_.lower():
107
110
  case "gene":
108
111
  feature = self._finalize_gene(draft, result)
112
+ case "transcript":
113
+ feature = self._finalize_transcript(draft, result)
109
114
  case "exon":
110
115
  feature = self._finalize_exon(draft, result)
111
116
  case "three_prime_utr":
112
- feature = self._finalize_three_prime_utr(draft, result)
117
+ feature = self._finalize_utr(draft, result, ThreePrimeUTR)
118
+ case "utr":
119
+ feature = self._finalize_utr(draft, result, UTR)
113
120
  case _:
114
121
  feature = self._finalize_other(draft, result)
115
122
  if feature.parent:
@@ -123,33 +130,43 @@ class FeatureReader(Reader):
123
130
  biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
124
131
  if name is None or biotype is None:
125
132
  return feature
126
- return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
133
+ return Gene(**feature.__dict__, name=name, biotype=biotype, transcripts=())
127
134
 
128
- def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
135
+ def _finalize_transcript(self, draft: FeatureDraft, result: Features) -> Feature:
129
136
  feature = self._finalize_other(draft, result)
137
+ if not (gene := self._find_ancestor_of_type(feature, Gene)):
138
+ return feature
139
+ transcript = Transcript(**feature.__dict__, gene=gene, exons=())
140
+ object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
141
+ return transcript
130
142
 
131
- gene = feature.parent
132
- while gene and not isinstance(gene, Gene):
133
- gene = gene.parent
134
-
135
- if gene is None:
143
+ def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
144
+ feature = self._finalize_other(draft, result)
145
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
136
146
  return feature
137
- exon = Exon(**feature.__dict__, gene=gene)
138
- object.__setattr__(gene, "exons", gene.exons + (exon,))
147
+ exon = Exon(**feature.__dict__, gene=transcript.gene, transcript=transcript)
148
+ object.__setattr__(transcript, "exons", transcript.exons + (exon,))
139
149
  return exon
140
150
 
141
- def _finalize_three_prime_utr(
142
- self, draft: FeatureDraft, result: Features
143
- ) -> Feature:
151
+ UTRT = TypeVar("UTRT", bound=UTR)
152
+
153
+ def _finalize_utr(
154
+ self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
155
+ ) -> Feature | UTRT:
144
156
  feature = self._finalize_other(draft, result)
157
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
158
+ return feature
159
+ return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
145
160
 
146
- gene = feature.parent
147
- while gene and not isinstance(gene, Gene):
148
- gene = gene.parent
161
+ FeatureT = TypeVar("FeatureT", bound=Feature)
149
162
 
150
- if gene is None:
151
- return feature
152
- return ThreePrimeUTR(**feature.__dict__, gene=gene)
163
+ def _find_ancestor_of_type(
164
+ self, feature: Feature, t: Type[FeatureT]
165
+ ) -> FeatureT | None:
166
+ ancestor = feature.parent
167
+ while ancestor and not isinstance(ancestor, t):
168
+ ancestor = ancestor.parent
169
+ return ancestor
153
170
 
154
171
  def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
155
172
  parent_id = self._extract_parent_id(draft)
@@ -188,6 +205,8 @@ class FeatureReader(Reader):
188
205
  return id_
189
206
  if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
190
207
  return id_
191
- if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
208
+ if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
209
+ id_ := draft.attributes.get("transcript_id")
210
+ ):
192
211
  return id_
193
212
  return None
@@ -4,7 +4,7 @@ from typing import Iterator, cast, TextIO
4
4
 
5
5
  from biofiles.common import Strand, Writer
6
6
  from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
7
- from biofiles.types.feature import Feature, Gene, Exon
7
+ from biofiles.types.feature import Feature, Gene, Exon, UTR
8
8
 
9
9
  __all__ = ["GFFReader", "GFF3Writer"]
10
10
 
@@ -122,8 +122,8 @@ class GFF3Writer(Writer):
122
122
  feature.sequence_id,
123
123
  feature.source,
124
124
  feature.type_,
125
- str(feature.start_original),
126
- str(feature.end_original),
125
+ str(feature.start_c + 1),
126
+ str(feature.end_c),
127
127
  str(feature.score) if feature.score is not None else ".",
128
128
  str(feature.strand) if feature.strand is not None else ".",
129
129
  str(feature.phase) if feature.phase is not None else ".",
@@ -142,14 +142,21 @@ if __name__ == "__main__":
142
142
  total_features = 0
143
143
  annotated_genes = 0
144
144
  annotated_exons = 0
145
+ annotated_utrs = 0
145
146
  parsed_genes = 0
146
147
  parsed_exons = 0
148
+ parsed_utrs = 0
147
149
  for feature in r:
148
150
  total_features += 1
149
151
  annotated_genes += feature.type_ == "gene"
150
152
  annotated_exons += feature.type_ == "exon"
153
+ annotated_utrs += "utr" in feature.type_.lower()
151
154
  parsed_genes += isinstance(feature, Gene)
152
155
  parsed_exons += isinstance(feature, Exon)
156
+ parsed_utrs += isinstance(feature, UTR)
153
157
  print(
154
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
158
+ f"{path}: {total_features} features, "
159
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
160
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
161
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
155
162
  )
@@ -4,7 +4,7 @@ import sys
4
4
  from typing import Iterator
5
5
 
6
6
  from biofiles.gff import GFFReader
7
- from biofiles.types.feature import Gene, Exon, Feature
7
+ from biofiles.types.feature import Gene, Exon, Feature, UTR
8
8
 
9
9
 
10
10
  class GTFReader(GFFReader):
@@ -25,14 +25,21 @@ if __name__ == "__main__":
25
25
  total_features = 0
26
26
  annotated_genes = 0
27
27
  annotated_exons = 0
28
+ annotated_utrs = 0
28
29
  parsed_genes = 0
29
30
  parsed_exons = 0
31
+ parsed_utrs = 0
30
32
  for feature in r:
31
33
  total_features += 1
32
34
  annotated_genes += feature.type_ == "gene"
33
35
  annotated_exons += feature.type_ == "exon"
36
+ annotated_utrs += "utr" in feature.type_.lower()
34
37
  parsed_genes += isinstance(feature, Gene)
35
38
  parsed_exons += isinstance(feature, Exon)
39
+ parsed_utrs += isinstance(feature, UTR)
36
40
  print(
37
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
41
+ f"{path}: {total_features} features, "
42
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
43
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
44
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
38
45
  )
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from biofiles.common import Strand
4
4
 
5
5
 
6
- __all__ = ["Feature", "Gene", "Exon"]
6
+ __all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
7
7
 
8
8
 
9
9
  @dataclass(frozen=True)
@@ -14,7 +14,7 @@ class Feature:
14
14
 
15
15
  start_original: int
16
16
  end_original: int
17
- # Original, 1-based inclusive values.
17
+ # Original values as they were present in the file (1-based inclusive for .gff and .gtf).
18
18
 
19
19
  start_c: int
20
20
  end_c: int
@@ -37,16 +37,28 @@ class Feature:
37
37
  class Gene(Feature):
38
38
  name: str
39
39
  biotype: str
40
+ transcripts: tuple["Transcript", ...]
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class Transcript(Feature):
45
+ gene: Gene
40
46
  exons: tuple["Exon", ...]
41
47
 
42
48
 
43
49
  @dataclass(frozen=True)
44
50
  class Exon(Feature):
45
51
  gene: Gene
46
- # TODO transcript, mRNA
52
+ transcript: Transcript
53
+ # TODO mRNA
47
54
 
48
55
 
49
56
  @dataclass(frozen=True)
50
- class ThreePrimeUTR(Feature):
57
+ class UTR(Feature):
51
58
  gene: Gene
52
- # TODO transcript
59
+ transcript: Transcript
60
+
61
+
62
+ @dataclass(frozen=True)
63
+ class ThreePrimeUTR(UTR):
64
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "biofiles"
7
- version = "0.0.6"
7
+ version = "0.0.8"
8
8
  authors = [
9
9
  { name="Tigran Saluev", email="tigran@saluev.com" },
10
10
  ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes