biofiles 0.0.7__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -1,10 +1,17 @@
1
1
  from collections import deque
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import Iterator, TextIO, Type
4
+ from typing import Iterator, TextIO, Type, TypeVar, cast
5
5
 
6
6
  from biofiles.common import Reader, Strand
7
- from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR
7
+ from biofiles.types.feature import (
8
+ Feature,
9
+ Gene,
10
+ ThreePrimeUTR,
11
+ Exon,
12
+ UTR,
13
+ Transcript,
14
+ )
8
15
 
9
16
 
10
17
  @dataclass
@@ -60,6 +67,12 @@ class Features:
60
67
  self.by_id[id_] = feature
61
68
 
62
69
 
70
+ FeatureT = TypeVar("FeatureT", bound=Feature)
71
+ GeneT = TypeVar("GeneT", bound=Gene)
72
+ TranscriptT = TypeVar("TranscriptT", bound=Transcript)
73
+ UTRT = TypeVar("UTRT", bound=UTR)
74
+
75
+
63
76
  class FeatureReader(Reader):
64
77
  def __init__(
65
78
  self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
@@ -107,8 +120,10 @@ class FeatureReader(Reader):
107
120
 
108
121
  def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
109
122
  match draft.type_.lower():
110
- case "gene":
111
- feature = self._finalize_gene(draft, result)
123
+ case "gene" | "ncrna_gene":
124
+ feature = self._finalize_gene(draft, result, Gene)
125
+ case "transcript" | "mrna" | "lnc_rna":
126
+ feature = self._finalize_transcript(draft, result, Transcript)
112
127
  case "exon":
113
128
  feature = self._finalize_exon(draft, result)
114
129
  case "three_prime_utr":
@@ -122,39 +137,49 @@ class FeatureReader(Reader):
122
137
  object.__setattr__(feature.parent, "children", new_children)
123
138
  return feature
124
139
 
125
- def _finalize_gene(self, draft: FeatureDraft, result: Features) -> Feature:
140
+ def _finalize_gene(
141
+ self, draft: FeatureDraft, result: Features, type_: Type[GeneT]
142
+ ) -> Feature:
126
143
  feature = self._finalize_other(draft, result)
127
144
  name = draft.pick_attribute("gene_name", "Name")
128
145
  biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
129
146
  if name is None or biotype is None:
130
147
  return feature
131
- return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
148
+ return type_(**feature.__dict__, name=name, biotype=biotype, transcripts=())
132
149
 
133
- def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
150
+ def _finalize_transcript(
151
+ self, draft: FeatureDraft, result: Features, type_: Type[TranscriptT]
152
+ ) -> Feature:
134
153
  feature = self._finalize_other(draft, result)
154
+ if not (gene := self._find_ancestor_of_type(feature, Gene)):
155
+ return feature
156
+ transcript = type_(**feature.__dict__, gene=gene, exons=())
157
+ object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
158
+ return transcript
135
159
 
136
- gene = feature.parent
137
- while gene and not isinstance(gene, Gene):
138
- gene = gene.parent
139
-
140
- if gene is None:
160
+ def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
161
+ feature = self._finalize_other(draft, result)
162
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
141
163
  return feature
142
- exon = Exon(**feature.__dict__, gene=gene)
143
- object.__setattr__(gene, "exons", gene.exons + (exon,))
164
+ exon = Exon(**feature.__dict__, gene=transcript.gene, transcript=transcript)
165
+ object.__setattr__(transcript, "exons", transcript.exons + (exon,))
144
166
  return exon
145
167
 
146
168
  def _finalize_utr(
147
- self, draft: FeatureDraft, result: Features, type_: Type[UTR]
169
+ self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
148
170
  ) -> Feature:
149
171
  feature = self._finalize_other(draft, result)
150
-
151
- gene = feature.parent
152
- while gene and not isinstance(gene, Gene):
153
- gene = gene.parent
154
-
155
- if gene is None:
172
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
156
173
  return feature
157
- return type_(**feature.__dict__, gene=gene)
174
+ return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
175
+
176
+ def _find_ancestor_of_type(
177
+ self, feature: Feature, t: Type[FeatureT]
178
+ ) -> FeatureT | None:
179
+ ancestor = feature.parent
180
+ while ancestor and not isinstance(ancestor, t):
181
+ ancestor = ancestor.parent
182
+ return cast(FeatureT | None, ancestor)
158
183
 
159
184
  def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
160
185
  parent_id = self._extract_parent_id(draft)
@@ -193,6 +218,8 @@ class FeatureReader(Reader):
193
218
  return id_
194
219
  if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
195
220
  return id_
196
- if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
221
+ if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
222
+ id_ := draft.attributes.get("transcript_id")
223
+ ):
197
224
  return id_
198
225
  return None
@@ -4,7 +4,7 @@ from typing import Iterator, cast, TextIO
4
4
 
5
5
  from biofiles.common import Strand, Writer
6
6
  from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
7
- from biofiles.types.feature import Feature, Gene, Exon
7
+ from biofiles.types.feature import Feature, Gene, Exon, UTR
8
8
 
9
9
  __all__ = ["GFFReader", "GFF3Writer"]
10
10
 
@@ -142,14 +142,21 @@ if __name__ == "__main__":
142
142
  total_features = 0
143
143
  annotated_genes = 0
144
144
  annotated_exons = 0
145
+ annotated_utrs = 0
145
146
  parsed_genes = 0
146
147
  parsed_exons = 0
148
+ parsed_utrs = 0
147
149
  for feature in r:
148
150
  total_features += 1
149
- annotated_genes += feature.type_ == "gene"
151
+ annotated_genes += "gene" in feature.type_.lower()
150
152
  annotated_exons += feature.type_ == "exon"
153
+ annotated_utrs += "utr" in feature.type_.lower()
151
154
  parsed_genes += isinstance(feature, Gene)
152
155
  parsed_exons += isinstance(feature, Exon)
156
+ parsed_utrs += isinstance(feature, UTR)
153
157
  print(
154
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
158
+ f"{path}: {total_features} features, "
159
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
160
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
161
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
155
162
  )
@@ -4,7 +4,7 @@ import sys
4
4
  from typing import Iterator
5
5
 
6
6
  from biofiles.gff import GFFReader
7
- from biofiles.types.feature import Gene, Exon, Feature
7
+ from biofiles.types.feature import Gene, Exon, Feature, UTR
8
8
 
9
9
 
10
10
  class GTFReader(GFFReader):
@@ -25,14 +25,21 @@ if __name__ == "__main__":
25
25
  total_features = 0
26
26
  annotated_genes = 0
27
27
  annotated_exons = 0
28
+ annotated_utrs = 0
28
29
  parsed_genes = 0
29
30
  parsed_exons = 0
31
+ parsed_utrs = 0
30
32
  for feature in r:
31
33
  total_features += 1
32
- annotated_genes += feature.type_ == "gene"
34
+ annotated_genes += "gene" in feature.type_.lower()
33
35
  annotated_exons += feature.type_ == "exon"
36
+ annotated_utrs += "utr" in feature.type_.lower()
34
37
  parsed_genes += isinstance(feature, Gene)
35
38
  parsed_exons += isinstance(feature, Exon)
39
+ parsed_utrs += isinstance(feature, UTR)
36
40
  print(
37
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
41
+ f"{path}: {total_features} features, "
42
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
43
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
44
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
38
45
  )
@@ -1,6 +1,6 @@
1
1
  import sys
2
2
  from collections import Counter
3
- from typing import Iterator
3
+ from typing import Iterator, cast, Literal
4
4
 
5
5
  from biofiles.common import Reader
6
6
  from biofiles.types.repeat import Repeat
@@ -42,7 +42,7 @@ class RepeatMaskerReader(Reader):
42
42
  seq_start = int(seq_start_str)
43
43
  seq_end = int(seq_end_str)
44
44
  seq_left = int(seq_left_str[1:-1])
45
- strand = {"+": "+", "C": "-"}[strand_str]
45
+ strand = cast(Literal["+", "-"], {"+": "+", "C": "-"}[strand_str])
46
46
 
47
47
  if "/" in repeat_class_family:
48
48
  repeat_class, repeat_family = repeat_class_family.split("/", 1)
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from biofiles.common import Strand
4
4
 
5
5
 
6
- __all__ = ["Feature", "Gene", "Exon"]
6
+ __all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
7
7
 
8
8
 
9
9
  @dataclass(frozen=True)
@@ -26,7 +26,7 @@ class Feature:
26
26
  attributes: dict[str, str]
27
27
 
28
28
  id: str | None
29
- parent: "GFFFeature | None"
29
+ parent: "Feature | None"
30
30
  children: tuple["Feature", ...]
31
31
 
32
32
 
@@ -37,19 +37,25 @@ class Feature:
37
37
  class Gene(Feature):
38
38
  name: str
39
39
  biotype: str
40
+ transcripts: tuple["Transcript", ...]
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class Transcript(Feature):
45
+ gene: Gene
40
46
  exons: tuple["Exon", ...]
41
47
 
42
48
 
43
49
  @dataclass(frozen=True)
44
50
  class Exon(Feature):
45
51
  gene: Gene
46
- # TODO transcript, mRNA
52
+ transcript: Transcript
47
53
 
48
54
 
49
55
  @dataclass(frozen=True)
50
56
  class UTR(Feature):
51
57
  gene: Gene
52
- # TODO transcript
58
+ transcript: Transcript
53
59
 
54
60
 
55
61
  @dataclass(frozen=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "biofiles"
7
- version = "0.0.7"
7
+ version = "0.0.9"
8
8
  authors = [
9
9
  { name="Tigran Saluev", email="tigran@saluev.com" },
10
10
  ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes