biofiles 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biofiles/feature.py CHANGED
@@ -1,10 +1,10 @@
1
1
  from collections import deque
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import Iterator, TextIO, Type
4
+ from typing import Iterator, TextIO, Type, TypeVar
5
5
 
6
6
  from biofiles.common import Reader, Strand
7
- from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR
7
+ from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR, Transcript
8
8
 
9
9
 
10
10
  @dataclass
@@ -109,6 +109,8 @@ class FeatureReader(Reader):
109
109
  match draft.type_.lower():
110
110
  case "gene":
111
111
  feature = self._finalize_gene(draft, result)
112
+ case "transcript":
113
+ feature = self._finalize_transcript(draft, result)
112
114
  case "exon":
113
115
  feature = self._finalize_exon(draft, result)
114
116
  case "three_prime_utr":
@@ -128,33 +130,43 @@ class FeatureReader(Reader):
128
130
  biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
129
131
  if name is None or biotype is None:
130
132
  return feature
131
- return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
133
+ return Gene(**feature.__dict__, name=name, biotype=biotype, transcripts=())
132
134
 
133
- def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
135
+ def _finalize_transcript(self, draft: FeatureDraft, result: Features) -> Feature:
134
136
  feature = self._finalize_other(draft, result)
137
+ if not (gene := self._find_ancestor_of_type(feature, Gene)):
138
+ return feature
139
+ transcript = Transcript(**feature.__dict__, gene=gene, exons=())
140
+ object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
141
+ return transcript
135
142
 
136
- gene = feature.parent
137
- while gene and not isinstance(gene, Gene):
138
- gene = gene.parent
139
-
140
- if gene is None:
143
+ def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
144
+ feature = self._finalize_other(draft, result)
145
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
141
146
  return feature
142
- exon = Exon(**feature.__dict__, gene=gene)
143
- object.__setattr__(gene, "exons", gene.exons + (exon,))
147
+ exon = Exon(**feature.__dict__, gene=transcript.gene, transcript=transcript)
148
+ object.__setattr__(transcript, "exons", transcript.exons + (exon,))
144
149
  return exon
145
150
 
151
+ UTRT = TypeVar("UTRT", bound=UTR)
152
+
146
153
  def _finalize_utr(
147
- self, draft: FeatureDraft, result: Features, type_: Type[UTR]
148
- ) -> Feature:
154
+ self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
155
+ ) -> Feature | UTRT:
149
156
  feature = self._finalize_other(draft, result)
157
+ if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
158
+ return feature
159
+ return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
150
160
 
151
- gene = feature.parent
152
- while gene and not isinstance(gene, Gene):
153
- gene = gene.parent
161
+ FeatureT = TypeVar("FeatureT", bound=Feature)
154
162
 
155
- if gene is None:
156
- return feature
157
- return type_(**feature.__dict__, gene=gene)
163
+ def _find_ancestor_of_type(
164
+ self, feature: Feature, t: Type[FeatureT]
165
+ ) -> FeatureT | None:
166
+ ancestor = feature.parent
167
+ while ancestor and not isinstance(ancestor, t):
168
+ ancestor = ancestor.parent
169
+ return ancestor
158
170
 
159
171
  def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
160
172
  parent_id = self._extract_parent_id(draft)
@@ -193,6 +205,8 @@ class FeatureReader(Reader):
193
205
  return id_
194
206
  if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
195
207
  return id_
196
- if draft.type_ == "exon" and (id_ := draft.attributes.get("transcript_id")):
208
+ if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
209
+ id_ := draft.attributes.get("transcript_id")
210
+ ):
197
211
  return id_
198
212
  return None
biofiles/gff.py CHANGED
@@ -4,7 +4,7 @@ from typing import Iterator, cast, TextIO
4
4
 
5
5
  from biofiles.common import Strand, Writer
6
6
  from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
7
- from biofiles.types.feature import Feature, Gene, Exon
7
+ from biofiles.types.feature import Feature, Gene, Exon, UTR
8
8
 
9
9
  __all__ = ["GFFReader", "GFF3Writer"]
10
10
 
@@ -142,14 +142,21 @@ if __name__ == "__main__":
142
142
  total_features = 0
143
143
  annotated_genes = 0
144
144
  annotated_exons = 0
145
+ annotated_utrs = 0
145
146
  parsed_genes = 0
146
147
  parsed_exons = 0
148
+ parsed_utrs = 0
147
149
  for feature in r:
148
150
  total_features += 1
149
151
  annotated_genes += feature.type_ == "gene"
150
152
  annotated_exons += feature.type_ == "exon"
153
+ annotated_utrs += "utr" in feature.type_.lower()
151
154
  parsed_genes += isinstance(feature, Gene)
152
155
  parsed_exons += isinstance(feature, Exon)
156
+ parsed_utrs += isinstance(feature, UTR)
153
157
  print(
154
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
158
+ f"{path}: {total_features} features, "
159
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
160
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
161
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
155
162
  )
biofiles/gtf.py CHANGED
@@ -4,7 +4,7 @@ import sys
4
4
  from typing import Iterator
5
5
 
6
6
  from biofiles.gff import GFFReader
7
- from biofiles.types.feature import Gene, Exon, Feature
7
+ from biofiles.types.feature import Gene, Exon, Feature, UTR
8
8
 
9
9
 
10
10
  class GTFReader(GFFReader):
@@ -25,14 +25,21 @@ if __name__ == "__main__":
25
25
  total_features = 0
26
26
  annotated_genes = 0
27
27
  annotated_exons = 0
28
+ annotated_utrs = 0
28
29
  parsed_genes = 0
29
30
  parsed_exons = 0
31
+ parsed_utrs = 0
30
32
  for feature in r:
31
33
  total_features += 1
32
34
  annotated_genes += feature.type_ == "gene"
33
35
  annotated_exons += feature.type_ == "exon"
36
+ annotated_utrs += "utr" in feature.type_.lower()
34
37
  parsed_genes += isinstance(feature, Gene)
35
38
  parsed_exons += isinstance(feature, Exon)
39
+ parsed_utrs += isinstance(feature, UTR)
36
40
  print(
37
- f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
41
+ f"{path}: {total_features} features, "
42
+ f"{parsed_genes} genes parsed out of {annotated_genes}, "
43
+ f"{parsed_exons} exons parsed out of {annotated_exons}, "
44
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
38
45
  )
biofiles/types/feature.py CHANGED
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from biofiles.common import Strand
4
4
 
5
5
 
6
- __all__ = ["Feature", "Gene", "Exon"]
6
+ __all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
7
7
 
8
8
 
9
9
  @dataclass(frozen=True)
@@ -37,19 +37,26 @@ class Feature:
37
37
  class Gene(Feature):
38
38
  name: str
39
39
  biotype: str
40
+ transcripts: tuple["Transcript", ...]
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class Transcript(Feature):
45
+ gene: Gene
40
46
  exons: tuple["Exon", ...]
41
47
 
42
48
 
43
49
  @dataclass(frozen=True)
44
50
  class Exon(Feature):
45
51
  gene: Gene
46
- # TODO transcript, mRNA
52
+ transcript: Transcript
53
+ # TODO mRNA
47
54
 
48
55
 
49
56
  @dataclass(frozen=True)
50
57
  class UTR(Feature):
51
58
  gene: Gene
52
- # TODO transcript
59
+ transcript: Transcript
53
60
 
54
61
 
55
62
  @dataclass(frozen=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.7
3
+ Version: 0.0.8
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -1,16 +1,16 @@
1
1
  biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
3
3
  biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
4
- biofiles/feature.py,sha256=wga91SYc2cMh5r5nH2m9IRM5hnVObesrxYr31kBU4WA,6874
5
- biofiles/gff.py,sha256=W5AjaQL_iYk4OF-H7C2pOjtpeLDEKfVg5uTOFxPDJ5I,5506
6
- biofiles/gtf.py,sha256=hWfjQjzwsrXLjCGr9ia6GdHNdYtlwkBrG1ldJYhRD-4,1251
4
+ biofiles/feature.py,sha256=oZKNkZrCJjg4-AutGy3rri0gq-FRyo7vLwUzYG1EY7g,7809
5
+ biofiles/gff.py,sha256=LIbHGkpSTo-iMeatt2opPFlpNs8tHyv9XHPIVwzh3m8,5790
6
+ biofiles/gtf.py,sha256=eQsnpTjDaxrBeQ8uHzXy6C6sj8OvenFv9zwkFlytQYM,1535
7
7
  biofiles/repeatmasker.py,sha256=DqD1z1hUfCP4-qnfjF-oMF-ZpW_6XhOf_nzA8VHhQbw,3079
8
8
  biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- biofiles/types/feature.py,sha256=gdywZKp7_TXiiZcXfbVs6oSHIWtju2-bqfQOmjH71Dg,1041
9
+ biofiles/types/feature.py,sha256=N6IIip7YqtSib5w_VLX1cBVwja8iWfa5AJncsKBs1PU,1209
10
10
  biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
11
11
  biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
12
- biofiles-0.0.7.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
13
- biofiles-0.0.7.dist-info/METADATA,sha256=jd6-bM7tz1laNRX_wUCnDP4FRvQwGO0sxqqWYAvowiQ,3033
14
- biofiles-0.0.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
- biofiles-0.0.7.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
16
- biofiles-0.0.7.dist-info/RECORD,,
12
+ biofiles-0.0.8.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
13
+ biofiles-0.0.8.dist-info/METADATA,sha256=B0rgF4FGa2lgMehk6LdOEhHB2jddaoc76fteG3p4dp0,3033
14
+ biofiles-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ biofiles-0.0.8.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
16
+ biofiles-0.0.8.dist-info/RECORD,,