biofiles 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/feature.py +41 -22
- biofiles/gff.py +11 -4
- biofiles/gtf.py +9 -2
- biofiles/types/feature.py +17 -5
- {biofiles-0.0.6.dist-info → biofiles-0.0.8.dist-info}/METADATA +1 -1
- {biofiles-0.0.6.dist-info → biofiles-0.0.8.dist-info}/RECORD +9 -9
- {biofiles-0.0.6.dist-info → biofiles-0.0.8.dist-info}/LICENSE +0 -0
- {biofiles-0.0.6.dist-info → biofiles-0.0.8.dist-info}/WHEEL +0 -0
- {biofiles-0.0.6.dist-info → biofiles-0.0.8.dist-info}/top_level.txt +0 -0
biofiles/feature.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
from collections import deque
|
2
2
|
from dataclasses import dataclass, field
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Iterator, TextIO
|
4
|
+
from typing import Iterator, TextIO, Type, TypeVar
|
5
5
|
|
6
6
|
from biofiles.common import Reader, Strand
|
7
|
-
from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon
|
7
|
+
from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR, Transcript
|
8
8
|
|
9
9
|
|
10
10
|
@dataclass
|
@@ -67,6 +67,9 @@ class FeatureReader(Reader):
|
|
67
67
|
super().__init__(input_)
|
68
68
|
self._streaming_window = streaming_window
|
69
69
|
|
70
|
+
def __iter__(self) -> Iterator[Feature]:
|
71
|
+
raise NotImplementedError
|
72
|
+
|
70
73
|
def _finalize_drafts(
|
71
74
|
self, drafts: FeatureDrafts, w: int | None
|
72
75
|
) -> Iterator[Feature]:
|
@@ -106,10 +109,14 @@ class FeatureReader(Reader):
|
|
106
109
|
match draft.type_.lower():
|
107
110
|
case "gene":
|
108
111
|
feature = self._finalize_gene(draft, result)
|
112
|
+
case "transcript":
|
113
|
+
feature = self._finalize_transcript(draft, result)
|
109
114
|
case "exon":
|
110
115
|
feature = self._finalize_exon(draft, result)
|
111
116
|
case "three_prime_utr":
|
112
|
-
feature = self.
|
117
|
+
feature = self._finalize_utr(draft, result, ThreePrimeUTR)
|
118
|
+
case "utr":
|
119
|
+
feature = self._finalize_utr(draft, result, UTR)
|
113
120
|
case _:
|
114
121
|
feature = self._finalize_other(draft, result)
|
115
122
|
if feature.parent:
|
@@ -123,33 +130,43 @@ class FeatureReader(Reader):
|
|
123
130
|
biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
|
124
131
|
if name is None or biotype is None:
|
125
132
|
return feature
|
126
|
-
return Gene(**feature.__dict__, name=name, biotype=biotype,
|
133
|
+
return Gene(**feature.__dict__, name=name, biotype=biotype, transcripts=())
|
127
134
|
|
128
|
-
def
|
135
|
+
def _finalize_transcript(self, draft: FeatureDraft, result: Features) -> Feature:
|
129
136
|
feature = self._finalize_other(draft, result)
|
137
|
+
if not (gene := self._find_ancestor_of_type(feature, Gene)):
|
138
|
+
return feature
|
139
|
+
transcript = Transcript(**feature.__dict__, gene=gene, exons=())
|
140
|
+
object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
|
141
|
+
return transcript
|
130
142
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
if gene is None:
|
143
|
+
def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
|
144
|
+
feature = self._finalize_other(draft, result)
|
145
|
+
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
136
146
|
return feature
|
137
|
-
exon = Exon(**feature.__dict__, gene=gene)
|
138
|
-
object.__setattr__(
|
147
|
+
exon = Exon(**feature.__dict__, gene=transcript.gene, transcript=transcript)
|
148
|
+
object.__setattr__(transcript, "exons", transcript.exons + (exon,))
|
139
149
|
return exon
|
140
150
|
|
141
|
-
|
142
|
-
|
143
|
-
|
151
|
+
UTRT = TypeVar("UTRT", bound=UTR)
|
152
|
+
|
153
|
+
def _finalize_utr(
|
154
|
+
self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
|
155
|
+
) -> Feature | UTRT:
|
144
156
|
feature = self._finalize_other(draft, result)
|
157
|
+
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
158
|
+
return feature
|
159
|
+
return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
|
145
160
|
|
146
|
-
|
147
|
-
while gene and not isinstance(gene, Gene):
|
148
|
-
gene = gene.parent
|
161
|
+
FeatureT = TypeVar("FeatureT", bound=Feature)
|
149
162
|
|
150
|
-
|
151
|
-
|
152
|
-
|
163
|
+
def _find_ancestor_of_type(
|
164
|
+
self, feature: Feature, t: Type[FeatureT]
|
165
|
+
) -> FeatureT | None:
|
166
|
+
ancestor = feature.parent
|
167
|
+
while ancestor and not isinstance(ancestor, t):
|
168
|
+
ancestor = ancestor.parent
|
169
|
+
return ancestor
|
153
170
|
|
154
171
|
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
155
172
|
parent_id = self._extract_parent_id(draft)
|
@@ -188,6 +205,8 @@ class FeatureReader(Reader):
|
|
188
205
|
return id_
|
189
206
|
if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
|
190
207
|
return id_
|
191
|
-
if draft.type_
|
208
|
+
if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
|
209
|
+
id_ := draft.attributes.get("transcript_id")
|
210
|
+
):
|
192
211
|
return id_
|
193
212
|
return None
|
biofiles/gff.py
CHANGED
@@ -4,7 +4,7 @@ from typing import Iterator, cast, TextIO
|
|
4
4
|
|
5
5
|
from biofiles.common import Strand, Writer
|
6
6
|
from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
|
7
|
-
from biofiles.types.feature import Feature, Gene, Exon
|
7
|
+
from biofiles.types.feature import Feature, Gene, Exon, UTR
|
8
8
|
|
9
9
|
__all__ = ["GFFReader", "GFF3Writer"]
|
10
10
|
|
@@ -122,8 +122,8 @@ class GFF3Writer(Writer):
|
|
122
122
|
feature.sequence_id,
|
123
123
|
feature.source,
|
124
124
|
feature.type_,
|
125
|
-
str(feature.
|
126
|
-
str(feature.
|
125
|
+
str(feature.start_c + 1),
|
126
|
+
str(feature.end_c),
|
127
127
|
str(feature.score) if feature.score is not None else ".",
|
128
128
|
str(feature.strand) if feature.strand is not None else ".",
|
129
129
|
str(feature.phase) if feature.phase is not None else ".",
|
@@ -142,14 +142,21 @@ if __name__ == "__main__":
|
|
142
142
|
total_features = 0
|
143
143
|
annotated_genes = 0
|
144
144
|
annotated_exons = 0
|
145
|
+
annotated_utrs = 0
|
145
146
|
parsed_genes = 0
|
146
147
|
parsed_exons = 0
|
148
|
+
parsed_utrs = 0
|
147
149
|
for feature in r:
|
148
150
|
total_features += 1
|
149
151
|
annotated_genes += feature.type_ == "gene"
|
150
152
|
annotated_exons += feature.type_ == "exon"
|
153
|
+
annotated_utrs += "utr" in feature.type_.lower()
|
151
154
|
parsed_genes += isinstance(feature, Gene)
|
152
155
|
parsed_exons += isinstance(feature, Exon)
|
156
|
+
parsed_utrs += isinstance(feature, UTR)
|
153
157
|
print(
|
154
|
-
f"{path}: {total_features} features,
|
158
|
+
f"{path}: {total_features} features, "
|
159
|
+
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
160
|
+
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
161
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
155
162
|
)
|
biofiles/gtf.py
CHANGED
@@ -4,7 +4,7 @@ import sys
|
|
4
4
|
from typing import Iterator
|
5
5
|
|
6
6
|
from biofiles.gff import GFFReader
|
7
|
-
from biofiles.types.feature import Gene, Exon, Feature
|
7
|
+
from biofiles.types.feature import Gene, Exon, Feature, UTR
|
8
8
|
|
9
9
|
|
10
10
|
class GTFReader(GFFReader):
|
@@ -25,14 +25,21 @@ if __name__ == "__main__":
|
|
25
25
|
total_features = 0
|
26
26
|
annotated_genes = 0
|
27
27
|
annotated_exons = 0
|
28
|
+
annotated_utrs = 0
|
28
29
|
parsed_genes = 0
|
29
30
|
parsed_exons = 0
|
31
|
+
parsed_utrs = 0
|
30
32
|
for feature in r:
|
31
33
|
total_features += 1
|
32
34
|
annotated_genes += feature.type_ == "gene"
|
33
35
|
annotated_exons += feature.type_ == "exon"
|
36
|
+
annotated_utrs += "utr" in feature.type_.lower()
|
34
37
|
parsed_genes += isinstance(feature, Gene)
|
35
38
|
parsed_exons += isinstance(feature, Exon)
|
39
|
+
parsed_utrs += isinstance(feature, UTR)
|
36
40
|
print(
|
37
|
-
f"{path}: {total_features} features,
|
41
|
+
f"{path}: {total_features} features, "
|
42
|
+
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
43
|
+
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
44
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
38
45
|
)
|
biofiles/types/feature.py
CHANGED
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
3
3
|
from biofiles.common import Strand
|
4
4
|
|
5
5
|
|
6
|
-
__all__ = ["Feature", "Gene", "Exon"]
|
6
|
+
__all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
|
7
7
|
|
8
8
|
|
9
9
|
@dataclass(frozen=True)
|
@@ -14,7 +14,7 @@ class Feature:
|
|
14
14
|
|
15
15
|
start_original: int
|
16
16
|
end_original: int
|
17
|
-
# Original
|
17
|
+
# Original values as they were present in the file (1-based inclusive for .gff and .gtf).
|
18
18
|
|
19
19
|
start_c: int
|
20
20
|
end_c: int
|
@@ -37,16 +37,28 @@ class Feature:
|
|
37
37
|
class Gene(Feature):
|
38
38
|
name: str
|
39
39
|
biotype: str
|
40
|
+
transcripts: tuple["Transcript", ...]
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass(frozen=True)
|
44
|
+
class Transcript(Feature):
|
45
|
+
gene: Gene
|
40
46
|
exons: tuple["Exon", ...]
|
41
47
|
|
42
48
|
|
43
49
|
@dataclass(frozen=True)
|
44
50
|
class Exon(Feature):
|
45
51
|
gene: Gene
|
46
|
-
|
52
|
+
transcript: Transcript
|
53
|
+
# TODO mRNA
|
47
54
|
|
48
55
|
|
49
56
|
@dataclass(frozen=True)
|
50
|
-
class
|
57
|
+
class UTR(Feature):
|
51
58
|
gene: Gene
|
52
|
-
|
59
|
+
transcript: Transcript
|
60
|
+
|
61
|
+
|
62
|
+
@dataclass(frozen=True)
|
63
|
+
class ThreePrimeUTR(UTR):
|
64
|
+
pass
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.8
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -1,16 +1,16 @@
|
|
1
1
|
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
3
3
|
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
4
|
-
biofiles/feature.py,sha256=
|
5
|
-
biofiles/gff.py,sha256=
|
6
|
-
biofiles/gtf.py,sha256=
|
4
|
+
biofiles/feature.py,sha256=oZKNkZrCJjg4-AutGy3rri0gq-FRyo7vLwUzYG1EY7g,7809
|
5
|
+
biofiles/gff.py,sha256=LIbHGkpSTo-iMeatt2opPFlpNs8tHyv9XHPIVwzh3m8,5790
|
6
|
+
biofiles/gtf.py,sha256=eQsnpTjDaxrBeQ8uHzXy6C6sj8OvenFv9zwkFlytQYM,1535
|
7
7
|
biofiles/repeatmasker.py,sha256=DqD1z1hUfCP4-qnfjF-oMF-ZpW_6XhOf_nzA8VHhQbw,3079
|
8
8
|
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
biofiles/types/feature.py,sha256=
|
9
|
+
biofiles/types/feature.py,sha256=N6IIip7YqtSib5w_VLX1cBVwja8iWfa5AJncsKBs1PU,1209
|
10
10
|
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
11
11
|
biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
|
12
|
-
biofiles-0.0.
|
13
|
-
biofiles-0.0.
|
14
|
-
biofiles-0.0.
|
15
|
-
biofiles-0.0.
|
16
|
-
biofiles-0.0.
|
12
|
+
biofiles-0.0.8.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
13
|
+
biofiles-0.0.8.dist-info/METADATA,sha256=B0rgF4FGa2lgMehk6LdOEhHB2jddaoc76fteG3p4dp0,3033
|
14
|
+
biofiles-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
+
biofiles-0.0.8.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
16
|
+
biofiles-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|