biofiles 0.0.7__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biofiles-0.0.7 → biofiles-0.0.9}/PKG-INFO +1 -1
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/feature.py +50 -23
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/gff.py +10 -3
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/gtf.py +10 -3
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/repeatmasker.py +2 -2
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/types/feature.py +10 -4
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles.egg-info/PKG-INFO +1 -1
- {biofiles-0.0.7 → biofiles-0.0.9}/pyproject.toml +1 -1
- {biofiles-0.0.7 → biofiles-0.0.9}/LICENSE +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/README.md +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/__init__.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/common.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/fasta.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/types/__init__.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/types/repeat.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles/types/sequence.py +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles.egg-info/SOURCES.txt +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles.egg-info/dependency_links.txt +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/biofiles.egg-info/top_level.txt +0 -0
- {biofiles-0.0.7 → biofiles-0.0.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -1,10 +1,17 @@
|
|
1
1
|
from collections import deque
|
2
2
|
from dataclasses import dataclass, field
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Iterator, TextIO, Type
|
4
|
+
from typing import Iterator, TextIO, Type, TypeVar, cast
|
5
5
|
|
6
6
|
from biofiles.common import Reader, Strand
|
7
|
-
from biofiles.types.feature import
|
7
|
+
from biofiles.types.feature import (
|
8
|
+
Feature,
|
9
|
+
Gene,
|
10
|
+
ThreePrimeUTR,
|
11
|
+
Exon,
|
12
|
+
UTR,
|
13
|
+
Transcript,
|
14
|
+
)
|
8
15
|
|
9
16
|
|
10
17
|
@dataclass
|
@@ -60,6 +67,12 @@ class Features:
|
|
60
67
|
self.by_id[id_] = feature
|
61
68
|
|
62
69
|
|
70
|
+
FeatureT = TypeVar("FeatureT", bound=Feature)
|
71
|
+
GeneT = TypeVar("GeneT", bound=Gene)
|
72
|
+
TranscriptT = TypeVar("TranscriptT", bound=Transcript)
|
73
|
+
UTRT = TypeVar("UTRT", bound=UTR)
|
74
|
+
|
75
|
+
|
63
76
|
class FeatureReader(Reader):
|
64
77
|
def __init__(
|
65
78
|
self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
|
@@ -107,8 +120,10 @@ class FeatureReader(Reader):
|
|
107
120
|
|
108
121
|
def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
|
109
122
|
match draft.type_.lower():
|
110
|
-
case "gene":
|
111
|
-
feature = self._finalize_gene(draft, result)
|
123
|
+
case "gene" | "ncrna_gene":
|
124
|
+
feature = self._finalize_gene(draft, result, Gene)
|
125
|
+
case "transcript" | "mrna" | "lnc_rna":
|
126
|
+
feature = self._finalize_transcript(draft, result, Transcript)
|
112
127
|
case "exon":
|
113
128
|
feature = self._finalize_exon(draft, result)
|
114
129
|
case "three_prime_utr":
|
@@ -122,39 +137,49 @@ class FeatureReader(Reader):
|
|
122
137
|
object.__setattr__(feature.parent, "children", new_children)
|
123
138
|
return feature
|
124
139
|
|
125
|
-
def _finalize_gene(
|
140
|
+
def _finalize_gene(
|
141
|
+
self, draft: FeatureDraft, result: Features, type_: Type[GeneT]
|
142
|
+
) -> Feature:
|
126
143
|
feature = self._finalize_other(draft, result)
|
127
144
|
name = draft.pick_attribute("gene_name", "Name")
|
128
145
|
biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
|
129
146
|
if name is None or biotype is None:
|
130
147
|
return feature
|
131
|
-
return
|
148
|
+
return type_(**feature.__dict__, name=name, biotype=biotype, transcripts=())
|
132
149
|
|
133
|
-
def
|
150
|
+
def _finalize_transcript(
|
151
|
+
self, draft: FeatureDraft, result: Features, type_: Type[TranscriptT]
|
152
|
+
) -> Feature:
|
134
153
|
feature = self._finalize_other(draft, result)
|
154
|
+
if not (gene := self._find_ancestor_of_type(feature, Gene)):
|
155
|
+
return feature
|
156
|
+
transcript = type_(**feature.__dict__, gene=gene, exons=())
|
157
|
+
object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
|
158
|
+
return transcript
|
135
159
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
if gene is None:
|
160
|
+
def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
|
161
|
+
feature = self._finalize_other(draft, result)
|
162
|
+
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
141
163
|
return feature
|
142
|
-
exon = Exon(**feature.__dict__, gene=gene)
|
143
|
-
object.__setattr__(
|
164
|
+
exon = Exon(**feature.__dict__, gene=transcript.gene, transcript=transcript)
|
165
|
+
object.__setattr__(transcript, "exons", transcript.exons + (exon,))
|
144
166
|
return exon
|
145
167
|
|
146
168
|
def _finalize_utr(
|
147
|
-
self, draft: FeatureDraft, result: Features, type_: Type[
|
169
|
+
self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
|
148
170
|
) -> Feature:
|
149
171
|
feature = self._finalize_other(draft, result)
|
150
|
-
|
151
|
-
gene = feature.parent
|
152
|
-
while gene and not isinstance(gene, Gene):
|
153
|
-
gene = gene.parent
|
154
|
-
|
155
|
-
if gene is None:
|
172
|
+
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
156
173
|
return feature
|
157
|
-
return type_(**feature.__dict__, gene=gene)
|
174
|
+
return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
|
175
|
+
|
176
|
+
def _find_ancestor_of_type(
|
177
|
+
self, feature: Feature, t: Type[FeatureT]
|
178
|
+
) -> FeatureT | None:
|
179
|
+
ancestor = feature.parent
|
180
|
+
while ancestor and not isinstance(ancestor, t):
|
181
|
+
ancestor = ancestor.parent
|
182
|
+
return cast(FeatureT | None, ancestor)
|
158
183
|
|
159
184
|
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
160
185
|
parent_id = self._extract_parent_id(draft)
|
@@ -193,6 +218,8 @@ class FeatureReader(Reader):
|
|
193
218
|
return id_
|
194
219
|
if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
|
195
220
|
return id_
|
196
|
-
if draft.type_
|
221
|
+
if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
|
222
|
+
id_ := draft.attributes.get("transcript_id")
|
223
|
+
):
|
197
224
|
return id_
|
198
225
|
return None
|
@@ -4,7 +4,7 @@ from typing import Iterator, cast, TextIO
|
|
4
4
|
|
5
5
|
from biofiles.common import Strand, Writer
|
6
6
|
from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
|
7
|
-
from biofiles.types.feature import Feature, Gene, Exon
|
7
|
+
from biofiles.types.feature import Feature, Gene, Exon, UTR
|
8
8
|
|
9
9
|
__all__ = ["GFFReader", "GFF3Writer"]
|
10
10
|
|
@@ -142,14 +142,21 @@ if __name__ == "__main__":
|
|
142
142
|
total_features = 0
|
143
143
|
annotated_genes = 0
|
144
144
|
annotated_exons = 0
|
145
|
+
annotated_utrs = 0
|
145
146
|
parsed_genes = 0
|
146
147
|
parsed_exons = 0
|
148
|
+
parsed_utrs = 0
|
147
149
|
for feature in r:
|
148
150
|
total_features += 1
|
149
|
-
annotated_genes += feature.type_
|
151
|
+
annotated_genes += "gene" in feature.type_.lower()
|
150
152
|
annotated_exons += feature.type_ == "exon"
|
153
|
+
annotated_utrs += "utr" in feature.type_.lower()
|
151
154
|
parsed_genes += isinstance(feature, Gene)
|
152
155
|
parsed_exons += isinstance(feature, Exon)
|
156
|
+
parsed_utrs += isinstance(feature, UTR)
|
153
157
|
print(
|
154
|
-
f"{path}: {total_features} features,
|
158
|
+
f"{path}: {total_features} features, "
|
159
|
+
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
160
|
+
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
161
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
155
162
|
)
|
@@ -4,7 +4,7 @@ import sys
|
|
4
4
|
from typing import Iterator
|
5
5
|
|
6
6
|
from biofiles.gff import GFFReader
|
7
|
-
from biofiles.types.feature import Gene, Exon, Feature
|
7
|
+
from biofiles.types.feature import Gene, Exon, Feature, UTR
|
8
8
|
|
9
9
|
|
10
10
|
class GTFReader(GFFReader):
|
@@ -25,14 +25,21 @@ if __name__ == "__main__":
|
|
25
25
|
total_features = 0
|
26
26
|
annotated_genes = 0
|
27
27
|
annotated_exons = 0
|
28
|
+
annotated_utrs = 0
|
28
29
|
parsed_genes = 0
|
29
30
|
parsed_exons = 0
|
31
|
+
parsed_utrs = 0
|
30
32
|
for feature in r:
|
31
33
|
total_features += 1
|
32
|
-
annotated_genes += feature.type_
|
34
|
+
annotated_genes += "gene" in feature.type_.lower()
|
33
35
|
annotated_exons += feature.type_ == "exon"
|
36
|
+
annotated_utrs += "utr" in feature.type_.lower()
|
34
37
|
parsed_genes += isinstance(feature, Gene)
|
35
38
|
parsed_exons += isinstance(feature, Exon)
|
39
|
+
parsed_utrs += isinstance(feature, UTR)
|
36
40
|
print(
|
37
|
-
f"{path}: {total_features} features,
|
41
|
+
f"{path}: {total_features} features, "
|
42
|
+
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
43
|
+
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
44
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
38
45
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import sys
|
2
2
|
from collections import Counter
|
3
|
-
from typing import Iterator
|
3
|
+
from typing import Iterator, cast, Literal
|
4
4
|
|
5
5
|
from biofiles.common import Reader
|
6
6
|
from biofiles.types.repeat import Repeat
|
@@ -42,7 +42,7 @@ class RepeatMaskerReader(Reader):
|
|
42
42
|
seq_start = int(seq_start_str)
|
43
43
|
seq_end = int(seq_end_str)
|
44
44
|
seq_left = int(seq_left_str[1:-1])
|
45
|
-
strand = {"+": "+", "C": "-"}[strand_str]
|
45
|
+
strand = cast(Literal["+", "-"], {"+": "+", "C": "-"}[strand_str])
|
46
46
|
|
47
47
|
if "/" in repeat_class_family:
|
48
48
|
repeat_class, repeat_family = repeat_class_family.split("/", 1)
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
3
3
|
from biofiles.common import Strand
|
4
4
|
|
5
5
|
|
6
|
-
__all__ = ["Feature", "Gene", "Exon"]
|
6
|
+
__all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
|
7
7
|
|
8
8
|
|
9
9
|
@dataclass(frozen=True)
|
@@ -26,7 +26,7 @@ class Feature:
|
|
26
26
|
attributes: dict[str, str]
|
27
27
|
|
28
28
|
id: str | None
|
29
|
-
parent: "
|
29
|
+
parent: "Feature | None"
|
30
30
|
children: tuple["Feature", ...]
|
31
31
|
|
32
32
|
|
@@ -37,19 +37,25 @@ class Feature:
|
|
37
37
|
class Gene(Feature):
|
38
38
|
name: str
|
39
39
|
biotype: str
|
40
|
+
transcripts: tuple["Transcript", ...]
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass(frozen=True)
|
44
|
+
class Transcript(Feature):
|
45
|
+
gene: Gene
|
40
46
|
exons: tuple["Exon", ...]
|
41
47
|
|
42
48
|
|
43
49
|
@dataclass(frozen=True)
|
44
50
|
class Exon(Feature):
|
45
51
|
gene: Gene
|
46
|
-
|
52
|
+
transcript: Transcript
|
47
53
|
|
48
54
|
|
49
55
|
@dataclass(frozen=True)
|
50
56
|
class UTR(Feature):
|
51
57
|
gene: Gene
|
52
|
-
|
58
|
+
transcript: Transcript
|
53
59
|
|
54
60
|
|
55
61
|
@dataclass(frozen=True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|