biofiles 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/dialects/detector.py +74 -0
- biofiles/dialects/gencode.py +235 -0
- biofiles/dialects/genomic_base.py +25 -0
- biofiles/dialects/refseq.py +142 -0
- biofiles/dialects/stringtie.py +24 -0
- biofiles/gff.py +49 -44
- biofiles/gtf.py +34 -11
- biofiles/types/feature.py +268 -34
- biofiles/utility/cli.py +2 -1
- biofiles/utility/feature.py +180 -211
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/METADATA +9 -4
- biofiles-0.1.1.dist-info/RECORD +27 -0
- biofiles/dialects/havana_ensembl.py +0 -101
- biofiles/types/feature_v2.py +0 -105
- biofiles/utility/feature_v2.py +0 -148
- biofiles-0.0.14.dist-info/RECORD +0 -25
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/WHEEL +0 -0
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/top_level.txt +0 -0
biofiles/utility/feature.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
|
-
from collections import deque
|
1
|
+
from collections import deque, defaultdict
|
2
2
|
from dataclasses import dataclass, field
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Iterator, TextIO, Type
|
4
|
+
from typing import Any, Iterator, TextIO, Type
|
5
5
|
|
6
|
-
from biofiles.common import
|
6
|
+
from biofiles.common import Strand, Reader
|
7
7
|
from biofiles.types.feature import (
|
8
8
|
Feature,
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
CDS,
|
9
|
+
FeatureMetaclass,
|
10
|
+
Relation,
|
11
|
+
Source,
|
12
|
+
get_composite_field,
|
13
|
+
Dialect,
|
15
14
|
)
|
16
15
|
|
17
16
|
|
@@ -23,225 +22,195 @@ class FeatureDraft:
|
|
23
22
|
type_: str
|
24
23
|
start_original: int
|
25
24
|
end_original: int
|
25
|
+
start_c: int
|
26
|
+
end_c: int
|
26
27
|
score: float | None
|
27
28
|
strand: Strand | None
|
28
29
|
phase: int | None
|
29
|
-
attributes: dict[str, str]
|
30
|
+
attributes: dict[str, str | list[str]]
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
return value
|
35
|
-
return None
|
32
|
+
class_: Type[Feature] | None = None
|
33
|
+
id: Any = None
|
34
|
+
finalized: Feature | None = None
|
36
35
|
|
37
36
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
by_id: dict[str, FeatureDraft] = field(default_factory=dict)
|
42
|
-
# deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
|
37
|
+
class FeatureTypes:
|
38
|
+
ambiguous_type_mapping: dict[str, list[FeatureMetaclass]]
|
39
|
+
unique_type_mapping: dict[str, FeatureMetaclass]
|
43
40
|
|
44
|
-
def
|
45
|
-
self.
|
46
|
-
|
47
|
-
self.by_id[id_] = draft
|
48
|
-
# if parent_id := draft.attributes.get("Parent", None):
|
49
|
-
# parent = self.by_id[parent_id]
|
50
|
-
# self.deps[parent.idx].append(draft.idx)
|
41
|
+
def __init__(self, feature_types: list[FeatureMetaclass]) -> None:
|
42
|
+
self.ambiguous_type_mapping = defaultdict(list)
|
43
|
+
self.unique_type_mapping = {}
|
51
44
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
# if id_ := draft.attributes.get("ID", None):
|
56
|
-
# del self.by_id[id_]
|
57
|
-
# self.deps.pop(draft.idx, None)
|
45
|
+
for ft in feature_types:
|
46
|
+
for type in ft.__filter_type__:
|
47
|
+
self.ambiguous_type_mapping[type].append(ft)
|
58
48
|
|
49
|
+
for key, fts in [*self.ambiguous_type_mapping.items()]:
|
50
|
+
if len(fts) == 1:
|
51
|
+
self.unique_type_mapping[key] = fts[0]
|
52
|
+
del self.ambiguous_type_mapping[key]
|
53
|
+
continue
|
54
|
+
self.ambiguous_type_mapping[key] = _sort_by_filter_specificity(fts)
|
59
55
|
|
60
|
-
@dataclass
|
61
|
-
class Features:
|
62
|
-
features: list[Feature] = field(default_factory=list)
|
63
|
-
by_id: dict[str, Feature] = field(default_factory=dict)
|
64
56
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
57
|
+
def _sort_by_filter_specificity(fts: list[FeatureMetaclass]) -> list[FeatureMetaclass]:
|
58
|
+
"""Sort feature classes by their filter specificity, most specific -> least specific."""
|
59
|
+
key = lambda ft: bool(ft.__filter_starts__) + bool(ft.__filter_ends__)
|
60
|
+
return sorted(fts, key=key, reverse=True)
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass
|
64
|
+
class FeatureDrafts:
|
65
|
+
feature_types: FeatureTypes
|
66
|
+
drafts: list[FeatureDraft] = field(default_factory=deque)
|
67
|
+
by_class_and_id: dict[tuple[type, Any], FeatureDraft] = field(default_factory=dict)
|
69
68
|
|
69
|
+
def add(self, draft: FeatureDraft) -> None:
|
70
|
+
self.drafts.append(draft)
|
71
|
+
if class_ := self.feature_types.unique_type_mapping.get(draft.type_.lower()):
|
72
|
+
draft.class_ = class_
|
73
|
+
draft.id = get_composite_field(
|
74
|
+
draft.attributes, class_.__id_attribute_source__
|
75
|
+
)
|
76
|
+
self.register(draft)
|
77
|
+
|
78
|
+
def register(self, draft: FeatureDraft) -> None:
|
79
|
+
if draft.id is None:
|
80
|
+
return
|
81
|
+
if (key := (draft.class_, draft.id)) in self.by_class_and_id:
|
82
|
+
raise ValueError(
|
83
|
+
f"duplicate feature ID {draft.id} for class {draft.class_.__name__}"
|
84
|
+
)
|
85
|
+
self.by_class_and_id[key] = draft
|
86
|
+
|
87
|
+
|
88
|
+
class RawFeatureReader(Reader):
|
89
|
+
def __init__(self, input_: TextIO | Path) -> None:
|
90
|
+
super().__init__(input_)
|
70
91
|
|
71
|
-
|
72
|
-
|
73
|
-
TranscriptT = TypeVar("TranscriptT", bound=Transcript)
|
74
|
-
UTRT = TypeVar("UTRT", bound=UTR)
|
92
|
+
def __iter__(self) -> Iterator[FeatureDraft]:
|
93
|
+
raise NotImplementedError
|
75
94
|
|
76
95
|
|
77
96
|
class FeatureReader(Reader):
|
78
|
-
|
79
|
-
|
80
|
-
):
|
97
|
+
|
98
|
+
def __init__(self, input_: TextIO | Path | str, dialect: Dialect) -> None:
|
81
99
|
super().__init__(input_)
|
82
|
-
self.
|
100
|
+
self._feature_types = FeatureTypes(dialect.feature_types)
|
101
|
+
self._raw_reader = self._make_raw_feature_reader()
|
83
102
|
|
84
|
-
def
|
103
|
+
def _make_raw_feature_reader(self) -> RawFeatureReader:
|
85
104
|
raise NotImplementedError
|
86
105
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
return
|
185
|
-
|
186
|
-
def
|
187
|
-
self,
|
188
|
-
) ->
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
while ancestor and not isinstance(ancestor, t):
|
199
|
-
ancestor = ancestor.parent
|
200
|
-
return cast(FeatureT | None, ancestor)
|
201
|
-
|
202
|
-
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
203
|
-
parent_id = self._extract_parent_id(draft)
|
204
|
-
parent = result.by_id.get(parent_id) if parent_id is not None else None
|
205
|
-
|
206
|
-
return Feature(
|
207
|
-
sequence_id=draft.sequence_id,
|
208
|
-
source=draft.source,
|
209
|
-
type_=draft.type_,
|
210
|
-
start_original=draft.start_original,
|
211
|
-
end_original=draft.end_original,
|
212
|
-
start_c=draft.start_original - 1,
|
213
|
-
end_c=draft.end_original,
|
214
|
-
score=draft.score,
|
215
|
-
strand=draft.strand,
|
216
|
-
phase=draft.phase,
|
217
|
-
attributes=draft.attributes,
|
218
|
-
id=self._extract_id(draft),
|
219
|
-
parent=parent,
|
220
|
-
children=(),
|
221
|
-
)
|
222
|
-
|
223
|
-
def _extract_id(self, draft: FeatureDraft) -> str | None:
|
224
|
-
if (id_ := draft.attributes.get("ID")) is not None:
|
225
|
-
return id_
|
226
|
-
if draft.type_ == "gene" and (id_ := draft.attributes.get("gene_id")):
|
227
|
-
return id_
|
228
|
-
if draft.type_ == "transcript" and (
|
229
|
-
id_ := draft.attributes.get("transcript_id")
|
230
|
-
):
|
231
|
-
return id_
|
232
|
-
if draft.type_ == "exon" and (id_ := draft.attributes.get("exon_id")):
|
233
|
-
return id_
|
234
|
-
return None
|
235
|
-
|
236
|
-
def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
|
237
|
-
if (id_ := draft.attributes.get("Parent")) is not None:
|
238
|
-
return id_
|
239
|
-
if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
|
240
|
-
return id_
|
241
|
-
if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
|
242
|
-
id_ := draft.attributes.get("transcript_id")
|
243
|
-
):
|
244
|
-
return id_
|
245
|
-
if draft.type_.lower() == "cds" and (id_ := draft.attributes.get("exon_id")):
|
246
|
-
return id_
|
247
|
-
return None
|
106
|
+
def __iter__(self) -> Iterator[Feature]:
|
107
|
+
fds = FeatureDrafts(self._feature_types)
|
108
|
+
for draft in self._raw_reader:
|
109
|
+
fds.add(draft)
|
110
|
+
yield from self._finalize_drafts(fds)
|
111
|
+
|
112
|
+
def _finalize_drafts(self, fds: FeatureDrafts) -> Iterator[Feature]:
|
113
|
+
self._choose_classes(fds)
|
114
|
+
self._instantiate_objects(fds)
|
115
|
+
self._fill_relations(fds)
|
116
|
+
for fd in fds.drafts:
|
117
|
+
yield fd.finalized
|
118
|
+
|
119
|
+
def _choose_classes(self, fds: FeatureDrafts) -> None:
|
120
|
+
for fd in fds.drafts:
|
121
|
+
if fd.class_:
|
122
|
+
continue
|
123
|
+
|
124
|
+
fts = self._feature_types.ambiguous_type_mapping[fd.type_]
|
125
|
+
matching_fts = [ft for ft in fts if self._check_filters(fds, fd, ft)]
|
126
|
+
if not matching_fts:
|
127
|
+
raise ValueError(
|
128
|
+
f"no matching classes (out of {len(fts)}) for "
|
129
|
+
f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
|
130
|
+
)
|
131
|
+
if len(matching_fts) > 1:
|
132
|
+
raise ValueError(
|
133
|
+
f"too many matching classes ({len(matching_fts)}) for "
|
134
|
+
f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
|
135
|
+
)
|
136
|
+
ft = matching_fts[0]
|
137
|
+
fd.class_ = ft
|
138
|
+
fd.id = get_composite_field(fd.attributes, ft.__id_attribute_source__)
|
139
|
+
fds.register(fd)
|
140
|
+
|
141
|
+
def _instantiate_objects(self, fds: FeatureDrafts) -> None:
|
142
|
+
for fd in fds.drafts:
|
143
|
+
fd.finalized = fd.class_(
|
144
|
+
sequence_id=fd.sequence_id,
|
145
|
+
source=fd.source,
|
146
|
+
type_=fd.type_,
|
147
|
+
start_original=fd.start_original,
|
148
|
+
end_original=fd.end_original,
|
149
|
+
start_c=fd.start_c,
|
150
|
+
end_c=fd.end_c,
|
151
|
+
score=fd.score,
|
152
|
+
strand=fd.strand,
|
153
|
+
phase=fd.phase,
|
154
|
+
attributes=fd.attributes,
|
155
|
+
)
|
156
|
+
|
157
|
+
def _fill_relations(self, fds: FeatureDrafts) -> None:
|
158
|
+
for fd in fds.drafts:
|
159
|
+
for relation in fd.class_.__relations__:
|
160
|
+
related_id = get_composite_field(
|
161
|
+
fd.attributes, relation.id_attribute_source
|
162
|
+
)
|
163
|
+
related_class = relation.inverse.class_
|
164
|
+
try:
|
165
|
+
related_fd = fds.by_class_and_id[related_class, related_id]
|
166
|
+
except KeyError as exc:
|
167
|
+
raise ValueError(
|
168
|
+
f"can't find related {related_class.__name__} {related_id} for {fd.finalized}"
|
169
|
+
) from exc
|
170
|
+
setattr(fd.finalized, relation.attribute_name, related_fd.finalized)
|
171
|
+
if relation.inverse.attribute_name is None:
|
172
|
+
pass
|
173
|
+
elif relation.inverse.one_to_one:
|
174
|
+
setattr(
|
175
|
+
related_fd.finalized,
|
176
|
+
relation.inverse.attribute_name,
|
177
|
+
fd.finalized,
|
178
|
+
)
|
179
|
+
else:
|
180
|
+
getattr(
|
181
|
+
related_fd.finalized, relation.inverse.attribute_name
|
182
|
+
).append(fd.finalized)
|
183
|
+
|
184
|
+
def _check_filters(
|
185
|
+
self, fds: FeatureDrafts, fd: FeatureDraft, ft: FeatureMetaclass
|
186
|
+
) -> bool:
|
187
|
+
if r := ft.__filter_starts__:
|
188
|
+
related_fd = self._get_related_feature_draft(fds, fd, r)
|
189
|
+
if fd.strand != related_fd.strand:
|
190
|
+
return False
|
191
|
+
if fd.strand == "+" and fd.start_original != related_fd.start_original:
|
192
|
+
return False
|
193
|
+
if fd.strand == "-" and fd.end_original != related_fd.end_original:
|
194
|
+
return False
|
195
|
+
if r := ft.__filter_ends__:
|
196
|
+
related_fd = self._get_related_feature_draft(fds, fd, r)
|
197
|
+
if fd.strand != related_fd.strand:
|
198
|
+
return False
|
199
|
+
if fd.strand == "+" and fd.end_original != related_fd.end_original:
|
200
|
+
return False
|
201
|
+
if fd.strand == "-" and fd.start_original != related_fd.start_original:
|
202
|
+
return False
|
203
|
+
return True
|
204
|
+
|
205
|
+
def _get_related_feature_draft(
|
206
|
+
self, fds: FeatureDrafts, fd: FeatureDraft, r: Relation
|
207
|
+
) -> FeatureDraft:
|
208
|
+
related_class = r.inverse.class_
|
209
|
+
related_id = fd.attributes[r.id_attribute_source]
|
210
|
+
try:
|
211
|
+
return fds.by_class_and_id[related_class, related_id]
|
212
|
+
except KeyError as exc:
|
213
|
+
raise ValueError(
|
214
|
+
f"can't find related {related_class.__name__} for "
|
215
|
+
f"{fd.class_.__name__} with attributes {fd.attributes!r}"
|
216
|
+
) from exc
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -30,7 +30,6 @@ Project-URL: Homepage, https://github.com/Saluev/biofiles
|
|
30
30
|
Classifier: Programming Language :: Python :: 3
|
31
31
|
Classifier: License :: OSI Approved :: MIT License
|
32
32
|
Classifier: Operating System :: OS Independent
|
33
|
-
Classifier: Programming Language :: Python :: 3.10
|
34
33
|
Classifier: Programming Language :: Python :: 3.11
|
35
34
|
Classifier: Programming Language :: Python :: 3.12
|
36
35
|
Requires-Python: >=3.10
|
@@ -84,14 +83,20 @@ Reading GFF genome annotations:
|
|
84
83
|
|
85
84
|
```python
|
86
85
|
from biofiles.gff import GFFReader
|
87
|
-
from biofiles.
|
86
|
+
from biofiles.dialects.gencode import GENCODE_DIALECT
|
87
|
+
from biofiles.dialects.genomic_base import Gene
|
88
88
|
|
89
|
-
with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
|
89
|
+
with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff", dialect=GENCODE_DIALECT) as r:
|
90
90
|
for feature in r:
|
91
91
|
if isinstance(feature, Gene):
|
92
92
|
print(feature.name, len(feature.exons))
|
93
93
|
```
|
94
94
|
|
95
|
+
Currently three dialects are supported:
|
96
|
+
* `biofiles.dialects.gencode.GENCODE_DIALECT` for GENCODE genome annotation;
|
97
|
+
* `biofiles.dialects.refseq.REFSEQ_DIALECT` for RefSeq genome annotation;
|
98
|
+
* `biofiles.dialects.stringtie.STRINGTIE_DIALECT` for StringTie output files.
|
99
|
+
|
95
100
|
## License
|
96
101
|
|
97
102
|
MIT license, see [License](LICENSE).
|
@@ -0,0 +1,27 @@
|
|
1
|
+
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
biofiles/bam.py,sha256=w32LLOAuKWdGF7joTSrB4HYXCdfvvijZW44jizG36R8,6771
|
3
|
+
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
4
|
+
biofiles/fai.py,sha256=gG2oDmaU7PIIYYBc8LiudaeKdo-6WIdwsMDIM7qi098,678
|
5
|
+
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
6
|
+
biofiles/gff.py,sha256=X1VK6QTPq0_w5jR2zwyjHcs9SE339yh3QGpwF5hg-T8,6357
|
7
|
+
biofiles/gtf.py,sha256=wCtHzLTCqR-oAGPtiRDJ4GvyS_Z5FzmLLzw8lgTj5Ig,3490
|
8
|
+
biofiles/repeatmasker.py,sha256=7KObXELCHQ6oBkO8yK6Znrs6MX8sfVuxNSmOMe0Ogfk,3289
|
9
|
+
biofiles/dialects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
biofiles/dialects/detector.py,sha256=OP42NhQf3s609fmDaRRFEMawgf0VMZdeAfkbK6IL4yM,2461
|
11
|
+
biofiles/dialects/gencode.py,sha256=4XyHZfrvxDm64bWnEYyatmkKtFgdgZynCXpiCGQ8OnY,7724
|
12
|
+
biofiles/dialects/genomic_base.py,sha256=JOVmVHj3DNboZK79v4Ofj5qNRWpABZrUDdMntFZkx4U,356
|
13
|
+
biofiles/dialects/refseq.py,sha256=UYjDb5S98hjhfD1tPAIY796E0ditsZAvc-u5qSKfvOk,4211
|
14
|
+
biofiles/dialects/stringtie.py,sha256=kuQ6IWRo7c4xkNjN-4jjYy3_dRUPorMaffYvDYvQDD8,834
|
15
|
+
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
biofiles/types/alignment.py,sha256=Kc0XteLyfj1gNJNLsUgzSKzAAoMobhkJyPFsovaU7dM,1696
|
17
|
+
biofiles/types/feature.py,sha256=r4zJlwKcO8LSy9EfRsLfrDcLyVLMH1uGGX1-ZNnQLOc,11580
|
18
|
+
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
19
|
+
biofiles/types/sequence.py,sha256=XeJ3wgi8AwRaVYVKmf41y5mOmWQfdsS8ysaRLZWbNoQ,254
|
20
|
+
biofiles/utility/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
+
biofiles/utility/cli.py,sha256=i2kYpPFnpTYdbd_6T-U9-egitKhDGQVDbw5gme4Kelg,4074
|
22
|
+
biofiles/utility/feature.py,sha256=bHUOfYir_Dfk_DEBrlO1GXvkJKDRkpj2YJfSl4w698s,7931
|
23
|
+
biofiles-0.1.1.dist-info/licenses/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
24
|
+
biofiles-0.1.1.dist-info/METADATA,sha256=8maXLM85bmqcLfjZ-f3H_LolE62pd1J7YWERqc__Nt0,3361
|
25
|
+
biofiles-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
26
|
+
biofiles-0.1.1.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
27
|
+
biofiles-0.1.1.dist-info/RECORD,,
|
@@ -1,101 +0,0 @@
|
|
1
|
-
"""Feature dialect for HAVANA+Ensembl .gtf files (e.g. T2T annotation)."""
|
2
|
-
|
3
|
-
from enum import StrEnum
|
4
|
-
|
5
|
-
from biofiles.types.feature_v2 import Feature, id_field, field, relation
|
6
|
-
|
7
|
-
|
8
|
-
class GeneType(StrEnum):
|
9
|
-
LNC_RNA = "lncRNA"
|
10
|
-
PROTEIN_CODING = "protein_coding"
|
11
|
-
|
12
|
-
|
13
|
-
class TranscriptType(StrEnum):
|
14
|
-
LNC_RNA = "lncRNA"
|
15
|
-
PROTEIN_CODING = "protein_coding"
|
16
|
-
|
17
|
-
|
18
|
-
transcript_gene, gene_transcripts = relation(source="gene_id")
|
19
|
-
exon_transcript, transcript_exons = relation(source="transcript_id")
|
20
|
-
exon_gene, _ = relation(source="gene_id")
|
21
|
-
cds_exon, exon_cds = relation(source="exon_id", one_to_one=True)
|
22
|
-
utr_transcript, transcript_utrs = relation(source="transcript_id")
|
23
|
-
utr_gene, _ = relation(source="gene_id")
|
24
|
-
five_prime_utr_transcript, transcript_five_prime_utr = relation(
|
25
|
-
source="transcript_id", one_to_one=True
|
26
|
-
)
|
27
|
-
five_prime_utr_gene, _ = relation(source="gene_id")
|
28
|
-
three_prime_utr_transcript, transcript_three_prime_utr = relation(
|
29
|
-
source="transcript_id", one_to_one=True
|
30
|
-
)
|
31
|
-
three_prime_utr_gene, _ = relation(source="gene_id")
|
32
|
-
start_codon_transcript, transcript_start_codon = relation(
|
33
|
-
source="transcript_id", one_to_one=True
|
34
|
-
)
|
35
|
-
start_codon_exon, _ = relation(source="exon_id", one_to_one=True)
|
36
|
-
stop_codon_transcript, transcript_stop_codon = relation(
|
37
|
-
source="transcript_id", one_to_one=True
|
38
|
-
)
|
39
|
-
stop_codon_exon, _ = relation(source="exon_id", one_to_one=True)
|
40
|
-
|
41
|
-
|
42
|
-
class Gene(Feature, type="gene"):
|
43
|
-
id: str = id_field(source="gene_id")
|
44
|
-
type: GeneType = field(source="gene_type")
|
45
|
-
name: str = field(source="gene_name")
|
46
|
-
transcripts: list["Transcript"] = gene_transcripts
|
47
|
-
|
48
|
-
|
49
|
-
class Transcript(Feature, type="transcript"):
|
50
|
-
id: str = id_field(source="transcript_id")
|
51
|
-
type: TranscriptType = field(source="transcript_type")
|
52
|
-
name: str = field(source="transcript_name")
|
53
|
-
gene: Gene = transcript_gene
|
54
|
-
exons: list["Exon"] = transcript_exons
|
55
|
-
five_prime_utr: "FivePrimeUTR | None" = transcript_five_prime_utr
|
56
|
-
three_prime_utr: "ThreePrimeUTR | None" = transcript_three_prime_utr
|
57
|
-
start_codon: "StartCodon | None" = transcript_start_codon
|
58
|
-
stop_codon: "StopCodon | None" = transcript_stop_codon
|
59
|
-
|
60
|
-
|
61
|
-
class Exon(Feature, type="exon"):
|
62
|
-
id: str = id_field(source="exon_id")
|
63
|
-
number: int = field(source="exon_number")
|
64
|
-
transcript: Transcript = exon_transcript
|
65
|
-
gene: Gene = exon_gene
|
66
|
-
cds: "CDS | None" = exon_cds
|
67
|
-
|
68
|
-
|
69
|
-
class CDS(Feature, type="cds"):
|
70
|
-
id: str = id_field(source="exon_id")
|
71
|
-
exon: Exon = cds_exon
|
72
|
-
|
73
|
-
|
74
|
-
class UTR(Feature, type="utr"):
|
75
|
-
id: str = id_field(source="transcript_id")
|
76
|
-
transcript: Transcript = utr_transcript
|
77
|
-
gene: Gene = utr_gene
|
78
|
-
|
79
|
-
|
80
|
-
class FivePrimeUTR(UTR, starts=five_prime_utr_transcript):
|
81
|
-
id: str = id_field(source="transcript_id")
|
82
|
-
transcript: Transcript = five_prime_utr_transcript
|
83
|
-
gene: Gene = five_prime_utr_gene
|
84
|
-
|
85
|
-
|
86
|
-
class ThreePrimeUTR(UTR, ends=three_prime_utr_transcript):
|
87
|
-
id: str = id_field(source="transcript_id")
|
88
|
-
transcript: Transcript = three_prime_utr_transcript
|
89
|
-
gene: Gene = three_prime_utr_gene
|
90
|
-
|
91
|
-
|
92
|
-
class StartCodon(Feature, type="start_codon"):
|
93
|
-
id: str = id_field(source="transcript_id")
|
94
|
-
transcript: Transcript = start_codon_transcript
|
95
|
-
exon: Exon = start_codon_exon
|
96
|
-
|
97
|
-
|
98
|
-
class StopCodon(Feature, type="stop_codon"):
|
99
|
-
id: str = id_field(source="transcript_id")
|
100
|
-
transcript: Transcript = stop_codon_transcript
|
101
|
-
exon: Exon = stop_codon_exon
|