biofiles 0.0.14__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,16 @@
1
- from collections import deque
1
+ from collections import deque, defaultdict
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import Iterator, TextIO, Type, TypeVar, cast
4
+ from typing import Any, Iterator, TextIO, Type
5
5
 
6
- from biofiles.common import Reader, Strand
6
+ from biofiles.common import Strand, Reader
7
7
  from biofiles.types.feature import (
8
8
  Feature,
9
- Gene,
10
- ThreePrimeUTR,
11
- Exon,
12
- UTR,
13
- Transcript,
14
- CDS,
9
+ FeatureMetaclass,
10
+ Relation,
11
+ Source,
12
+ get_composite_field,
13
+ Dialect,
15
14
  )
16
15
 
17
16
 
@@ -23,225 +22,195 @@ class FeatureDraft:
23
22
  type_: str
24
23
  start_original: int
25
24
  end_original: int
25
+ start_c: int
26
+ end_c: int
26
27
  score: float | None
27
28
  strand: Strand | None
28
29
  phase: int | None
29
- attributes: dict[str, str]
30
+ attributes: dict[str, str | list[str]]
30
31
 
31
- def pick_attribute(self, *keys: str) -> str | None:
32
- for key in keys:
33
- if (value := self.attributes.get(key, None)) is not None:
34
- return value
35
- return None
32
+ class_: Type[Feature] | None = None
33
+ id: Any = None
34
+ finalized: Feature | None = None
36
35
 
37
36
 
38
- @dataclass
39
- class FeatureDrafts:
40
- drafts: deque[FeatureDraft] = field(default_factory=deque)
41
- by_id: dict[str, FeatureDraft] = field(default_factory=dict)
42
- # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
37
+ class FeatureTypes:
38
+ ambiguous_type_mapping: dict[str, list[FeatureMetaclass]]
39
+ unique_type_mapping: dict[str, FeatureMetaclass]
43
40
 
44
- def add(self, draft: FeatureDraft) -> None:
45
- self.drafts.append(draft)
46
- if id_ := draft.attributes.get("ID", None):
47
- self.by_id[id_] = draft
48
- # if parent_id := draft.attributes.get("Parent", None):
49
- # parent = self.by_id[parent_id]
50
- # self.deps[parent.idx].append(draft.idx)
41
+ def __init__(self, feature_types: list[FeatureMetaclass]) -> None:
42
+ self.ambiguous_type_mapping = defaultdict(list)
43
+ self.unique_type_mapping = {}
51
44
 
52
- # def remove_first_n(self, n: int) -> None:
53
- # for _ in range(n):
54
- # draft = self.drafts.popleft()
55
- # if id_ := draft.attributes.get("ID", None):
56
- # del self.by_id[id_]
57
- # self.deps.pop(draft.idx, None)
45
+ for ft in feature_types:
46
+ for type in ft.__filter_type__:
47
+ self.ambiguous_type_mapping[type].append(ft)
58
48
 
49
+ for key, fts in [*self.ambiguous_type_mapping.items()]:
50
+ if len(fts) == 1:
51
+ self.unique_type_mapping[key] = fts[0]
52
+ del self.ambiguous_type_mapping[key]
53
+ continue
54
+ self.ambiguous_type_mapping[key] = _sort_by_filter_specificity(fts)
59
55
 
60
- @dataclass
61
- class Features:
62
- features: list[Feature] = field(default_factory=list)
63
- by_id: dict[str, Feature] = field(default_factory=dict)
64
56
 
65
- def add(self, feature: Feature):
66
- self.features.append(feature)
67
- if id_ := feature.id:
68
- self.by_id[id_] = feature
57
+ def _sort_by_filter_specificity(fts: list[FeatureMetaclass]) -> list[FeatureMetaclass]:
58
+ """Sort feature classes by their filter specificity, most specific -> least specific."""
59
+ key = lambda ft: bool(ft.__filter_starts__) + bool(ft.__filter_ends__)
60
+ return sorted(fts, key=key, reverse=True)
61
+
62
+
63
+ @dataclass
64
+ class FeatureDrafts:
65
+ feature_types: FeatureTypes
66
+ drafts: list[FeatureDraft] = field(default_factory=deque)
67
+ by_class_and_id: dict[tuple[type, Any], FeatureDraft] = field(default_factory=dict)
69
68
 
69
+ def add(self, draft: FeatureDraft) -> None:
70
+ self.drafts.append(draft)
71
+ if class_ := self.feature_types.unique_type_mapping.get(draft.type_.lower()):
72
+ draft.class_ = class_
73
+ draft.id = get_composite_field(
74
+ draft.attributes, class_.__id_attribute_source__
75
+ )
76
+ self.register(draft)
77
+
78
+ def register(self, draft: FeatureDraft) -> None:
79
+ if draft.id is None:
80
+ return
81
+ if (key := (draft.class_, draft.id)) in self.by_class_and_id:
82
+ raise ValueError(
83
+ f"duplicate feature ID {draft.id} for class {draft.class_.__name__}"
84
+ )
85
+ self.by_class_and_id[key] = draft
86
+
87
+
88
+ class RawFeatureReader(Reader):
89
+ def __init__(self, input_: TextIO | Path) -> None:
90
+ super().__init__(input_)
70
91
 
71
- FeatureT = TypeVar("FeatureT", bound=Feature)
72
- GeneT = TypeVar("GeneT", bound=Gene)
73
- TranscriptT = TypeVar("TranscriptT", bound=Transcript)
74
- UTRT = TypeVar("UTRT", bound=UTR)
92
+ def __iter__(self) -> Iterator[FeatureDraft]:
93
+ raise NotImplementedError
75
94
 
76
95
 
77
96
  class FeatureReader(Reader):
78
- def __init__(
79
- self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
80
- ):
97
+
98
+ def __init__(self, input_: TextIO | Path | str, dialect: Dialect) -> None:
81
99
  super().__init__(input_)
82
- self._streaming_window = streaming_window
100
+ self._feature_types = FeatureTypes(dialect.feature_types)
101
+ self._raw_reader = self._make_raw_feature_reader()
83
102
 
84
- def __iter__(self) -> Iterator[Feature]:
103
+ def _make_raw_feature_reader(self) -> RawFeatureReader:
85
104
  raise NotImplementedError
86
105
 
87
- def _finalize_drafts(
88
- self, drafts: FeatureDrafts, w: int | None
89
- ) -> Iterator[Feature]:
90
- # TODO streaming version!
91
- # code below is already tracking
92
- # if not drafts.drafts:
93
- # return
94
- # if w is not None and len(drafts.drafts) <= w:
95
- # return
96
- #
97
- # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
98
- #
99
- # i = 0
100
- # while i < len(drafts.drafts) and (
101
- # not drafts.deps[drafts.drafts[i].idx]
102
- # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
103
- # ):
104
- # i += 1
105
- #
106
- # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
107
- #
108
- # result = _Features()
109
- # for j in range(i):
110
- # draft = drafts.drafts[j]
111
- # feature = self._finalize_draft(draft, result)
112
- # result.add(feature)
113
- # drafts.remove_first_n(i)
114
- # yield from result.features
115
-
116
- result = Features()
117
- for draft in drafts.drafts:
118
- feature = self._finalize_draft(draft, result)
119
- result.add(feature)
120
- yield from result.features
121
-
122
- def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
123
- match draft.type_.lower():
124
- case "gene" | "ncrna_gene":
125
- feature = self._finalize_gene(draft, result, Gene)
126
- case "transcript" | "mrna" | "lnc_rna":
127
- feature = self._finalize_transcript(draft, result, Transcript)
128
- case "exon":
129
- feature = self._finalize_exon(draft, result)
130
- case "cds":
131
- feature = self._finalize_cds(draft, result)
132
- case "three_prime_utr":
133
- feature = self._finalize_utr(draft, result, ThreePrimeUTR)
134
- case "utr":
135
- feature = self._finalize_utr(draft, result, UTR)
136
- case _:
137
- feature = self._finalize_other(draft, result)
138
- if feature.parent:
139
- new_children = feature.parent.children + (feature,)
140
- object.__setattr__(feature.parent, "children", new_children)
141
- return feature
142
-
143
- def _finalize_gene(
144
- self, draft: FeatureDraft, result: Features, type_: Type[GeneT]
145
- ) -> Feature:
146
- feature = self._finalize_other(draft, result)
147
- name = draft.pick_attribute("gene_name", "Name")
148
- biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
149
- if name is None or biotype is None:
150
- return feature
151
- return type_(**feature.__dict__, name=name, biotype=biotype, transcripts=())
152
-
153
- def _finalize_transcript(
154
- self, draft: FeatureDraft, result: Features, type_: Type[TranscriptT]
155
- ) -> Feature:
156
- feature = self._finalize_other(draft, result)
157
- if not (gene := self._find_ancestor_of_type(feature, Gene)):
158
- return feature
159
- transcript = type_(**feature.__dict__, gene=gene, exons=())
160
- object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
161
- return transcript
162
-
163
- def _finalize_exon(self, draft: FeatureDraft, result: Features) -> Feature:
164
- feature = self._finalize_other(draft, result)
165
- if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
166
- return feature
167
- exon = Exon(
168
- **feature.__dict__, gene=transcript.gene, transcript=transcript, cds=None
169
- )
170
- object.__setattr__(transcript, "exons", transcript.exons + (exon,))
171
- return exon
172
-
173
- def _finalize_cds(self, draft: FeatureDraft, result: Features) -> Feature:
174
- feature = self._finalize_other(draft, result)
175
- if not (exon := self._find_ancestor_of_type(feature, Exon)):
176
- return feature
177
- cds = CDS(
178
- **feature.__dict__,
179
- exon=exon,
180
- transcript=exon.transcript,
181
- gene=exon.transcript.gene,
182
- )
183
- object.__setattr__(exon, "cds", cds)
184
- return cds
185
-
186
- def _finalize_utr(
187
- self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
188
- ) -> Feature:
189
- feature = self._finalize_other(draft, result)
190
- if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
191
- return feature
192
- return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
193
-
194
- def _find_ancestor_of_type(
195
- self, feature: Feature, t: Type[FeatureT]
196
- ) -> FeatureT | None:
197
- ancestor = feature.parent
198
- while ancestor and not isinstance(ancestor, t):
199
- ancestor = ancestor.parent
200
- return cast(FeatureT | None, ancestor)
201
-
202
- def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
203
- parent_id = self._extract_parent_id(draft)
204
- parent = result.by_id.get(parent_id) if parent_id is not None else None
205
-
206
- return Feature(
207
- sequence_id=draft.sequence_id,
208
- source=draft.source,
209
- type_=draft.type_,
210
- start_original=draft.start_original,
211
- end_original=draft.end_original,
212
- start_c=draft.start_original - 1,
213
- end_c=draft.end_original,
214
- score=draft.score,
215
- strand=draft.strand,
216
- phase=draft.phase,
217
- attributes=draft.attributes,
218
- id=self._extract_id(draft),
219
- parent=parent,
220
- children=(),
221
- )
222
-
223
- def _extract_id(self, draft: FeatureDraft) -> str | None:
224
- if (id_ := draft.attributes.get("ID")) is not None:
225
- return id_
226
- if draft.type_ == "gene" and (id_ := draft.attributes.get("gene_id")):
227
- return id_
228
- if draft.type_ == "transcript" and (
229
- id_ := draft.attributes.get("transcript_id")
230
- ):
231
- return id_
232
- if draft.type_ == "exon" and (id_ := draft.attributes.get("exon_id")):
233
- return id_
234
- return None
235
-
236
- def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
237
- if (id_ := draft.attributes.get("Parent")) is not None:
238
- return id_
239
- if draft.type_ == "transcript" and (id_ := draft.attributes.get("gene_id")):
240
- return id_
241
- if draft.type_ in ("exon", "UTR", "three_prime_UTR", "five_prime_UTR") and (
242
- id_ := draft.attributes.get("transcript_id")
243
- ):
244
- return id_
245
- if draft.type_.lower() == "cds" and (id_ := draft.attributes.get("exon_id")):
246
- return id_
247
- return None
106
+ def __iter__(self) -> Iterator[Feature]:
107
+ fds = FeatureDrafts(self._feature_types)
108
+ for draft in self._raw_reader:
109
+ fds.add(draft)
110
+ yield from self._finalize_drafts(fds)
111
+
112
+ def _finalize_drafts(self, fds: FeatureDrafts) -> Iterator[Feature]:
113
+ self._choose_classes(fds)
114
+ self._instantiate_objects(fds)
115
+ self._fill_relations(fds)
116
+ for fd in fds.drafts:
117
+ yield fd.finalized
118
+
119
+ def _choose_classes(self, fds: FeatureDrafts) -> None:
120
+ for fd in fds.drafts:
121
+ if fd.class_:
122
+ continue
123
+
124
+ fts = self._feature_types.ambiguous_type_mapping[fd.type_]
125
+ matching_fts = [ft for ft in fts if self._check_filters(fds, fd, ft)]
126
+ if not matching_fts:
127
+ raise ValueError(
128
+ f"no matching classes (out of {len(fts)}) for "
129
+ f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
130
+ )
131
+ if len(matching_fts) > 1:
132
+ raise ValueError(
133
+ f"too many matching classes ({len(matching_fts)}) for "
134
+ f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
135
+ )
136
+ ft = matching_fts[0]
137
+ fd.class_ = ft
138
+ fd.id = get_composite_field(fd.attributes, ft.__id_attribute_source__)
139
+ fds.register(fd)
140
+
141
+ def _instantiate_objects(self, fds: FeatureDrafts) -> None:
142
+ for fd in fds.drafts:
143
+ fd.finalized = fd.class_(
144
+ sequence_id=fd.sequence_id,
145
+ source=fd.source,
146
+ type_=fd.type_,
147
+ start_original=fd.start_original,
148
+ end_original=fd.end_original,
149
+ start_c=fd.start_c,
150
+ end_c=fd.end_c,
151
+ score=fd.score,
152
+ strand=fd.strand,
153
+ phase=fd.phase,
154
+ attributes=fd.attributes,
155
+ )
156
+
157
+ def _fill_relations(self, fds: FeatureDrafts) -> None:
158
+ for fd in fds.drafts:
159
+ for relation in fd.class_.__relations__:
160
+ related_id = get_composite_field(
161
+ fd.attributes, relation.id_attribute_source
162
+ )
163
+ related_class = relation.inverse.class_
164
+ try:
165
+ related_fd = fds.by_class_and_id[related_class, related_id]
166
+ except KeyError as exc:
167
+ raise ValueError(
168
+ f"can't find related {related_class.__name__} {related_id} for {fd.finalized}"
169
+ ) from exc
170
+ setattr(fd.finalized, relation.attribute_name, related_fd.finalized)
171
+ if relation.inverse.attribute_name is None:
172
+ pass
173
+ elif relation.inverse.one_to_one:
174
+ setattr(
175
+ related_fd.finalized,
176
+ relation.inverse.attribute_name,
177
+ fd.finalized,
178
+ )
179
+ else:
180
+ getattr(
181
+ related_fd.finalized, relation.inverse.attribute_name
182
+ ).append(fd.finalized)
183
+
184
+ def _check_filters(
185
+ self, fds: FeatureDrafts, fd: FeatureDraft, ft: FeatureMetaclass
186
+ ) -> bool:
187
+ if r := ft.__filter_starts__:
188
+ related_fd = self._get_related_feature_draft(fds, fd, r)
189
+ if fd.strand != related_fd.strand:
190
+ return False
191
+ if fd.strand == "+" and fd.start_original != related_fd.start_original:
192
+ return False
193
+ if fd.strand == "-" and fd.end_original != related_fd.end_original:
194
+ return False
195
+ if r := ft.__filter_ends__:
196
+ related_fd = self._get_related_feature_draft(fds, fd, r)
197
+ if fd.strand != related_fd.strand:
198
+ return False
199
+ if fd.strand == "+" and fd.end_original != related_fd.end_original:
200
+ return False
201
+ if fd.strand == "-" and fd.start_original != related_fd.start_original:
202
+ return False
203
+ return True
204
+
205
+ def _get_related_feature_draft(
206
+ self, fds: FeatureDrafts, fd: FeatureDraft, r: Relation
207
+ ) -> FeatureDraft:
208
+ related_class = r.inverse.class_
209
+ related_id = fd.attributes[r.id_attribute_source]
210
+ try:
211
+ return fds.by_class_and_id[related_class, related_id]
212
+ except KeyError as exc:
213
+ raise ValueError(
214
+ f"can't find related {related_class.__name__} for "
215
+ f"{fd.class_.__name__} with attributes {fd.attributes!r}"
216
+ ) from exc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biofiles
3
- Version: 0.0.14
3
+ Version: 0.1.2
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -30,7 +30,6 @@ Project-URL: Homepage, https://github.com/Saluev/biofiles
30
30
  Classifier: Programming Language :: Python :: 3
31
31
  Classifier: License :: OSI Approved :: MIT License
32
32
  Classifier: Operating System :: OS Independent
33
- Classifier: Programming Language :: Python :: 3.10
34
33
  Classifier: Programming Language :: Python :: 3.11
35
34
  Classifier: Programming Language :: Python :: 3.12
36
35
  Requires-Python: >=3.10
@@ -84,14 +83,20 @@ Reading GFF genome annotations:
84
83
 
85
84
  ```python
86
85
  from biofiles.gff import GFFReader
87
- from biofiles.types.feature import Gene
86
+ from biofiles.dialects.gencode import GENCODE_DIALECT
87
+ from biofiles.dialects.genomic_base import Gene
88
88
 
89
- with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
89
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff", dialect=GENCODE_DIALECT) as r:
90
90
  for feature in r:
91
91
  if isinstance(feature, Gene):
92
92
  print(feature.name, len(feature.exons))
93
93
  ```
94
94
 
95
+ Currently three dialects are supported:
96
+ * `biofiles.dialects.gencode.GENCODE_DIALECT` for GENCODE genome annotation;
97
+ * `biofiles.dialects.refseq.REFSEQ_DIALECT` for RefSeq genome annotation;
98
+ * `biofiles.dialects.stringtie.STRINGTIE_DIALECT` for StringTie output files.
99
+
95
100
  ## License
96
101
 
97
102
  MIT license, see [License](LICENSE).
@@ -0,0 +1,27 @@
1
+ biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ biofiles/bam.py,sha256=w32LLOAuKWdGF7joTSrB4HYXCdfvvijZW44jizG36R8,6771
3
+ biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
4
+ biofiles/fai.py,sha256=gG2oDmaU7PIIYYBc8LiudaeKdo-6WIdwsMDIM7qi098,678
5
+ biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
6
+ biofiles/gff.py,sha256=X1VK6QTPq0_w5jR2zwyjHcs9SE339yh3QGpwF5hg-T8,6357
7
+ biofiles/gtf.py,sha256=wCtHzLTCqR-oAGPtiRDJ4GvyS_Z5FzmLLzw8lgTj5Ig,3490
8
+ biofiles/repeatmasker.py,sha256=7KObXELCHQ6oBkO8yK6Znrs6MX8sfVuxNSmOMe0Ogfk,3289
9
+ biofiles/dialects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ biofiles/dialects/detector.py,sha256=OP42NhQf3s609fmDaRRFEMawgf0VMZdeAfkbK6IL4yM,2461
11
+ biofiles/dialects/gencode.py,sha256=oxTKmcipalgdhy_eNhCqRd5Rdchz8Pn1SXlQHlg6YuM,7730
12
+ biofiles/dialects/genomic_base.py,sha256=mw46OgVW1TjsZ-RnH_Nnfs2bJIqJiaPV8d-MaAbjPSQ,362
13
+ biofiles/dialects/refseq.py,sha256=ZjTpm-AOLs2jQRU9G4V11uiAAtrJop1OF-HyDIr-IrI,4217
14
+ biofiles/dialects/stringtie.py,sha256=kuQ6IWRo7c4xkNjN-4jjYy3_dRUPorMaffYvDYvQDD8,834
15
+ biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ biofiles/types/alignment.py,sha256=Kc0XteLyfj1gNJNLsUgzSKzAAoMobhkJyPFsovaU7dM,1696
17
+ biofiles/types/feature.py,sha256=r4zJlwKcO8LSy9EfRsLfrDcLyVLMH1uGGX1-ZNnQLOc,11580
18
+ biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
19
+ biofiles/types/sequence.py,sha256=XeJ3wgi8AwRaVYVKmf41y5mOmWQfdsS8ysaRLZWbNoQ,254
20
+ biofiles/utility/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ biofiles/utility/cli.py,sha256=i2kYpPFnpTYdbd_6T-U9-egitKhDGQVDbw5gme4Kelg,4074
22
+ biofiles/utility/feature.py,sha256=bHUOfYir_Dfk_DEBrlO1GXvkJKDRkpj2YJfSl4w698s,7931
23
+ biofiles-0.1.2.dist-info/licenses/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
24
+ biofiles-0.1.2.dist-info/METADATA,sha256=FecyQR9RV1AgLstbkbARgEbSqzsaP2LwMY03f7gZqiA,3361
25
+ biofiles-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
26
+ biofiles-0.1.2.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
27
+ biofiles-0.1.2.dist-info/RECORD,,
@@ -1,101 +0,0 @@
1
- """Feature dialect for HAVANA+Ensembl .gtf files (e.g. T2T annotation)."""
2
-
3
- from enum import StrEnum
4
-
5
- from biofiles.types.feature_v2 import Feature, id_field, field, relation
6
-
7
-
8
- class GeneType(StrEnum):
9
- LNC_RNA = "lncRNA"
10
- PROTEIN_CODING = "protein_coding"
11
-
12
-
13
- class TranscriptType(StrEnum):
14
- LNC_RNA = "lncRNA"
15
- PROTEIN_CODING = "protein_coding"
16
-
17
-
18
- transcript_gene, gene_transcripts = relation(source="gene_id")
19
- exon_transcript, transcript_exons = relation(source="transcript_id")
20
- exon_gene, _ = relation(source="gene_id")
21
- cds_exon, exon_cds = relation(source="exon_id", one_to_one=True)
22
- utr_transcript, transcript_utrs = relation(source="transcript_id")
23
- utr_gene, _ = relation(source="gene_id")
24
- five_prime_utr_transcript, transcript_five_prime_utr = relation(
25
- source="transcript_id", one_to_one=True
26
- )
27
- five_prime_utr_gene, _ = relation(source="gene_id")
28
- three_prime_utr_transcript, transcript_three_prime_utr = relation(
29
- source="transcript_id", one_to_one=True
30
- )
31
- three_prime_utr_gene, _ = relation(source="gene_id")
32
- start_codon_transcript, transcript_start_codon = relation(
33
- source="transcript_id", one_to_one=True
34
- )
35
- start_codon_exon, _ = relation(source="exon_id", one_to_one=True)
36
- stop_codon_transcript, transcript_stop_codon = relation(
37
- source="transcript_id", one_to_one=True
38
- )
39
- stop_codon_exon, _ = relation(source="exon_id", one_to_one=True)
40
-
41
-
42
- class Gene(Feature, type="gene"):
43
- id: str = id_field(source="gene_id")
44
- type: GeneType = field(source="gene_type")
45
- name: str = field(source="gene_name")
46
- transcripts: list["Transcript"] = gene_transcripts
47
-
48
-
49
- class Transcript(Feature, type="transcript"):
50
- id: str = id_field(source="transcript_id")
51
- type: TranscriptType = field(source="transcript_type")
52
- name: str = field(source="transcript_name")
53
- gene: Gene = transcript_gene
54
- exons: list["Exon"] = transcript_exons
55
- five_prime_utr: "FivePrimeUTR | None" = transcript_five_prime_utr
56
- three_prime_utr: "ThreePrimeUTR | None" = transcript_three_prime_utr
57
- start_codon: "StartCodon | None" = transcript_start_codon
58
- stop_codon: "StopCodon | None" = transcript_stop_codon
59
-
60
-
61
- class Exon(Feature, type="exon"):
62
- id: str = id_field(source="exon_id")
63
- number: int = field(source="exon_number")
64
- transcript: Transcript = exon_transcript
65
- gene: Gene = exon_gene
66
- cds: "CDS | None" = exon_cds
67
-
68
-
69
- class CDS(Feature, type="cds"):
70
- id: str = id_field(source="exon_id")
71
- exon: Exon = cds_exon
72
-
73
-
74
- class UTR(Feature, type="utr"):
75
- id: str = id_field(source="transcript_id")
76
- transcript: Transcript = utr_transcript
77
- gene: Gene = utr_gene
78
-
79
-
80
- class FivePrimeUTR(UTR, starts=five_prime_utr_transcript):
81
- id: str = id_field(source="transcript_id")
82
- transcript: Transcript = five_prime_utr_transcript
83
- gene: Gene = five_prime_utr_gene
84
-
85
-
86
- class ThreePrimeUTR(UTR, ends=three_prime_utr_transcript):
87
- id: str = id_field(source="transcript_id")
88
- transcript: Transcript = three_prime_utr_transcript
89
- gene: Gene = three_prime_utr_gene
90
-
91
-
92
- class StartCodon(Feature, type="start_codon"):
93
- id: str = id_field(source="transcript_id")
94
- transcript: Transcript = start_codon_transcript
95
- exon: Exon = start_codon_exon
96
-
97
-
98
- class StopCodon(Feature, type="stop_codon"):
99
- id: str = id_field(source="transcript_id")
100
- transcript: Transcript = stop_codon_transcript
101
- exon: Exon = stop_codon_exon