biofiles 0.0.14__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {biofiles-0.0.14 → biofiles-0.1.1}/PKG-INFO +9 -4
  2. {biofiles-0.0.14 → biofiles-0.1.1}/README.md +8 -2
  3. biofiles-0.1.1/biofiles/dialects/detector.py +74 -0
  4. biofiles-0.1.1/biofiles/dialects/gencode.py +235 -0
  5. biofiles-0.1.1/biofiles/dialects/genomic_base.py +25 -0
  6. biofiles-0.1.1/biofiles/dialects/refseq.py +142 -0
  7. biofiles-0.1.1/biofiles/dialects/stringtie.py +24 -0
  8. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/gff.py +49 -44
  9. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/gtf.py +34 -11
  10. biofiles-0.1.1/biofiles/types/feature.py +305 -0
  11. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/utility/cli.py +2 -1
  12. biofiles-0.0.14/biofiles/utility/feature_v2.py → biofiles-0.1.1/biofiles/utility/feature.py +96 -28
  13. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles.egg-info/PKG-INFO +9 -4
  14. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles.egg-info/SOURCES.txt +6 -4
  15. {biofiles-0.0.14 → biofiles-0.1.1}/pyproject.toml +1 -2
  16. biofiles-0.0.14/biofiles/dialects/havana_ensembl.py +0 -101
  17. biofiles-0.0.14/biofiles/types/feature.py +0 -71
  18. biofiles-0.0.14/biofiles/types/feature_v2.py +0 -105
  19. biofiles-0.0.14/biofiles/utility/feature.py +0 -247
  20. {biofiles-0.0.14 → biofiles-0.1.1}/LICENSE +0 -0
  21. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/__init__.py +0 -0
  22. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/bam.py +0 -0
  23. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/common.py +0 -0
  24. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/dialects/__init__.py +0 -0
  25. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/fai.py +0 -0
  26. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/fasta.py +0 -0
  27. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/repeatmasker.py +0 -0
  28. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/types/__init__.py +0 -0
  29. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/types/alignment.py +0 -0
  30. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/types/repeat.py +0 -0
  31. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/types/sequence.py +0 -0
  32. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles/utility/__init__.py +0 -0
  33. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles.egg-info/dependency_links.txt +0 -0
  34. {biofiles-0.0.14 → biofiles-0.1.1}/biofiles.egg-info/top_level.txt +0 -0
  35. {biofiles-0.0.14 → biofiles-0.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biofiles
3
- Version: 0.0.14
3
+ Version: 0.1.1
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -30,7 +30,6 @@ Project-URL: Homepage, https://github.com/Saluev/biofiles
30
30
  Classifier: Programming Language :: Python :: 3
31
31
  Classifier: License :: OSI Approved :: MIT License
32
32
  Classifier: Operating System :: OS Independent
33
- Classifier: Programming Language :: Python :: 3.10
34
33
  Classifier: Programming Language :: Python :: 3.11
35
34
  Classifier: Programming Language :: Python :: 3.12
36
35
  Requires-Python: >=3.10
@@ -84,14 +83,20 @@ Reading GFF genome annotations:
84
83
 
85
84
  ```python
86
85
  from biofiles.gff import GFFReader
87
- from biofiles.types.feature import Gene
86
+ from biofiles.dialects.gencode import GENCODE_DIALECT
87
+ from biofiles.dialects.genomic_base import Gene
88
88
 
89
- with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
89
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff", dialect=GENCODE_DIALECT) as r:
90
90
  for feature in r:
91
91
  if isinstance(feature, Gene):
92
92
  print(feature.name, len(feature.exons))
93
93
  ```
94
94
 
95
+ Currently three dialects are supported:
96
+ * `biofiles.dialects.gencode.GENCODE_DIALECT` for GENCODE genome annotation;
97
+ * `biofiles.dialects.refseq.REFSEQ_DIALECT` for RefSeq genome annotation;
98
+ * `biofiles.dialects.stringtie.STRINGTIE_DIALECT` for StringTie output files.
99
+
95
100
  ## License
96
101
 
97
102
  MIT license, see [License](LICENSE).
@@ -44,14 +44,20 @@ Reading GFF genome annotations:
44
44
 
45
45
  ```python
46
46
  from biofiles.gff import GFFReader
47
- from biofiles.types.feature import Gene
47
+ from biofiles.dialects.gencode import GENCODE_DIALECT
48
+ from biofiles.dialects.genomic_base import Gene
48
49
 
49
- with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
50
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff", dialect=GENCODE_DIALECT) as r:
50
51
  for feature in r:
51
52
  if isinstance(feature, Gene):
52
53
  print(feature.name, len(feature.exons))
53
54
  ```
54
55
 
56
+ Currently three dialects are supported:
57
+ * `biofiles.dialects.gencode.GENCODE_DIALECT` for GENCODE genome annotation;
58
+ * `biofiles.dialects.refseq.REFSEQ_DIALECT` for RefSeq genome annotation;
59
+ * `biofiles.dialects.stringtie.STRINGTIE_DIALECT` for StringTie output files.
60
+
55
61
  ## License
56
62
 
57
63
  MIT license, see [License](LICENSE).
@@ -0,0 +1,74 @@
1
+ import sys
2
+ from itertools import islice
3
+ from pathlib import Path
4
+
5
+ from biofiles.dialects.gencode import GENCODE_DIALECT
6
+ from biofiles.dialects.refseq import REFSEQ_DIALECT
7
+ from biofiles.dialects.stringtie import STRINGTIE_DIALECT
8
+ from biofiles.types.feature import Dialect
9
+ from biofiles.utility.feature import RawFeatureReader
10
+
11
+
12
+ class CantDetectDialect(Exception):
13
+ pass
14
+
15
+
16
+ class DialectDetector:
17
+ def __init__(self, raw_reader: RawFeatureReader, num_samples: int = 1000) -> None:
18
+ self._raw_reader = raw_reader
19
+ self._num_samples = num_samples
20
+
21
+ def detect(self) -> Dialect:
22
+ gencode_rows = 0
23
+ refseq_rows = 0
24
+ stringtie_rows = 0
25
+ total_rows = 0
26
+ for fd in islice(self._raw_reader, self._num_samples):
27
+ total_rows += 1
28
+ source = fd.source.lower()
29
+ if source in ("havana", "ensembl"):
30
+ gencode_rows += 1
31
+ elif source in ("bestrefseq", "bestrefseq%2cgnomon", "gnomon", "refseq"):
32
+ refseq_rows += 1
33
+ elif source in ("stringtie",):
34
+ stringtie_rows += 1
35
+
36
+ if gencode_rows > 0 and gencode_rows >= 0.9 * total_rows:
37
+ return GENCODE_DIALECT
38
+ if refseq_rows > 0 and refseq_rows >= 0.9 * total_rows:
39
+ return REFSEQ_DIALECT
40
+ if stringtie_rows > 0 and stringtie_rows >= 0.9 * total_rows:
41
+ return STRINGTIE_DIALECT
42
+
43
+ raise CantDetectDialect(
44
+ f"of {total_rows} read rows {gencode_rows} look like GENCODE, "
45
+ f"{refseq_rows} look like RefSeq, {stringtie_rows} look like StringTie"
46
+ )
47
+
48
+
49
+ def detect_dialect(path: Path) -> Dialect:
50
+ if path.suffix == ".gtf":
51
+ from biofiles.gtf import RawGTFReader
52
+
53
+ raw_reader = RawGTFReader(path)
54
+ elif path.suffix in (".gff", ".gff3"):
55
+ from biofiles.gff import RawGFFReader
56
+
57
+ raw_reader = RawGFFReader(path)
58
+ else:
59
+ raise CantDetectDialect(f"unknown file extension {path.suffix}")
60
+ detector = DialectDetector(raw_reader=raw_reader)
61
+ return detector.detect()
62
+
63
+
64
+ if __name__ == "__main__":
65
+ exit_code = 0
66
+ for path_str in sys.argv[1:]:
67
+ path = Path(path_str)
68
+ try:
69
+ dialect = detect_dialect(path)
70
+ print(f"{path}\t{dialect.name}")
71
+ except CantDetectDialect as exc:
72
+ print(f"Failed to detect dialect for {path}: {exc}", file=sys.stderr)
73
+ exit_code = 1
74
+ sys.exit(exit_code)
@@ -0,0 +1,235 @@
1
+ """Feature dialect for GENCODE .gtf/.gff3 files."""
2
+
3
+ from enum import StrEnum
4
+ from types import NoneType
5
+
6
+ from biofiles.dialects.genomic_base import (
7
+ Gene as BaseGene,
8
+ Transcript as BaseTranscript,
9
+ Exon as BaseExon,
10
+ CDS as BaseCDS,
11
+ UTR as BaseUTR,
12
+ )
13
+ from biofiles.types.feature import (
14
+ Feature,
15
+ id_field,
16
+ field,
17
+ relation,
18
+ no_id_field,
19
+ Dialect,
20
+ )
21
+
22
+
23
+ class GeneType(StrEnum):
24
+ ARTIFACT = "artifact"
25
+ IG_C_GENE = "IG_C_gene"
26
+ IG_C_PSEUDOGENE = "IG_C_pseudogene"
27
+ IG_D_GENE = "IG_D_gene"
28
+ IG_D_PSEUDOGENE = "IG_D_pseudogene"
29
+ IG_J_GENE = "IG_J_gene"
30
+ IG_J_PSEUDOGENE = "IG_J_pseudogene"
31
+ IG_PSEUDOGENE = "IG_pseudogene"
32
+ IG_V_GENE = "IG_V_gene"
33
+ IG_V_PSEUDOGENE = "IG_V_pseudogene"
34
+ LNCRNA = "lncRNA"
35
+ MIRNA = "miRNA"
36
+ MISC_RNA = "misc_RNA"
37
+ MT_RRNA = "Mt_rRNA"
38
+ MT_TRNA = "Mt_tRNA"
39
+ PROCESSED_PSEUDOGENE = "processed_pseudogene"
40
+ PROTEIN_CODING = "protein_coding"
41
+ RIBOZYME = "ribozyme"
42
+ RRNA = "rRNA"
43
+ RRNA_PSEUDOGENE = "rRNA_pseudogene"
44
+ SRNA = "sRNA"
45
+ SCRNA = "scRNA"
46
+ SCARNA = "scaRNA"
47
+ SNRNA = "snRNA"
48
+ SNORNA = "snoRNA"
49
+ TEC = "TEC"
50
+ TR_C_GENE = "TR_C_gene"
51
+ TR_C_PSEUDOGENE = "TR_C_pseudogene"
52
+ TR_D_GENE = "TR_D_gene"
53
+ TR_D_PSEUDOGENE = "TR_D_pseudogene"
54
+ TR_J_GENE = "TR_J_gene"
55
+ TR_J_PSEUDOGENE = "TR_J_pseudogene"
56
+ TR_V_GENE = "TR_V_gene"
57
+ TR_V_PSEUDOGENE = "TR_V_pseudogene"
58
+ TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"
59
+ TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"
60
+ TRANSCRIBED_UNPROCESSED_PSEUDOGENE = "transcribed_unprocessed_pseudogene"
61
+ TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"
62
+ UNITARY_PSEUDOGENE = "unitary_pseudogene"
63
+ UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"
64
+ VAULT_RNA = "vault_RNA"
65
+
66
+
67
+ class TranscriptType(StrEnum):
68
+ ARTIFACT = "artifact"
69
+ IG_C_GENE = "IG_C_gene"
70
+ IG_C_PSEUDOGENE = "IG_C_pseudogene"
71
+ IG_D_GENE = "IG_D_gene"
72
+ IG_D_PSEUDOGENE = "IG_D_pseudogene"
73
+ IG_J_GENE = "IG_J_gene"
74
+ IG_J_PSEUDOGENE = "IG_J_pseudogene"
75
+ IG_PSEUDOGENE = "IG_pseudogene"
76
+ IG_V_GENE = "IG_V_gene"
77
+ IG_V_PSEUDOGENE = "IG_V_pseudogene"
78
+ LNCRNA = "lncRNA"
79
+ MIRNA = "miRNA"
80
+ MISC_RNA = "misc_RNA"
81
+ MT_RRNA = "Mt_rRNA"
82
+ MT_TRNA = "Mt_tRNA"
83
+ PROCESSED_PSEUDOGENE = "processed_pseudogene"
84
+ PROTEIN_CODING = "protein_coding"
85
+ RIBOZYME = "ribozyme"
86
+ RRNA = "rRNA"
87
+ RRNA_PSEUDOGENE = "rRNA_pseudogene"
88
+ SRNA = "sRNA"
89
+ SCRNA = "scRNA"
90
+ SCARNA = "scaRNA"
91
+ SNRNA = "snRNA"
92
+ SNORNA = "snoRNA"
93
+ TEC = "TEC"
94
+ TR_C_GENE = "TR_C_gene"
95
+ TR_C_PSEUDOGENE = "TR_C_pseudogene"
96
+ TR_D_GENE = "TR_D_gene"
97
+ TR_D_PSEUDOGENE = "TR_D_pseudogene"
98
+ TR_J_GENE = "TR_J_gene"
99
+ TR_J_PSEUDOGENE = "TR_J_pseudogene"
100
+ TR_V_GENE = "TR_V_gene"
101
+ TR_V_PSEUDOGENE = "TR_V_pseudogene"
102
+ TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"
103
+ TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"
104
+ TRANSCRIBED_UNPROCESSED_PSEUDOGENE = "transcribed_unprocessed_pseudogene"
105
+ TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"
106
+ UNITARY_PSEUDOGENE = "unitary_pseudogene"
107
+ UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"
108
+ VAULT_RNA = "vault_RNA"
109
+
110
+ # Transcript-specific:
111
+ NON_STOP_DECAY = "non_stop_decay"
112
+ NONSENSE_MEDIATED_DECAY = "nonsense_mediated_decay"
113
+ PROCESSED_TRANSCRIPT = "processed_transcript"
114
+ PROTEIN_CODING_CDS_NOT_DEFINED = "protein_coding_CDS_not_defined"
115
+ PROTEIN_CODING_LOF = "protein_coding_LoF"
116
+ RETAINED_INTRON = "retained_intron"
117
+
118
+
119
+ transcript_gene, gene_transcripts = relation(source="gene_id")
120
+ selenocysteine_gene, _ = relation(source="gene_id")
121
+ selenocysteine_transcript, _ = relation(source="transcript_id")
122
+ exon_transcript, transcript_exons = relation(source="transcript_id")
123
+ exon_gene, _ = relation(source="gene_id")
124
+ cds_exon, exon_cds = relation(source=("transcript_id", "exon_number"), one_to_one=True)
125
+ utr_transcript, transcript_utrs = relation(source="transcript_id")
126
+ utr_gene, _ = relation(source="gene_id")
127
+ five_prime_utr_transcript, transcript_five_prime_utr = relation(
128
+ source="transcript_id", one_to_one=True
129
+ )
130
+ five_prime_utr_gene, _ = relation(source="gene_id")
131
+ three_prime_utr_transcript, transcript_three_prime_utr = relation(
132
+ source="transcript_id", one_to_one=True
133
+ )
134
+ three_prime_utr_gene, _ = relation(source="gene_id")
135
+ start_codon_transcript, transcript_start_codon = relation(
136
+ source="transcript_id", one_to_one=True
137
+ )
138
+ start_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
139
+ stop_codon_transcript, transcript_stop_codon = relation(
140
+ source="transcript_id", one_to_one=True
141
+ )
142
+ stop_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
143
+
144
+
145
+ class Gene(BaseGene, type="gene"):
146
+ id: str = id_field(source="gene_id")
147
+ type: GeneType = field(source="gene_type")
148
+ name: str = field(source="gene_name")
149
+ transcripts: list["Transcript"] = gene_transcripts
150
+ tags: list[str] = field(source="tag", default_factory=list)
151
+
152
+
153
+ class Transcript(BaseTranscript, type="transcript"):
154
+ id: str = id_field(source="transcript_id")
155
+ type: TranscriptType = field(source="transcript_type")
156
+ name: str = field(source="transcript_name")
157
+ gene: Gene = transcript_gene
158
+ exons: list["Exon"] = transcript_exons
159
+ utrs: list["UTR"] = transcript_utrs
160
+ start_codon: "StartCodon | None" = transcript_start_codon
161
+ stop_codon: "StopCodon | None" = transcript_stop_codon
162
+ tags: list[str] = field(source="tag", default_factory=list)
163
+
164
+
165
+ class Selenocysteine(
166
+ Feature, type=("selenocysteine", "stop_codon_redefined_as_selenocysteine")
167
+ ):
168
+ id: str = no_id_field()
169
+ gene: Gene = selenocysteine_gene
170
+ transcript: Transcript = selenocysteine_transcript
171
+
172
+
173
+ class Exon(BaseExon, type="exon"):
174
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
175
+ number: int = field(source="exon_number")
176
+ transcript: Transcript = exon_transcript
177
+ gene: Gene = exon_gene
178
+ cds: "CDS | None" = exon_cds
179
+ tags: list[str] = field(source="tag", default_factory=list)
180
+
181
+ @property
182
+ def cdss(self) -> list["CDS"]:
183
+ # In RefSeq, exon can have multiple CDS.
184
+ # This property is for compatibility with a more general case.
185
+ return [self.cds] if self.cds is not None else []
186
+
187
+
188
+ class CDS(BaseCDS, type="cds"):
189
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
190
+ exon: Exon = cds_exon
191
+
192
+
193
+ class UTR(BaseUTR, type="utr"):
194
+ id: NoneType = no_id_field()
195
+ transcript: Transcript = utr_transcript
196
+ gene: Gene = utr_gene
197
+
198
+
199
+ class FivePrimeUTR(UTR, type="five_prime_utr"):
200
+ id: NoneType = no_id_field()
201
+ transcript: Transcript = five_prime_utr_transcript
202
+ gene: Gene = five_prime_utr_gene
203
+
204
+
205
+ class ThreePrimeUTR(UTR, type="three_prime_utr"):
206
+ id: NoneType = no_id_field()
207
+ transcript: Transcript = three_prime_utr_transcript
208
+ gene: Gene = three_prime_utr_gene
209
+
210
+
211
+ class StartCodon(Feature, type="start_codon"):
212
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
213
+ transcript: Transcript = start_codon_transcript
214
+ exon: Exon = start_codon_exon
215
+
216
+
217
+ class StopCodon(Feature, type="stop_codon"):
218
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
219
+ transcript: Transcript = stop_codon_transcript
220
+ exon: Exon = stop_codon_exon
221
+
222
+
223
+ GENCODE_FEATURE_TYPES = [
224
+ Gene,
225
+ Transcript,
226
+ Selenocysteine,
227
+ Exon,
228
+ CDS,
229
+ UTR,
230
+ FivePrimeUTR,
231
+ ThreePrimeUTR,
232
+ StartCodon,
233
+ StopCodon,
234
+ ]
235
+ GENCODE_DIALECT = Dialect(name="GENCODE", feature_types=GENCODE_FEATURE_TYPES)
@@ -0,0 +1,25 @@
1
+ from biofiles.types.feature import Feature
2
+
3
+
4
+ class Gene(Feature):
5
+ type: str
6
+ transcripts: list["Transcript"]
7
+
8
+
9
+ class Transcript(Feature):
10
+ type: str
11
+ gene: Gene
12
+ exons: list["Exon"]
13
+
14
+
15
+ class Exon(Feature):
16
+ transcript: Transcript
17
+ cdss: list["CDS"]
18
+
19
+
20
+ class CDS(Feature):
21
+ exon: Exon
22
+
23
+
24
+ class UTR(Feature):
25
+ transcript: Transcript
@@ -0,0 +1,142 @@
1
+ """Feature dialect for RefSeq .gtf/.gff3 files."""
2
+
3
+ from enum import StrEnum
4
+
5
+ from biofiles.dialects.genomic_base import (
6
+ Gene as BaseGene,
7
+ Transcript as BaseTranscript,
8
+ Exon as BaseExon,
9
+ CDS as BaseCDS,
10
+ )
11
+ from biofiles.types.feature import (
12
+ Feature,
13
+ id_field,
14
+ field,
15
+ relation,
16
+ no_id_field,
17
+ Dialect,
18
+ )
19
+
20
+
21
+ class GeneType(StrEnum):
22
+ ANTISENSE_RNA = "antisense_RNA"
23
+ C_REGION = "C_region"
24
+ C_REGION_PSEUDOGENE = "C_region_pseudogene"
25
+ D_SEGMENT = "D_segment"
26
+ D_SEGMENT_PSEUDOGENE = "D_segment_pseudogene"
27
+ J_SEGMENT = "J_segment"
28
+ J_SEGMENT_PSEUDOGENE = "J_segment_pseudogene"
29
+ LNCRNA = "lncRNA"
30
+ MIRNA = "miRNA"
31
+ MISC_RNA = "misc_RNA"
32
+ NCRNA = "ncRNA"
33
+ NCRNA_PSEUDOGENE = "ncRNA_pseudogene"
34
+ OTHER = "other"
35
+ PROTEIN_CODING = "protein_coding"
36
+ PSEUDOGENE = "pseudogene"
37
+ RNASE_MRP_RNA = "RNase_MRP_RNA"
38
+ RNASE_P_RNA = "RNase_P_RNA"
39
+ RRNA = "rRNA"
40
+ SCARNA = "scaRNA"
41
+ SCRNA = "scRNA"
42
+ SNORNA = "snoRNA"
43
+ SNRNA = "snRNA"
44
+ TELOMERASE_RNA = "telomerase_RNA"
45
+ TRANSCRIBED_PSEUDOGENE = "transcribed_pseudogene"
46
+ TRNA = "tRNA"
47
+ VAULT_RNA = "vault_RNA"
48
+ V_SEGMENT = "V_segment"
49
+ V_SEGMENT_PSEUDOGENE = "V_segment_pseudogene"
50
+ Y_RNA = "Y_RNA"
51
+
52
+
53
+ class TranscriptType(StrEnum):
54
+ ANTISENSE_RNA = "antisense_RNA"
55
+ C_GENE_SEGMENT = "C_gene_segment"
56
+ D_GENE_SEGMENT = "D_gene_segment"
57
+ J_GENE_SEGMENT = "J_gene_segment"
58
+ LNC_RNA = "lnc_RNA"
59
+ MIRNA = "miRNA"
60
+ MRNA = "mRNA"
61
+ PRIMARY_TRANSCRIPT = "primary_transcript"
62
+ RNASE_MRP_RNA = "RNase_MRP_RNA"
63
+ RNASE_P_RNA = "RNase_P_RNA"
64
+ RRNA = "rRNA"
65
+ SCARNA = "scaRNA"
66
+ SCRNA = "scRNA"
67
+ SNORNA = "snoRNA"
68
+ SNRNA = "snRNA"
69
+ TELOMERASE_RNA = "telomerase_RNA"
70
+ TRANSCRIPT = "transcript"
71
+ TRNA = "tRNA"
72
+ VAULT_RNA = "vault_RNA"
73
+ V_GENE_SEGMENT = "V_gene_segment"
74
+ Y_RNA = "Y_RNA"
75
+
76
+
77
+ transcript_gene, gene_transcripts = relation(source="gene_id")
78
+ exon_transcript, transcript_exons = relation(source="transcript_id")
79
+ exon_gene, _ = relation(source="gene_id")
80
+ cds_exon, exon_cds = relation(source=("transcript_id", "exon_number"))
81
+ start_codon_transcript, transcript_start_codon = relation(
82
+ source="transcript_id", one_to_one=True
83
+ )
84
+ start_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
85
+ stop_codon_transcript, transcript_stop_codon = relation(
86
+ source="transcript_id", one_to_one=True
87
+ )
88
+ stop_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
89
+
90
+
91
+ class Gene(BaseGene, type="gene"):
92
+ id: str = id_field(source="gene_id")
93
+ type: GeneType = field(source="gene_biotype")
94
+ name: str = field(source="gene")
95
+ synonyms: list[str] = field(source="gene_synonym", default_factory=list)
96
+ transcripts: list["Transcript"] = gene_transcripts
97
+
98
+
99
+ class Transcript(BaseTranscript, type="transcript"):
100
+ id: str = id_field(source="transcript_id")
101
+ type: TranscriptType = field(source="transcript_biotype")
102
+ product: str | None = field(source="product", default=None)
103
+ gene: Gene = transcript_gene
104
+ exons: list["Exon"] = transcript_exons
105
+ start_codon: "StartCodon | None" = transcript_start_codon
106
+ stop_codon: "StopCodon | None" = transcript_stop_codon
107
+
108
+
109
+ class Exon(BaseExon, type="exon"):
110
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
111
+ number: int = field(source="exon_number")
112
+ transcript: Transcript = exon_transcript
113
+ gene: Gene = exon_gene
114
+ cdss: list["CDS"] = exon_cds
115
+
116
+
117
+ class CDS(BaseCDS, type="cds"):
118
+ id: tuple[str, int] = no_id_field()
119
+ exon: Exon = cds_exon
120
+
121
+
122
+ class StartCodon(Feature, type="start_codon"):
123
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
124
+ transcript: Transcript = start_codon_transcript
125
+ exon: Exon = start_codon_exon
126
+
127
+
128
+ class StopCodon(Feature, type="stop_codon"):
129
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
130
+ transcript: Transcript = stop_codon_transcript
131
+ exon: Exon = stop_codon_exon
132
+
133
+
134
+ REFSEQ_FEATURE_TYPES = [
135
+ Gene,
136
+ Transcript,
137
+ Exon,
138
+ CDS,
139
+ StartCodon,
140
+ StopCodon,
141
+ ]
142
+ REFSEQ_DIALECT = Dialect(name="RefSeq", feature_types=REFSEQ_FEATURE_TYPES)
@@ -0,0 +1,24 @@
1
+ """Feature dialect for StringTie .gtf/.gff output."""
2
+
3
+ from biofiles.types.feature import Feature, relation, id_field, field, Dialect
4
+
5
+ exon_transcript, transcript_exons = relation(source="transcript_id")
6
+
7
+
8
+ class Transcript(Feature, type="transcript"):
9
+ id: str = id_field(source="transcript_id")
10
+ gene_id: str = field(source="gene_id")
11
+ exons: list["Exon"] = transcript_exons
12
+ coverage: float = field(source="cov")
13
+ fpkm: float = field(source="FPKM")
14
+ tpm: float = field(source="TPM")
15
+
16
+
17
+ class Exon(Feature, type="exon"):
18
+ id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
19
+ transcript: Transcript = exon_transcript
20
+ coverage: float = field(source="cov")
21
+
22
+
23
+ STRINGTIE_FEATURE_TYPES = [Transcript, Exon]
24
+ STRINGTIE_DIALECT = Dialect(name="StringTie", feature_types=STRINGTIE_FEATURE_TYPES)