biofiles 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/dialects/detector.py +74 -0
- biofiles/dialects/gencode.py +235 -0
- biofiles/dialects/genomic_base.py +25 -0
- biofiles/dialects/refseq.py +142 -0
- biofiles/dialects/stringtie.py +24 -0
- biofiles/gff.py +49 -44
- biofiles/gtf.py +34 -11
- biofiles/types/feature.py +268 -34
- biofiles/utility/cli.py +2 -1
- biofiles/utility/feature.py +180 -211
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/METADATA +9 -4
- biofiles-0.1.1.dist-info/RECORD +27 -0
- biofiles/dialects/havana_ensembl.py +0 -101
- biofiles/types/feature_v2.py +0 -105
- biofiles/utility/feature_v2.py +0 -148
- biofiles-0.0.14.dist-info/RECORD +0 -25
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/WHEEL +0 -0
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {biofiles-0.0.14.dist-info → biofiles-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
import sys
|
2
|
+
from itertools import islice
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from biofiles.dialects.gencode import GENCODE_DIALECT
|
6
|
+
from biofiles.dialects.refseq import REFSEQ_DIALECT
|
7
|
+
from biofiles.dialects.stringtie import STRINGTIE_DIALECT
|
8
|
+
from biofiles.types.feature import Dialect
|
9
|
+
from biofiles.utility.feature import RawFeatureReader
|
10
|
+
|
11
|
+
|
12
|
+
class CantDetectDialect(Exception):
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class DialectDetector:
|
17
|
+
def __init__(self, raw_reader: RawFeatureReader, num_samples: int = 1000) -> None:
|
18
|
+
self._raw_reader = raw_reader
|
19
|
+
self._num_samples = num_samples
|
20
|
+
|
21
|
+
def detect(self) -> Dialect:
|
22
|
+
gencode_rows = 0
|
23
|
+
refseq_rows = 0
|
24
|
+
stringtie_rows = 0
|
25
|
+
total_rows = 0
|
26
|
+
for fd in islice(self._raw_reader, self._num_samples):
|
27
|
+
total_rows += 1
|
28
|
+
source = fd.source.lower()
|
29
|
+
if source in ("havana", "ensembl"):
|
30
|
+
gencode_rows += 1
|
31
|
+
elif source in ("bestrefseq", "bestrefseq%2cgnomon", "gnomon", "refseq"):
|
32
|
+
refseq_rows += 1
|
33
|
+
elif source in ("stringtie",):
|
34
|
+
stringtie_rows += 1
|
35
|
+
|
36
|
+
if gencode_rows > 0 and gencode_rows >= 0.9 * total_rows:
|
37
|
+
return GENCODE_DIALECT
|
38
|
+
if refseq_rows > 0 and refseq_rows >= 0.9 * total_rows:
|
39
|
+
return REFSEQ_DIALECT
|
40
|
+
if stringtie_rows > 0 and stringtie_rows >= 0.9 * total_rows:
|
41
|
+
return STRINGTIE_DIALECT
|
42
|
+
|
43
|
+
raise CantDetectDialect(
|
44
|
+
f"of {total_rows} read rows {gencode_rows} look like GENCODE, "
|
45
|
+
f"{refseq_rows} look like RefSeq, {stringtie_rows} look like StringTie"
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
def detect_dialect(path: Path) -> Dialect:
|
50
|
+
if path.suffix == ".gtf":
|
51
|
+
from biofiles.gtf import RawGTFReader
|
52
|
+
|
53
|
+
raw_reader = RawGTFReader(path)
|
54
|
+
elif path.suffix in (".gff", ".gff3"):
|
55
|
+
from biofiles.gff import RawGFFReader
|
56
|
+
|
57
|
+
raw_reader = RawGFFReader(path)
|
58
|
+
else:
|
59
|
+
raise CantDetectDialect(f"unknown file extension {path.suffix}")
|
60
|
+
detector = DialectDetector(raw_reader=raw_reader)
|
61
|
+
return detector.detect()
|
62
|
+
|
63
|
+
|
64
|
+
if __name__ == "__main__":
|
65
|
+
exit_code = 0
|
66
|
+
for path_str in sys.argv[1:]:
|
67
|
+
path = Path(path_str)
|
68
|
+
try:
|
69
|
+
dialect = detect_dialect(path)
|
70
|
+
print(f"{path}\t{dialect.name}")
|
71
|
+
except CantDetectDialect as exc:
|
72
|
+
print(f"Failed to detect dialect for {path}: {exc}", file=sys.stderr)
|
73
|
+
exit_code = 1
|
74
|
+
sys.exit(exit_code)
|
@@ -0,0 +1,235 @@
|
|
1
|
+
"""Feature dialect for GENCODE .gtf/.gff3 files."""
|
2
|
+
|
3
|
+
from enum import StrEnum
|
4
|
+
from types import NoneType
|
5
|
+
|
6
|
+
from biofiles.dialects.genomic_base import (
|
7
|
+
Gene as BaseGene,
|
8
|
+
Transcript as BaseTranscript,
|
9
|
+
Exon as BaseExon,
|
10
|
+
CDS as BaseCDS,
|
11
|
+
UTR as BaseUTR,
|
12
|
+
)
|
13
|
+
from biofiles.types.feature import (
|
14
|
+
Feature,
|
15
|
+
id_field,
|
16
|
+
field,
|
17
|
+
relation,
|
18
|
+
no_id_field,
|
19
|
+
Dialect,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
class GeneType(StrEnum):
|
24
|
+
ARTIFACT = "artifact"
|
25
|
+
IG_C_GENE = "IG_C_gene"
|
26
|
+
IG_C_PSEUDOGENE = "IG_C_pseudogene"
|
27
|
+
IG_D_GENE = "IG_D_gene"
|
28
|
+
IG_D_PSEUDOGENE = "IG_D_pseudogene"
|
29
|
+
IG_J_GENE = "IG_J_gene"
|
30
|
+
IG_J_PSEUDOGENE = "IG_J_pseudogene"
|
31
|
+
IG_PSEUDOGENE = "IG_pseudogene"
|
32
|
+
IG_V_GENE = "IG_V_gene"
|
33
|
+
IG_V_PSEUDOGENE = "IG_V_pseudogene"
|
34
|
+
LNCRNA = "lncRNA"
|
35
|
+
MIRNA = "miRNA"
|
36
|
+
MISC_RNA = "misc_RNA"
|
37
|
+
MT_RRNA = "Mt_rRNA"
|
38
|
+
MT_TRNA = "Mt_tRNA"
|
39
|
+
PROCESSED_PSEUDOGENE = "processed_pseudogene"
|
40
|
+
PROTEIN_CODING = "protein_coding"
|
41
|
+
RIBOZYME = "ribozyme"
|
42
|
+
RRNA = "rRNA"
|
43
|
+
RRNA_PSEUDOGENE = "rRNA_pseudogene"
|
44
|
+
SRNA = "sRNA"
|
45
|
+
SCRNA = "scRNA"
|
46
|
+
SCARNA = "scaRNA"
|
47
|
+
SNRNA = "snRNA"
|
48
|
+
SNORNA = "snoRNA"
|
49
|
+
TEC = "TEC"
|
50
|
+
TR_C_GENE = "TR_C_gene"
|
51
|
+
TR_C_PSEUDOGENE = "TR_C_pseudogene"
|
52
|
+
TR_D_GENE = "TR_D_gene"
|
53
|
+
TR_D_PSEUDOGENE = "TR_D_pseudogene"
|
54
|
+
TR_J_GENE = "TR_J_gene"
|
55
|
+
TR_J_PSEUDOGENE = "TR_J_pseudogene"
|
56
|
+
TR_V_GENE = "TR_V_gene"
|
57
|
+
TR_V_PSEUDOGENE = "TR_V_pseudogene"
|
58
|
+
TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"
|
59
|
+
TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"
|
60
|
+
TRANSCRIBED_UNPROCESSED_PSEUDOGENE = "transcribed_unprocessed_pseudogene"
|
61
|
+
TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"
|
62
|
+
UNITARY_PSEUDOGENE = "unitary_pseudogene"
|
63
|
+
UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"
|
64
|
+
VAULT_RNA = "vault_RNA"
|
65
|
+
|
66
|
+
|
67
|
+
class TranscriptType(StrEnum):
|
68
|
+
ARTIFACT = "artifact"
|
69
|
+
IG_C_GENE = "IG_C_gene"
|
70
|
+
IG_C_PSEUDOGENE = "IG_C_pseudogene"
|
71
|
+
IG_D_GENE = "IG_D_gene"
|
72
|
+
IG_D_PSEUDOGENE = "IG_D_pseudogene"
|
73
|
+
IG_J_GENE = "IG_J_gene"
|
74
|
+
IG_J_PSEUDOGENE = "IG_J_pseudogene"
|
75
|
+
IG_PSEUDOGENE = "IG_pseudogene"
|
76
|
+
IG_V_GENE = "IG_V_gene"
|
77
|
+
IG_V_PSEUDOGENE = "IG_V_pseudogene"
|
78
|
+
LNCRNA = "lncRNA"
|
79
|
+
MIRNA = "miRNA"
|
80
|
+
MISC_RNA = "misc_RNA"
|
81
|
+
MT_RRNA = "Mt_rRNA"
|
82
|
+
MT_TRNA = "Mt_tRNA"
|
83
|
+
PROCESSED_PSEUDOGENE = "processed_pseudogene"
|
84
|
+
PROTEIN_CODING = "protein_coding"
|
85
|
+
RIBOZYME = "ribozyme"
|
86
|
+
RRNA = "rRNA"
|
87
|
+
RRNA_PSEUDOGENE = "rRNA_pseudogene"
|
88
|
+
SRNA = "sRNA"
|
89
|
+
SCRNA = "scRNA"
|
90
|
+
SCARNA = "scaRNA"
|
91
|
+
SNRNA = "snRNA"
|
92
|
+
SNORNA = "snoRNA"
|
93
|
+
TEC = "TEC"
|
94
|
+
TR_C_GENE = "TR_C_gene"
|
95
|
+
TR_C_PSEUDOGENE = "TR_C_pseudogene"
|
96
|
+
TR_D_GENE = "TR_D_gene"
|
97
|
+
TR_D_PSEUDOGENE = "TR_D_pseudogene"
|
98
|
+
TR_J_GENE = "TR_J_gene"
|
99
|
+
TR_J_PSEUDOGENE = "TR_J_pseudogene"
|
100
|
+
TR_V_GENE = "TR_V_gene"
|
101
|
+
TR_V_PSEUDOGENE = "TR_V_pseudogene"
|
102
|
+
TRANSCRIBED_PROCESSED_PSEUDOGENE = "transcribed_processed_pseudogene"
|
103
|
+
TRANSCRIBED_UNITARY_PSEUDOGENE = "transcribed_unitary_pseudogene"
|
104
|
+
TRANSCRIBED_UNPROCESSED_PSEUDOGENE = "transcribed_unprocessed_pseudogene"
|
105
|
+
TRANSLATED_PROCESSED_PSEUDOGENE = "translated_processed_pseudogene"
|
106
|
+
UNITARY_PSEUDOGENE = "unitary_pseudogene"
|
107
|
+
UNPROCESSED_PSEUDOGENE = "unprocessed_pseudogene"
|
108
|
+
VAULT_RNA = "vault_RNA"
|
109
|
+
|
110
|
+
# Transcript-specific:
|
111
|
+
NON_STOP_DECAY = "non_stop_decay"
|
112
|
+
NONSENSE_MEDIATED_DECAY = "nonsense_mediated_decay"
|
113
|
+
PROCESSED_TRANSCRIPT = "processed_transcript"
|
114
|
+
PROTEIN_CODING_CDS_NOT_DEFINED = "protein_coding_CDS_not_defined"
|
115
|
+
PROTEIN_CODING_LOF = "protein_coding_LoF"
|
116
|
+
RETAINED_INTRON = "retained_intron"
|
117
|
+
|
118
|
+
|
119
|
+
transcript_gene, gene_transcripts = relation(source="gene_id")
|
120
|
+
selenocysteine_gene, _ = relation(source="gene_id")
|
121
|
+
selenocysteine_transcript, _ = relation(source="transcript_id")
|
122
|
+
exon_transcript, transcript_exons = relation(source="transcript_id")
|
123
|
+
exon_gene, _ = relation(source="gene_id")
|
124
|
+
cds_exon, exon_cds = relation(source=("transcript_id", "exon_number"), one_to_one=True)
|
125
|
+
utr_transcript, transcript_utrs = relation(source="transcript_id")
|
126
|
+
utr_gene, _ = relation(source="gene_id")
|
127
|
+
five_prime_utr_transcript, transcript_five_prime_utr = relation(
|
128
|
+
source="transcript_id", one_to_one=True
|
129
|
+
)
|
130
|
+
five_prime_utr_gene, _ = relation(source="gene_id")
|
131
|
+
three_prime_utr_transcript, transcript_three_prime_utr = relation(
|
132
|
+
source="transcript_id", one_to_one=True
|
133
|
+
)
|
134
|
+
three_prime_utr_gene, _ = relation(source="gene_id")
|
135
|
+
start_codon_transcript, transcript_start_codon = relation(
|
136
|
+
source="transcript_id", one_to_one=True
|
137
|
+
)
|
138
|
+
start_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
|
139
|
+
stop_codon_transcript, transcript_stop_codon = relation(
|
140
|
+
source="transcript_id", one_to_one=True
|
141
|
+
)
|
142
|
+
stop_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
|
143
|
+
|
144
|
+
|
145
|
+
class Gene(BaseGene, type="gene"):
|
146
|
+
id: str = id_field(source="gene_id")
|
147
|
+
type: GeneType = field(source="gene_type")
|
148
|
+
name: str = field(source="gene_name")
|
149
|
+
transcripts: list["Transcript"] = gene_transcripts
|
150
|
+
tags: list[str] = field(source="tag", default_factory=list)
|
151
|
+
|
152
|
+
|
153
|
+
class Transcript(BaseTranscript, type="transcript"):
|
154
|
+
id: str = id_field(source="transcript_id")
|
155
|
+
type: TranscriptType = field(source="transcript_type")
|
156
|
+
name: str = field(source="transcript_name")
|
157
|
+
gene: Gene = transcript_gene
|
158
|
+
exons: list["Exon"] = transcript_exons
|
159
|
+
utrs: list["UTR"] = transcript_utrs
|
160
|
+
start_codon: "StartCodon | None" = transcript_start_codon
|
161
|
+
stop_codon: "StopCodon | None" = transcript_stop_codon
|
162
|
+
tags: list[str] = field(source="tag", default_factory=list)
|
163
|
+
|
164
|
+
|
165
|
+
class Selenocysteine(
|
166
|
+
Feature, type=("selenocysteine", "stop_codon_redefined_as_selenocysteine")
|
167
|
+
):
|
168
|
+
id: str = no_id_field()
|
169
|
+
gene: Gene = selenocysteine_gene
|
170
|
+
transcript: Transcript = selenocysteine_transcript
|
171
|
+
|
172
|
+
|
173
|
+
class Exon(BaseExon, type="exon"):
|
174
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
175
|
+
number: int = field(source="exon_number")
|
176
|
+
transcript: Transcript = exon_transcript
|
177
|
+
gene: Gene = exon_gene
|
178
|
+
cds: "CDS | None" = exon_cds
|
179
|
+
tags: list[str] = field(source="tag", default_factory=list)
|
180
|
+
|
181
|
+
@property
|
182
|
+
def cdss(self) -> list["CDS"]:
|
183
|
+
# In RefSeq, exon can have multiple CDS.
|
184
|
+
# This property is for compatibility with a more general case.
|
185
|
+
return [self.cds] if self.cds is not None else []
|
186
|
+
|
187
|
+
|
188
|
+
class CDS(BaseCDS, type="cds"):
|
189
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
190
|
+
exon: Exon = cds_exon
|
191
|
+
|
192
|
+
|
193
|
+
class UTR(BaseUTR, type="utr"):
|
194
|
+
id: NoneType = no_id_field()
|
195
|
+
transcript: Transcript = utr_transcript
|
196
|
+
gene: Gene = utr_gene
|
197
|
+
|
198
|
+
|
199
|
+
class FivePrimeUTR(UTR, type="five_prime_utr"):
|
200
|
+
id: NoneType = no_id_field()
|
201
|
+
transcript: Transcript = five_prime_utr_transcript
|
202
|
+
gene: Gene = five_prime_utr_gene
|
203
|
+
|
204
|
+
|
205
|
+
class ThreePrimeUTR(UTR, type="three_prime_utr"):
|
206
|
+
id: NoneType = no_id_field()
|
207
|
+
transcript: Transcript = three_prime_utr_transcript
|
208
|
+
gene: Gene = three_prime_utr_gene
|
209
|
+
|
210
|
+
|
211
|
+
class StartCodon(Feature, type="start_codon"):
|
212
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
213
|
+
transcript: Transcript = start_codon_transcript
|
214
|
+
exon: Exon = start_codon_exon
|
215
|
+
|
216
|
+
|
217
|
+
class StopCodon(Feature, type="stop_codon"):
|
218
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
219
|
+
transcript: Transcript = stop_codon_transcript
|
220
|
+
exon: Exon = stop_codon_exon
|
221
|
+
|
222
|
+
|
223
|
+
GENCODE_FEATURE_TYPES = [
|
224
|
+
Gene,
|
225
|
+
Transcript,
|
226
|
+
Selenocysteine,
|
227
|
+
Exon,
|
228
|
+
CDS,
|
229
|
+
UTR,
|
230
|
+
FivePrimeUTR,
|
231
|
+
ThreePrimeUTR,
|
232
|
+
StartCodon,
|
233
|
+
StopCodon,
|
234
|
+
]
|
235
|
+
GENCODE_DIALECT = Dialect(name="GENCODE", feature_types=GENCODE_FEATURE_TYPES)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from biofiles.types.feature import Feature
|
2
|
+
|
3
|
+
|
4
|
+
class Gene(Feature):
|
5
|
+
type: str
|
6
|
+
transcripts: list["Transcript"]
|
7
|
+
|
8
|
+
|
9
|
+
class Transcript(Feature):
|
10
|
+
type: str
|
11
|
+
gene: Gene
|
12
|
+
exons: list["Exon"]
|
13
|
+
|
14
|
+
|
15
|
+
class Exon(Feature):
|
16
|
+
transcript: Transcript
|
17
|
+
cdss: list["CDS"]
|
18
|
+
|
19
|
+
|
20
|
+
class CDS(Feature):
|
21
|
+
exon: Exon
|
22
|
+
|
23
|
+
|
24
|
+
class UTR(Feature):
|
25
|
+
transcript: Transcript
|
@@ -0,0 +1,142 @@
|
|
1
|
+
"""Feature dialect for RefSeq .gtf/.gff3 files."""
|
2
|
+
|
3
|
+
from enum import StrEnum
|
4
|
+
|
5
|
+
from biofiles.dialects.genomic_base import (
|
6
|
+
Gene as BaseGene,
|
7
|
+
Transcript as BaseTranscript,
|
8
|
+
Exon as BaseExon,
|
9
|
+
CDS as BaseCDS,
|
10
|
+
)
|
11
|
+
from biofiles.types.feature import (
|
12
|
+
Feature,
|
13
|
+
id_field,
|
14
|
+
field,
|
15
|
+
relation,
|
16
|
+
no_id_field,
|
17
|
+
Dialect,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
class GeneType(StrEnum):
|
22
|
+
ANTISENSE_RNA = "antisense_RNA"
|
23
|
+
C_REGION = "C_region"
|
24
|
+
C_REGION_PSEUDOGENE = "C_region_pseudogene"
|
25
|
+
D_SEGMENT = "D_segment"
|
26
|
+
D_SEGMENT_PSEUDOGENE = "D_segment_pseudogene"
|
27
|
+
J_SEGMENT = "J_segment"
|
28
|
+
J_SEGMENT_PSEUDOGENE = "J_segment_pseudogene"
|
29
|
+
LNCRNA = "lncRNA"
|
30
|
+
MIRNA = "miRNA"
|
31
|
+
MISC_RNA = "misc_RNA"
|
32
|
+
NCRNA = "ncRNA"
|
33
|
+
NCRNA_PSEUDOGENE = "ncRNA_pseudogene"
|
34
|
+
OTHER = "other"
|
35
|
+
PROTEIN_CODING = "protein_coding"
|
36
|
+
PSEUDOGENE = "pseudogene"
|
37
|
+
RNASE_MRP_RNA = "RNase_MRP_RNA"
|
38
|
+
RNASE_P_RNA = "RNase_P_RNA"
|
39
|
+
RRNA = "rRNA"
|
40
|
+
SCARNA = "scaRNA"
|
41
|
+
SCRNA = "scRNA"
|
42
|
+
SNORNA = "snoRNA"
|
43
|
+
SNRNA = "snRNA"
|
44
|
+
TELOMERASE_RNA = "telomerase_RNA"
|
45
|
+
TRANSCRIBED_PSEUDOGENE = "transcribed_pseudogene"
|
46
|
+
TRNA = "tRNA"
|
47
|
+
VAULT_RNA = "vault_RNA"
|
48
|
+
V_SEGMENT = "V_segment"
|
49
|
+
V_SEGMENT_PSEUDOGENE = "V_segment_pseudogene"
|
50
|
+
Y_RNA = "Y_RNA"
|
51
|
+
|
52
|
+
|
53
|
+
class TranscriptType(StrEnum):
|
54
|
+
ANTISENSE_RNA = "antisense_RNA"
|
55
|
+
C_GENE_SEGMENT = "C_gene_segment"
|
56
|
+
D_GENE_SEGMENT = "D_gene_segment"
|
57
|
+
J_GENE_SEGMENT = "J_gene_segment"
|
58
|
+
LNC_RNA = "lnc_RNA"
|
59
|
+
MIRNA = "miRNA"
|
60
|
+
MRNA = "mRNA"
|
61
|
+
PRIMARY_TRANSCRIPT = "primary_transcript"
|
62
|
+
RNASE_MRP_RNA = "RNase_MRP_RNA"
|
63
|
+
RNASE_P_RNA = "RNase_P_RNA"
|
64
|
+
RRNA = "rRNA"
|
65
|
+
SCARNA = "scaRNA"
|
66
|
+
SCRNA = "scRNA"
|
67
|
+
SNORNA = "snoRNA"
|
68
|
+
SNRNA = "snRNA"
|
69
|
+
TELOMERASE_RNA = "telomerase_RNA"
|
70
|
+
TRANSCRIPT = "transcript"
|
71
|
+
TRNA = "tRNA"
|
72
|
+
VAULT_RNA = "vault_RNA"
|
73
|
+
V_GENE_SEGMENT = "V_gene_segment"
|
74
|
+
Y_RNA = "Y_RNA"
|
75
|
+
|
76
|
+
|
77
|
+
transcript_gene, gene_transcripts = relation(source="gene_id")
|
78
|
+
exon_transcript, transcript_exons = relation(source="transcript_id")
|
79
|
+
exon_gene, _ = relation(source="gene_id")
|
80
|
+
cds_exon, exon_cds = relation(source=("transcript_id", "exon_number"))
|
81
|
+
start_codon_transcript, transcript_start_codon = relation(
|
82
|
+
source="transcript_id", one_to_one=True
|
83
|
+
)
|
84
|
+
start_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
|
85
|
+
stop_codon_transcript, transcript_stop_codon = relation(
|
86
|
+
source="transcript_id", one_to_one=True
|
87
|
+
)
|
88
|
+
stop_codon_exon, _ = relation(source=("transcript_id", "exon_number"), one_to_one=True)
|
89
|
+
|
90
|
+
|
91
|
+
class Gene(BaseGene, type="gene"):
|
92
|
+
id: str = id_field(source="gene_id")
|
93
|
+
type: GeneType = field(source="gene_biotype")
|
94
|
+
name: str = field(source="gene")
|
95
|
+
synonyms: list[str] = field(source="gene_synonym", default_factory=list)
|
96
|
+
transcripts: list["Transcript"] = gene_transcripts
|
97
|
+
|
98
|
+
|
99
|
+
class Transcript(BaseTranscript, type="transcript"):
|
100
|
+
id: str = id_field(source="transcript_id")
|
101
|
+
type: TranscriptType = field(source="transcript_biotype")
|
102
|
+
product: str | None = field(source="product", default=None)
|
103
|
+
gene: Gene = transcript_gene
|
104
|
+
exons: list["Exon"] = transcript_exons
|
105
|
+
start_codon: "StartCodon | None" = transcript_start_codon
|
106
|
+
stop_codon: "StopCodon | None" = transcript_stop_codon
|
107
|
+
|
108
|
+
|
109
|
+
class Exon(BaseExon, type="exon"):
|
110
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
111
|
+
number: int = field(source="exon_number")
|
112
|
+
transcript: Transcript = exon_transcript
|
113
|
+
gene: Gene = exon_gene
|
114
|
+
cdss: list["CDS"] = exon_cds
|
115
|
+
|
116
|
+
|
117
|
+
class CDS(BaseCDS, type="cds"):
|
118
|
+
id: tuple[str, int] = no_id_field()
|
119
|
+
exon: Exon = cds_exon
|
120
|
+
|
121
|
+
|
122
|
+
class StartCodon(Feature, type="start_codon"):
|
123
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
124
|
+
transcript: Transcript = start_codon_transcript
|
125
|
+
exon: Exon = start_codon_exon
|
126
|
+
|
127
|
+
|
128
|
+
class StopCodon(Feature, type="stop_codon"):
|
129
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
130
|
+
transcript: Transcript = stop_codon_transcript
|
131
|
+
exon: Exon = stop_codon_exon
|
132
|
+
|
133
|
+
|
134
|
+
REFSEQ_FEATURE_TYPES = [
|
135
|
+
Gene,
|
136
|
+
Transcript,
|
137
|
+
Exon,
|
138
|
+
CDS,
|
139
|
+
StartCodon,
|
140
|
+
StopCodon,
|
141
|
+
]
|
142
|
+
REFSEQ_DIALECT = Dialect(name="RefSeq", feature_types=REFSEQ_FEATURE_TYPES)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
"""Feature dialect for StringTie .gtf/.gff output."""
|
2
|
+
|
3
|
+
from biofiles.types.feature import Feature, relation, id_field, field, Dialect
|
4
|
+
|
5
|
+
exon_transcript, transcript_exons = relation(source="transcript_id")
|
6
|
+
|
7
|
+
|
8
|
+
class Transcript(Feature, type="transcript"):
|
9
|
+
id: str = id_field(source="transcript_id")
|
10
|
+
gene_id: str = field(source="gene_id")
|
11
|
+
exons: list["Exon"] = transcript_exons
|
12
|
+
coverage: float = field(source="cov")
|
13
|
+
fpkm: float = field(source="FPKM")
|
14
|
+
tpm: float = field(source="TPM")
|
15
|
+
|
16
|
+
|
17
|
+
class Exon(Feature, type="exon"):
|
18
|
+
id: tuple[str, int] = id_field(source=("transcript_id", "exon_number"))
|
19
|
+
transcript: Transcript = exon_transcript
|
20
|
+
coverage: float = field(source="cov")
|
21
|
+
|
22
|
+
|
23
|
+
STRINGTIE_FEATURE_TYPES = [Transcript, Exon]
|
24
|
+
STRINGTIE_DIALECT = Dialect(name="StringTie", feature_types=STRINGTIE_FEATURE_TYPES)
|
biofiles/gff.py
CHANGED
@@ -3,15 +3,20 @@ from pathlib import Path
|
|
3
3
|
from typing import Iterator, cast, TextIO
|
4
4
|
|
5
5
|
from biofiles.common import Strand, Writer
|
6
|
+
from biofiles.dialects.detector import detect_dialect
|
7
|
+
from biofiles.dialects.genomic_base import Feature, Gene, Exon, UTR
|
6
8
|
from biofiles.utility.cli import parse_pipeline_args
|
7
|
-
from biofiles.utility.feature import
|
8
|
-
|
9
|
+
from biofiles.utility.feature import (
|
10
|
+
FeatureReader,
|
11
|
+
FeatureDraft,
|
12
|
+
RawFeatureReader,
|
13
|
+
)
|
9
14
|
|
10
|
-
__all__ = ["GFFReader", "GFF3Writer"]
|
15
|
+
__all__ = ["RawGFFReader", "GFFReader", "GFF3Writer"]
|
11
16
|
|
12
17
|
|
13
|
-
class
|
14
|
-
def __iter__(self) -> Iterator[
|
18
|
+
class RawGFFReader(RawFeatureReader):
|
19
|
+
def __iter__(self) -> Iterator[FeatureDraft]:
|
15
20
|
for line in self._input:
|
16
21
|
line = line.rstrip("\n")
|
17
22
|
if line.startswith(_VERSION_PREFIX):
|
@@ -24,60 +29,51 @@ class GFFReader(FeatureReader):
|
|
24
29
|
continue
|
25
30
|
raise ValueError(f"unexpected line {line!r}, expected version")
|
26
31
|
|
27
|
-
def _read_gff3(self) -> Iterator[
|
28
|
-
drafts = FeatureDrafts()
|
32
|
+
def _read_gff3(self) -> Iterator[FeatureDraft]:
|
29
33
|
idx = 0
|
30
|
-
for line in self._input:
|
34
|
+
for i, line in enumerate(self._input):
|
31
35
|
if line.startswith("#"):
|
32
36
|
continue
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
37
|
+
try:
|
38
|
+
line = line.rstrip("\n")
|
39
|
+
parts = line.split("\t", maxsplit=8)
|
40
|
+
if len(parts) != 9:
|
41
|
+
raise ValueError(f"unexpected line {line!r}, expected 9 columns")
|
42
|
+
(
|
43
|
+
sequence_id,
|
44
|
+
source,
|
45
|
+
type_,
|
46
|
+
start_str,
|
47
|
+
end_str,
|
48
|
+
score_str,
|
49
|
+
strand_str,
|
50
|
+
phase_str,
|
51
|
+
attributes_str,
|
52
|
+
) = parts
|
53
|
+
score = self._parse_score(line, score_str)
|
54
|
+
strand = self._parse_strand(line, strand_str)
|
55
|
+
phase = self._parse_phase(line, phase_str)
|
56
|
+
attributes = self._parse_attributes(line, attributes_str)
|
57
|
+
except Exception as exc:
|
58
|
+
raise ValueError(f"failed to parse line {i}: {exc}") from exc
|
52
59
|
|
53
60
|
parent_id = attributes.get("Parent", None)
|
54
|
-
|
55
|
-
# yield from self._finalize_drafts(drafts)
|
56
|
-
# drafts = _FeatureDrafts()
|
57
|
-
if parent_id is not None and parent_id not in drafts.by_id:
|
58
|
-
raise ValueError(
|
59
|
-
f"unexpected line {line!r}, parent ID not among recent feature IDs"
|
60
|
-
)
|
61
|
-
|
62
|
-
draft = FeatureDraft(
|
61
|
+
yield FeatureDraft(
|
63
62
|
idx=idx,
|
64
63
|
sequence_id=sequence_id,
|
65
64
|
source=source,
|
66
65
|
type_=type_,
|
67
66
|
start_original=int(start_str),
|
68
67
|
end_original=int(end_str),
|
68
|
+
start_c=int(start_str) - 1,
|
69
|
+
end_c=int(end_str),
|
69
70
|
score=score,
|
70
71
|
strand=strand,
|
71
72
|
phase=phase,
|
72
73
|
attributes=attributes,
|
73
74
|
)
|
74
|
-
drafts.add(draft)
|
75
75
|
idx += 1
|
76
76
|
|
77
|
-
# yield from self._finalize_drafts(drafts, self._streaming_window)
|
78
|
-
|
79
|
-
yield from self._finalize_drafts(drafts, None)
|
80
|
-
|
81
77
|
def _parse_score(self, line: str, score_str: str) -> float | None:
|
82
78
|
if score_str == ".":
|
83
79
|
return None
|
@@ -105,14 +101,22 @@ class GFFReader(FeatureReader):
|
|
105
101
|
f"unexpected line {line!r}, phase should be an integer or '.'"
|
106
102
|
) from exc
|
107
103
|
|
108
|
-
def _parse_attributes(
|
104
|
+
def _parse_attributes(
|
105
|
+
self, line: str, attributes_str: str
|
106
|
+
) -> dict[str, str | list[str]]:
|
109
107
|
return {
|
110
|
-
k: v
|
108
|
+
k: v.split(",") if "," in v else v
|
111
109
|
for part in attributes_str.strip(";").split(";")
|
112
110
|
for k, v in (part.split("=", 1),)
|
113
111
|
}
|
114
112
|
|
115
113
|
|
114
|
+
class GFFReader(FeatureReader):
|
115
|
+
|
116
|
+
def _make_raw_feature_reader(self) -> RawFeatureReader:
|
117
|
+
return RawGFFReader(self._input)
|
118
|
+
|
119
|
+
|
116
120
|
class GFF3Writer(Writer):
|
117
121
|
def __init__(self, output: TextIO | Path | str) -> None:
|
118
122
|
super().__init__(output)
|
@@ -147,7 +151,8 @@ if __name__ == "__main__":
|
|
147
151
|
pipeline.mapper = lambda f: print(old_mapper(f))
|
148
152
|
|
149
153
|
for path in pipeline.inputs:
|
150
|
-
|
154
|
+
dialect = detect_dialect(path)
|
155
|
+
with GFFReader(path, dialect=dialect) as r:
|
151
156
|
total_features = 0
|
152
157
|
annotated_genes = 0
|
153
158
|
annotated_exons = 0
|