biofiles 0.0.11__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biofiles-0.0.11 → biofiles-0.0.13}/PKG-INFO +3 -2
- biofiles-0.0.13/biofiles/dialects/havana_ensembl.py +101 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/gtf.py +6 -1
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/types/alignment.py +9 -1
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/types/feature.py +8 -0
- biofiles-0.0.13/biofiles/types/feature_v2.py +105 -0
- biofiles-0.0.13/biofiles/utility/__init__.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/utility/feature.py +24 -2
- biofiles-0.0.13/biofiles/utility/feature_v2.py +148 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles.egg-info/PKG-INFO +3 -2
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles.egg-info/SOURCES.txt +5 -1
- {biofiles-0.0.11 → biofiles-0.0.13}/pyproject.toml +1 -1
- {biofiles-0.0.11 → biofiles-0.0.13}/LICENSE +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/README.md +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/__init__.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/bam.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/common.py +0 -0
- {biofiles-0.0.11/biofiles/types → biofiles-0.0.13/biofiles/dialects}/__init__.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/fai.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/fasta.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/gff.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/repeatmasker.py +0 -0
- {biofiles-0.0.11/biofiles/utility → biofiles-0.0.13/biofiles/types}/__init__.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/types/repeat.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/types/sequence.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles/utility/cli.py +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles.egg-info/dependency_links.txt +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/biofiles.egg-info/top_level.txt +0 -0
- {biofiles-0.0.11 → biofiles-0.0.13}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.13
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -36,6 +36,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
36
36
|
Requires-Python: >=3.10
|
37
37
|
Description-Content-Type: text/markdown
|
38
38
|
License-File: LICENSE
|
39
|
+
Dynamic: license-file
|
39
40
|
|
40
41
|
# biofiles
|
41
42
|
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""Feature dialect for HAVANA+Ensembl .gtf files (e.g. T2T annotation)."""
|
2
|
+
|
3
|
+
from enum import StrEnum
|
4
|
+
|
5
|
+
from biofiles.types.feature_v2 import Feature, id_field, field, relation
|
6
|
+
|
7
|
+
|
8
|
+
class GeneType(StrEnum):
|
9
|
+
LNC_RNA = "lncRNA"
|
10
|
+
PROTEIN_CODING = "protein_coding"
|
11
|
+
|
12
|
+
|
13
|
+
class TranscriptType(StrEnum):
|
14
|
+
LNC_RNA = "lncRNA"
|
15
|
+
PROTEIN_CODING = "protein_coding"
|
16
|
+
|
17
|
+
|
18
|
+
transcript_gene, gene_transcripts = relation(source="gene_id")
|
19
|
+
exon_transcript, transcript_exons = relation(source="transcript_id")
|
20
|
+
exon_gene, _ = relation(source="gene_id")
|
21
|
+
cds_exon, exon_cds = relation(source="exon_id", one_to_one=True)
|
22
|
+
utr_transcript, transcript_utrs = relation(source="transcript_id")
|
23
|
+
utr_gene, _ = relation(source="gene_id")
|
24
|
+
five_prime_utr_transcript, transcript_five_prime_utr = relation(
|
25
|
+
source="transcript_id", one_to_one=True
|
26
|
+
)
|
27
|
+
five_prime_utr_gene, _ = relation(source="gene_id")
|
28
|
+
three_prime_utr_transcript, transcript_three_prime_utr = relation(
|
29
|
+
source="transcript_id", one_to_one=True
|
30
|
+
)
|
31
|
+
three_prime_utr_gene, _ = relation(source="gene_id")
|
32
|
+
start_codon_transcript, transcript_start_codon = relation(
|
33
|
+
source="transcript_id", one_to_one=True
|
34
|
+
)
|
35
|
+
start_codon_exon, _ = relation(source="exon_id", one_to_one=True)
|
36
|
+
stop_codon_transcript, transcript_stop_codon = relation(
|
37
|
+
source="transcript_id", one_to_one=True
|
38
|
+
)
|
39
|
+
stop_codon_exon, _ = relation(source="exon_id", one_to_one=True)
|
40
|
+
|
41
|
+
|
42
|
+
class Gene(Feature, type="gene"):
|
43
|
+
id: str = id_field(source="gene_id")
|
44
|
+
type: GeneType = field(source="gene_type")
|
45
|
+
name: str = field(source="gene_name")
|
46
|
+
transcripts: list["Transcript"] = gene_transcripts
|
47
|
+
|
48
|
+
|
49
|
+
class Transcript(Feature, type="transcript"):
|
50
|
+
id: str = id_field(source="transcript_id")
|
51
|
+
type: TranscriptType = field(source="transcript_type")
|
52
|
+
name: str = field(source="transcript_name")
|
53
|
+
gene: Gene = transcript_gene
|
54
|
+
exons: list["Exon"] = transcript_exons
|
55
|
+
five_prime_utr: "FivePrimeUTR | None" = transcript_five_prime_utr
|
56
|
+
three_prime_utr: "ThreePrimeUTR | None" = transcript_three_prime_utr
|
57
|
+
start_codon: "StartCodon | None" = transcript_start_codon
|
58
|
+
stop_codon: "StopCodon | None" = transcript_stop_codon
|
59
|
+
|
60
|
+
|
61
|
+
class Exon(Feature, type="exon"):
|
62
|
+
id: str = id_field(source="exon_id")
|
63
|
+
number: int = field(source="exon_number")
|
64
|
+
transcript: Transcript = exon_transcript
|
65
|
+
gene: Gene = exon_gene
|
66
|
+
cds: "CDS | None" = exon_cds
|
67
|
+
|
68
|
+
|
69
|
+
class CDS(Feature, type="cds"):
|
70
|
+
id: str = id_field(source="exon_id")
|
71
|
+
exon: Exon = cds_exon
|
72
|
+
|
73
|
+
|
74
|
+
class UTR(Feature, type="utr"):
|
75
|
+
id: str = id_field(source="transcript_id")
|
76
|
+
transcript: Transcript = utr_transcript
|
77
|
+
gene: Gene = utr_gene
|
78
|
+
|
79
|
+
|
80
|
+
class FivePrimeUTR(UTR, starts=five_prime_utr_transcript):
|
81
|
+
id: str = id_field(source="transcript_id")
|
82
|
+
transcript: Transcript = five_prime_utr_transcript
|
83
|
+
gene: Gene = five_prime_utr_gene
|
84
|
+
|
85
|
+
|
86
|
+
class ThreePrimeUTR(UTR, ends=three_prime_utr_transcript):
|
87
|
+
id: str = id_field(source="transcript_id")
|
88
|
+
transcript: Transcript = three_prime_utr_transcript
|
89
|
+
gene: Gene = three_prime_utr_gene
|
90
|
+
|
91
|
+
|
92
|
+
class StartCodon(Feature, type="start_codon"):
|
93
|
+
id: str = id_field(source="transcript_id")
|
94
|
+
transcript: Transcript = start_codon_transcript
|
95
|
+
exon: Exon = start_codon_exon
|
96
|
+
|
97
|
+
|
98
|
+
class StopCodon(Feature, type="stop_codon"):
|
99
|
+
id: str = id_field(source="transcript_id")
|
100
|
+
transcript: Transcript = stop_codon_transcript
|
101
|
+
exon: Exon = stop_codon_exon
|
@@ -5,7 +5,7 @@ from typing import Iterator
|
|
5
5
|
|
6
6
|
from biofiles.common import Writer
|
7
7
|
from biofiles.gff import GFFReader
|
8
|
-
from biofiles.types.feature import Gene, Exon, Feature, UTR
|
8
|
+
from biofiles.types.feature import Gene, Exon, Feature, UTR, CDS
|
9
9
|
|
10
10
|
|
11
11
|
class GTFReader(GFFReader):
|
@@ -46,22 +46,27 @@ if __name__ == "__main__":
|
|
46
46
|
total_features = 0
|
47
47
|
annotated_genes = 0
|
48
48
|
annotated_exons = 0
|
49
|
+
annotated_cds = 0
|
49
50
|
annotated_utrs = 0
|
50
51
|
parsed_genes = 0
|
51
52
|
parsed_exons = 0
|
53
|
+
parsed_cds = 0
|
52
54
|
parsed_utrs = 0
|
53
55
|
for feature in r:
|
54
56
|
total_features += 1
|
55
57
|
annotated_genes += "gene" in feature.type_.lower()
|
56
58
|
annotated_exons += feature.type_ == "exon"
|
59
|
+
annotated_cds += feature.type_.lower() == "cds"
|
57
60
|
annotated_utrs += "utr" in feature.type_.lower()
|
58
61
|
parsed_genes += isinstance(feature, Gene)
|
59
62
|
parsed_exons += isinstance(feature, Exon)
|
63
|
+
parsed_cds += isinstance(feature, CDS)
|
60
64
|
parsed_utrs += isinstance(feature, UTR)
|
61
65
|
print(
|
62
66
|
f"{path}: {total_features} features, "
|
63
67
|
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
64
68
|
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
69
|
+
f"{parsed_cds} CDS parsed out of {annotated_cds}, "
|
65
70
|
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
|
66
71
|
file=sys.stderr,
|
67
72
|
)
|
@@ -1,7 +1,15 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
|
3
3
|
|
4
|
-
__all__ = [
|
4
|
+
__all__ = [
|
5
|
+
"Alignment",
|
6
|
+
"BAMFlag",
|
7
|
+
"BAMTag",
|
8
|
+
"CIGAR",
|
9
|
+
"CIGAROpKind",
|
10
|
+
"CIGAROperation",
|
11
|
+
"ReferenceSequence",
|
12
|
+
]
|
5
13
|
|
6
14
|
from enum import IntFlag
|
7
15
|
|
@@ -50,6 +50,7 @@ class Transcript(Feature):
|
|
50
50
|
class Exon(Feature):
|
51
51
|
gene: Gene
|
52
52
|
transcript: Transcript
|
53
|
+
cds: "CDS | None"
|
53
54
|
|
54
55
|
|
55
56
|
@dataclass(frozen=True)
|
@@ -61,3 +62,10 @@ class UTR(Feature):
|
|
61
62
|
@dataclass(frozen=True)
|
62
63
|
class ThreePrimeUTR(UTR):
|
63
64
|
pass
|
65
|
+
|
66
|
+
|
67
|
+
@dataclass(frozen=True)
|
68
|
+
class CDS(Feature):
|
69
|
+
gene: Gene
|
70
|
+
transcript: Transcript
|
71
|
+
exon: Exon
|
@@ -0,0 +1,105 @@
|
|
1
|
+
from dataclasses import dataclass, Field, field as dataclass_field
|
2
|
+
from typing import dataclass_transform
|
3
|
+
|
4
|
+
from biofiles.common import Strand
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class Relation:
|
9
|
+
id_field_name: str
|
10
|
+
inverse: "InverseRelation | None" = None
|
11
|
+
class_: type | None = None
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class InverseRelation:
|
16
|
+
inverse: Relation
|
17
|
+
one_to_one: bool
|
18
|
+
class_: type | None = None
|
19
|
+
|
20
|
+
|
21
|
+
@dataclass_transform()
|
22
|
+
class FeatureMetaclass(type):
|
23
|
+
__id_field_name__: str
|
24
|
+
__filter_type__: str
|
25
|
+
__filter_starts__: Relation | None
|
26
|
+
__filter_ends__: Relation | None
|
27
|
+
|
28
|
+
def __new__(
|
29
|
+
cls,
|
30
|
+
name,
|
31
|
+
bases,
|
32
|
+
namespace,
|
33
|
+
type: str | None = None,
|
34
|
+
starts: Field | None = None,
|
35
|
+
ends: Field | None = None,
|
36
|
+
):
|
37
|
+
result = super().__new__(cls, name, bases, namespace)
|
38
|
+
result.__id_field_name__ = ""
|
39
|
+
for key, value in namespace.items():
|
40
|
+
match value:
|
41
|
+
case Field(metadata={"id_field_name": id_field_name}):
|
42
|
+
if result.__id_field_name__:
|
43
|
+
raise TypeError(
|
44
|
+
f"should specify exactly one id_field() in class {result.__name__}"
|
45
|
+
)
|
46
|
+
result.__id_field_name__ = id_field_name
|
47
|
+
case Field(metadata={"relation": Relation() as r}):
|
48
|
+
r.class_ = result
|
49
|
+
if key in result.__annotations__:
|
50
|
+
# TODO handle optionality and forward refs
|
51
|
+
r.inverse.class_ = result.__annotations__[key]
|
52
|
+
case Field(metadata={"relation": InverseRelation() as r}):
|
53
|
+
r.class_ = result
|
54
|
+
# TODO calculating r.inverse.class_ based on type annotation
|
55
|
+
|
56
|
+
if type is not None:
|
57
|
+
result.__filter_type__ = type
|
58
|
+
result.__filter_starts__ = None
|
59
|
+
if starts is not None:
|
60
|
+
result.__filter_starts__ = starts.metadata["relation"]
|
61
|
+
result.__filter_ends__ = None
|
62
|
+
if ends is not None:
|
63
|
+
result.__filter_ends__ = ends.metadata["relation"]
|
64
|
+
|
65
|
+
# TODO generate dataclass-like __init__ method,
|
66
|
+
# keep all relations optional
|
67
|
+
|
68
|
+
return result
|
69
|
+
|
70
|
+
|
71
|
+
class Feature(metaclass=FeatureMetaclass):
|
72
|
+
sequence_id: str
|
73
|
+
source: str
|
74
|
+
type_: str
|
75
|
+
|
76
|
+
start_original: int
|
77
|
+
end_original: int
|
78
|
+
# Original values as they were present in the file (1-based inclusive for .gff and .gtf).
|
79
|
+
|
80
|
+
start_c: int
|
81
|
+
end_c: int
|
82
|
+
# Standardized ("C-style") 0-based values, start inclusive, end exclusive.
|
83
|
+
|
84
|
+
score: float | None
|
85
|
+
strand: Strand | None
|
86
|
+
phase: int | None
|
87
|
+
attributes: dict[str, str]
|
88
|
+
|
89
|
+
|
90
|
+
def id_field(source: str) -> Field:
|
91
|
+
return dataclass_field(metadata={"id_field_name": source})
|
92
|
+
|
93
|
+
|
94
|
+
def field(source: str) -> Field:
|
95
|
+
return dataclass_field(metadata={"field_name": source})
|
96
|
+
|
97
|
+
|
98
|
+
def relation(source: str, *, one_to_one: bool = False) -> tuple[Field, Field]:
|
99
|
+
forward = Relation(id_field_name=source)
|
100
|
+
inverse = InverseRelation(inverse=forward, one_to_one=one_to_one)
|
101
|
+
forward.inverse = inverse
|
102
|
+
|
103
|
+
return dataclass_field(metadata={"relation": forward}), dataclass_field(
|
104
|
+
metadata={"relation": inverse}
|
105
|
+
)
|
File without changes
|
@@ -11,6 +11,7 @@ from biofiles.types.feature import (
|
|
11
11
|
Exon,
|
12
12
|
UTR,
|
13
13
|
Transcript,
|
14
|
+
CDS,
|
14
15
|
)
|
15
16
|
|
16
17
|
|
@@ -126,6 +127,8 @@ class FeatureReader(Reader):
|
|
126
127
|
feature = self._finalize_transcript(draft, result, Transcript)
|
127
128
|
case "exon":
|
128
129
|
feature = self._finalize_exon(draft, result)
|
130
|
+
case "cds":
|
131
|
+
feature = self._finalize_cds(draft, result)
|
129
132
|
case "three_prime_utr":
|
130
133
|
feature = self._finalize_utr(draft, result, ThreePrimeUTR)
|
131
134
|
case "utr":
|
@@ -161,10 +164,25 @@ class FeatureReader(Reader):
|
|
161
164
|
feature = self._finalize_other(draft, result)
|
162
165
|
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
163
166
|
return feature
|
164
|
-
exon = Exon(
|
167
|
+
exon = Exon(
|
168
|
+
**feature.__dict__, gene=transcript.gene, transcript=transcript, cds=None
|
169
|
+
)
|
165
170
|
object.__setattr__(transcript, "exons", transcript.exons + (exon,))
|
166
171
|
return exon
|
167
172
|
|
173
|
+
def _finalize_cds(self, draft: FeatureDraft, result: Features) -> Feature:
|
174
|
+
feature = self._finalize_other(draft, result)
|
175
|
+
if not (exon := self._find_ancestor_of_type(feature, Exon)):
|
176
|
+
return feature
|
177
|
+
cds = CDS(
|
178
|
+
**feature.__dict__,
|
179
|
+
exon=exon,
|
180
|
+
transcript=exon.transcript,
|
181
|
+
gene=exon.transcript.gene,
|
182
|
+
)
|
183
|
+
object.__setattr__(exon, "cds", cds)
|
184
|
+
return cds
|
185
|
+
|
168
186
|
def _finalize_utr(
|
169
187
|
self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
|
170
188
|
) -> Feature:
|
@@ -183,7 +201,7 @@ class FeatureReader(Reader):
|
|
183
201
|
|
184
202
|
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
185
203
|
parent_id = self._extract_parent_id(draft)
|
186
|
-
parent = result.by_id
|
204
|
+
parent = result.by_id.get(parent_id) if parent_id is not None else None
|
187
205
|
|
188
206
|
return Feature(
|
189
207
|
sequence_id=draft.sequence_id,
|
@@ -211,6 +229,8 @@ class FeatureReader(Reader):
|
|
211
229
|
id_ := draft.attributes.get("transcript_id")
|
212
230
|
):
|
213
231
|
return id_
|
232
|
+
if draft.type_ == "exon" and (id_ := draft.attributes.get("exon_id")):
|
233
|
+
return id_
|
214
234
|
return None
|
215
235
|
|
216
236
|
def _extract_parent_id(self, draft: FeatureDraft) -> str | None:
|
@@ -222,4 +242,6 @@ class FeatureReader(Reader):
|
|
222
242
|
id_ := draft.attributes.get("transcript_id")
|
223
243
|
):
|
224
244
|
return id_
|
245
|
+
if draft.type_.lower() == "cds" and (id_ := draft.attributes.get("exon_id")):
|
246
|
+
return id_
|
225
247
|
return None
|
@@ -0,0 +1,148 @@
|
|
1
|
+
from collections import deque, defaultdict
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Iterator, TextIO
|
5
|
+
|
6
|
+
from biofiles.common import Strand, Reader
|
7
|
+
from biofiles.types.feature_v2 import Feature, FeatureMetaclass, Relation
|
8
|
+
|
9
|
+
|
10
|
+
@dataclass
|
11
|
+
class FeatureDraft:
|
12
|
+
idx: int
|
13
|
+
sequence_id: str
|
14
|
+
source: str
|
15
|
+
type_: str
|
16
|
+
start_original: int
|
17
|
+
end_original: int
|
18
|
+
score: float | None
|
19
|
+
strand: Strand | None
|
20
|
+
phase: int | None
|
21
|
+
attributes: dict[str, str]
|
22
|
+
|
23
|
+
class_: type | None = None
|
24
|
+
id: Any = None
|
25
|
+
finalized: Feature | None = None
|
26
|
+
|
27
|
+
|
28
|
+
class FeatureTypes:
|
29
|
+
def __init__(self, feature_types: list[FeatureMetaclass]) -> None:
|
30
|
+
for ft in feature_types:
|
31
|
+
if not ft.__id_field_name__:
|
32
|
+
raise ValueError(
|
33
|
+
f"{ft.__name__} is not proper feature type - has no id_field()"
|
34
|
+
)
|
35
|
+
|
36
|
+
self.ambiguous_type_mapping: dict[str, list[FeatureMetaclass]] = defaultdict(
|
37
|
+
list
|
38
|
+
)
|
39
|
+
self.unique_type_mapping: dict[str, FeatureMetaclass] = {}
|
40
|
+
|
41
|
+
for ft in feature_types:
|
42
|
+
self.ambiguous_type_mapping[ft.__filter_type__].append(ft)
|
43
|
+
|
44
|
+
for key, fts in [*self.ambiguous_type_mapping.items()]:
|
45
|
+
if len(fts) == 1:
|
46
|
+
self.unique_type_mapping[key] = fts[0]
|
47
|
+
del self.ambiguous_type_mapping[key]
|
48
|
+
continue
|
49
|
+
self.ambiguous_type_mapping[key] = _sort_by_filter_specificity(fts)
|
50
|
+
|
51
|
+
|
52
|
+
def _sort_by_filter_specificity(fts: list[FeatureMetaclass]) -> list[FeatureMetaclass]:
|
53
|
+
"""Sort feature classes by their filter specificity, most specific -> least specific."""
|
54
|
+
key = lambda ft: bool(ft.__filter_starts__) + bool(ft.__filter_ends__)
|
55
|
+
return sorted(fts, key=key, reverse=True)
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class FeatureDrafts:
|
60
|
+
feature_types: FeatureTypes
|
61
|
+
drafts: list[FeatureDraft] = field(default_factory=deque)
|
62
|
+
by_class_and_id: dict[tuple[type, Any], FeatureDraft] = field(default_factory=dict)
|
63
|
+
|
64
|
+
def add(self, draft: FeatureDraft) -> None:
|
65
|
+
self.drafts.append(draft)
|
66
|
+
if class_ := self.feature_types.unique_type_mapping.get(draft.type_):
|
67
|
+
draft.class_ = class_
|
68
|
+
draft.id = draft.attributes[class_.__id_field_name__]
|
69
|
+
self.register(draft)
|
70
|
+
|
71
|
+
def register(self, draft: FeatureDraft) -> None:
|
72
|
+
if (key := (draft.class_, draft.id)) in self.by_class_and_id:
|
73
|
+
raise ValueError(
|
74
|
+
f"duplicate feature ID {draft.id} for class {class_.__name__}"
|
75
|
+
)
|
76
|
+
self.by_class_and_id[key] = draft
|
77
|
+
|
78
|
+
|
79
|
+
class FeatureReader(Reader):
|
80
|
+
|
81
|
+
def __init__(
|
82
|
+
self, input_: TextIO | Path | str, feature_types: list[FeatureMetaclass]
|
83
|
+
) -> None:
|
84
|
+
super().__init__(input_)
|
85
|
+
self._feature_types = FeatureTypes(feature_types)
|
86
|
+
|
87
|
+
def __iter__(self) -> Iterator[Feature]:
|
88
|
+
raise NotImplementedError
|
89
|
+
|
90
|
+
def _finalize_drafts(self, fds: FeatureDrafts) -> Iterator[Feature]:
|
91
|
+
self._choose_classes(fds)
|
92
|
+
pass
|
93
|
+
|
94
|
+
def _choose_classes(self, fds: FeatureDrafts) -> Iterator[Feature]:
|
95
|
+
for fd in fds.drafts:
|
96
|
+
if fd.class_:
|
97
|
+
continue
|
98
|
+
|
99
|
+
fts = self._feature_types.ambiguous_type_mapping[fd.type_]
|
100
|
+
matching_fts = [ft for ft in fts if self._check_filters(fd, ft)]
|
101
|
+
if not matching_fts:
|
102
|
+
raise ValueError(
|
103
|
+
f"no matching classes (out of {len(fts)}) for "
|
104
|
+
f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
|
105
|
+
)
|
106
|
+
if len(matching_fts) > 1:
|
107
|
+
raise ValueError(
|
108
|
+
f"too many matching classes ({len(matching_fts)}) for "
|
109
|
+
f"feature with type {fd.type_!r}, attributes {fd.attributes!r}"
|
110
|
+
)
|
111
|
+
ft = matching_fts[0]
|
112
|
+
fd.class_ = ft
|
113
|
+
fd.id = fd.attributes[ft.__id_field_name__]
|
114
|
+
fds.register(fd)
|
115
|
+
|
116
|
+
def _check_filters(
|
117
|
+
self, fds: FeatureDrafts, fd: FeatureDraft, ft: FeatureMetaclass
|
118
|
+
) -> bool:
|
119
|
+
if r := ft.__filter_starts__:
|
120
|
+
related_fd = self._get_related_feature_draft(fds, fd, r)
|
121
|
+
if fd.strand != related_fd.strand:
|
122
|
+
return False
|
123
|
+
if fd.strand == "+" and fd.start_original != related_fd.start_original:
|
124
|
+
return False
|
125
|
+
if fd.strand == "-" and fd.end_original != related_fd.end_original:
|
126
|
+
return False
|
127
|
+
if r := ft.__filter_ends__:
|
128
|
+
related_fd = self._get_related_feature_draft(fds, fd, r)
|
129
|
+
if fd.strand != related_fd.strand:
|
130
|
+
return False
|
131
|
+
if fd.strand == "+" and fd.end_original != related_fd.end_original:
|
132
|
+
return False
|
133
|
+
if fd.strand == "-" and fd.start_original != related_fd.start_original:
|
134
|
+
return False
|
135
|
+
return True
|
136
|
+
|
137
|
+
def _get_related_feature_draft(
|
138
|
+
self, fds: FeatureDrafts, fd: FeatureDraft, r: Relation
|
139
|
+
) -> FeatureDraft:
|
140
|
+
related_class = r.inverse.class_
|
141
|
+
related_id = fd.attributes[r.id_field_name]
|
142
|
+
try:
|
143
|
+
return fds.by_class_and_id[related_class, related_id]
|
144
|
+
except KeyError as exc:
|
145
|
+
raise ValueError(
|
146
|
+
f"can't find related {related_class.__name__} for "
|
147
|
+
f"{fd.class_.__name__} with attributes {fd.attributes!r}"
|
148
|
+
) from exc
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.13
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -36,6 +36,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
36
36
|
Requires-Python: >=3.10
|
37
37
|
Description-Content-Type: text/markdown
|
38
38
|
License-File: LICENSE
|
39
|
+
Dynamic: license-file
|
39
40
|
|
40
41
|
# biofiles
|
41
42
|
|
@@ -13,11 +13,15 @@ biofiles.egg-info/PKG-INFO
|
|
13
13
|
biofiles.egg-info/SOURCES.txt
|
14
14
|
biofiles.egg-info/dependency_links.txt
|
15
15
|
biofiles.egg-info/top_level.txt
|
16
|
+
biofiles/dialects/__init__.py
|
17
|
+
biofiles/dialects/havana_ensembl.py
|
16
18
|
biofiles/types/__init__.py
|
17
19
|
biofiles/types/alignment.py
|
18
20
|
biofiles/types/feature.py
|
21
|
+
biofiles/types/feature_v2.py
|
19
22
|
biofiles/types/repeat.py
|
20
23
|
biofiles/types/sequence.py
|
21
24
|
biofiles/utility/__init__.py
|
22
25
|
biofiles/utility/cli.py
|
23
|
-
biofiles/utility/feature.py
|
26
|
+
biofiles/utility/feature.py
|
27
|
+
biofiles/utility/feature_v2.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|