biofiles 0.0.13__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/dialects/detector.py +74 -0
- biofiles/dialects/gencode.py +235 -0
- biofiles/dialects/genomic_base.py +25 -0
- biofiles/dialects/refseq.py +142 -0
- biofiles/dialects/stringtie.py +24 -0
- biofiles/gff.py +49 -44
- biofiles/gtf.py +34 -11
- biofiles/repeatmasker.py +5 -1
- biofiles/types/feature.py +268 -34
- biofiles/utility/cli.py +2 -1
- biofiles/utility/feature.py +180 -211
- {biofiles-0.0.13.dist-info → biofiles-0.1.1.dist-info}/METADATA +9 -4
- biofiles-0.1.1.dist-info/RECORD +27 -0
- biofiles/dialects/havana_ensembl.py +0 -101
- biofiles/types/feature_v2.py +0 -105
- biofiles/utility/feature_v2.py +0 -148
- biofiles-0.0.13.dist-info/RECORD +0 -25
- {biofiles-0.0.13.dist-info → biofiles-0.1.1.dist-info}/WHEEL +0 -0
- {biofiles-0.0.13.dist-info → biofiles-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {biofiles-0.0.13.dist-info → biofiles-0.1.1.dist-info}/top_level.txt +0 -0
biofiles/gtf.py
CHANGED
@@ -1,23 +1,45 @@
|
|
1
1
|
__all__ = ["GTFReader", "GTFWriter"]
|
2
2
|
|
3
3
|
import sys
|
4
|
+
from pathlib import Path
|
4
5
|
from typing import Iterator
|
5
6
|
|
6
7
|
from biofiles.common import Writer
|
7
|
-
from biofiles.
|
8
|
-
from biofiles.
|
8
|
+
from biofiles.dialects.detector import detect_dialect
|
9
|
+
from biofiles.dialects.genomic_base import Gene, Exon, Feature, CDS, UTR
|
10
|
+
from biofiles.gff import RawGFFReader
|
11
|
+
from biofiles.utility.feature import FeatureReader, RawFeatureReader, FeatureDraft
|
9
12
|
|
10
13
|
|
11
|
-
class
|
12
|
-
def __iter__(self) -> Iterator[
|
14
|
+
class RawGTFReader(RawGFFReader):
|
15
|
+
def __iter__(self) -> Iterator[FeatureDraft]:
|
13
16
|
yield from self._read_gff3()
|
14
17
|
|
15
|
-
def _parse_attributes(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
def _parse_attributes(
|
19
|
+
self, line: str, attributes_str: str
|
20
|
+
) -> dict[str, str | list[str]]:
|
21
|
+
try:
|
22
|
+
result: dict[str, str | list[str]] = {}
|
23
|
+
for part in attributes_str.strip().strip(";").split(";"):
|
24
|
+
k, v = part.strip().split(None, 1)
|
25
|
+
v = v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
|
26
|
+
if k in result:
|
27
|
+
if not isinstance(result[k], list):
|
28
|
+
result[k] = [result[k]]
|
29
|
+
result[k].append(v)
|
30
|
+
else:
|
31
|
+
result[k] = v
|
32
|
+
return result
|
33
|
+
except ValueError as exc:
|
34
|
+
raise ValueError(
|
35
|
+
f"failed to parse attribute string {attributes_str!r}: {exc}"
|
36
|
+
) from exc
|
37
|
+
|
38
|
+
|
39
|
+
class GTFReader(FeatureReader):
|
40
|
+
|
41
|
+
def _make_raw_feature_reader(self) -> RawFeatureReader:
|
42
|
+
return RawGTFReader(self._input)
|
21
43
|
|
22
44
|
|
23
45
|
class GTFWriter(Writer):
|
@@ -42,7 +64,8 @@ class GTFWriter(Writer):
|
|
42
64
|
|
43
65
|
if __name__ == "__main__":
|
44
66
|
for path in sys.argv[1:]:
|
45
|
-
|
67
|
+
dialect = detect_dialect(Path(path))
|
68
|
+
with GTFReader(path, dialect=dialect) as r:
|
46
69
|
total_features = 0
|
47
70
|
annotated_genes = 0
|
48
71
|
annotated_exons = 0
|
biofiles/repeatmasker.py
CHANGED
@@ -11,11 +11,15 @@ __all__ = ["RepeatMaskerReader"]
|
|
11
11
|
|
12
12
|
class RepeatMaskerReader(Reader):
|
13
13
|
def __iter__(self) -> Iterator[Repeat]:
|
14
|
+
has_passed_header = False
|
14
15
|
for line in self._input:
|
15
|
-
parts = line.split(
|
16
|
+
parts = line.split()
|
16
17
|
if not (14 <= len(parts) <= 15):
|
17
18
|
# Probably some metainfo. No way to tell.
|
18
19
|
continue
|
20
|
+
if not has_passed_header and ("SW" in parts or "score" in parts):
|
21
|
+
continue
|
22
|
+
has_passed_header = True
|
19
23
|
|
20
24
|
(
|
21
25
|
sw_score_str,
|
biofiles/types/feature.py
CHANGED
@@ -1,13 +1,251 @@
|
|
1
|
-
from dataclasses import dataclass
|
1
|
+
from dataclasses import dataclass, Field, field as dataclass_field
|
2
|
+
from enum import Enum
|
3
|
+
from typing import dataclass_transform, Type, Any, TypeAlias
|
4
|
+
from uuid import uuid4
|
2
5
|
|
3
6
|
from biofiles.common import Strand
|
4
7
|
|
8
|
+
Source: TypeAlias = str | tuple[str, ...]
|
5
9
|
|
6
|
-
__all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
|
7
10
|
|
11
|
+
@dataclass
|
12
|
+
class Relation:
|
13
|
+
"""Equivalent of SQL foreign key — declarative description
|
14
|
+
of a relation between two types of features."""
|
8
15
|
|
9
|
-
|
10
|
-
|
16
|
+
id_attribute_source: Source
|
17
|
+
""" Name of GTF/GFF attribute(s) which contains related feature ID. """
|
18
|
+
|
19
|
+
inverse: "InverseRelation | None" = None
|
20
|
+
|
21
|
+
class_: type | None = None
|
22
|
+
""" Python class for the related feature. """
|
23
|
+
attribute_name: str | None = None
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class InverseRelation:
|
28
|
+
inverse: Relation
|
29
|
+
one_to_one: bool
|
30
|
+
class_: type | None = None
|
31
|
+
attribute_name: str | None = None
|
32
|
+
|
33
|
+
|
34
|
+
def get_composite_field(
|
35
|
+
attributes: dict[str, str], source: Source
|
36
|
+
) -> str | tuple[str, ...] | None:
|
37
|
+
if source is None:
|
38
|
+
return None
|
39
|
+
if isinstance(source, str):
|
40
|
+
return attributes[source]
|
41
|
+
return tuple(attributes[attribute_name] for attribute_name in source)
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass_transform()
|
45
|
+
class FeatureMetaclass(type):
|
46
|
+
__id_attribute_source__: Source | None
|
47
|
+
""" Name of GTF/GFF attribute(s) which contains the type-unique ID. """
|
48
|
+
|
49
|
+
__filter_type__: tuple[str, ...]
|
50
|
+
""" Filter by feature type ("gene", "transcript", etc.). """
|
51
|
+
|
52
|
+
__filter_starts__: Relation | None
|
53
|
+
""" Filter by start position — feature starts at the same point as related feature. """
|
54
|
+
|
55
|
+
__filter_ends__: Relation | None
|
56
|
+
""" Filter by end position — feature ends at the same point as related feature. """
|
57
|
+
|
58
|
+
__relations__: list[Relation]
|
59
|
+
""" All direct relations for this type, for faster parsing. """
|
60
|
+
|
61
|
+
def __new__(
|
62
|
+
cls,
|
63
|
+
name,
|
64
|
+
bases,
|
65
|
+
namespace,
|
66
|
+
type: str | tuple[str, ...] | None = None,
|
67
|
+
starts: Field | None = None,
|
68
|
+
ends: Field | None = None,
|
69
|
+
):
|
70
|
+
result = super().__new__(cls, name, bases, namespace)
|
71
|
+
result.__id_attribute_source__ = cls._find_id_attribute_source(namespace)
|
72
|
+
result._fill_relation_classes(namespace)
|
73
|
+
result._fill_filters(type=type, starts=starts, ends=ends)
|
74
|
+
result._fill_slots()
|
75
|
+
result._fill_init_method(namespace)
|
76
|
+
|
77
|
+
# TODO generate dataclass-like __init__ method,
|
78
|
+
# keep all relations optional
|
79
|
+
|
80
|
+
return result
|
81
|
+
|
82
|
+
@staticmethod
|
83
|
+
def _find_id_attribute_source(namespace) -> str:
|
84
|
+
result: str | None = None
|
85
|
+
for key, value in namespace.items():
|
86
|
+
match value:
|
87
|
+
case Field(metadata={"id_attribute_name": id_attribute_source}):
|
88
|
+
if result:
|
89
|
+
raise TypeError(
|
90
|
+
f"should specify exactly one id_field() in class {result.__name__}"
|
91
|
+
)
|
92
|
+
result = id_attribute_source
|
93
|
+
return result
|
94
|
+
|
95
|
+
def _fill_relation_classes(cls, namespace) -> None:
|
96
|
+
cls.__relations__ = []
|
97
|
+
for key, value in namespace.items():
|
98
|
+
match value:
|
99
|
+
case Field(metadata={"relation": Relation() as r}):
|
100
|
+
r.class_ = cls
|
101
|
+
r.attribute_name = key
|
102
|
+
if key in cls.__annotations__:
|
103
|
+
# TODO handle optionality and forward refs
|
104
|
+
r.inverse.class_ = cls.__annotations__[key]
|
105
|
+
cls.__relations__.append(r)
|
106
|
+
case Field(metadata={"relation": InverseRelation() as r}):
|
107
|
+
r.class_ = cls
|
108
|
+
r.attribute_name = key
|
109
|
+
# TODO calculating r.inverse.class_ based on type annotation
|
110
|
+
|
111
|
+
def _fill_filters(
|
112
|
+
cls,
|
113
|
+
*,
|
114
|
+
type: str | tuple[str, ...] | None = None,
|
115
|
+
starts: Field | None = None,
|
116
|
+
ends: Field | None = None,
|
117
|
+
) -> None:
|
118
|
+
if type is not None:
|
119
|
+
cls.__filter_type__ = (type,) if isinstance(type, str) else type
|
120
|
+
|
121
|
+
cls.__filter_starts__ = None
|
122
|
+
if starts is not None:
|
123
|
+
cls.__filter_starts__ = starts.metadata["relation"]
|
124
|
+
|
125
|
+
cls.__filter_ends__ = None
|
126
|
+
if ends is not None:
|
127
|
+
cls.__filter_ends__ = ends.metadata["relation"]
|
128
|
+
|
129
|
+
def _fill_slots(cls) -> None:
|
130
|
+
cls.__slots__ = [
|
131
|
+
key
|
132
|
+
for ancestor in cls.__mro__[::-1][1:]
|
133
|
+
for key in ancestor.__annotations__
|
134
|
+
]
|
135
|
+
|
136
|
+
def _fill_init_method(cls, namespace) -> None:
|
137
|
+
default_arguments: list[str] = []
|
138
|
+
non_default_arguments: list[str] = []
|
139
|
+
assignments: list[str] = []
|
140
|
+
globals: dict[str, Any] = {}
|
141
|
+
|
142
|
+
key_to_ancestor: dict[str, Type] = {}
|
143
|
+
for ancestor in cls.__mro__[:-1]:
|
144
|
+
for key, value in ancestor.__annotations__.items():
|
145
|
+
key_to_ancestor.setdefault(key, ancestor)
|
146
|
+
|
147
|
+
for ancestor in cls.__mro__[::-1][1:]:
|
148
|
+
for key, value in ancestor.__annotations__.items():
|
149
|
+
if key_to_ancestor[key] is not ancestor:
|
150
|
+
# Overridden in a descendant class.
|
151
|
+
continue
|
152
|
+
|
153
|
+
field_value = getattr(cls, key, None)
|
154
|
+
argument, assignment = cls._compose_field(
|
155
|
+
key, value, field_value, globals
|
156
|
+
)
|
157
|
+
|
158
|
+
if argument and argument.endswith(" = None"):
|
159
|
+
default_arguments.append(argument)
|
160
|
+
elif argument:
|
161
|
+
non_default_arguments.append(argument)
|
162
|
+
assignments.append(assignment)
|
163
|
+
|
164
|
+
body = "\n ".join(assignments)
|
165
|
+
all_arguments = [*non_default_arguments, *default_arguments]
|
166
|
+
source_code = f"def __init__(self, {', '.join(all_arguments)}):\n {body}"
|
167
|
+
locals = {}
|
168
|
+
exec(source_code, globals, locals)
|
169
|
+
cls.__init__ = locals["__init__"]
|
170
|
+
|
171
|
+
def _compose_field(
|
172
|
+
cls,
|
173
|
+
field_name: str,
|
174
|
+
field_annotation: Any,
|
175
|
+
field_value: Field | None,
|
176
|
+
globals: dict[str, Any],
|
177
|
+
) -> tuple[str | None, str]:
|
178
|
+
argument: str | None
|
179
|
+
assignment: str
|
180
|
+
match field_value:
|
181
|
+
case Field(metadata={"relation": r}):
|
182
|
+
argument = f"{field_name}: {cls._format_type_arg(field_annotation, optional=True)} = None"
|
183
|
+
if isinstance(r, InverseRelation) and not r.one_to_one:
|
184
|
+
assignment = f"self.{field_name} = {field_name} if {field_name} is not None else []"
|
185
|
+
else:
|
186
|
+
assignment = f"self.{field_name} = {field_name}"
|
187
|
+
case Field(metadata={"id_attribute_name": None}):
|
188
|
+
argument = None
|
189
|
+
assignment = f"self.{field_name} = None"
|
190
|
+
case Field(metadata={"attribute_name": attribute_name}) | Field(
|
191
|
+
metadata={"id_attribute_name": attribute_name}
|
192
|
+
):
|
193
|
+
default = field_value.metadata.get("attribute_default", _no_default)
|
194
|
+
default_factory = field_value.metadata.get(
|
195
|
+
"attribute_default_factory", _no_default
|
196
|
+
)
|
197
|
+
default_variable_name = f"default_{uuid4().hex}"
|
198
|
+
argument = None
|
199
|
+
if isinstance(attribute_name, str):
|
200
|
+
if default is not _no_default:
|
201
|
+
globals[default_variable_name] = default
|
202
|
+
getter = f"attributes.get({repr(attribute_name)}, {default_variable_name})"
|
203
|
+
elif default_factory is not _no_default:
|
204
|
+
globals[default_variable_name] = default_factory
|
205
|
+
getter = f"attributes.get({repr(attribute_name)}, {default_variable_name}())"
|
206
|
+
else:
|
207
|
+
getter = f"attributes[{repr(attribute_name)}]"
|
208
|
+
else:
|
209
|
+
if default is not _no_default or default_factory is not _no_default:
|
210
|
+
raise NotImplementedError()
|
211
|
+
globals["get_composite_field"] = get_composite_field
|
212
|
+
getter = f"get_composite_field(attributes, {repr(attribute_name)})"
|
213
|
+
if isinstance(field_annotation, type) and issubclass(
|
214
|
+
field_annotation, (int, float)
|
215
|
+
):
|
216
|
+
getter = f"{field_annotation.__name__}({getter})"
|
217
|
+
elif isinstance(field_annotation, type) and issubclass(
|
218
|
+
field_annotation, Enum
|
219
|
+
):
|
220
|
+
globals[field_annotation.__name__] = field_annotation
|
221
|
+
getter = f"{field_annotation.__name__}({getter})"
|
222
|
+
# TODO int | None, list[Enum], etc.
|
223
|
+
# TODO ensure it's a list if annotated as list
|
224
|
+
assignment = f"self.{field_name} = {getter}"
|
225
|
+
# TODO necessary conversions, proper exceptions
|
226
|
+
case None:
|
227
|
+
argument = f"{field_name}: {cls._format_type_arg(field_annotation, optional=False)}"
|
228
|
+
assignment = f"self.{field_name} = {field_name}"
|
229
|
+
case property():
|
230
|
+
argument = None
|
231
|
+
assignment = ""
|
232
|
+
case other:
|
233
|
+
raise TypeError(f"unsupported field: {field_value}")
|
234
|
+
return argument, assignment
|
235
|
+
|
236
|
+
def _format_type_arg(cls, type: str | Type, optional: bool) -> str:
|
237
|
+
if isinstance(type, str):
|
238
|
+
return f'"{type} | None"' if optional else type
|
239
|
+
try:
|
240
|
+
if type.__module__ == "builtins":
|
241
|
+
return f"{type.__name__} | None" if optional else type.__name__
|
242
|
+
return f'"{type.__module__}.{type.__name__}"'
|
243
|
+
except AttributeError:
|
244
|
+
# TODO Properly support Optional, Union, etc., especially with built-in types
|
245
|
+
return f'"{str(type)} | None"' if optional else repr(str(type))
|
246
|
+
|
247
|
+
|
248
|
+
class Feature(metaclass=FeatureMetaclass):
|
11
249
|
sequence_id: str
|
12
250
|
source: str
|
13
251
|
type_: str
|
@@ -25,47 +263,43 @@ class Feature:
|
|
25
263
|
phase: int | None
|
26
264
|
attributes: dict[str, str]
|
27
265
|
|
28
|
-
|
29
|
-
|
30
|
-
children: tuple["Feature", ...]
|
31
|
-
|
266
|
+
def __repr__(self) -> str:
|
267
|
+
return f"{type(self).__name__}({self.sequence_id}:{self.start_c}-{self.end_c})"
|
32
268
|
|
33
|
-
# Custom types for particular kinds of features:
|
34
269
|
|
270
|
+
def id_field(source: Source) -> Field:
|
271
|
+
return dataclass_field(metadata={"id_attribute_name": source})
|
35
272
|
|
36
|
-
@dataclass(frozen=True)
|
37
|
-
class Gene(Feature):
|
38
|
-
name: str
|
39
|
-
biotype: str
|
40
|
-
transcripts: tuple["Transcript", ...]
|
41
273
|
|
274
|
+
def no_id_field() -> Field:
|
275
|
+
return dataclass_field(metadata={"id_attribute_name": None})
|
42
276
|
|
43
|
-
@dataclass(frozen=True)
|
44
|
-
class Transcript(Feature):
|
45
|
-
gene: Gene
|
46
|
-
exons: tuple["Exon", ...]
|
47
277
|
|
278
|
+
_no_default = object()
|
48
279
|
|
49
|
-
@dataclass(frozen=True)
|
50
|
-
class Exon(Feature):
|
51
|
-
gene: Gene
|
52
|
-
transcript: Transcript
|
53
|
-
cds: "CDS | None"
|
54
280
|
|
281
|
+
def field(
|
282
|
+
source: Source, *, default: Any = _no_default, default_factory: Any = _no_default
|
283
|
+
) -> Field:
|
284
|
+
metadata = {"attribute_name": source}
|
285
|
+
if default is not _no_default:
|
286
|
+
metadata["attribute_default"] = default
|
287
|
+
if default_factory is not _no_default:
|
288
|
+
metadata["attribute_default_factory"] = default_factory
|
289
|
+
return dataclass_field(metadata=metadata)
|
55
290
|
|
56
|
-
@dataclass(frozen=True)
|
57
|
-
class UTR(Feature):
|
58
|
-
gene: Gene
|
59
|
-
transcript: Transcript
|
60
291
|
|
292
|
+
def relation(source: Source, *, one_to_one: bool = False) -> tuple[Field, Field]:
|
293
|
+
forward_relation = Relation(id_attribute_source=source)
|
294
|
+
inverse_relation = InverseRelation(inverse=forward_relation, one_to_one=one_to_one)
|
295
|
+
forward_relation.inverse = inverse_relation
|
61
296
|
|
62
|
-
|
63
|
-
|
64
|
-
|
297
|
+
forward_field = dataclass_field(metadata={"relation": forward_relation})
|
298
|
+
inverse_field = dataclass_field(metadata={"relation": inverse_relation})
|
299
|
+
return forward_field, inverse_field
|
65
300
|
|
66
301
|
|
67
302
|
@dataclass(frozen=True)
|
68
|
-
class
|
69
|
-
|
70
|
-
|
71
|
-
exon: Exon
|
303
|
+
class Dialect:
|
304
|
+
name: str
|
305
|
+
feature_types: list[Type[Feature]]
|
biofiles/utility/cli.py
CHANGED
@@ -2,7 +2,8 @@ from dataclasses import dataclass
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import TypeAlias, Callable, Any, Literal, Type
|
4
4
|
|
5
|
-
from biofiles.types.feature import Feature
|
5
|
+
from biofiles.types.feature import Feature
|
6
|
+
from biofiles.dialects.genomic_base import Gene, Transcript, UTR, Exon
|
6
7
|
|
7
8
|
FeatureFilter: TypeAlias = Callable[[Feature], bool]
|
8
9
|
FeatureMapper: TypeAlias = Callable[[Feature], Any]
|