biofiles 0.0.13__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biofiles/gtf.py CHANGED
@@ -1,23 +1,45 @@
1
1
  __all__ = ["GTFReader", "GTFWriter"]
2
2
 
3
3
  import sys
4
+ from pathlib import Path
4
5
  from typing import Iterator
5
6
 
6
7
  from biofiles.common import Writer
7
- from biofiles.gff import GFFReader
8
- from biofiles.types.feature import Gene, Exon, Feature, UTR, CDS
8
+ from biofiles.dialects.detector import detect_dialect
9
+ from biofiles.dialects.genomic_base import Gene, Exon, Feature, CDS, UTR
10
+ from biofiles.gff import RawGFFReader
11
+ from biofiles.utility.feature import FeatureReader, RawFeatureReader, FeatureDraft
9
12
 
10
13
 
11
- class GTFReader(GFFReader):
12
- def __iter__(self) -> Iterator[Feature]:
14
+ class RawGTFReader(RawGFFReader):
15
+ def __iter__(self) -> Iterator[FeatureDraft]:
13
16
  yield from self._read_gff3()
14
17
 
15
- def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
16
- return {
17
- k: v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
18
- for part in attributes_str.strip(";").split(";")
19
- for k, v in (part.strip().split(None, 1),)
20
- }
18
+ def _parse_attributes(
19
+ self, line: str, attributes_str: str
20
+ ) -> dict[str, str | list[str]]:
21
+ try:
22
+ result: dict[str, str | list[str]] = {}
23
+ for part in attributes_str.strip().strip(";").split(";"):
24
+ k, v = part.strip().split(None, 1)
25
+ v = v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
26
+ if k in result:
27
+ if not isinstance(result[k], list):
28
+ result[k] = [result[k]]
29
+ result[k].append(v)
30
+ else:
31
+ result[k] = v
32
+ return result
33
+ except ValueError as exc:
34
+ raise ValueError(
35
+ f"failed to parse attribute string {attributes_str!r}: {exc}"
36
+ ) from exc
37
+
38
+
39
+ class GTFReader(FeatureReader):
40
+
41
+ def _make_raw_feature_reader(self) -> RawFeatureReader:
42
+ return RawGTFReader(self._input)
21
43
 
22
44
 
23
45
  class GTFWriter(Writer):
@@ -42,7 +64,8 @@ class GTFWriter(Writer):
42
64
 
43
65
  if __name__ == "__main__":
44
66
  for path in sys.argv[1:]:
45
- with GTFReader(path) as r:
67
+ dialect = detect_dialect(Path(path))
68
+ with GTFReader(path, dialect=dialect) as r:
46
69
  total_features = 0
47
70
  annotated_genes = 0
48
71
  annotated_exons = 0
biofiles/repeatmasker.py CHANGED
@@ -11,11 +11,15 @@ __all__ = ["RepeatMaskerReader"]
11
11
 
12
12
  class RepeatMaskerReader(Reader):
13
13
  def __iter__(self) -> Iterator[Repeat]:
14
+ has_passed_header = False
14
15
  for line in self._input:
15
- parts = line.split("\t")
16
+ parts = line.split()
16
17
  if not (14 <= len(parts) <= 15):
17
18
  # Probably some metainfo. No way to tell.
18
19
  continue
20
+ if not has_passed_header and ("SW" in parts or "score" in parts):
21
+ continue
22
+ has_passed_header = True
19
23
 
20
24
  (
21
25
  sw_score_str,
biofiles/types/feature.py CHANGED
@@ -1,13 +1,251 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, Field, field as dataclass_field
2
+ from enum import Enum
3
+ from typing import dataclass_transform, Type, Any, TypeAlias
4
+ from uuid import uuid4
2
5
 
3
6
  from biofiles.common import Strand
4
7
 
8
+ Source: TypeAlias = str | tuple[str, ...]
5
9
 
6
- __all__ = ["Feature", "Gene", "Transcript", "Exon", "UTR", "ThreePrimeUTR"]
7
10
 
11
+ @dataclass
12
+ class Relation:
13
+ """Equivalent of SQL foreign key — declarative description
14
+ of a relation between two types of features."""
8
15
 
9
- @dataclass(frozen=True)
10
- class Feature:
16
+ id_attribute_source: Source
17
+ """ Name of GTF/GFF attribute(s) which contains related feature ID. """
18
+
19
+ inverse: "InverseRelation | None" = None
20
+
21
+ class_: type | None = None
22
+ """ Python class for the related feature. """
23
+ attribute_name: str | None = None
24
+
25
+
26
+ @dataclass
27
+ class InverseRelation:
28
+ inverse: Relation
29
+ one_to_one: bool
30
+ class_: type | None = None
31
+ attribute_name: str | None = None
32
+
33
+
34
+ def get_composite_field(
35
+ attributes: dict[str, str], source: Source
36
+ ) -> str | tuple[str, ...] | None:
37
+ if source is None:
38
+ return None
39
+ if isinstance(source, str):
40
+ return attributes[source]
41
+ return tuple(attributes[attribute_name] for attribute_name in source)
42
+
43
+
44
+ @dataclass_transform()
45
+ class FeatureMetaclass(type):
46
+ __id_attribute_source__: Source | None
47
+ """ Name of GTF/GFF attribute(s) which contains the type-unique ID. """
48
+
49
+ __filter_type__: tuple[str, ...]
50
+ """ Filter by feature type ("gene", "transcript", etc.). """
51
+
52
+ __filter_starts__: Relation | None
53
+ """ Filter by start position — feature starts at the same point as related feature. """
54
+
55
+ __filter_ends__: Relation | None
56
+ """ Filter by end position — feature ends at the same point as related feature. """
57
+
58
+ __relations__: list[Relation]
59
+ """ All direct relations for this type, for faster parsing. """
60
+
61
+ def __new__(
62
+ cls,
63
+ name,
64
+ bases,
65
+ namespace,
66
+ type: str | tuple[str, ...] | None = None,
67
+ starts: Field | None = None,
68
+ ends: Field | None = None,
69
+ ):
70
+ result = super().__new__(cls, name, bases, namespace)
71
+ result.__id_attribute_source__ = cls._find_id_attribute_source(namespace)
72
+ result._fill_relation_classes(namespace)
73
+ result._fill_filters(type=type, starts=starts, ends=ends)
74
+ result._fill_slots()
75
+ result._fill_init_method(namespace)
76
+
77
+ # TODO generate dataclass-like __init__ method,
78
+ # keep all relations optional
79
+
80
+ return result
81
+
82
+ @staticmethod
83
+ def _find_id_attribute_source(namespace) -> str:
84
+ result: str | None = None
85
+ for key, value in namespace.items():
86
+ match value:
87
+ case Field(metadata={"id_attribute_name": id_attribute_source}):
88
+ if result:
89
+ raise TypeError(
90
+ f"should specify exactly one id_field() in class {result.__name__}"
91
+ )
92
+ result = id_attribute_source
93
+ return result
94
+
95
+ def _fill_relation_classes(cls, namespace) -> None:
96
+ cls.__relations__ = []
97
+ for key, value in namespace.items():
98
+ match value:
99
+ case Field(metadata={"relation": Relation() as r}):
100
+ r.class_ = cls
101
+ r.attribute_name = key
102
+ if key in cls.__annotations__:
103
+ # TODO handle optionality and forward refs
104
+ r.inverse.class_ = cls.__annotations__[key]
105
+ cls.__relations__.append(r)
106
+ case Field(metadata={"relation": InverseRelation() as r}):
107
+ r.class_ = cls
108
+ r.attribute_name = key
109
+ # TODO calculating r.inverse.class_ based on type annotation
110
+
111
+ def _fill_filters(
112
+ cls,
113
+ *,
114
+ type: str | tuple[str, ...] | None = None,
115
+ starts: Field | None = None,
116
+ ends: Field | None = None,
117
+ ) -> None:
118
+ if type is not None:
119
+ cls.__filter_type__ = (type,) if isinstance(type, str) else type
120
+
121
+ cls.__filter_starts__ = None
122
+ if starts is not None:
123
+ cls.__filter_starts__ = starts.metadata["relation"]
124
+
125
+ cls.__filter_ends__ = None
126
+ if ends is not None:
127
+ cls.__filter_ends__ = ends.metadata["relation"]
128
+
129
+ def _fill_slots(cls) -> None:
130
+ cls.__slots__ = [
131
+ key
132
+ for ancestor in cls.__mro__[::-1][1:]
133
+ for key in ancestor.__annotations__
134
+ ]
135
+
136
+ def _fill_init_method(cls, namespace) -> None:
137
+ default_arguments: list[str] = []
138
+ non_default_arguments: list[str] = []
139
+ assignments: list[str] = []
140
+ globals: dict[str, Any] = {}
141
+
142
+ key_to_ancestor: dict[str, Type] = {}
143
+ for ancestor in cls.__mro__[:-1]:
144
+ for key, value in ancestor.__annotations__.items():
145
+ key_to_ancestor.setdefault(key, ancestor)
146
+
147
+ for ancestor in cls.__mro__[::-1][1:]:
148
+ for key, value in ancestor.__annotations__.items():
149
+ if key_to_ancestor[key] is not ancestor:
150
+ # Overridden in a descendant class.
151
+ continue
152
+
153
+ field_value = getattr(cls, key, None)
154
+ argument, assignment = cls._compose_field(
155
+ key, value, field_value, globals
156
+ )
157
+
158
+ if argument and argument.endswith(" = None"):
159
+ default_arguments.append(argument)
160
+ elif argument:
161
+ non_default_arguments.append(argument)
162
+ assignments.append(assignment)
163
+
164
+ body = "\n ".join(assignments)
165
+ all_arguments = [*non_default_arguments, *default_arguments]
166
+ source_code = f"def __init__(self, {', '.join(all_arguments)}):\n {body}"
167
+ locals = {}
168
+ exec(source_code, globals, locals)
169
+ cls.__init__ = locals["__init__"]
170
+
171
+ def _compose_field(
172
+ cls,
173
+ field_name: str,
174
+ field_annotation: Any,
175
+ field_value: Field | None,
176
+ globals: dict[str, Any],
177
+ ) -> tuple[str | None, str]:
178
+ argument: str | None
179
+ assignment: str
180
+ match field_value:
181
+ case Field(metadata={"relation": r}):
182
+ argument = f"{field_name}: {cls._format_type_arg(field_annotation, optional=True)} = None"
183
+ if isinstance(r, InverseRelation) and not r.one_to_one:
184
+ assignment = f"self.{field_name} = {field_name} if {field_name} is not None else []"
185
+ else:
186
+ assignment = f"self.{field_name} = {field_name}"
187
+ case Field(metadata={"id_attribute_name": None}):
188
+ argument = None
189
+ assignment = f"self.{field_name} = None"
190
+ case Field(metadata={"attribute_name": attribute_name}) | Field(
191
+ metadata={"id_attribute_name": attribute_name}
192
+ ):
193
+ default = field_value.metadata.get("attribute_default", _no_default)
194
+ default_factory = field_value.metadata.get(
195
+ "attribute_default_factory", _no_default
196
+ )
197
+ default_variable_name = f"default_{uuid4().hex}"
198
+ argument = None
199
+ if isinstance(attribute_name, str):
200
+ if default is not _no_default:
201
+ globals[default_variable_name] = default
202
+ getter = f"attributes.get({repr(attribute_name)}, {default_variable_name})"
203
+ elif default_factory is not _no_default:
204
+ globals[default_variable_name] = default_factory
205
+ getter = f"attributes.get({repr(attribute_name)}, {default_variable_name}())"
206
+ else:
207
+ getter = f"attributes[{repr(attribute_name)}]"
208
+ else:
209
+ if default is not _no_default or default_factory is not _no_default:
210
+ raise NotImplementedError()
211
+ globals["get_composite_field"] = get_composite_field
212
+ getter = f"get_composite_field(attributes, {repr(attribute_name)})"
213
+ if isinstance(field_annotation, type) and issubclass(
214
+ field_annotation, (int, float)
215
+ ):
216
+ getter = f"{field_annotation.__name__}({getter})"
217
+ elif isinstance(field_annotation, type) and issubclass(
218
+ field_annotation, Enum
219
+ ):
220
+ globals[field_annotation.__name__] = field_annotation
221
+ getter = f"{field_annotation.__name__}({getter})"
222
+ # TODO int | None, list[Enum], etc.
223
+ # TODO ensure it's a list if annotated as list
224
+ assignment = f"self.{field_name} = {getter}"
225
+ # TODO necessary conversions, proper exceptions
226
+ case None:
227
+ argument = f"{field_name}: {cls._format_type_arg(field_annotation, optional=False)}"
228
+ assignment = f"self.{field_name} = {field_name}"
229
+ case property():
230
+ argument = None
231
+ assignment = ""
232
+ case other:
233
+ raise TypeError(f"unsupported field: {field_value}")
234
+ return argument, assignment
235
+
236
+ def _format_type_arg(cls, type: str | Type, optional: bool) -> str:
237
+ if isinstance(type, str):
238
+ return f'"{type} | None"' if optional else type
239
+ try:
240
+ if type.__module__ == "builtins":
241
+ return f"{type.__name__} | None" if optional else type.__name__
242
+ return f'"{type.__module__}.{type.__name__}"'
243
+ except AttributeError:
244
+ # TODO Properly support Optional, Union, etc., especially with built-in types
245
+ return f'"{str(type)} | None"' if optional else repr(str(type))
246
+
247
+
248
+ class Feature(metaclass=FeatureMetaclass):
11
249
  sequence_id: str
12
250
  source: str
13
251
  type_: str
@@ -25,47 +263,43 @@ class Feature:
25
263
  phase: int | None
26
264
  attributes: dict[str, str]
27
265
 
28
- id: str | None
29
- parent: "Feature | None"
30
- children: tuple["Feature", ...]
31
-
266
+ def __repr__(self) -> str:
267
+ return f"{type(self).__name__}({self.sequence_id}:{self.start_c}-{self.end_c})"
32
268
 
33
- # Custom types for particular kinds of features:
34
269
 
270
+ def id_field(source: Source) -> Field:
271
+ return dataclass_field(metadata={"id_attribute_name": source})
35
272
 
36
- @dataclass(frozen=True)
37
- class Gene(Feature):
38
- name: str
39
- biotype: str
40
- transcripts: tuple["Transcript", ...]
41
273
 
274
+ def no_id_field() -> Field:
275
+ return dataclass_field(metadata={"id_attribute_name": None})
42
276
 
43
- @dataclass(frozen=True)
44
- class Transcript(Feature):
45
- gene: Gene
46
- exons: tuple["Exon", ...]
47
277
 
278
+ _no_default = object()
48
279
 
49
- @dataclass(frozen=True)
50
- class Exon(Feature):
51
- gene: Gene
52
- transcript: Transcript
53
- cds: "CDS | None"
54
280
 
281
+ def field(
282
+ source: Source, *, default: Any = _no_default, default_factory: Any = _no_default
283
+ ) -> Field:
284
+ metadata = {"attribute_name": source}
285
+ if default is not _no_default:
286
+ metadata["attribute_default"] = default
287
+ if default_factory is not _no_default:
288
+ metadata["attribute_default_factory"] = default_factory
289
+ return dataclass_field(metadata=metadata)
55
290
 
56
- @dataclass(frozen=True)
57
- class UTR(Feature):
58
- gene: Gene
59
- transcript: Transcript
60
291
 
292
+ def relation(source: Source, *, one_to_one: bool = False) -> tuple[Field, Field]:
293
+ forward_relation = Relation(id_attribute_source=source)
294
+ inverse_relation = InverseRelation(inverse=forward_relation, one_to_one=one_to_one)
295
+ forward_relation.inverse = inverse_relation
61
296
 
62
- @dataclass(frozen=True)
63
- class ThreePrimeUTR(UTR):
64
- pass
297
+ forward_field = dataclass_field(metadata={"relation": forward_relation})
298
+ inverse_field = dataclass_field(metadata={"relation": inverse_relation})
299
+ return forward_field, inverse_field
65
300
 
66
301
 
67
302
  @dataclass(frozen=True)
68
- class CDS(Feature):
69
- gene: Gene
70
- transcript: Transcript
71
- exon: Exon
303
+ class Dialect:
304
+ name: str
305
+ feature_types: list[Type[Feature]]
biofiles/utility/cli.py CHANGED
@@ -2,7 +2,8 @@ from dataclasses import dataclass
2
2
  from pathlib import Path
3
3
  from typing import TypeAlias, Callable, Any, Literal, Type
4
4
 
5
- from biofiles.types.feature import Feature, Gene, Transcript, UTR, Exon
5
+ from biofiles.types.feature import Feature
6
+ from biofiles.dialects.genomic_base import Gene, Transcript, UTR, Exon
6
7
 
7
8
  FeatureFilter: TypeAlias = Callable[[Feature], bool]
8
9
  FeatureMapper: TypeAlias = Callable[[Feature], Any]