biofiles 0.0.8__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {biofiles-0.0.8 → biofiles-0.0.10}/PKG-INFO +1 -1
  2. biofiles-0.0.10/biofiles/bam.py +199 -0
  3. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/gff.py +18 -4
  4. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/gtf.py +26 -4
  5. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/repeatmasker.py +2 -2
  6. biofiles-0.0.10/biofiles/types/alignment.py +76 -0
  7. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/types/feature.py +1 -2
  8. biofiles-0.0.10/biofiles/utility/__init__.py +0 -0
  9. biofiles-0.0.10/biofiles/utility/cli.py +126 -0
  10. {biofiles-0.0.8/biofiles → biofiles-0.0.10/biofiles/utility}/feature.py +29 -16
  11. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles.egg-info/PKG-INFO +1 -1
  12. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles.egg-info/SOURCES.txt +6 -2
  13. {biofiles-0.0.8 → biofiles-0.0.10}/pyproject.toml +1 -1
  14. {biofiles-0.0.8 → biofiles-0.0.10}/LICENSE +0 -0
  15. {biofiles-0.0.8 → biofiles-0.0.10}/README.md +0 -0
  16. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/__init__.py +0 -0
  17. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/common.py +0 -0
  18. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/fasta.py +0 -0
  19. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/types/__init__.py +0 -0
  20. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/types/repeat.py +0 -0
  21. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles/types/sequence.py +0 -0
  22. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles.egg-info/dependency_links.txt +0 -0
  23. {biofiles-0.0.8 → biofiles-0.0.10}/biofiles.egg-info/top_level.txt +0 -0
  24. {biofiles-0.0.8 → biofiles-0.0.10}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -0,0 +1,199 @@
1
+ import gzip
2
+ import struct
3
+ import sys
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Iterator, Any
8
+
9
+ from biofiles.types.alignment import (
10
+ ReferenceSequence,
11
+ Alignment,
12
+ BAMTag,
13
+ CIGAR,
14
+ CIGAROpKind,
15
+ CIGAROperation,
16
+ )
17
+
18
+
19
+ class BAMReader:
20
+ def __init__(self, input_: BytesIO | Path | str) -> None:
21
+ if isinstance(input_, Path | str):
22
+ input_ = open(input_, "rb")
23
+ self._input = input_
24
+ self._ungzipped_input = gzip.open(input_)
25
+
26
+ self._header_text: str | None = None
27
+ self._ref_seqs: list[ReferenceSequence] = []
28
+
29
+ self._read_header()
30
+
31
+ def _read_header(self) -> None:
32
+ magic_bytes = self._ungzipped_input.read(8)
33
+ magic_data = struct.unpack("<ccccI", magic_bytes)
34
+ if b"".join(magic_data[:4]) != b"BAM\1":
35
+ raise ValueError("not a BAM file, invalid magic bytes")
36
+
37
+ header_text_length = magic_data[-1]
38
+ self._header_text = self._ungzipped_input.read(header_text_length)
39
+ (num_ref_seqs,) = struct.unpack("<I", self._ungzipped_input.read(4))
40
+
41
+ for _ in range(num_ref_seqs):
42
+ (ref_seq_name_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
43
+ ref_seq_name = self._ungzipped_input.read(ref_seq_name_length)
44
+ (ref_seq_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
45
+ ref_seq = ReferenceSequence(
46
+ id=ref_seq_name.rstrip(b"\0").decode("ascii"), length=ref_seq_length
47
+ )
48
+ self._ref_seqs.append(ref_seq)
49
+
50
+ def __iter__(self) -> Iterator[Alignment]:
51
+ return self
52
+
53
+ def __next__(self) -> Alignment:
54
+ block_size_bytes = self._ungzipped_input.read(4)
55
+ if not block_size_bytes:
56
+ raise StopIteration
57
+
58
+ (block_length,) = struct.unpack("<I", block_size_bytes)
59
+
60
+ body_format = "<iiBBHHHIiii"
61
+ body_bytes = self._ungzipped_input.read(struct.calcsize(body_format))
62
+ (
63
+ ref_seq_idx,
64
+ pos,
65
+ read_name_length,
66
+ mapping_quality,
67
+ bai_index_bin,
68
+ num_cigar_ops,
69
+ flags,
70
+ seq_length,
71
+ next_ref_seq_idx,
72
+ next_pos,
73
+ template_length,
74
+ ) = struct.unpack(body_format, body_bytes)
75
+ read_name_bytes = self._ungzipped_input.read(read_name_length)
76
+
77
+ cigar_format = "<" + "I" * num_cigar_ops
78
+ cigar_bytes = self._ungzipped_input.read(struct.calcsize(cigar_format))
79
+ encoded_cigar = struct.unpack(cigar_format, cigar_bytes)
80
+
81
+ seq_bytes = self._ungzipped_input.read((seq_length + 1) // 2)
82
+ encoded_seq = struct.unpack("<" + "B" * len(seq_bytes), seq_bytes)
83
+
84
+ quality = self._ungzipped_input.read(seq_length).decode("ascii")
85
+
86
+ remaining_length = (
87
+ block_length
88
+ - len(body_bytes)
89
+ - len(read_name_bytes)
90
+ - len(cigar_bytes)
91
+ - len(seq_bytes)
92
+ - len(quality)
93
+ )
94
+
95
+ tags: list[BAMTag] = []
96
+ while remaining_length > 0:
97
+ tag, used_length = self._read_tag()
98
+ tags.append(tag)
99
+ remaining_length -= used_length
100
+ if remaining_length < 0:
101
+ raise ValueError("invalid BAM file, wrong tag length")
102
+
103
+ ref_seq = self._ref_seqs[ref_seq_idx] if ref_seq_idx >= 0 else None
104
+ next_ref_seq = (
105
+ self._ref_seqs[next_ref_seq_idx] if next_ref_seq_idx >= 0 else None
106
+ )
107
+ return Alignment(
108
+ reference_sequence=ref_seq,
109
+ start_c=pos,
110
+ read_name=read_name_bytes.rstrip(b"\0").decode("utf-8"),
111
+ mapping_quality=mapping_quality,
112
+ bai_index_bin=bai_index_bin,
113
+ next_reference_sequence=next_ref_seq,
114
+ next_start_c=next_pos,
115
+ template_length=template_length,
116
+ cigar=self._decode_cigar(encoded_cigar),
117
+ read_sequence=self._decode_seq(encoded_seq),
118
+ quality=quality,
119
+ bam_flags=flags,
120
+ bam_tags=tuple(tags),
121
+ )
122
+
123
+ def _decode_cigar(self, encoded_cigar: tuple[int, ...]) -> CIGAR:
124
+ return CIGAR(
125
+ operations=tuple(
126
+ CIGAROperation(kind=_BAM_CIGAR_OP_KINDS[item & 0b1111], count=item >> 4)
127
+ for item in encoded_cigar
128
+ )
129
+ )
130
+
131
+ def _decode_seq(self, encoded_seq: tuple[int, ...]) -> str:
132
+ return "".join(
133
+ f"{_BAM_SEQUENCE_LETTERS[b >> 4]}{_BAM_SEQUENCE_LETTERS[b & 15]}"
134
+ for b in encoded_seq
135
+ )
136
+
137
+ def _read_tag(self) -> tuple[BAMTag, int]:
138
+ tag = self._ungzipped_input.read(2).decode("ascii")
139
+ value_type = self._ungzipped_input.read(1)
140
+ value, value_length = self._read_tag_value(value_type)
141
+ return BAMTag(tag=tag, value=value), 3 + value_length
142
+
143
+ def _read_tag_value(self, value_type: bytes) -> tuple[Any, int]:
144
+ if value_type in (b"Z", b"H"):
145
+ characters: list[bytes] = []
146
+ last_character = b""
147
+ while last_character != b"\0":
148
+ characters.append(last_character)
149
+ last_character = self._ungzipped_input.read(1)
150
+ value = b"".join(characters).decode("utf-8")
151
+ return value, len(characters)
152
+
153
+ elif value_type == b"B":
154
+ subtype, count = struct.unpack("<cI", self._ungzipped_input.read(5))
155
+ format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[subtype] * count
156
+ length = struct.calcsize(format_)
157
+ value = struct.unpack(format_, self._ungzipped_input.read(length))
158
+ return value, 5 + length
159
+
160
+ else:
161
+ format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[value_type]
162
+ length = struct.calcsize(format_)
163
+ (value,) = struct.unpack(format_, self._ungzipped_input.read(length))
164
+ return value, length
165
+
166
+ def __enter__(self):
167
+ self._input.__enter__()
168
+ return self
169
+
170
+ def __exit__(
171
+ self,
172
+ exc_type: type[BaseException] | None,
173
+ exc_val: BaseException | None,
174
+ exc_tb: TracebackType | None,
175
+ ) -> None:
176
+ self._input.__exit__(exc_type, exc_val, exc_tb)
177
+
178
+
179
+ _BAM_FORMAT_TO_STRUCT_FORMAT = {
180
+ b"A": "c",
181
+ b"c": "b",
182
+ b"C": "B",
183
+ b"s": "h",
184
+ b"S": "H",
185
+ b"i": "i",
186
+ b"I": "I",
187
+ b"f": "f",
188
+ }
189
+
190
+ _BAM_CIGAR_OP_KINDS: list[CIGAROpKind] = ["M", "I", "D", "N", "S", "H", "P", "=", "X"]
191
+ _BAM_SEQUENCE_LETTERS = "=ACMGRSVTWYHKDBN"
192
+
193
+ if __name__ == "__main__":
194
+ for path in sys.argv[1:]:
195
+ num_alignments = 0
196
+ with BAMReader(path) as reader:
197
+ for record in reader:
198
+ num_alignments += 1
199
+ print(f"Parsed {num_alignments} alignments from {path}")
@@ -3,7 +3,8 @@ from pathlib import Path
3
3
  from typing import Iterator, cast, TextIO
4
4
 
5
5
  from biofiles.common import Strand, Writer
6
- from biofiles.feature import FeatureReader, FeatureDraft, FeatureDrafts
6
+ from biofiles.utility.cli import parse_pipeline_args
7
+ from biofiles.utility.feature import FeatureReader, FeatureDraft, FeatureDrafts
7
8
  from biofiles.types.feature import Feature, Gene, Exon, UTR
8
9
 
9
10
  __all__ = ["GFFReader", "GFF3Writer"]
@@ -137,7 +138,15 @@ _VERSION_PREFIX = "##gff-version "
137
138
 
138
139
 
139
140
  if __name__ == "__main__":
140
- for path in sys.argv[1:]:
141
+ pipeline = parse_pipeline_args(sys.argv[1:])
142
+ if pipeline.mapper is None:
143
+ writer = GFF3Writer(sys.stdout)
144
+ pipeline.mapper = writer.write
145
+ else:
146
+ old_mapper = pipeline.mapper
147
+ pipeline.mapper = lambda f: print(old_mapper(f))
148
+
149
+ for path in pipeline.inputs:
141
150
  with GFFReader(path) as r:
142
151
  total_features = 0
143
152
  annotated_genes = 0
@@ -148,15 +157,20 @@ if __name__ == "__main__":
148
157
  parsed_utrs = 0
149
158
  for feature in r:
150
159
  total_features += 1
151
- annotated_genes += feature.type_ == "gene"
160
+ annotated_genes += "gene" in feature.type_.lower()
152
161
  annotated_exons += feature.type_ == "exon"
153
162
  annotated_utrs += "utr" in feature.type_.lower()
154
163
  parsed_genes += isinstance(feature, Gene)
155
164
  parsed_exons += isinstance(feature, Exon)
156
165
  parsed_utrs += isinstance(feature, UTR)
166
+
167
+ if pipeline.filter(feature):
168
+ pipeline.map(feature)
169
+
157
170
  print(
158
171
  f"{path}: {total_features} features, "
159
172
  f"{parsed_genes} genes parsed out of {annotated_genes}, "
160
173
  f"{parsed_exons} exons parsed out of {annotated_exons}, "
161
- f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
174
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
175
+ file=sys.stderr,
162
176
  )
@@ -1,8 +1,9 @@
1
- __all__ = ["GTFReader"]
1
+ __all__ = ["GTFReader", "GTFWriter"]
2
2
 
3
3
  import sys
4
4
  from typing import Iterator
5
5
 
6
+ from biofiles.common import Writer
6
7
  from biofiles.gff import GFFReader
7
8
  from biofiles.types.feature import Gene, Exon, Feature, UTR
8
9
 
@@ -13,12 +14,32 @@ class GTFReader(GFFReader):
13
14
 
14
15
  def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
15
16
  return {
16
- k: v.strip('"')
17
+ k: v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
17
18
  for part in attributes_str.strip(";").split(";")
18
19
  for k, v in (part.strip().split(None, 1),)
19
20
  }
20
21
 
21
22
 
23
+ class GTFWriter(Writer):
24
+ def write(self, feature: Feature) -> None:
25
+ fields = (
26
+ feature.sequence_id,
27
+ feature.source,
28
+ feature.type_,
29
+ str(feature.start_c + 1),
30
+ str(feature.end_c),
31
+ str(feature.score) if feature.score is not None else ".",
32
+ str(feature.strand) if feature.strand is not None else ".",
33
+ str(feature.phase) if feature.phase is not None else ".",
34
+ "; ".join(
35
+ f'{k} "' + v.replace('"', r"\"") + '"'
36
+ for k, v in feature.attributes.items()
37
+ ),
38
+ )
39
+ self._output.write("\t".join(fields))
40
+ self._output.write("\n")
41
+
42
+
22
43
  if __name__ == "__main__":
23
44
  for path in sys.argv[1:]:
24
45
  with GTFReader(path) as r:
@@ -31,7 +52,7 @@ if __name__ == "__main__":
31
52
  parsed_utrs = 0
32
53
  for feature in r:
33
54
  total_features += 1
34
- annotated_genes += feature.type_ == "gene"
55
+ annotated_genes += "gene" in feature.type_.lower()
35
56
  annotated_exons += feature.type_ == "exon"
36
57
  annotated_utrs += "utr" in feature.type_.lower()
37
58
  parsed_genes += isinstance(feature, Gene)
@@ -41,5 +62,6 @@ if __name__ == "__main__":
41
62
  f"{path}: {total_features} features, "
42
63
  f"{parsed_genes} genes parsed out of {annotated_genes}, "
43
64
  f"{parsed_exons} exons parsed out of {annotated_exons}, "
44
- f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
65
+ f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
66
+ file=sys.stderr,
45
67
  )
@@ -1,6 +1,6 @@
1
1
  import sys
2
2
  from collections import Counter
3
- from typing import Iterator
3
+ from typing import Iterator, cast, Literal
4
4
 
5
5
  from biofiles.common import Reader
6
6
  from biofiles.types.repeat import Repeat
@@ -42,7 +42,7 @@ class RepeatMaskerReader(Reader):
42
42
  seq_start = int(seq_start_str)
43
43
  seq_end = int(seq_end_str)
44
44
  seq_left = int(seq_left_str[1:-1])
45
- strand = {"+": "+", "C": "-"}[strand_str]
45
+ strand = cast(Literal["+", "-"], {"+": "+", "C": "-"}[strand_str])
46
46
 
47
47
  if "/" in repeat_class_family:
48
48
  repeat_class, repeat_family = repeat_class_family.split("/", 1)
@@ -0,0 +1,76 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ __all__ = ["ReferenceSequence", "Alignment", "BAMTag"]
5
+
6
+ from enum import IntFlag
7
+
8
+ from typing import Any, Literal
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class ReferenceSequence:
13
+ id: str
14
+ length: int
15
+
16
+
17
+ @dataclass(frozen=True, slots=True)
18
+ class BAMTag:
19
+ tag: str
20
+ value: Any
21
+
22
+
23
+ CIGAROpKind = Literal["M", "I", "D", "N", "S", "H", "P", "=", "X"]
24
+
25
+
26
+ @dataclass(frozen=True, slots=True)
27
+ class CIGAROperation:
28
+ kind: CIGAROpKind
29
+ count: int
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class CIGAR:
34
+ operations: tuple[CIGAROperation, ...]
35
+
36
+ def __repr__(self) -> str:
37
+ return f'CIGAR("{self}")'
38
+
39
+ def __str__(self) -> str:
40
+ return "".join(f"{op.count}{op.kind}" for op in self.operations)
41
+
42
+
43
+ class BAMFlag(IntFlag):
44
+ MULTIPLE_SEGMENTS = 1 << 0
45
+ EACH_SEGMENT_PROPERLY_ALIGNED = 1 << 1
46
+ SEGMENT_UNMAPPED = 1 << 2
47
+ NEXT_SEGMENT_UNMAPPED = 1 << 3
48
+ READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 4
49
+ NEXT_SEGMENT_READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 5
50
+ FIRST_SEGMENT = 1 << 6
51
+ LAST_SEGMENT = 1 << 7
52
+ SECONDARY_SEGMENT = 1 << 8
53
+ NOT_PASSING_QUALITY_CONTROL = 1 << 9
54
+ DUPLICATE = 1 << 10
55
+ SUPPLEMENTARY_ALIGNMENT = 1 << 11
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class Alignment:
60
+ reference_sequence: ReferenceSequence | None
61
+
62
+ start_c: int
63
+ # 0-based leftmost coordinate.
64
+ read_name: str
65
+ mapping_quality: int
66
+ bai_index_bin: int
67
+
68
+ next_reference_sequence: ReferenceSequence | None
69
+ next_start_c: int
70
+ template_length: int
71
+ cigar: CIGAR
72
+ read_sequence: str
73
+ quality: str
74
+
75
+ bam_flags: int
76
+ bam_tags: tuple[BAMTag, ...]
@@ -26,7 +26,7 @@ class Feature:
26
26
  attributes: dict[str, str]
27
27
 
28
28
  id: str | None
29
- parent: "GFFFeature | None"
29
+ parent: "Feature | None"
30
30
  children: tuple["Feature", ...]
31
31
 
32
32
 
@@ -50,7 +50,6 @@ class Transcript(Feature):
50
50
  class Exon(Feature):
51
51
  gene: Gene
52
52
  transcript: Transcript
53
- # TODO mRNA
54
53
 
55
54
 
56
55
  @dataclass(frozen=True)
File without changes
@@ -0,0 +1,126 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import TypeAlias, Callable, Any, Literal, Type
4
+
5
+ from biofiles.types.feature import Feature, Gene, Transcript, UTR, Exon
6
+
7
+ FeatureFilter: TypeAlias = Callable[[Feature], bool]
8
+ FeatureMapper: TypeAlias = Callable[[Feature], Any]
9
+
10
+
11
+ @dataclass
12
+ class Pipeline:
13
+ inputs: list[Path]
14
+ filters: list[FeatureFilter]
15
+ mapper: FeatureMapper | None
16
+
17
+ def filter(self, feature: Feature) -> bool:
18
+ for f in self.filters:
19
+ if not f(feature):
20
+ return False
21
+ return True
22
+
23
+ def map(self, feature: Feature) -> Any:
24
+ if not self.mapper:
25
+ return feature
26
+ return self.mapper(feature)
27
+
28
+
29
+ Mode: TypeAlias = Literal["inputs", "filters", "done"]
30
+
31
+
32
+ def parse_pipeline_args(argv: list[str]) -> Pipeline:
33
+ pipeline = Pipeline(inputs=[], filters=[], mapper=None)
34
+
35
+ mode: Mode = "inputs"
36
+ i = 0
37
+ while i < len(argv):
38
+ match mode, argv[i:]:
39
+ case "inputs", [str_path, *_] if (path := Path(str_path)).is_file():
40
+ pipeline.inputs.append(path)
41
+ i += 1
42
+ case "inputs", ["--filter", *_]:
43
+ mode = "filters"
44
+ i += 1
45
+ case "inputs" | "filters", ["--attr", key]:
46
+ path = key.split(".")
47
+ pipeline.mapper = _produce_attr_mapper(path)
48
+ mode = "done"
49
+ i += 2
50
+ case "filters", [filter_str, *_]:
51
+ filter_ = _parse_filter(filter_str)
52
+ pipeline.filters.append(filter_)
53
+ i += 1
54
+ case other:
55
+ raise ValueError(f"can't parse command line arguments {argv[i:]}")
56
+
57
+ return pipeline
58
+
59
+
60
+ def _parse_filter(filter_str: str) -> FeatureFilter:
61
+ if "=" not in filter_str:
62
+ # --filter gene,transcript
63
+ type_strs = filter_str.split(",")
64
+ types = tuple(_parse_feature_type(t) for t in type_strs)
65
+ return lambda f: isinstance(f, types)
66
+
67
+ # --filter attr=value1,value2
68
+ key, value = filter_str.split("=", maxsplit=1)
69
+ values = value.split(",")
70
+ match key:
71
+ case "chromosome":
72
+ return lambda f: f.sequence_id in values
73
+ case "type":
74
+ return lambda f: f.type_ in values
75
+ case "strand":
76
+ return lambda f: f.strand in values
77
+ case _:
78
+ path = key.split(".")
79
+ return _produce_attr_filter(path, values)
80
+
81
+ raise ValueError(f"can't parse filter {filter_str!r}")
82
+
83
+
84
+ def _parse_feature_type(t: str) -> Type[Feature]:
85
+ if t not in _FEATURE_TYPES:
86
+ raise ValueError(f"unknown feature type {t!r}")
87
+ return _FEATURE_TYPES[t]
88
+
89
+
90
+ def _produce_attr_filter(path: list[str], values: list[str]) -> FeatureFilter:
91
+ assert path
92
+ if len(path) == 1:
93
+ (key,) = path
94
+ match key:
95
+ case "chromosome" | "type" | "strand" | "id":
96
+ return lambda f: getattr(f, key) in values
97
+ # TODO other attributes
98
+ case _:
99
+ return lambda f: f.attributes.get(key) in values
100
+
101
+ if path[0] not in ("gene", "transcript", "parent"):
102
+ raise ValueError(f"unknown attribute {path[-2]!r}")
103
+
104
+ nested = _produce_attr_filter(path[1:], values)
105
+ return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else False)
106
+
107
+
108
+ def _produce_attr_mapper(path: list[str]) -> FeatureMapper:
109
+ assert path
110
+ if len(path) == 1:
111
+ (key,) = path
112
+ match key:
113
+ case "chromosome" | "type" | "strand" | "id":
114
+ return lambda f: getattr(f, key)
115
+ # TODO other attributes
116
+ case _:
117
+ return lambda f: f.attributes.get(key, "")
118
+
119
+ if path[0] not in ("gene", "transcript", "parent"):
120
+ raise ValueError(f"unknown attribute {path[-2]!r}")
121
+
122
+ nested = _produce_attr_mapper(path[1:])
123
+ return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else None)
124
+
125
+
126
+ _FEATURE_TYPES = {"gene": Gene, "transcript": Transcript, "exon": Exon, "utr": UTR}
@@ -1,10 +1,17 @@
1
1
  from collections import deque
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import Iterator, TextIO, Type, TypeVar
4
+ from typing import Iterator, TextIO, Type, TypeVar, cast
5
5
 
6
6
  from biofiles.common import Reader, Strand
7
- from biofiles.types.feature import Feature, Gene, ThreePrimeUTR, Exon, UTR, Transcript
7
+ from biofiles.types.feature import (
8
+ Feature,
9
+ Gene,
10
+ ThreePrimeUTR,
11
+ Exon,
12
+ UTR,
13
+ Transcript,
14
+ )
8
15
 
9
16
 
10
17
  @dataclass
@@ -60,6 +67,12 @@ class Features:
60
67
  self.by_id[id_] = feature
61
68
 
62
69
 
70
+ FeatureT = TypeVar("FeatureT", bound=Feature)
71
+ GeneT = TypeVar("GeneT", bound=Gene)
72
+ TranscriptT = TypeVar("TranscriptT", bound=Transcript)
73
+ UTRT = TypeVar("UTRT", bound=UTR)
74
+
75
+
63
76
  class FeatureReader(Reader):
64
77
  def __init__(
65
78
  self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
@@ -107,10 +120,10 @@ class FeatureReader(Reader):
107
120
 
108
121
  def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
109
122
  match draft.type_.lower():
110
- case "gene":
111
- feature = self._finalize_gene(draft, result)
112
- case "transcript":
113
- feature = self._finalize_transcript(draft, result)
123
+ case "gene" | "ncrna_gene":
124
+ feature = self._finalize_gene(draft, result, Gene)
125
+ case "transcript" | "mrna" | "lnc_rna":
126
+ feature = self._finalize_transcript(draft, result, Transcript)
114
127
  case "exon":
115
128
  feature = self._finalize_exon(draft, result)
116
129
  case "three_prime_utr":
@@ -124,19 +137,23 @@ class FeatureReader(Reader):
124
137
  object.__setattr__(feature.parent, "children", new_children)
125
138
  return feature
126
139
 
127
- def _finalize_gene(self, draft: FeatureDraft, result: Features) -> Feature:
140
+ def _finalize_gene(
141
+ self, draft: FeatureDraft, result: Features, type_: Type[GeneT]
142
+ ) -> Feature:
128
143
  feature = self._finalize_other(draft, result)
129
144
  name = draft.pick_attribute("gene_name", "Name")
130
145
  biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
131
146
  if name is None or biotype is None:
132
147
  return feature
133
- return Gene(**feature.__dict__, name=name, biotype=biotype, transcripts=())
148
+ return type_(**feature.__dict__, name=name, biotype=biotype, transcripts=())
134
149
 
135
- def _finalize_transcript(self, draft: FeatureDraft, result: Features) -> Feature:
150
+ def _finalize_transcript(
151
+ self, draft: FeatureDraft, result: Features, type_: Type[TranscriptT]
152
+ ) -> Feature:
136
153
  feature = self._finalize_other(draft, result)
137
154
  if not (gene := self._find_ancestor_of_type(feature, Gene)):
138
155
  return feature
139
- transcript = Transcript(**feature.__dict__, gene=gene, exons=())
156
+ transcript = type_(**feature.__dict__, gene=gene, exons=())
140
157
  object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
141
158
  return transcript
142
159
 
@@ -148,25 +165,21 @@ class FeatureReader(Reader):
148
165
  object.__setattr__(transcript, "exons", transcript.exons + (exon,))
149
166
  return exon
150
167
 
151
- UTRT = TypeVar("UTRT", bound=UTR)
152
-
153
168
  def _finalize_utr(
154
169
  self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
155
- ) -> Feature | UTRT:
170
+ ) -> Feature:
156
171
  feature = self._finalize_other(draft, result)
157
172
  if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
158
173
  return feature
159
174
  return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
160
175
 
161
- FeatureT = TypeVar("FeatureT", bound=Feature)
162
-
163
176
  def _find_ancestor_of_type(
164
177
  self, feature: Feature, t: Type[FeatureT]
165
178
  ) -> FeatureT | None:
166
179
  ancestor = feature.parent
167
180
  while ancestor and not isinstance(ancestor, t):
168
181
  ancestor = ancestor.parent
169
- return ancestor
182
+ return cast(FeatureT | None, ancestor)
170
183
 
171
184
  def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
172
185
  parent_id = self._extract_parent_id(draft)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: biofiles
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
5
  Author-email: Tigran Saluev <tigran@saluev.com>
6
6
  Maintainer-email: Tigran Saluev <tigran@saluev.com>
@@ -2,9 +2,9 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  biofiles/__init__.py
5
+ biofiles/bam.py
5
6
  biofiles/common.py
6
7
  biofiles/fasta.py
7
- biofiles/feature.py
8
8
  biofiles/gff.py
9
9
  biofiles/gtf.py
10
10
  biofiles/repeatmasker.py
@@ -13,6 +13,10 @@ biofiles.egg-info/SOURCES.txt
13
13
  biofiles.egg-info/dependency_links.txt
14
14
  biofiles.egg-info/top_level.txt
15
15
  biofiles/types/__init__.py
16
+ biofiles/types/alignment.py
16
17
  biofiles/types/feature.py
17
18
  biofiles/types/repeat.py
18
- biofiles/types/sequence.py
19
+ biofiles/types/sequence.py
20
+ biofiles/utility/__init__.py
21
+ biofiles/utility/cli.py
22
+ biofiles/utility/feature.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "biofiles"
7
- version = "0.0.8"
7
+ version = "0.0.10"
8
8
  authors = [
9
9
  { name="Tigran Saluev", email="tigran@saluev.com" },
10
10
  ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes