biofiles 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/bam.py +199 -0
- biofiles/gff.py +18 -4
- biofiles/gtf.py +26 -4
- biofiles/repeatmasker.py +2 -2
- biofiles/types/alignment.py +76 -0
- biofiles/types/feature.py +1 -2
- biofiles/utility/__init__.py +0 -0
- biofiles/utility/cli.py +126 -0
- biofiles/{feature.py → utility/feature.py} +29 -16
- {biofiles-0.0.8.dist-info → biofiles-0.0.10.dist-info}/METADATA +1 -1
- biofiles-0.0.10.dist-info/RECORD +20 -0
- {biofiles-0.0.8.dist-info → biofiles-0.0.10.dist-info}/WHEEL +1 -1
- biofiles-0.0.8.dist-info/RECORD +0 -16
- {biofiles-0.0.8.dist-info → biofiles-0.0.10.dist-info}/LICENSE +0 -0
- {biofiles-0.0.8.dist-info → biofiles-0.0.10.dist-info}/top_level.txt +0 -0
biofiles/bam.py
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
import gzip
|
2
|
+
import struct
|
3
|
+
import sys
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from types import TracebackType
|
7
|
+
from typing import Iterator, Any
|
8
|
+
|
9
|
+
from biofiles.types.alignment import (
|
10
|
+
ReferenceSequence,
|
11
|
+
Alignment,
|
12
|
+
BAMTag,
|
13
|
+
CIGAR,
|
14
|
+
CIGAROpKind,
|
15
|
+
CIGAROperation,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class BAMReader:
|
20
|
+
def __init__(self, input_: BytesIO | Path | str) -> None:
|
21
|
+
if isinstance(input_, Path | str):
|
22
|
+
input_ = open(input_, "rb")
|
23
|
+
self._input = input_
|
24
|
+
self._ungzipped_input = gzip.open(input_)
|
25
|
+
|
26
|
+
self._header_text: str | None = None
|
27
|
+
self._ref_seqs: list[ReferenceSequence] = []
|
28
|
+
|
29
|
+
self._read_header()
|
30
|
+
|
31
|
+
def _read_header(self) -> None:
|
32
|
+
magic_bytes = self._ungzipped_input.read(8)
|
33
|
+
magic_data = struct.unpack("<ccccI", magic_bytes)
|
34
|
+
if b"".join(magic_data[:4]) != b"BAM\1":
|
35
|
+
raise ValueError("not a BAM file, invalid magic bytes")
|
36
|
+
|
37
|
+
header_text_length = magic_data[-1]
|
38
|
+
self._header_text = self._ungzipped_input.read(header_text_length)
|
39
|
+
(num_ref_seqs,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
40
|
+
|
41
|
+
for _ in range(num_ref_seqs):
|
42
|
+
(ref_seq_name_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
43
|
+
ref_seq_name = self._ungzipped_input.read(ref_seq_name_length)
|
44
|
+
(ref_seq_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
45
|
+
ref_seq = ReferenceSequence(
|
46
|
+
id=ref_seq_name.rstrip(b"\0").decode("ascii"), length=ref_seq_length
|
47
|
+
)
|
48
|
+
self._ref_seqs.append(ref_seq)
|
49
|
+
|
50
|
+
def __iter__(self) -> Iterator[Alignment]:
|
51
|
+
return self
|
52
|
+
|
53
|
+
def __next__(self) -> Alignment:
|
54
|
+
block_size_bytes = self._ungzipped_input.read(4)
|
55
|
+
if not block_size_bytes:
|
56
|
+
raise StopIteration
|
57
|
+
|
58
|
+
(block_length,) = struct.unpack("<I", block_size_bytes)
|
59
|
+
|
60
|
+
body_format = "<iiBBHHHIiii"
|
61
|
+
body_bytes = self._ungzipped_input.read(struct.calcsize(body_format))
|
62
|
+
(
|
63
|
+
ref_seq_idx,
|
64
|
+
pos,
|
65
|
+
read_name_length,
|
66
|
+
mapping_quality,
|
67
|
+
bai_index_bin,
|
68
|
+
num_cigar_ops,
|
69
|
+
flags,
|
70
|
+
seq_length,
|
71
|
+
next_ref_seq_idx,
|
72
|
+
next_pos,
|
73
|
+
template_length,
|
74
|
+
) = struct.unpack(body_format, body_bytes)
|
75
|
+
read_name_bytes = self._ungzipped_input.read(read_name_length)
|
76
|
+
|
77
|
+
cigar_format = "<" + "I" * num_cigar_ops
|
78
|
+
cigar_bytes = self._ungzipped_input.read(struct.calcsize(cigar_format))
|
79
|
+
encoded_cigar = struct.unpack(cigar_format, cigar_bytes)
|
80
|
+
|
81
|
+
seq_bytes = self._ungzipped_input.read((seq_length + 1) // 2)
|
82
|
+
encoded_seq = struct.unpack("<" + "B" * len(seq_bytes), seq_bytes)
|
83
|
+
|
84
|
+
quality = self._ungzipped_input.read(seq_length).decode("ascii")
|
85
|
+
|
86
|
+
remaining_length = (
|
87
|
+
block_length
|
88
|
+
- len(body_bytes)
|
89
|
+
- len(read_name_bytes)
|
90
|
+
- len(cigar_bytes)
|
91
|
+
- len(seq_bytes)
|
92
|
+
- len(quality)
|
93
|
+
)
|
94
|
+
|
95
|
+
tags: list[BAMTag] = []
|
96
|
+
while remaining_length > 0:
|
97
|
+
tag, used_length = self._read_tag()
|
98
|
+
tags.append(tag)
|
99
|
+
remaining_length -= used_length
|
100
|
+
if remaining_length < 0:
|
101
|
+
raise ValueError("invalid BAM file, wrong tag length")
|
102
|
+
|
103
|
+
ref_seq = self._ref_seqs[ref_seq_idx] if ref_seq_idx >= 0 else None
|
104
|
+
next_ref_seq = (
|
105
|
+
self._ref_seqs[next_ref_seq_idx] if next_ref_seq_idx >= 0 else None
|
106
|
+
)
|
107
|
+
return Alignment(
|
108
|
+
reference_sequence=ref_seq,
|
109
|
+
start_c=pos,
|
110
|
+
read_name=read_name_bytes.rstrip(b"\0").decode("utf-8"),
|
111
|
+
mapping_quality=mapping_quality,
|
112
|
+
bai_index_bin=bai_index_bin,
|
113
|
+
next_reference_sequence=next_ref_seq,
|
114
|
+
next_start_c=next_pos,
|
115
|
+
template_length=template_length,
|
116
|
+
cigar=self._decode_cigar(encoded_cigar),
|
117
|
+
read_sequence=self._decode_seq(encoded_seq),
|
118
|
+
quality=quality,
|
119
|
+
bam_flags=flags,
|
120
|
+
bam_tags=tuple(tags),
|
121
|
+
)
|
122
|
+
|
123
|
+
def _decode_cigar(self, encoded_cigar: tuple[int, ...]) -> CIGAR:
|
124
|
+
return CIGAR(
|
125
|
+
operations=tuple(
|
126
|
+
CIGAROperation(kind=_BAM_CIGAR_OP_KINDS[item & 0b1111], count=item >> 4)
|
127
|
+
for item in encoded_cigar
|
128
|
+
)
|
129
|
+
)
|
130
|
+
|
131
|
+
def _decode_seq(self, encoded_seq: tuple[int, ...]) -> str:
|
132
|
+
return "".join(
|
133
|
+
f"{_BAM_SEQUENCE_LETTERS[b >> 4]}{_BAM_SEQUENCE_LETTERS[b & 15]}"
|
134
|
+
for b in encoded_seq
|
135
|
+
)
|
136
|
+
|
137
|
+
def _read_tag(self) -> tuple[BAMTag, int]:
|
138
|
+
tag = self._ungzipped_input.read(2).decode("ascii")
|
139
|
+
value_type = self._ungzipped_input.read(1)
|
140
|
+
value, value_length = self._read_tag_value(value_type)
|
141
|
+
return BAMTag(tag=tag, value=value), 3 + value_length
|
142
|
+
|
143
|
+
def _read_tag_value(self, value_type: bytes) -> tuple[Any, int]:
|
144
|
+
if value_type in (b"Z", b"H"):
|
145
|
+
characters: list[bytes] = []
|
146
|
+
last_character = b""
|
147
|
+
while last_character != b"\0":
|
148
|
+
characters.append(last_character)
|
149
|
+
last_character = self._ungzipped_input.read(1)
|
150
|
+
value = b"".join(characters).decode("utf-8")
|
151
|
+
return value, len(characters)
|
152
|
+
|
153
|
+
elif value_type == b"B":
|
154
|
+
subtype, count = struct.unpack("<cI", self._ungzipped_input.read(5))
|
155
|
+
format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[subtype] * count
|
156
|
+
length = struct.calcsize(format_)
|
157
|
+
value = struct.unpack(format_, self._ungzipped_input.read(length))
|
158
|
+
return value, 5 + length
|
159
|
+
|
160
|
+
else:
|
161
|
+
format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[value_type]
|
162
|
+
length = struct.calcsize(format_)
|
163
|
+
(value,) = struct.unpack(format_, self._ungzipped_input.read(length))
|
164
|
+
return value, length
|
165
|
+
|
166
|
+
def __enter__(self):
|
167
|
+
self._input.__enter__()
|
168
|
+
return self
|
169
|
+
|
170
|
+
def __exit__(
|
171
|
+
self,
|
172
|
+
exc_type: type[BaseException] | None,
|
173
|
+
exc_val: BaseException | None,
|
174
|
+
exc_tb: TracebackType | None,
|
175
|
+
) -> None:
|
176
|
+
self._input.__exit__(exc_type, exc_val, exc_tb)
|
177
|
+
|
178
|
+
|
179
|
+
_BAM_FORMAT_TO_STRUCT_FORMAT = {
|
180
|
+
b"A": "c",
|
181
|
+
b"c": "b",
|
182
|
+
b"C": "B",
|
183
|
+
b"s": "h",
|
184
|
+
b"S": "H",
|
185
|
+
b"i": "i",
|
186
|
+
b"I": "I",
|
187
|
+
b"f": "f",
|
188
|
+
}
|
189
|
+
|
190
|
+
_BAM_CIGAR_OP_KINDS: list[CIGAROpKind] = ["M", "I", "D", "N", "S", "H", "P", "=", "X"]
|
191
|
+
_BAM_SEQUENCE_LETTERS = "=ACMGRSVTWYHKDBN"
|
192
|
+
|
193
|
+
if __name__ == "__main__":
|
194
|
+
for path in sys.argv[1:]:
|
195
|
+
num_alignments = 0
|
196
|
+
with BAMReader(path) as reader:
|
197
|
+
for record in reader:
|
198
|
+
num_alignments += 1
|
199
|
+
print(f"Parsed {num_alignments} alignments from {path}")
|
biofiles/gff.py
CHANGED
@@ -3,7 +3,8 @@ from pathlib import Path
|
|
3
3
|
from typing import Iterator, cast, TextIO
|
4
4
|
|
5
5
|
from biofiles.common import Strand, Writer
|
6
|
-
from biofiles.
|
6
|
+
from biofiles.utility.cli import parse_pipeline_args
|
7
|
+
from biofiles.utility.feature import FeatureReader, FeatureDraft, FeatureDrafts
|
7
8
|
from biofiles.types.feature import Feature, Gene, Exon, UTR
|
8
9
|
|
9
10
|
__all__ = ["GFFReader", "GFF3Writer"]
|
@@ -137,7 +138,15 @@ _VERSION_PREFIX = "##gff-version "
|
|
137
138
|
|
138
139
|
|
139
140
|
if __name__ == "__main__":
|
140
|
-
|
141
|
+
pipeline = parse_pipeline_args(sys.argv[1:])
|
142
|
+
if pipeline.mapper is None:
|
143
|
+
writer = GFF3Writer(sys.stdout)
|
144
|
+
pipeline.mapper = writer.write
|
145
|
+
else:
|
146
|
+
old_mapper = pipeline.mapper
|
147
|
+
pipeline.mapper = lambda f: print(old_mapper(f))
|
148
|
+
|
149
|
+
for path in pipeline.inputs:
|
141
150
|
with GFFReader(path) as r:
|
142
151
|
total_features = 0
|
143
152
|
annotated_genes = 0
|
@@ -148,15 +157,20 @@ if __name__ == "__main__":
|
|
148
157
|
parsed_utrs = 0
|
149
158
|
for feature in r:
|
150
159
|
total_features += 1
|
151
|
-
annotated_genes += feature.type_
|
160
|
+
annotated_genes += "gene" in feature.type_.lower()
|
152
161
|
annotated_exons += feature.type_ == "exon"
|
153
162
|
annotated_utrs += "utr" in feature.type_.lower()
|
154
163
|
parsed_genes += isinstance(feature, Gene)
|
155
164
|
parsed_exons += isinstance(feature, Exon)
|
156
165
|
parsed_utrs += isinstance(feature, UTR)
|
166
|
+
|
167
|
+
if pipeline.filter(feature):
|
168
|
+
pipeline.map(feature)
|
169
|
+
|
157
170
|
print(
|
158
171
|
f"{path}: {total_features} features, "
|
159
172
|
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
160
173
|
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
161
|
-
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
174
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
|
175
|
+
file=sys.stderr,
|
162
176
|
)
|
biofiles/gtf.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
-
__all__ = ["GTFReader"]
|
1
|
+
__all__ = ["GTFReader", "GTFWriter"]
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from typing import Iterator
|
5
5
|
|
6
|
+
from biofiles.common import Writer
|
6
7
|
from biofiles.gff import GFFReader
|
7
8
|
from biofiles.types.feature import Gene, Exon, Feature, UTR
|
8
9
|
|
@@ -13,12 +14,32 @@ class GTFReader(GFFReader):
|
|
13
14
|
|
14
15
|
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
15
16
|
return {
|
16
|
-
k: v.
|
17
|
+
k: v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
|
17
18
|
for part in attributes_str.strip(";").split(";")
|
18
19
|
for k, v in (part.strip().split(None, 1),)
|
19
20
|
}
|
20
21
|
|
21
22
|
|
23
|
+
class GTFWriter(Writer):
|
24
|
+
def write(self, feature: Feature) -> None:
|
25
|
+
fields = (
|
26
|
+
feature.sequence_id,
|
27
|
+
feature.source,
|
28
|
+
feature.type_,
|
29
|
+
str(feature.start_c + 1),
|
30
|
+
str(feature.end_c),
|
31
|
+
str(feature.score) if feature.score is not None else ".",
|
32
|
+
str(feature.strand) if feature.strand is not None else ".",
|
33
|
+
str(feature.phase) if feature.phase is not None else ".",
|
34
|
+
"; ".join(
|
35
|
+
f'{k} "' + v.replace('"', r"\"") + '"'
|
36
|
+
for k, v in feature.attributes.items()
|
37
|
+
),
|
38
|
+
)
|
39
|
+
self._output.write("\t".join(fields))
|
40
|
+
self._output.write("\n")
|
41
|
+
|
42
|
+
|
22
43
|
if __name__ == "__main__":
|
23
44
|
for path in sys.argv[1:]:
|
24
45
|
with GTFReader(path) as r:
|
@@ -31,7 +52,7 @@ if __name__ == "__main__":
|
|
31
52
|
parsed_utrs = 0
|
32
53
|
for feature in r:
|
33
54
|
total_features += 1
|
34
|
-
annotated_genes += feature.type_
|
55
|
+
annotated_genes += "gene" in feature.type_.lower()
|
35
56
|
annotated_exons += feature.type_ == "exon"
|
36
57
|
annotated_utrs += "utr" in feature.type_.lower()
|
37
58
|
parsed_genes += isinstance(feature, Gene)
|
@@ -41,5 +62,6 @@ if __name__ == "__main__":
|
|
41
62
|
f"{path}: {total_features} features, "
|
42
63
|
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
43
64
|
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
44
|
-
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
65
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
|
66
|
+
file=sys.stderr,
|
45
67
|
)
|
biofiles/repeatmasker.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import sys
|
2
2
|
from collections import Counter
|
3
|
-
from typing import Iterator
|
3
|
+
from typing import Iterator, cast, Literal
|
4
4
|
|
5
5
|
from biofiles.common import Reader
|
6
6
|
from biofiles.types.repeat import Repeat
|
@@ -42,7 +42,7 @@ class RepeatMaskerReader(Reader):
|
|
42
42
|
seq_start = int(seq_start_str)
|
43
43
|
seq_end = int(seq_end_str)
|
44
44
|
seq_left = int(seq_left_str[1:-1])
|
45
|
-
strand = {"+": "+", "C": "-"}[strand_str]
|
45
|
+
strand = cast(Literal["+", "-"], {"+": "+", "C": "-"}[strand_str])
|
46
46
|
|
47
47
|
if "/" in repeat_class_family:
|
48
48
|
repeat_class, repeat_family = repeat_class_family.split("/", 1)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
|
3
|
+
|
4
|
+
__all__ = ["ReferenceSequence", "Alignment", "BAMTag"]
|
5
|
+
|
6
|
+
from enum import IntFlag
|
7
|
+
|
8
|
+
from typing import Any, Literal
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass(frozen=True)
|
12
|
+
class ReferenceSequence:
|
13
|
+
id: str
|
14
|
+
length: int
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass(frozen=True, slots=True)
|
18
|
+
class BAMTag:
|
19
|
+
tag: str
|
20
|
+
value: Any
|
21
|
+
|
22
|
+
|
23
|
+
CIGAROpKind = Literal["M", "I", "D", "N", "S", "H", "P", "=", "X"]
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True, slots=True)
|
27
|
+
class CIGAROperation:
|
28
|
+
kind: CIGAROpKind
|
29
|
+
count: int
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass(frozen=True)
|
33
|
+
class CIGAR:
|
34
|
+
operations: tuple[CIGAROperation, ...]
|
35
|
+
|
36
|
+
def __repr__(self) -> str:
|
37
|
+
return f'CIGAR("{self}")'
|
38
|
+
|
39
|
+
def __str__(self) -> str:
|
40
|
+
return "".join(f"{op.count}{op.kind}" for op in self.operations)
|
41
|
+
|
42
|
+
|
43
|
+
class BAMFlag(IntFlag):
|
44
|
+
MULTIPLE_SEGMENTS = 1 << 0
|
45
|
+
EACH_SEGMENT_PROPERLY_ALIGNED = 1 << 1
|
46
|
+
SEGMENT_UNMAPPED = 1 << 2
|
47
|
+
NEXT_SEGMENT_UNMAPPED = 1 << 3
|
48
|
+
READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 4
|
49
|
+
NEXT_SEGMENT_READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 5
|
50
|
+
FIRST_SEGMENT = 1 << 6
|
51
|
+
LAST_SEGMENT = 1 << 7
|
52
|
+
SECONDARY_SEGMENT = 1 << 8
|
53
|
+
NOT_PASSING_QUALITY_CONTROL = 1 << 9
|
54
|
+
DUPLICATE = 1 << 10
|
55
|
+
SUPPLEMENTARY_ALIGNMENT = 1 << 11
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass(frozen=True)
|
59
|
+
class Alignment:
|
60
|
+
reference_sequence: ReferenceSequence | None
|
61
|
+
|
62
|
+
start_c: int
|
63
|
+
# 0-based leftmost coordinate.
|
64
|
+
read_name: str
|
65
|
+
mapping_quality: int
|
66
|
+
bai_index_bin: int
|
67
|
+
|
68
|
+
next_reference_sequence: ReferenceSequence | None
|
69
|
+
next_start_c: int
|
70
|
+
template_length: int
|
71
|
+
cigar: CIGAR
|
72
|
+
read_sequence: str
|
73
|
+
quality: str
|
74
|
+
|
75
|
+
bam_flags: int
|
76
|
+
bam_tags: tuple[BAMTag, ...]
|
biofiles/types/feature.py
CHANGED
@@ -26,7 +26,7 @@ class Feature:
|
|
26
26
|
attributes: dict[str, str]
|
27
27
|
|
28
28
|
id: str | None
|
29
|
-
parent: "
|
29
|
+
parent: "Feature | None"
|
30
30
|
children: tuple["Feature", ...]
|
31
31
|
|
32
32
|
|
@@ -50,7 +50,6 @@ class Transcript(Feature):
|
|
50
50
|
class Exon(Feature):
|
51
51
|
gene: Gene
|
52
52
|
transcript: Transcript
|
53
|
-
# TODO mRNA
|
54
53
|
|
55
54
|
|
56
55
|
@dataclass(frozen=True)
|
File without changes
|
biofiles/utility/cli.py
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import TypeAlias, Callable, Any, Literal, Type
|
4
|
+
|
5
|
+
from biofiles.types.feature import Feature, Gene, Transcript, UTR, Exon
|
6
|
+
|
7
|
+
FeatureFilter: TypeAlias = Callable[[Feature], bool]
|
8
|
+
FeatureMapper: TypeAlias = Callable[[Feature], Any]
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class Pipeline:
|
13
|
+
inputs: list[Path]
|
14
|
+
filters: list[FeatureFilter]
|
15
|
+
mapper: FeatureMapper | None
|
16
|
+
|
17
|
+
def filter(self, feature: Feature) -> bool:
|
18
|
+
for f in self.filters:
|
19
|
+
if not f(feature):
|
20
|
+
return False
|
21
|
+
return True
|
22
|
+
|
23
|
+
def map(self, feature: Feature) -> Any:
|
24
|
+
if not self.mapper:
|
25
|
+
return feature
|
26
|
+
return self.mapper(feature)
|
27
|
+
|
28
|
+
|
29
|
+
Mode: TypeAlias = Literal["inputs", "filters", "done"]
|
30
|
+
|
31
|
+
|
32
|
+
def parse_pipeline_args(argv: list[str]) -> Pipeline:
|
33
|
+
pipeline = Pipeline(inputs=[], filters=[], mapper=None)
|
34
|
+
|
35
|
+
mode: Mode = "inputs"
|
36
|
+
i = 0
|
37
|
+
while i < len(argv):
|
38
|
+
match mode, argv[i:]:
|
39
|
+
case "inputs", [str_path, *_] if (path := Path(str_path)).is_file():
|
40
|
+
pipeline.inputs.append(path)
|
41
|
+
i += 1
|
42
|
+
case "inputs", ["--filter", *_]:
|
43
|
+
mode = "filters"
|
44
|
+
i += 1
|
45
|
+
case "inputs" | "filters", ["--attr", key]:
|
46
|
+
path = key.split(".")
|
47
|
+
pipeline.mapper = _produce_attr_mapper(path)
|
48
|
+
mode = "done"
|
49
|
+
i += 2
|
50
|
+
case "filters", [filter_str, *_]:
|
51
|
+
filter_ = _parse_filter(filter_str)
|
52
|
+
pipeline.filters.append(filter_)
|
53
|
+
i += 1
|
54
|
+
case other:
|
55
|
+
raise ValueError(f"can't parse command line arguments {argv[i:]}")
|
56
|
+
|
57
|
+
return pipeline
|
58
|
+
|
59
|
+
|
60
|
+
def _parse_filter(filter_str: str) -> FeatureFilter:
|
61
|
+
if "=" not in filter_str:
|
62
|
+
# --filter gene,transcript
|
63
|
+
type_strs = filter_str.split(",")
|
64
|
+
types = tuple(_parse_feature_type(t) for t in type_strs)
|
65
|
+
return lambda f: isinstance(f, types)
|
66
|
+
|
67
|
+
# --filter attr=value1,value2
|
68
|
+
key, value = filter_str.split("=", maxsplit=1)
|
69
|
+
values = value.split(",")
|
70
|
+
match key:
|
71
|
+
case "chromosome":
|
72
|
+
return lambda f: f.sequence_id in values
|
73
|
+
case "type":
|
74
|
+
return lambda f: f.type_ in values
|
75
|
+
case "strand":
|
76
|
+
return lambda f: f.strand in values
|
77
|
+
case _:
|
78
|
+
path = key.split(".")
|
79
|
+
return _produce_attr_filter(path, values)
|
80
|
+
|
81
|
+
raise ValueError(f"can't parse filter {filter_str!r}")
|
82
|
+
|
83
|
+
|
84
|
+
def _parse_feature_type(t: str) -> Type[Feature]:
|
85
|
+
if t not in _FEATURE_TYPES:
|
86
|
+
raise ValueError(f"unknown feature type {t!r}")
|
87
|
+
return _FEATURE_TYPES[t]
|
88
|
+
|
89
|
+
|
90
|
+
def _produce_attr_filter(path: list[str], values: list[str]) -> FeatureFilter:
|
91
|
+
assert path
|
92
|
+
if len(path) == 1:
|
93
|
+
(key,) = path
|
94
|
+
match key:
|
95
|
+
case "chromosome" | "type" | "strand" | "id":
|
96
|
+
return lambda f: getattr(f, key) in values
|
97
|
+
# TODO other attributes
|
98
|
+
case _:
|
99
|
+
return lambda f: f.attributes.get(key) in values
|
100
|
+
|
101
|
+
if path[0] not in ("gene", "transcript", "parent"):
|
102
|
+
raise ValueError(f"unknown attribute {path[-2]!r}")
|
103
|
+
|
104
|
+
nested = _produce_attr_filter(path[1:], values)
|
105
|
+
return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else False)
|
106
|
+
|
107
|
+
|
108
|
+
def _produce_attr_mapper(path: list[str]) -> FeatureMapper:
|
109
|
+
assert path
|
110
|
+
if len(path) == 1:
|
111
|
+
(key,) = path
|
112
|
+
match key:
|
113
|
+
case "chromosome" | "type" | "strand" | "id":
|
114
|
+
return lambda f: getattr(f, key)
|
115
|
+
# TODO other attributes
|
116
|
+
case _:
|
117
|
+
return lambda f: f.attributes.get(key, "")
|
118
|
+
|
119
|
+
if path[0] not in ("gene", "transcript", "parent"):
|
120
|
+
raise ValueError(f"unknown attribute {path[-2]!r}")
|
121
|
+
|
122
|
+
nested = _produce_attr_mapper(path[1:])
|
123
|
+
return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else None)
|
124
|
+
|
125
|
+
|
126
|
+
_FEATURE_TYPES = {"gene": Gene, "transcript": Transcript, "exon": Exon, "utr": UTR}
|
@@ -1,10 +1,17 @@
|
|
1
1
|
from collections import deque
|
2
2
|
from dataclasses import dataclass, field
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Iterator, TextIO, Type, TypeVar
|
4
|
+
from typing import Iterator, TextIO, Type, TypeVar, cast
|
5
5
|
|
6
6
|
from biofiles.common import Reader, Strand
|
7
|
-
from biofiles.types.feature import
|
7
|
+
from biofiles.types.feature import (
|
8
|
+
Feature,
|
9
|
+
Gene,
|
10
|
+
ThreePrimeUTR,
|
11
|
+
Exon,
|
12
|
+
UTR,
|
13
|
+
Transcript,
|
14
|
+
)
|
8
15
|
|
9
16
|
|
10
17
|
@dataclass
|
@@ -60,6 +67,12 @@ class Features:
|
|
60
67
|
self.by_id[id_] = feature
|
61
68
|
|
62
69
|
|
70
|
+
FeatureT = TypeVar("FeatureT", bound=Feature)
|
71
|
+
GeneT = TypeVar("GeneT", bound=Gene)
|
72
|
+
TranscriptT = TypeVar("TranscriptT", bound=Transcript)
|
73
|
+
UTRT = TypeVar("UTRT", bound=UTR)
|
74
|
+
|
75
|
+
|
63
76
|
class FeatureReader(Reader):
|
64
77
|
def __init__(
|
65
78
|
self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
|
@@ -107,10 +120,10 @@ class FeatureReader(Reader):
|
|
107
120
|
|
108
121
|
def _finalize_draft(self, draft: FeatureDraft, result: Features) -> Feature:
|
109
122
|
match draft.type_.lower():
|
110
|
-
case "gene":
|
111
|
-
feature = self._finalize_gene(draft, result)
|
112
|
-
case "transcript":
|
113
|
-
feature = self._finalize_transcript(draft, result)
|
123
|
+
case "gene" | "ncrna_gene":
|
124
|
+
feature = self._finalize_gene(draft, result, Gene)
|
125
|
+
case "transcript" | "mrna" | "lnc_rna":
|
126
|
+
feature = self._finalize_transcript(draft, result, Transcript)
|
114
127
|
case "exon":
|
115
128
|
feature = self._finalize_exon(draft, result)
|
116
129
|
case "three_prime_utr":
|
@@ -124,19 +137,23 @@ class FeatureReader(Reader):
|
|
124
137
|
object.__setattr__(feature.parent, "children", new_children)
|
125
138
|
return feature
|
126
139
|
|
127
|
-
def _finalize_gene(
|
140
|
+
def _finalize_gene(
|
141
|
+
self, draft: FeatureDraft, result: Features, type_: Type[GeneT]
|
142
|
+
) -> Feature:
|
128
143
|
feature = self._finalize_other(draft, result)
|
129
144
|
name = draft.pick_attribute("gene_name", "Name")
|
130
145
|
biotype = draft.pick_attribute("gene_biotype", "biotype", "gene_type")
|
131
146
|
if name is None or biotype is None:
|
132
147
|
return feature
|
133
|
-
return
|
148
|
+
return type_(**feature.__dict__, name=name, biotype=biotype, transcripts=())
|
134
149
|
|
135
|
-
def _finalize_transcript(
|
150
|
+
def _finalize_transcript(
|
151
|
+
self, draft: FeatureDraft, result: Features, type_: Type[TranscriptT]
|
152
|
+
) -> Feature:
|
136
153
|
feature = self._finalize_other(draft, result)
|
137
154
|
if not (gene := self._find_ancestor_of_type(feature, Gene)):
|
138
155
|
return feature
|
139
|
-
transcript =
|
156
|
+
transcript = type_(**feature.__dict__, gene=gene, exons=())
|
140
157
|
object.__setattr__(gene, "transcripts", gene.transcripts + (transcript,))
|
141
158
|
return transcript
|
142
159
|
|
@@ -148,25 +165,21 @@ class FeatureReader(Reader):
|
|
148
165
|
object.__setattr__(transcript, "exons", transcript.exons + (exon,))
|
149
166
|
return exon
|
150
167
|
|
151
|
-
UTRT = TypeVar("UTRT", bound=UTR)
|
152
|
-
|
153
168
|
def _finalize_utr(
|
154
169
|
self, draft: FeatureDraft, result: Features, type_: Type[UTRT]
|
155
|
-
) -> Feature
|
170
|
+
) -> Feature:
|
156
171
|
feature = self._finalize_other(draft, result)
|
157
172
|
if not (transcript := self._find_ancestor_of_type(feature, Transcript)):
|
158
173
|
return feature
|
159
174
|
return type_(**feature.__dict__, gene=transcript.gene, transcript=transcript)
|
160
175
|
|
161
|
-
FeatureT = TypeVar("FeatureT", bound=Feature)
|
162
|
-
|
163
176
|
def _find_ancestor_of_type(
|
164
177
|
self, feature: Feature, t: Type[FeatureT]
|
165
178
|
) -> FeatureT | None:
|
166
179
|
ancestor = feature.parent
|
167
180
|
while ancestor and not isinstance(ancestor, t):
|
168
181
|
ancestor = ancestor.parent
|
169
|
-
return ancestor
|
182
|
+
return cast(FeatureT | None, ancestor)
|
170
183
|
|
171
184
|
def _finalize_other(self, draft: FeatureDraft, result: Features) -> Feature:
|
172
185
|
parent_id = self._extract_parent_id(draft)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.10
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
biofiles/bam.py,sha256=w32LLOAuKWdGF7joTSrB4HYXCdfvvijZW44jizG36R8,6771
|
3
|
+
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
4
|
+
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
5
|
+
biofiles/gff.py,sha256=b3apOmJNoiy_qQHtyUSnNh0s999B6gyAODyjI7fN15g,6246
|
6
|
+
biofiles/gtf.py,sha256=h_eFKnYWb8GQp-CX9EPZRodUba-bzQLGidGHOPUo4iM,2366
|
7
|
+
biofiles/repeatmasker.py,sha256=txOYdw15ru88pUczsk0pDFzgGpplLu23CB8Ppz-MczY,3119
|
8
|
+
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
biofiles/types/alignment.py,sha256=5UvwKJ2psIpkkU5efGRHe8gYhMIoW35-RZ_Zoe5YDrY,1612
|
10
|
+
biofiles/types/feature.py,sha256=3Ar45WRgiaDSh5iQt24Emtk6_57G01q5nHJ1GNIJ19Y,1190
|
11
|
+
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
12
|
+
biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
|
13
|
+
biofiles/utility/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
biofiles/utility/cli.py,sha256=bkUzmT5R4qdJ0YtA4LNU5JYpimD1HmZlHtoSaKzDsUc,4032
|
15
|
+
biofiles/utility/feature.py,sha256=tUTn16xV1e0qpgkZ1ZwQ4LJJGil5mgQJBJ9s1yFDgiI,8068
|
16
|
+
biofiles-0.0.10.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
17
|
+
biofiles-0.0.10.dist-info/METADATA,sha256=jthXyKbpyvig9dgiQmUaIyDH8hWKX2zMyojyIxr5mjM,3034
|
18
|
+
biofiles-0.0.10.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
19
|
+
biofiles-0.0.10.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
20
|
+
biofiles-0.0.10.dist-info/RECORD,,
|
biofiles-0.0.8.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
3
|
-
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
4
|
-
biofiles/feature.py,sha256=oZKNkZrCJjg4-AutGy3rri0gq-FRyo7vLwUzYG1EY7g,7809
|
5
|
-
biofiles/gff.py,sha256=LIbHGkpSTo-iMeatt2opPFlpNs8tHyv9XHPIVwzh3m8,5790
|
6
|
-
biofiles/gtf.py,sha256=eQsnpTjDaxrBeQ8uHzXy6C6sj8OvenFv9zwkFlytQYM,1535
|
7
|
-
biofiles/repeatmasker.py,sha256=DqD1z1hUfCP4-qnfjF-oMF-ZpW_6XhOf_nzA8VHhQbw,3079
|
8
|
-
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
biofiles/types/feature.py,sha256=N6IIip7YqtSib5w_VLX1cBVwja8iWfa5AJncsKBs1PU,1209
|
10
|
-
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
11
|
-
biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
|
12
|
-
biofiles-0.0.8.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
13
|
-
biofiles-0.0.8.dist-info/METADATA,sha256=B0rgF4FGa2lgMehk6LdOEhHB2jddaoc76fteG3p4dp0,3033
|
14
|
-
biofiles-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
-
biofiles-0.0.8.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
16
|
-
biofiles-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|