biofiles 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles/bam.py +199 -0
- biofiles/gff.py +17 -3
- biofiles/gtf.py +25 -3
- biofiles/types/alignment.py +76 -0
- biofiles/utility/__init__.py +0 -0
- biofiles/utility/cli.py +126 -0
- {biofiles-0.0.9.dist-info → biofiles-0.0.10.dist-info}/METADATA +1 -1
- biofiles-0.0.10.dist-info/RECORD +20 -0
- {biofiles-0.0.9.dist-info → biofiles-0.0.10.dist-info}/WHEEL +1 -1
- biofiles-0.0.9.dist-info/RECORD +0 -16
- /biofiles/{feature.py → utility/feature.py} +0 -0
- {biofiles-0.0.9.dist-info → biofiles-0.0.10.dist-info}/LICENSE +0 -0
- {biofiles-0.0.9.dist-info → biofiles-0.0.10.dist-info}/top_level.txt +0 -0
biofiles/bam.py
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
import gzip
|
2
|
+
import struct
|
3
|
+
import sys
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from types import TracebackType
|
7
|
+
from typing import Iterator, Any
|
8
|
+
|
9
|
+
from biofiles.types.alignment import (
|
10
|
+
ReferenceSequence,
|
11
|
+
Alignment,
|
12
|
+
BAMTag,
|
13
|
+
CIGAR,
|
14
|
+
CIGAROpKind,
|
15
|
+
CIGAROperation,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class BAMReader:
|
20
|
+
def __init__(self, input_: BytesIO | Path | str) -> None:
|
21
|
+
if isinstance(input_, Path | str):
|
22
|
+
input_ = open(input_, "rb")
|
23
|
+
self._input = input_
|
24
|
+
self._ungzipped_input = gzip.open(input_)
|
25
|
+
|
26
|
+
self._header_text: str | None = None
|
27
|
+
self._ref_seqs: list[ReferenceSequence] = []
|
28
|
+
|
29
|
+
self._read_header()
|
30
|
+
|
31
|
+
def _read_header(self) -> None:
|
32
|
+
magic_bytes = self._ungzipped_input.read(8)
|
33
|
+
magic_data = struct.unpack("<ccccI", magic_bytes)
|
34
|
+
if b"".join(magic_data[:4]) != b"BAM\1":
|
35
|
+
raise ValueError("not a BAM file, invalid magic bytes")
|
36
|
+
|
37
|
+
header_text_length = magic_data[-1]
|
38
|
+
self._header_text = self._ungzipped_input.read(header_text_length)
|
39
|
+
(num_ref_seqs,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
40
|
+
|
41
|
+
for _ in range(num_ref_seqs):
|
42
|
+
(ref_seq_name_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
43
|
+
ref_seq_name = self._ungzipped_input.read(ref_seq_name_length)
|
44
|
+
(ref_seq_length,) = struct.unpack("<I", self._ungzipped_input.read(4))
|
45
|
+
ref_seq = ReferenceSequence(
|
46
|
+
id=ref_seq_name.rstrip(b"\0").decode("ascii"), length=ref_seq_length
|
47
|
+
)
|
48
|
+
self._ref_seqs.append(ref_seq)
|
49
|
+
|
50
|
+
def __iter__(self) -> Iterator[Alignment]:
|
51
|
+
return self
|
52
|
+
|
53
|
+
def __next__(self) -> Alignment:
|
54
|
+
block_size_bytes = self._ungzipped_input.read(4)
|
55
|
+
if not block_size_bytes:
|
56
|
+
raise StopIteration
|
57
|
+
|
58
|
+
(block_length,) = struct.unpack("<I", block_size_bytes)
|
59
|
+
|
60
|
+
body_format = "<iiBBHHHIiii"
|
61
|
+
body_bytes = self._ungzipped_input.read(struct.calcsize(body_format))
|
62
|
+
(
|
63
|
+
ref_seq_idx,
|
64
|
+
pos,
|
65
|
+
read_name_length,
|
66
|
+
mapping_quality,
|
67
|
+
bai_index_bin,
|
68
|
+
num_cigar_ops,
|
69
|
+
flags,
|
70
|
+
seq_length,
|
71
|
+
next_ref_seq_idx,
|
72
|
+
next_pos,
|
73
|
+
template_length,
|
74
|
+
) = struct.unpack(body_format, body_bytes)
|
75
|
+
read_name_bytes = self._ungzipped_input.read(read_name_length)
|
76
|
+
|
77
|
+
cigar_format = "<" + "I" * num_cigar_ops
|
78
|
+
cigar_bytes = self._ungzipped_input.read(struct.calcsize(cigar_format))
|
79
|
+
encoded_cigar = struct.unpack(cigar_format, cigar_bytes)
|
80
|
+
|
81
|
+
seq_bytes = self._ungzipped_input.read((seq_length + 1) // 2)
|
82
|
+
encoded_seq = struct.unpack("<" + "B" * len(seq_bytes), seq_bytes)
|
83
|
+
|
84
|
+
quality = self._ungzipped_input.read(seq_length).decode("ascii")
|
85
|
+
|
86
|
+
remaining_length = (
|
87
|
+
block_length
|
88
|
+
- len(body_bytes)
|
89
|
+
- len(read_name_bytes)
|
90
|
+
- len(cigar_bytes)
|
91
|
+
- len(seq_bytes)
|
92
|
+
- len(quality)
|
93
|
+
)
|
94
|
+
|
95
|
+
tags: list[BAMTag] = []
|
96
|
+
while remaining_length > 0:
|
97
|
+
tag, used_length = self._read_tag()
|
98
|
+
tags.append(tag)
|
99
|
+
remaining_length -= used_length
|
100
|
+
if remaining_length < 0:
|
101
|
+
raise ValueError("invalid BAM file, wrong tag length")
|
102
|
+
|
103
|
+
ref_seq = self._ref_seqs[ref_seq_idx] if ref_seq_idx >= 0 else None
|
104
|
+
next_ref_seq = (
|
105
|
+
self._ref_seqs[next_ref_seq_idx] if next_ref_seq_idx >= 0 else None
|
106
|
+
)
|
107
|
+
return Alignment(
|
108
|
+
reference_sequence=ref_seq,
|
109
|
+
start_c=pos,
|
110
|
+
read_name=read_name_bytes.rstrip(b"\0").decode("utf-8"),
|
111
|
+
mapping_quality=mapping_quality,
|
112
|
+
bai_index_bin=bai_index_bin,
|
113
|
+
next_reference_sequence=next_ref_seq,
|
114
|
+
next_start_c=next_pos,
|
115
|
+
template_length=template_length,
|
116
|
+
cigar=self._decode_cigar(encoded_cigar),
|
117
|
+
read_sequence=self._decode_seq(encoded_seq),
|
118
|
+
quality=quality,
|
119
|
+
bam_flags=flags,
|
120
|
+
bam_tags=tuple(tags),
|
121
|
+
)
|
122
|
+
|
123
|
+
def _decode_cigar(self, encoded_cigar: tuple[int, ...]) -> CIGAR:
|
124
|
+
return CIGAR(
|
125
|
+
operations=tuple(
|
126
|
+
CIGAROperation(kind=_BAM_CIGAR_OP_KINDS[item & 0b1111], count=item >> 4)
|
127
|
+
for item in encoded_cigar
|
128
|
+
)
|
129
|
+
)
|
130
|
+
|
131
|
+
def _decode_seq(self, encoded_seq: tuple[int, ...]) -> str:
|
132
|
+
return "".join(
|
133
|
+
f"{_BAM_SEQUENCE_LETTERS[b >> 4]}{_BAM_SEQUENCE_LETTERS[b & 15]}"
|
134
|
+
for b in encoded_seq
|
135
|
+
)
|
136
|
+
|
137
|
+
def _read_tag(self) -> tuple[BAMTag, int]:
|
138
|
+
tag = self._ungzipped_input.read(2).decode("ascii")
|
139
|
+
value_type = self._ungzipped_input.read(1)
|
140
|
+
value, value_length = self._read_tag_value(value_type)
|
141
|
+
return BAMTag(tag=tag, value=value), 3 + value_length
|
142
|
+
|
143
|
+
def _read_tag_value(self, value_type: bytes) -> tuple[Any, int]:
|
144
|
+
if value_type in (b"Z", b"H"):
|
145
|
+
characters: list[bytes] = []
|
146
|
+
last_character = b""
|
147
|
+
while last_character != b"\0":
|
148
|
+
characters.append(last_character)
|
149
|
+
last_character = self._ungzipped_input.read(1)
|
150
|
+
value = b"".join(characters).decode("utf-8")
|
151
|
+
return value, len(characters)
|
152
|
+
|
153
|
+
elif value_type == b"B":
|
154
|
+
subtype, count = struct.unpack("<cI", self._ungzipped_input.read(5))
|
155
|
+
format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[subtype] * count
|
156
|
+
length = struct.calcsize(format_)
|
157
|
+
value = struct.unpack(format_, self._ungzipped_input.read(length))
|
158
|
+
return value, 5 + length
|
159
|
+
|
160
|
+
else:
|
161
|
+
format_ = "<" + _BAM_FORMAT_TO_STRUCT_FORMAT[value_type]
|
162
|
+
length = struct.calcsize(format_)
|
163
|
+
(value,) = struct.unpack(format_, self._ungzipped_input.read(length))
|
164
|
+
return value, length
|
165
|
+
|
166
|
+
def __enter__(self):
|
167
|
+
self._input.__enter__()
|
168
|
+
return self
|
169
|
+
|
170
|
+
def __exit__(
|
171
|
+
self,
|
172
|
+
exc_type: type[BaseException] | None,
|
173
|
+
exc_val: BaseException | None,
|
174
|
+
exc_tb: TracebackType | None,
|
175
|
+
) -> None:
|
176
|
+
self._input.__exit__(exc_type, exc_val, exc_tb)
|
177
|
+
|
178
|
+
|
179
|
+
_BAM_FORMAT_TO_STRUCT_FORMAT = {
|
180
|
+
b"A": "c",
|
181
|
+
b"c": "b",
|
182
|
+
b"C": "B",
|
183
|
+
b"s": "h",
|
184
|
+
b"S": "H",
|
185
|
+
b"i": "i",
|
186
|
+
b"I": "I",
|
187
|
+
b"f": "f",
|
188
|
+
}
|
189
|
+
|
190
|
+
_BAM_CIGAR_OP_KINDS: list[CIGAROpKind] = ["M", "I", "D", "N", "S", "H", "P", "=", "X"]
|
191
|
+
_BAM_SEQUENCE_LETTERS = "=ACMGRSVTWYHKDBN"
|
192
|
+
|
193
|
+
if __name__ == "__main__":
|
194
|
+
for path in sys.argv[1:]:
|
195
|
+
num_alignments = 0
|
196
|
+
with BAMReader(path) as reader:
|
197
|
+
for record in reader:
|
198
|
+
num_alignments += 1
|
199
|
+
print(f"Parsed {num_alignments} alignments from {path}")
|
biofiles/gff.py
CHANGED
@@ -3,7 +3,8 @@ from pathlib import Path
|
|
3
3
|
from typing import Iterator, cast, TextIO
|
4
4
|
|
5
5
|
from biofiles.common import Strand, Writer
|
6
|
-
from biofiles.
|
6
|
+
from biofiles.utility.cli import parse_pipeline_args
|
7
|
+
from biofiles.utility.feature import FeatureReader, FeatureDraft, FeatureDrafts
|
7
8
|
from biofiles.types.feature import Feature, Gene, Exon, UTR
|
8
9
|
|
9
10
|
__all__ = ["GFFReader", "GFF3Writer"]
|
@@ -137,7 +138,15 @@ _VERSION_PREFIX = "##gff-version "
|
|
137
138
|
|
138
139
|
|
139
140
|
if __name__ == "__main__":
|
140
|
-
|
141
|
+
pipeline = parse_pipeline_args(sys.argv[1:])
|
142
|
+
if pipeline.mapper is None:
|
143
|
+
writer = GFF3Writer(sys.stdout)
|
144
|
+
pipeline.mapper = writer.write
|
145
|
+
else:
|
146
|
+
old_mapper = pipeline.mapper
|
147
|
+
pipeline.mapper = lambda f: print(old_mapper(f))
|
148
|
+
|
149
|
+
for path in pipeline.inputs:
|
141
150
|
with GFFReader(path) as r:
|
142
151
|
total_features = 0
|
143
152
|
annotated_genes = 0
|
@@ -154,9 +163,14 @@ if __name__ == "__main__":
|
|
154
163
|
parsed_genes += isinstance(feature, Gene)
|
155
164
|
parsed_exons += isinstance(feature, Exon)
|
156
165
|
parsed_utrs += isinstance(feature, UTR)
|
166
|
+
|
167
|
+
if pipeline.filter(feature):
|
168
|
+
pipeline.map(feature)
|
169
|
+
|
157
170
|
print(
|
158
171
|
f"{path}: {total_features} features, "
|
159
172
|
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
160
173
|
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
161
|
-
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
174
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
|
175
|
+
file=sys.stderr,
|
162
176
|
)
|
biofiles/gtf.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
-
__all__ = ["GTFReader"]
|
1
|
+
__all__ = ["GTFReader", "GTFWriter"]
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from typing import Iterator
|
5
5
|
|
6
|
+
from biofiles.common import Writer
|
6
7
|
from biofiles.gff import GFFReader
|
7
8
|
from biofiles.types.feature import Gene, Exon, Feature, UTR
|
8
9
|
|
@@ -13,12 +14,32 @@ class GTFReader(GFFReader):
|
|
13
14
|
|
14
15
|
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
15
16
|
return {
|
16
|
-
k: v.
|
17
|
+
k: v.removeprefix('"').removesuffix('"').replace(r"\"", '"')
|
17
18
|
for part in attributes_str.strip(";").split(";")
|
18
19
|
for k, v in (part.strip().split(None, 1),)
|
19
20
|
}
|
20
21
|
|
21
22
|
|
23
|
+
class GTFWriter(Writer):
|
24
|
+
def write(self, feature: Feature) -> None:
|
25
|
+
fields = (
|
26
|
+
feature.sequence_id,
|
27
|
+
feature.source,
|
28
|
+
feature.type_,
|
29
|
+
str(feature.start_c + 1),
|
30
|
+
str(feature.end_c),
|
31
|
+
str(feature.score) if feature.score is not None else ".",
|
32
|
+
str(feature.strand) if feature.strand is not None else ".",
|
33
|
+
str(feature.phase) if feature.phase is not None else ".",
|
34
|
+
"; ".join(
|
35
|
+
f'{k} "' + v.replace('"', r"\"") + '"'
|
36
|
+
for k, v in feature.attributes.items()
|
37
|
+
),
|
38
|
+
)
|
39
|
+
self._output.write("\t".join(fields))
|
40
|
+
self._output.write("\n")
|
41
|
+
|
42
|
+
|
22
43
|
if __name__ == "__main__":
|
23
44
|
for path in sys.argv[1:]:
|
24
45
|
with GTFReader(path) as r:
|
@@ -41,5 +62,6 @@ if __name__ == "__main__":
|
|
41
62
|
f"{path}: {total_features} features, "
|
42
63
|
f"{parsed_genes} genes parsed out of {annotated_genes}, "
|
43
64
|
f"{parsed_exons} exons parsed out of {annotated_exons}, "
|
44
|
-
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}"
|
65
|
+
f"{parsed_utrs} UTRs parsed out of {annotated_utrs}",
|
66
|
+
file=sys.stderr,
|
45
67
|
)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
|
3
|
+
|
4
|
+
__all__ = ["ReferenceSequence", "Alignment", "BAMTag"]
|
5
|
+
|
6
|
+
from enum import IntFlag
|
7
|
+
|
8
|
+
from typing import Any, Literal
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass(frozen=True)
|
12
|
+
class ReferenceSequence:
|
13
|
+
id: str
|
14
|
+
length: int
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass(frozen=True, slots=True)
|
18
|
+
class BAMTag:
|
19
|
+
tag: str
|
20
|
+
value: Any
|
21
|
+
|
22
|
+
|
23
|
+
CIGAROpKind = Literal["M", "I", "D", "N", "S", "H", "P", "=", "X"]
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True, slots=True)
|
27
|
+
class CIGAROperation:
|
28
|
+
kind: CIGAROpKind
|
29
|
+
count: int
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass(frozen=True)
|
33
|
+
class CIGAR:
|
34
|
+
operations: tuple[CIGAROperation, ...]
|
35
|
+
|
36
|
+
def __repr__(self) -> str:
|
37
|
+
return f'CIGAR("{self}")'
|
38
|
+
|
39
|
+
def __str__(self) -> str:
|
40
|
+
return "".join(f"{op.count}{op.kind}" for op in self.operations)
|
41
|
+
|
42
|
+
|
43
|
+
class BAMFlag(IntFlag):
|
44
|
+
MULTIPLE_SEGMENTS = 1 << 0
|
45
|
+
EACH_SEGMENT_PROPERLY_ALIGNED = 1 << 1
|
46
|
+
SEGMENT_UNMAPPED = 1 << 2
|
47
|
+
NEXT_SEGMENT_UNMAPPED = 1 << 3
|
48
|
+
READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 4
|
49
|
+
NEXT_SEGMENT_READ_SEQUENCE_REVERSE_COMPLEMENTED = 1 << 5
|
50
|
+
FIRST_SEGMENT = 1 << 6
|
51
|
+
LAST_SEGMENT = 1 << 7
|
52
|
+
SECONDARY_SEGMENT = 1 << 8
|
53
|
+
NOT_PASSING_QUALITY_CONTROL = 1 << 9
|
54
|
+
DUPLICATE = 1 << 10
|
55
|
+
SUPPLEMENTARY_ALIGNMENT = 1 << 11
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass(frozen=True)
|
59
|
+
class Alignment:
|
60
|
+
reference_sequence: ReferenceSequence | None
|
61
|
+
|
62
|
+
start_c: int
|
63
|
+
# 0-based leftmost coordinate.
|
64
|
+
read_name: str
|
65
|
+
mapping_quality: int
|
66
|
+
bai_index_bin: int
|
67
|
+
|
68
|
+
next_reference_sequence: ReferenceSequence | None
|
69
|
+
next_start_c: int
|
70
|
+
template_length: int
|
71
|
+
cigar: CIGAR
|
72
|
+
read_sequence: str
|
73
|
+
quality: str
|
74
|
+
|
75
|
+
bam_flags: int
|
76
|
+
bam_tags: tuple[BAMTag, ...]
|
File without changes
|
biofiles/utility/cli.py
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import TypeAlias, Callable, Any, Literal, Type
|
4
|
+
|
5
|
+
from biofiles.types.feature import Feature, Gene, Transcript, UTR, Exon
|
6
|
+
|
7
|
+
FeatureFilter: TypeAlias = Callable[[Feature], bool]
|
8
|
+
FeatureMapper: TypeAlias = Callable[[Feature], Any]
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class Pipeline:
|
13
|
+
inputs: list[Path]
|
14
|
+
filters: list[FeatureFilter]
|
15
|
+
mapper: FeatureMapper | None
|
16
|
+
|
17
|
+
def filter(self, feature: Feature) -> bool:
|
18
|
+
for f in self.filters:
|
19
|
+
if not f(feature):
|
20
|
+
return False
|
21
|
+
return True
|
22
|
+
|
23
|
+
def map(self, feature: Feature) -> Any:
|
24
|
+
if not self.mapper:
|
25
|
+
return feature
|
26
|
+
return self.mapper(feature)
|
27
|
+
|
28
|
+
|
29
|
+
Mode: TypeAlias = Literal["inputs", "filters", "done"]
|
30
|
+
|
31
|
+
|
32
|
+
def parse_pipeline_args(argv: list[str]) -> Pipeline:
|
33
|
+
pipeline = Pipeline(inputs=[], filters=[], mapper=None)
|
34
|
+
|
35
|
+
mode: Mode = "inputs"
|
36
|
+
i = 0
|
37
|
+
while i < len(argv):
|
38
|
+
match mode, argv[i:]:
|
39
|
+
case "inputs", [str_path, *_] if (path := Path(str_path)).is_file():
|
40
|
+
pipeline.inputs.append(path)
|
41
|
+
i += 1
|
42
|
+
case "inputs", ["--filter", *_]:
|
43
|
+
mode = "filters"
|
44
|
+
i += 1
|
45
|
+
case "inputs" | "filters", ["--attr", key]:
|
46
|
+
path = key.split(".")
|
47
|
+
pipeline.mapper = _produce_attr_mapper(path)
|
48
|
+
mode = "done"
|
49
|
+
i += 2
|
50
|
+
case "filters", [filter_str, *_]:
|
51
|
+
filter_ = _parse_filter(filter_str)
|
52
|
+
pipeline.filters.append(filter_)
|
53
|
+
i += 1
|
54
|
+
case other:
|
55
|
+
raise ValueError(f"can't parse command line arguments {argv[i:]}")
|
56
|
+
|
57
|
+
return pipeline
|
58
|
+
|
59
|
+
|
60
|
+
def _parse_filter(filter_str: str) -> FeatureFilter:
|
61
|
+
if "=" not in filter_str:
|
62
|
+
# --filter gene,transcript
|
63
|
+
type_strs = filter_str.split(",")
|
64
|
+
types = tuple(_parse_feature_type(t) for t in type_strs)
|
65
|
+
return lambda f: isinstance(f, types)
|
66
|
+
|
67
|
+
# --filter attr=value1,value2
|
68
|
+
key, value = filter_str.split("=", maxsplit=1)
|
69
|
+
values = value.split(",")
|
70
|
+
match key:
|
71
|
+
case "chromosome":
|
72
|
+
return lambda f: f.sequence_id in values
|
73
|
+
case "type":
|
74
|
+
return lambda f: f.type_ in values
|
75
|
+
case "strand":
|
76
|
+
return lambda f: f.strand in values
|
77
|
+
case _:
|
78
|
+
path = key.split(".")
|
79
|
+
return _produce_attr_filter(path, values)
|
80
|
+
|
81
|
+
raise ValueError(f"can't parse filter {filter_str!r}")
|
82
|
+
|
83
|
+
|
84
|
+
def _parse_feature_type(t: str) -> Type[Feature]:
|
85
|
+
if t not in _FEATURE_TYPES:
|
86
|
+
raise ValueError(f"unknown feature type {t!r}")
|
87
|
+
return _FEATURE_TYPES[t]
|
88
|
+
|
89
|
+
|
90
|
+
def _produce_attr_filter(path: list[str], values: list[str]) -> FeatureFilter:
|
91
|
+
assert path
|
92
|
+
if len(path) == 1:
|
93
|
+
(key,) = path
|
94
|
+
match key:
|
95
|
+
case "chromosome" | "type" | "strand" | "id":
|
96
|
+
return lambda f: getattr(f, key) in values
|
97
|
+
# TODO other attributes
|
98
|
+
case _:
|
99
|
+
return lambda f: f.attributes.get(key) in values
|
100
|
+
|
101
|
+
if path[0] not in ("gene", "transcript", "parent"):
|
102
|
+
raise ValueError(f"unknown attribute {path[-2]!r}")
|
103
|
+
|
104
|
+
nested = _produce_attr_filter(path[1:], values)
|
105
|
+
return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else False)
|
106
|
+
|
107
|
+
|
108
|
+
def _produce_attr_mapper(path: list[str]) -> FeatureMapper:
|
109
|
+
assert path
|
110
|
+
if len(path) == 1:
|
111
|
+
(key,) = path
|
112
|
+
match key:
|
113
|
+
case "chromosome" | "type" | "strand" | "id":
|
114
|
+
return lambda f: getattr(f, key)
|
115
|
+
# TODO other attributes
|
116
|
+
case _:
|
117
|
+
return lambda f: f.attributes.get(key, "")
|
118
|
+
|
119
|
+
if path[0] not in ("gene", "transcript", "parent"):
|
120
|
+
raise ValueError(f"unknown attribute {path[-2]!r}")
|
121
|
+
|
122
|
+
nested = _produce_attr_mapper(path[1:])
|
123
|
+
return lambda f: (nested(nf) if (nf := getattr(f, path[0], None)) else None)
|
124
|
+
|
125
|
+
|
126
|
+
_FEATURE_TYPES = {"gene": Gene, "transcript": Transcript, "exon": Exon, "utr": UTR}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: biofiles
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.10
|
4
4
|
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
5
|
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
6
|
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
biofiles/bam.py,sha256=w32LLOAuKWdGF7joTSrB4HYXCdfvvijZW44jizG36R8,6771
|
3
|
+
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
4
|
+
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
5
|
+
biofiles/gff.py,sha256=b3apOmJNoiy_qQHtyUSnNh0s999B6gyAODyjI7fN15g,6246
|
6
|
+
biofiles/gtf.py,sha256=h_eFKnYWb8GQp-CX9EPZRodUba-bzQLGidGHOPUo4iM,2366
|
7
|
+
biofiles/repeatmasker.py,sha256=txOYdw15ru88pUczsk0pDFzgGpplLu23CB8Ppz-MczY,3119
|
8
|
+
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
biofiles/types/alignment.py,sha256=5UvwKJ2psIpkkU5efGRHe8gYhMIoW35-RZ_Zoe5YDrY,1612
|
10
|
+
biofiles/types/feature.py,sha256=3Ar45WRgiaDSh5iQt24Emtk6_57G01q5nHJ1GNIJ19Y,1190
|
11
|
+
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
12
|
+
biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
|
13
|
+
biofiles/utility/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
biofiles/utility/cli.py,sha256=bkUzmT5R4qdJ0YtA4LNU5JYpimD1HmZlHtoSaKzDsUc,4032
|
15
|
+
biofiles/utility/feature.py,sha256=tUTn16xV1e0qpgkZ1ZwQ4LJJGil5mgQJBJ9s1yFDgiI,8068
|
16
|
+
biofiles-0.0.10.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
17
|
+
biofiles-0.0.10.dist-info/METADATA,sha256=jthXyKbpyvig9dgiQmUaIyDH8hWKX2zMyojyIxr5mjM,3034
|
18
|
+
biofiles-0.0.10.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
19
|
+
biofiles-0.0.10.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
20
|
+
biofiles-0.0.10.dist-info/RECORD,,
|
biofiles-0.0.9.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
biofiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
biofiles/common.py,sha256=Yi0i85FpD2wR3vqL645LTUAE6TybGDxxZQsUmEGHqu4,1126
|
3
|
-
biofiles/fasta.py,sha256=ctIt5I_fcZx-xQN921zpmlZS7e9_ICf-3_i6mTs5qbs,2135
|
4
|
-
biofiles/feature.py,sha256=tUTn16xV1e0qpgkZ1ZwQ4LJJGil5mgQJBJ9s1yFDgiI,8068
|
5
|
-
biofiles/gff.py,sha256=6xmwnuU1CsFibIHzbggYJajzQC4KGsGAfWMxyYFFChw,5798
|
6
|
-
biofiles/gtf.py,sha256=kAt_5ifb0f8cCR-kycnQhkyo78xOynqTUUGqgOP8tjA,1543
|
7
|
-
biofiles/repeatmasker.py,sha256=txOYdw15ru88pUczsk0pDFzgGpplLu23CB8Ppz-MczY,3119
|
8
|
-
biofiles/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
biofiles/types/feature.py,sha256=3Ar45WRgiaDSh5iQt24Emtk6_57G01q5nHJ1GNIJ19Y,1190
|
10
|
-
biofiles/types/repeat.py,sha256=63SqzAwEGIDIGP9pxC85RUdwXbbSm0S5WNL3lSiWlmc,641
|
11
|
-
biofiles/types/sequence.py,sha256=EOw_oKuMR0THpCYJqVE__27z7qrRqcdIPrRWTL4OFMw,152
|
12
|
-
biofiles-0.0.9.dist-info/LICENSE,sha256=CbR8ssdFyViKj25JAlMjIt1_FbiZ1tAC5t-uwUbxqak,1070
|
13
|
-
biofiles-0.0.9.dist-info/METADATA,sha256=gLu3ufoag4tZllgq9xCDZe_kA24RXuI4TqQdAI_QIKw,3033
|
14
|
-
biofiles-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
-
biofiles-0.0.9.dist-info/top_level.txt,sha256=laFaFv8hpkI4U-Pgs0yBaAJXN2_CJKl7jb-m3-tGfSc,9
|
16
|
-
biofiles-0.0.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|