biofiles 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biofiles-0.0.1/LICENSE +21 -0
- biofiles-0.0.1/PKG-INFO +95 -0
- biofiles-0.0.1/README.md +57 -0
- biofiles-0.0.1/biofiles/__init__.py +0 -0
- biofiles-0.0.1/biofiles/common.py +43 -0
- biofiles-0.0.1/biofiles/fasta.py +65 -0
- biofiles-0.0.1/biofiles/gff.py +283 -0
- biofiles-0.0.1/biofiles/repeatmasker.py +87 -0
- biofiles-0.0.1/biofiles/types/__init__.py +0 -0
- biofiles-0.0.1/biofiles/types/feature.py +45 -0
- biofiles-0.0.1/biofiles/types/repeat.py +30 -0
- biofiles-0.0.1/biofiles/types/sequence.py +11 -0
- biofiles-0.0.1/biofiles.egg-info/PKG-INFO +95 -0
- biofiles-0.0.1/biofiles.egg-info/SOURCES.txt +16 -0
- biofiles-0.0.1/biofiles.egg-info/dependency_links.txt +1 -0
- biofiles-0.0.1/biofiles.egg-info/top_level.txt +1 -0
- biofiles-0.0.1/pyproject.toml +25 -0
- biofiles-0.0.1/setup.cfg +4 -0
biofiles-0.0.1/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2023 Tigran Saluev
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
biofiles-0.0.1/PKG-INFO
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: biofiles
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
|
+
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
|
+
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
7
|
+
License: MIT License
|
8
|
+
|
9
|
+
Copyright (c) 2023 Tigran Saluev
|
10
|
+
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
13
|
+
in the Software without restriction, including without limitation the rights
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
16
|
+
furnished to do so, subject to the following conditions:
|
17
|
+
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
19
|
+
copies or substantial portions of the Software.
|
20
|
+
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
27
|
+
SOFTWARE.
|
28
|
+
|
29
|
+
Classifier: Programming Language :: Python :: 3
|
30
|
+
Classifier: License :: OSI Approved :: MIT License
|
31
|
+
Classifier: Operating System :: OS Independent
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
35
|
+
Requires-Python: >=3.10
|
36
|
+
Description-Content-Type: text/markdown
|
37
|
+
License-File: LICENSE
|
38
|
+
|
39
|
+
# biofiles
|
40
|
+
|
41
|
+
Pure-Python, zero-dependency collection of bioinformatics-related
|
42
|
+
file readers and writers.
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```shell
|
47
|
+
python -m pip install biofiles
|
48
|
+
```
|
49
|
+
|
50
|
+
## Usage
|
51
|
+
|
52
|
+
Reading FASTA files:
|
53
|
+
|
54
|
+
```python
|
55
|
+
from biofiles.fasta import FASTAReader
|
56
|
+
|
57
|
+
with FASTAReader("sequences.fasta") as r:
|
58
|
+
for seq in r:
|
59
|
+
print(seq.id, len(seq.sequence))
|
60
|
+
|
61
|
+
# or
|
62
|
+
|
63
|
+
with open("sequences.fasta") as f:
|
64
|
+
r = FASTAReader(f)
|
65
|
+
for seq in r:
|
66
|
+
print(seq.id, len(seq.sequence))
|
67
|
+
```
|
68
|
+
|
69
|
+
Writing FASTA files:
|
70
|
+
|
71
|
+
```python
|
72
|
+
from biofiles.fasta import FASTAWriter
|
73
|
+
from biofiles.types.sequence import Sequence
|
74
|
+
|
75
|
+
seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
|
76
|
+
|
77
|
+
with FASTAWriter("output.fasta") as w:
|
78
|
+
w.write(seq)
|
79
|
+
```
|
80
|
+
|
81
|
+
Reading GFF genome annotations:
|
82
|
+
|
83
|
+
```python
|
84
|
+
from biofiles.gff import GFFReader
|
85
|
+
from biofiles.types.feature import Gene
|
86
|
+
|
87
|
+
with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
|
88
|
+
for feature in r:
|
89
|
+
if isinstance(feature, Gene):
|
90
|
+
print(feature.name, len(feature.exons))
|
91
|
+
```
|
92
|
+
|
93
|
+
## License
|
94
|
+
|
95
|
+
MIT license, see [License](LICENSE).
|
biofiles-0.0.1/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# biofiles
|
2
|
+
|
3
|
+
Pure-Python, zero-dependency collection of bioinformatics-related
|
4
|
+
file readers and writers.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
```shell
|
9
|
+
python -m pip install biofiles
|
10
|
+
```
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
Reading FASTA files:
|
15
|
+
|
16
|
+
```python
|
17
|
+
from biofiles.fasta import FASTAReader
|
18
|
+
|
19
|
+
with FASTAReader("sequences.fasta") as r:
|
20
|
+
for seq in r:
|
21
|
+
print(seq.id, len(seq.sequence))
|
22
|
+
|
23
|
+
# or
|
24
|
+
|
25
|
+
with open("sequences.fasta") as f:
|
26
|
+
r = FASTAReader(f)
|
27
|
+
for seq in r:
|
28
|
+
print(seq.id, len(seq.sequence))
|
29
|
+
```
|
30
|
+
|
31
|
+
Writing FASTA files:
|
32
|
+
|
33
|
+
```python
|
34
|
+
from biofiles.fasta import FASTAWriter
|
35
|
+
from biofiles.types.sequence import Sequence
|
36
|
+
|
37
|
+
seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
|
38
|
+
|
39
|
+
with FASTAWriter("output.fasta") as w:
|
40
|
+
w.write(seq)
|
41
|
+
```
|
42
|
+
|
43
|
+
Reading GFF genome annotations:
|
44
|
+
|
45
|
+
```python
|
46
|
+
from biofiles.gff import GFFReader
|
47
|
+
from biofiles.types.feature import Gene
|
48
|
+
|
49
|
+
with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
|
50
|
+
for feature in r:
|
51
|
+
if isinstance(feature, Gene):
|
52
|
+
print(feature.name, len(feature.exons))
|
53
|
+
```
|
54
|
+
|
55
|
+
## License
|
56
|
+
|
57
|
+
MIT license, see [License](LICENSE).
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from types import TracebackType
|
3
|
+
from typing import TypeAlias, Literal, TextIO
|
4
|
+
|
5
|
+
Strand: TypeAlias = Literal["+", "-"]
|
6
|
+
|
7
|
+
|
8
|
+
class Reader:
|
9
|
+
def __init__(self, input_: TextIO | Path | str) -> None:
|
10
|
+
if isinstance(input_, Path | str):
|
11
|
+
input_ = open(input_)
|
12
|
+
self._input = input_
|
13
|
+
|
14
|
+
def __enter__(self):
|
15
|
+
self._input.__enter__()
|
16
|
+
return self
|
17
|
+
|
18
|
+
def __exit__(
|
19
|
+
self,
|
20
|
+
exc_type: type[BaseException] | None,
|
21
|
+
exc_val: BaseException | None,
|
22
|
+
exc_tb: TracebackType | None,
|
23
|
+
) -> None:
|
24
|
+
self._input.__exit__(exc_type, exc_val, exc_tb)
|
25
|
+
|
26
|
+
|
27
|
+
class Writer:
|
28
|
+
def __init__(self, output: TextIO | Path | str) -> None:
|
29
|
+
if isinstance(output, Path | str):
|
30
|
+
output = open(output, "w")
|
31
|
+
self._output = output
|
32
|
+
|
33
|
+
def __enter__(self):
|
34
|
+
self._output.__enter__()
|
35
|
+
return self
|
36
|
+
|
37
|
+
def __exit__(
|
38
|
+
self,
|
39
|
+
exc_type: type[BaseException] | None,
|
40
|
+
exc_val: BaseException | None,
|
41
|
+
exc_tb: TracebackType | None,
|
42
|
+
) -> None:
|
43
|
+
self._output.__exit__(exc_type, exc_val, exc_tb)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import TextIO, Iterator
|
4
|
+
|
5
|
+
from biofiles.common import Reader, Writer
|
6
|
+
from biofiles.types.sequence import Sequence
|
7
|
+
|
8
|
+
|
9
|
+
__all__ = ["FASTAReader", "FASTAWriter"]
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class _SequenceDraft:
|
14
|
+
id: str
|
15
|
+
description: str
|
16
|
+
sequence_parts: list[str] = field(default_factory=list)
|
17
|
+
|
18
|
+
def finalize(self) -> Sequence:
|
19
|
+
return Sequence(
|
20
|
+
id=self.id,
|
21
|
+
description=self.description,
|
22
|
+
sequence="".join(self.sequence_parts),
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
class FASTAReader(Reader):
|
27
|
+
def __iter__(self) -> Iterator[Sequence]:
|
28
|
+
draft: _SequenceDraft | None = None
|
29
|
+
for line in self._input:
|
30
|
+
line = line.rstrip("\n")
|
31
|
+
if line.startswith(">"):
|
32
|
+
if draft:
|
33
|
+
yield draft.finalize()
|
34
|
+
line = line.removeprefix(">").lstrip()
|
35
|
+
match line.split(maxsplit=1):
|
36
|
+
case [id_, desc]:
|
37
|
+
pass
|
38
|
+
case [id_]:
|
39
|
+
desc = ""
|
40
|
+
case []:
|
41
|
+
raise ValueError(
|
42
|
+
f"unexpected line {line!r}, expected a non-empty sequence identifier"
|
43
|
+
)
|
44
|
+
draft = _SequenceDraft(id=id_, description=desc)
|
45
|
+
elif line:
|
46
|
+
if not draft:
|
47
|
+
raise ValueError(f"unexpected line {line!r}, expected >")
|
48
|
+
draft.sequence_parts.append(line)
|
49
|
+
if draft:
|
50
|
+
yield draft.finalize()
|
51
|
+
|
52
|
+
|
53
|
+
class FASTAWriter(Writer):
|
54
|
+
def __init__(self, output: TextIO | Path | str, width: int = 80) -> None:
|
55
|
+
super().__init__(output)
|
56
|
+
self._width = width
|
57
|
+
|
58
|
+
def write(self, sequence: Sequence) -> None:
|
59
|
+
self._output.write(f">{sequence.id} {sequence.description}\n")
|
60
|
+
sequence_len = len(sequence.sequence)
|
61
|
+
for offset in range(0, sequence_len, self._width):
|
62
|
+
self._output.write(
|
63
|
+
sequence.sequence[offset : min(offset + self._width, sequence_len)]
|
64
|
+
)
|
65
|
+
self._output.write("\n")
|
@@ -0,0 +1,283 @@
|
|
1
|
+
import sys
|
2
|
+
from collections import deque
|
3
|
+
from dataclasses import dataclass, field
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Iterator, cast, TextIO
|
6
|
+
|
7
|
+
from biofiles.common import Strand, Reader
|
8
|
+
from biofiles.types.feature import Feature, Gene, Exon
|
9
|
+
|
10
|
+
__all__ = ["GFFReader"]
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass
|
14
|
+
class _FeatureDraft:
|
15
|
+
idx: int
|
16
|
+
sequence_id: str
|
17
|
+
source: str
|
18
|
+
type_: str
|
19
|
+
start_original: int
|
20
|
+
end_original: int
|
21
|
+
score: float | None
|
22
|
+
strand: Strand | None
|
23
|
+
phase: int | None
|
24
|
+
attributes: dict[str, str]
|
25
|
+
|
26
|
+
def pick_attribute(self, *keys: str) -> str | None:
|
27
|
+
for key in keys:
|
28
|
+
if (value := self.attributes.get(key, None)) is not None:
|
29
|
+
return value
|
30
|
+
return None
|
31
|
+
|
32
|
+
|
33
|
+
@dataclass
|
34
|
+
class _FeatureDrafts:
|
35
|
+
drafts: deque[_FeatureDraft] = field(default_factory=deque)
|
36
|
+
by_id: dict[str, _FeatureDraft] = field(default_factory=dict)
|
37
|
+
# deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
|
38
|
+
|
39
|
+
def add(self, draft: _FeatureDraft) -> None:
|
40
|
+
self.drafts.append(draft)
|
41
|
+
if id_ := draft.attributes.get("ID", None):
|
42
|
+
self.by_id[id_] = draft
|
43
|
+
# if parent_id := draft.attributes.get("Parent", None):
|
44
|
+
# parent = self.by_id[parent_id]
|
45
|
+
# self.deps[parent.idx].append(draft.idx)
|
46
|
+
|
47
|
+
# def remove_first_n(self, n: int) -> None:
|
48
|
+
# for _ in range(n):
|
49
|
+
# draft = self.drafts.popleft()
|
50
|
+
# if id_ := draft.attributes.get("ID", None):
|
51
|
+
# del self.by_id[id_]
|
52
|
+
# self.deps.pop(draft.idx, None)
|
53
|
+
|
54
|
+
|
55
|
+
@dataclass
|
56
|
+
class _Features:
|
57
|
+
features: list[Feature] = field(default_factory=list)
|
58
|
+
by_id: dict[str, Feature] = field(default_factory=dict)
|
59
|
+
|
60
|
+
def add(self, feature: Feature):
|
61
|
+
self.features.append(feature)
|
62
|
+
if id_ := feature.attributes.get("ID", None):
|
63
|
+
self.by_id[id_] = feature
|
64
|
+
|
65
|
+
|
66
|
+
class GFFReader(Reader):
|
67
|
+
def __init__(
|
68
|
+
self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
|
69
|
+
):
|
70
|
+
super().__init__(input_)
|
71
|
+
self._streaming_window = streaming_window
|
72
|
+
|
73
|
+
def __iter__(self) -> Iterator[Feature]:
|
74
|
+
for line in self._input:
|
75
|
+
line = line.rstrip("\n")
|
76
|
+
if line.startswith(_VERSION_PREFIX):
|
77
|
+
version = line.removeprefix(_VERSION_PREFIX)
|
78
|
+
if version == "3":
|
79
|
+
yield from self._read_gff3()
|
80
|
+
return
|
81
|
+
raise ValueError(f"unsupported version {version!r}")
|
82
|
+
if line.startswith("#"):
|
83
|
+
continue
|
84
|
+
raise ValueError(f"unexpected line {line!r}, expected version")
|
85
|
+
|
86
|
+
def _read_gff3(self) -> Iterator[Feature]:
|
87
|
+
drafts = _FeatureDrafts()
|
88
|
+
idx = 0
|
89
|
+
for line in self._input:
|
90
|
+
if line.startswith("#"):
|
91
|
+
continue
|
92
|
+
line = line.rstrip("\n")
|
93
|
+
parts = line.split("\t", maxsplit=8)
|
94
|
+
if len(parts) != 9:
|
95
|
+
raise ValueError(f"unexpected line {line!r}, expected 9 columns")
|
96
|
+
(
|
97
|
+
sequence_id,
|
98
|
+
source,
|
99
|
+
type_,
|
100
|
+
start_str,
|
101
|
+
end_str,
|
102
|
+
score_str,
|
103
|
+
strand_str,
|
104
|
+
phase_str,
|
105
|
+
attributes_str,
|
106
|
+
) = parts
|
107
|
+
score = self._parse_score(line, score_str)
|
108
|
+
strand = self._parse_strand(line, strand_str)
|
109
|
+
phase = self._parse_phase(line, phase_str)
|
110
|
+
attributes = self._parse_attributes(line, attributes_str)
|
111
|
+
|
112
|
+
parent_id = attributes.get("Parent", None)
|
113
|
+
# if parent_id is None:
|
114
|
+
# yield from self._finalize_drafts(drafts)
|
115
|
+
# drafts = _FeatureDrafts()
|
116
|
+
if parent_id is not None and parent_id not in drafts.by_id:
|
117
|
+
raise ValueError(
|
118
|
+
f"unexpected line {line!r}, parent ID not among recent feature IDs"
|
119
|
+
)
|
120
|
+
|
121
|
+
draft = _FeatureDraft(
|
122
|
+
idx=idx,
|
123
|
+
sequence_id=sequence_id,
|
124
|
+
source=source,
|
125
|
+
type_=type_,
|
126
|
+
start_original=int(start_str),
|
127
|
+
end_original=int(end_str),
|
128
|
+
score=score,
|
129
|
+
strand=strand,
|
130
|
+
phase=phase,
|
131
|
+
attributes=attributes,
|
132
|
+
)
|
133
|
+
drafts.add(draft)
|
134
|
+
idx += 1
|
135
|
+
|
136
|
+
# yield from self._finalize_drafts(drafts, self._streaming_window)
|
137
|
+
|
138
|
+
yield from self._finalize_drafts(drafts, None)
|
139
|
+
|
140
|
+
def _finalize_drafts(
|
141
|
+
self, drafts: _FeatureDrafts, w: int | None
|
142
|
+
) -> Iterator[Feature]:
|
143
|
+
# TODO streaming version!
|
144
|
+
# code below is already tracking
|
145
|
+
# if not drafts.drafts:
|
146
|
+
# return
|
147
|
+
# if w is not None and len(drafts.drafts) <= w:
|
148
|
+
# return
|
149
|
+
#
|
150
|
+
# end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
|
151
|
+
#
|
152
|
+
# i = 0
|
153
|
+
# while i < len(drafts.drafts) and (
|
154
|
+
# not drafts.deps[drafts.drafts[i].idx]
|
155
|
+
# or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
|
156
|
+
# ):
|
157
|
+
# i += 1
|
158
|
+
#
|
159
|
+
# print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
|
160
|
+
#
|
161
|
+
# result = _Features()
|
162
|
+
# for j in range(i):
|
163
|
+
# draft = drafts.drafts[j]
|
164
|
+
# feature = self._finalize_draft(draft, result)
|
165
|
+
# result.add(feature)
|
166
|
+
# drafts.remove_first_n(i)
|
167
|
+
# yield from result.features
|
168
|
+
|
169
|
+
result = _Features()
|
170
|
+
for draft in drafts.drafts:
|
171
|
+
feature = self._finalize_draft(draft, result)
|
172
|
+
result.add(feature)
|
173
|
+
yield from result.features
|
174
|
+
|
175
|
+
def _finalize_draft(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
176
|
+
match draft.type_:
|
177
|
+
case "gene":
|
178
|
+
feature = self._finalize_gene(draft, result)
|
179
|
+
case "exon":
|
180
|
+
feature = self._finalize_exon(draft, result)
|
181
|
+
case _:
|
182
|
+
feature = self._finalize_other(draft, result)
|
183
|
+
if feature.parent:
|
184
|
+
new_children = feature.parent.children + (feature,)
|
185
|
+
object.__setattr__(feature.parent, "children", new_children)
|
186
|
+
return feature
|
187
|
+
|
188
|
+
def _finalize_gene(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
189
|
+
feature = self._finalize_other(draft, result)
|
190
|
+
name = draft.pick_attribute("gene_name", "Name")
|
191
|
+
biotype = draft.pick_attribute("gene_biotype", "biotype")
|
192
|
+
if name is None or biotype is None:
|
193
|
+
return feature
|
194
|
+
return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
|
195
|
+
|
196
|
+
def _finalize_exon(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
197
|
+
feature = self._finalize_other(draft, result)
|
198
|
+
|
199
|
+
gene = feature.parent
|
200
|
+
while gene and not isinstance(gene, Gene):
|
201
|
+
gene = gene.parent
|
202
|
+
|
203
|
+
if gene is None:
|
204
|
+
return feature
|
205
|
+
exon = Exon(**feature.__dict__, gene=gene)
|
206
|
+
object.__setattr__(gene, "exons", gene.exons + (exon,))
|
207
|
+
return exon
|
208
|
+
|
209
|
+
def _finalize_other(self, draft: _FeatureDraft, result: _Features) -> Feature:
|
210
|
+
parent_id = draft.attributes.get("Parent", None)
|
211
|
+
parent = result.by_id[parent_id] if parent_id is not None else None
|
212
|
+
|
213
|
+
return Feature(
|
214
|
+
sequence_id=draft.sequence_id,
|
215
|
+
source=draft.source,
|
216
|
+
type_=draft.type_,
|
217
|
+
start_original=draft.start_original,
|
218
|
+
end_original=draft.end_original,
|
219
|
+
start_c=draft.start_original - 1,
|
220
|
+
end_c=draft.end_original,
|
221
|
+
score=draft.score,
|
222
|
+
strand=draft.strand,
|
223
|
+
phase=draft.phase,
|
224
|
+
attributes=draft.attributes,
|
225
|
+
parent=parent,
|
226
|
+
children=(),
|
227
|
+
)
|
228
|
+
|
229
|
+
def _parse_score(self, line: str, score_str: str) -> float | None:
|
230
|
+
if score_str == ".":
|
231
|
+
return None
|
232
|
+
try:
|
233
|
+
return float(score_str)
|
234
|
+
except ValueError as exc:
|
235
|
+
raise ValueError(
|
236
|
+
f"unexpected line {line!r}, score should be a number or '.'"
|
237
|
+
) from exc
|
238
|
+
|
239
|
+
def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
|
240
|
+
if strand_str in ("-", "+"):
|
241
|
+
return cast(Strand, strand_str)
|
242
|
+
if strand_str == ".":
|
243
|
+
return None
|
244
|
+
raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
|
245
|
+
|
246
|
+
def _parse_phase(self, line: str, phase_str: str) -> int | None:
|
247
|
+
if phase_str == ".":
|
248
|
+
return None
|
249
|
+
try:
|
250
|
+
return int(phase_str)
|
251
|
+
except ValueError as exc:
|
252
|
+
raise ValueError(
|
253
|
+
f"unexpected line {line!r}, phase should be an integer or '.'"
|
254
|
+
) from exc
|
255
|
+
|
256
|
+
def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
|
257
|
+
return {
|
258
|
+
k: v
|
259
|
+
for part in attributes_str.strip(";").split(";")
|
260
|
+
for k, v in (part.split("=", 1),)
|
261
|
+
}
|
262
|
+
|
263
|
+
|
264
|
+
_VERSION_PREFIX = "##gff-version "
|
265
|
+
|
266
|
+
|
267
|
+
if __name__ == "__main__":
|
268
|
+
for path in sys.argv[1:]:
|
269
|
+
with GFFReader(path) as r:
|
270
|
+
total_features = 0
|
271
|
+
annotated_genes = 0
|
272
|
+
annotated_exons = 0
|
273
|
+
parsed_genes = 0
|
274
|
+
parsed_exons = 0
|
275
|
+
for feature in r:
|
276
|
+
total_features += 1
|
277
|
+
annotated_genes += feature.type_ == "gene"
|
278
|
+
annotated_exons += feature.type_ == "exon"
|
279
|
+
parsed_genes += isinstance(feature, Gene)
|
280
|
+
parsed_exons += isinstance(feature, Exon)
|
281
|
+
print(
|
282
|
+
f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
|
283
|
+
)
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import sys
|
2
|
+
from collections import Counter
|
3
|
+
from typing import Iterator
|
4
|
+
|
5
|
+
from biofiles.common import Reader
|
6
|
+
from biofiles.types.repeat import Repeat
|
7
|
+
|
8
|
+
|
9
|
+
__all__ = ["RepeatMaskerReader"]
|
10
|
+
|
11
|
+
|
12
|
+
class RepeatMaskerReader(Reader):
|
13
|
+
def __iter__(self) -> Iterator[Repeat]:
|
14
|
+
for line in self._input:
|
15
|
+
parts = line.split("\t")
|
16
|
+
if not (14 <= len(parts) <= 15):
|
17
|
+
# Probably some metainfo. No way to tell.
|
18
|
+
continue
|
19
|
+
|
20
|
+
(
|
21
|
+
sw_score_str,
|
22
|
+
div_str,
|
23
|
+
del_str,
|
24
|
+
ins_str,
|
25
|
+
seq_id,
|
26
|
+
seq_start_str,
|
27
|
+
seq_end_str,
|
28
|
+
seq_left_str,
|
29
|
+
strand_str,
|
30
|
+
repeat_name,
|
31
|
+
repeat_class_family,
|
32
|
+
repeat_start_str,
|
33
|
+
repeat_end_str,
|
34
|
+
repeat_left_str,
|
35
|
+
*repeat_id_or_none,
|
36
|
+
) = parts
|
37
|
+
|
38
|
+
sw_score = int(sw_score_str)
|
39
|
+
div_percent = float(div_str)
|
40
|
+
del_percent = float(del_str)
|
41
|
+
ins_percent = float(ins_str)
|
42
|
+
seq_start = int(seq_start_str)
|
43
|
+
seq_end = int(seq_end_str)
|
44
|
+
seq_left = int(seq_left_str[1:-1])
|
45
|
+
strand = {"+": "+", "C": "-"}[strand_str]
|
46
|
+
|
47
|
+
if "/" in repeat_class_family:
|
48
|
+
repeat_class, repeat_family = repeat_class_family.split("/", 1)
|
49
|
+
else:
|
50
|
+
repeat_class, repeat_family = repeat_class_family, None
|
51
|
+
if strand_str == "C":
|
52
|
+
repeat_start_str, repeat_left_str = (repeat_left_str, repeat_start_str)
|
53
|
+
repeat_start = int(repeat_start_str)
|
54
|
+
repeat_end = int(repeat_end_str)
|
55
|
+
repeat_left = int(repeat_left_str[1:-1])
|
56
|
+
repeat_id = repeat_id_or_none[0] if repeat_id_or_none else None
|
57
|
+
yield Repeat(
|
58
|
+
sw_score=sw_score,
|
59
|
+
divergence_percent=div_percent,
|
60
|
+
insertion_percent=ins_percent,
|
61
|
+
deletion_percent=del_percent,
|
62
|
+
sequence_id=seq_id,
|
63
|
+
sequence_start_original=seq_start,
|
64
|
+
sequence_end_original=seq_end,
|
65
|
+
sequence_start_c=seq_start - 1,
|
66
|
+
sequence_end_c=seq_end,
|
67
|
+
sequence_left=seq_left,
|
68
|
+
strand=strand,
|
69
|
+
repeat_name=repeat_name,
|
70
|
+
repeat_class=repeat_class,
|
71
|
+
repeat_family=repeat_family,
|
72
|
+
repeat_start_original=repeat_start,
|
73
|
+
repeat_end_original=repeat_end,
|
74
|
+
repeat_start_c=repeat_start - 1,
|
75
|
+
repeat_end_c=repeat_end,
|
76
|
+
repeat_left=repeat_left,
|
77
|
+
repeat_id=repeat_id,
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
if __name__ == "__main__":
|
82
|
+
for path in sys.argv[1:]:
|
83
|
+
with RepeatMaskerReader(path) as r:
|
84
|
+
repeats_per_class = Counter(repeat.repeat_class for repeat in r)
|
85
|
+
print(f"Repeat classes in {path}:")
|
86
|
+
for k, v in repeats_per_class.most_common():
|
87
|
+
print(f" {k}: {v} repeats")
|
File without changes
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
|
3
|
+
from biofiles.common import Strand
|
4
|
+
|
5
|
+
|
6
|
+
__all__ = ["Feature", "Gene", "Exon"]
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass(frozen=True)
|
10
|
+
class Feature:
|
11
|
+
sequence_id: str
|
12
|
+
source: str
|
13
|
+
type_: str
|
14
|
+
|
15
|
+
start_original: int
|
16
|
+
end_original: int
|
17
|
+
# Original, 1-based inclusive values.
|
18
|
+
|
19
|
+
start_c: int
|
20
|
+
end_c: int
|
21
|
+
# Standardized ("C-style") 0-based values, start inclusive, end exclusive.
|
22
|
+
|
23
|
+
score: float | None
|
24
|
+
strand: Strand | None
|
25
|
+
phase: int | None
|
26
|
+
attributes: dict[str, str]
|
27
|
+
|
28
|
+
parent: "GFFFeature | None"
|
29
|
+
children: tuple["Feature", ...]
|
30
|
+
|
31
|
+
|
32
|
+
# Custom types for particular kinds of features:
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass(frozen=True)
|
36
|
+
class Gene(Feature):
|
37
|
+
name: str
|
38
|
+
biotype: str
|
39
|
+
exons: tuple["Exon", ...]
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass(frozen=True)
|
43
|
+
class Exon(Feature):
|
44
|
+
gene: Gene
|
45
|
+
# TODO transcript, mRNA
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
|
3
|
+
from biofiles.common import Strand
|
4
|
+
|
5
|
+
|
6
|
+
__all__ = ["Repeat"]
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass(frozen=True)
|
10
|
+
class Repeat:
|
11
|
+
sw_score: int
|
12
|
+
divergence_percent: float
|
13
|
+
deletion_percent: float
|
14
|
+
insertion_percent: float
|
15
|
+
sequence_id: str
|
16
|
+
sequence_start_original: int
|
17
|
+
sequence_end_original: int
|
18
|
+
sequence_start_c: int
|
19
|
+
sequence_end_c: int
|
20
|
+
sequence_left: int
|
21
|
+
strand: Strand
|
22
|
+
repeat_name: str
|
23
|
+
repeat_class: str
|
24
|
+
repeat_family: str | None
|
25
|
+
repeat_start_original: int
|
26
|
+
repeat_end_original: int
|
27
|
+
repeat_start_c: int
|
28
|
+
repeat_end_c: int
|
29
|
+
repeat_left: int
|
30
|
+
repeat_id: str | None
|
@@ -0,0 +1,95 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: biofiles
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
|
5
|
+
Author-email: Tigran Saluev <tigran@saluev.com>
|
6
|
+
Maintainer-email: Tigran Saluev <tigran@saluev.com>
|
7
|
+
License: MIT License
|
8
|
+
|
9
|
+
Copyright (c) 2023 Tigran Saluev
|
10
|
+
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
13
|
+
in the Software without restriction, including without limitation the rights
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
16
|
+
furnished to do so, subject to the following conditions:
|
17
|
+
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
19
|
+
copies or substantial portions of the Software.
|
20
|
+
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
27
|
+
SOFTWARE.
|
28
|
+
|
29
|
+
Classifier: Programming Language :: Python :: 3
|
30
|
+
Classifier: License :: OSI Approved :: MIT License
|
31
|
+
Classifier: Operating System :: OS Independent
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
35
|
+
Requires-Python: >=3.10
|
36
|
+
Description-Content-Type: text/markdown
|
37
|
+
License-File: LICENSE
|
38
|
+
|
39
|
+
# biofiles
|
40
|
+
|
41
|
+
Pure-Python, zero-dependency collection of bioinformatics-related
|
42
|
+
file readers and writers.
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```shell
|
47
|
+
python -m pip install biofiles
|
48
|
+
```
|
49
|
+
|
50
|
+
## Usage
|
51
|
+
|
52
|
+
Reading FASTA files:
|
53
|
+
|
54
|
+
```python
|
55
|
+
from biofiles.fasta import FASTAReader
|
56
|
+
|
57
|
+
with FASTAReader("sequences.fasta") as r:
|
58
|
+
for seq in r:
|
59
|
+
print(seq.id, len(seq.sequence))
|
60
|
+
|
61
|
+
# or
|
62
|
+
|
63
|
+
with open("sequences.fasta") as f:
|
64
|
+
r = FASTAReader(f)
|
65
|
+
for seq in r:
|
66
|
+
print(seq.id, len(seq.sequence))
|
67
|
+
```
|
68
|
+
|
69
|
+
Writing FASTA files:
|
70
|
+
|
71
|
+
```python
|
72
|
+
from biofiles.fasta import FASTAWriter
|
73
|
+
from biofiles.types.sequence import Sequence
|
74
|
+
|
75
|
+
seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
|
76
|
+
|
77
|
+
with FASTAWriter("output.fasta") as w:
|
78
|
+
w.write(seq)
|
79
|
+
```
|
80
|
+
|
81
|
+
Reading GFF genome annotations:
|
82
|
+
|
83
|
+
```python
|
84
|
+
from biofiles.gff import GFFReader
|
85
|
+
from biofiles.types.feature import Gene
|
86
|
+
|
87
|
+
with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
|
88
|
+
for feature in r:
|
89
|
+
if isinstance(feature, Gene):
|
90
|
+
print(feature.name, len(feature.exons))
|
91
|
+
```
|
92
|
+
|
93
|
+
## License
|
94
|
+
|
95
|
+
MIT license, see [License](LICENSE).
|
@@ -0,0 +1,16 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
biofiles/__init__.py
|
5
|
+
biofiles/common.py
|
6
|
+
biofiles/fasta.py
|
7
|
+
biofiles/gff.py
|
8
|
+
biofiles/repeatmasker.py
|
9
|
+
biofiles.egg-info/PKG-INFO
|
10
|
+
biofiles.egg-info/SOURCES.txt
|
11
|
+
biofiles.egg-info/dependency_links.txt
|
12
|
+
biofiles.egg-info/top_level.txt
|
13
|
+
biofiles/types/__init__.py
|
14
|
+
biofiles/types/feature.py
|
15
|
+
biofiles/types/repeat.py
|
16
|
+
biofiles/types/sequence.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
biofiles
|
@@ -0,0 +1,25 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "biofiles"
|
7
|
+
version = "0.0.1"
|
8
|
+
authors = [
|
9
|
+
{ name="Tigran Saluev", email="tigran@saluev.com" },
|
10
|
+
]
|
11
|
+
maintainers = [
|
12
|
+
{ name="Tigran Saluev", email="tigran@saluev.com" },
|
13
|
+
]
|
14
|
+
description = "Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers"
|
15
|
+
readme = "README.md"
|
16
|
+
license = {file = "LICENSE"}
|
17
|
+
requires-python = ">=3.10"
|
18
|
+
classifiers = [
|
19
|
+
"Programming Language :: Python :: 3",
|
20
|
+
"License :: OSI Approved :: MIT License",
|
21
|
+
"Operating System :: OS Independent",
|
22
|
+
"Programming Language :: Python :: 3.10",
|
23
|
+
"Programming Language :: Python :: 3.11",
|
24
|
+
"Programming Language :: Python :: 3.12",
|
25
|
+
]
|
biofiles-0.0.1/setup.cfg
ADDED