biofiles 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biofiles-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tigran Saluev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.1
2
+ Name: biofiles
3
+ Version: 0.0.1
4
+ Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
+ Author-email: Tigran Saluev <tigran@saluev.com>
6
+ Maintainer-email: Tigran Saluev <tigran@saluev.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2023 Tigran Saluev
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Classifier: Programming Language :: Python :: 3
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Operating System :: OS Independent
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Requires-Python: >=3.10
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+
39
+ # biofiles
40
+
41
+ Pure-Python, zero-dependency collection of bioinformatics-related
42
+ file readers and writers.
43
+
44
+ ## Installation
45
+
46
+ ```shell
47
+ python -m pip install biofiles
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ Reading FASTA files:
53
+
54
+ ```python
55
+ from biofiles.fasta import FASTAReader
56
+
57
+ with FASTAReader("sequences.fasta") as r:
58
+ for seq in r:
59
+ print(seq.id, len(seq.sequence))
60
+
61
+ # or
62
+
63
+ with open("sequences.fasta") as f:
64
+ r = FASTAReader(f)
65
+ for seq in r:
66
+ print(seq.id, len(seq.sequence))
67
+ ```
68
+
69
+ Writing FASTA files:
70
+
71
+ ```python
72
+ from biofiles.fasta import FASTAWriter
73
+ from biofiles.types.sequence import Sequence
74
+
75
+ seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
76
+
77
+ with FASTAWriter("output.fasta") as w:
78
+ w.write(seq)
79
+ ```
80
+
81
+ Reading GFF genome annotations:
82
+
83
+ ```python
84
+ from biofiles.gff import GFFReader
85
+ from biofiles.types.feature import Gene
86
+
87
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
88
+ for feature in r:
89
+ if isinstance(feature, Gene):
90
+ print(feature.name, len(feature.exons))
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT license, see [License](LICENSE).
@@ -0,0 +1,57 @@
1
+ # biofiles
2
+
3
+ Pure-Python, zero-dependency collection of bioinformatics-related
4
+ file readers and writers.
5
+
6
+ ## Installation
7
+
8
+ ```shell
9
+ python -m pip install biofiles
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ Reading FASTA files:
15
+
16
+ ```python
17
+ from biofiles.fasta import FASTAReader
18
+
19
+ with FASTAReader("sequences.fasta") as r:
20
+ for seq in r:
21
+ print(seq.id, len(seq.sequence))
22
+
23
+ # or
24
+
25
+ with open("sequences.fasta") as f:
26
+ r = FASTAReader(f)
27
+ for seq in r:
28
+ print(seq.id, len(seq.sequence))
29
+ ```
30
+
31
+ Writing FASTA files:
32
+
33
+ ```python
34
+ from biofiles.fasta import FASTAWriter
35
+ from biofiles.types.sequence import Sequence
36
+
37
+ seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
38
+
39
+ with FASTAWriter("output.fasta") as w:
40
+ w.write(seq)
41
+ ```
42
+
43
+ Reading GFF genome annotations:
44
+
45
+ ```python
46
+ from biofiles.gff import GFFReader
47
+ from biofiles.types.feature import Gene
48
+
49
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
50
+ for feature in r:
51
+ if isinstance(feature, Gene):
52
+ print(feature.name, len(feature.exons))
53
+ ```
54
+
55
+ ## License
56
+
57
+ MIT license, see [License](LICENSE).
File without changes
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ from types import TracebackType
3
+ from typing import TypeAlias, Literal, TextIO
4
+
5
+ Strand: TypeAlias = Literal["+", "-"]
6
+
7
+
8
+ class Reader:
9
+ def __init__(self, input_: TextIO | Path | str) -> None:
10
+ if isinstance(input_, Path | str):
11
+ input_ = open(input_)
12
+ self._input = input_
13
+
14
+ def __enter__(self):
15
+ self._input.__enter__()
16
+ return self
17
+
18
+ def __exit__(
19
+ self,
20
+ exc_type: type[BaseException] | None,
21
+ exc_val: BaseException | None,
22
+ exc_tb: TracebackType | None,
23
+ ) -> None:
24
+ self._input.__exit__(exc_type, exc_val, exc_tb)
25
+
26
+
27
+ class Writer:
28
+ def __init__(self, output: TextIO | Path | str) -> None:
29
+ if isinstance(output, Path | str):
30
+ output = open(output, "w")
31
+ self._output = output
32
+
33
+ def __enter__(self):
34
+ self._output.__enter__()
35
+ return self
36
+
37
+ def __exit__(
38
+ self,
39
+ exc_type: type[BaseException] | None,
40
+ exc_val: BaseException | None,
41
+ exc_tb: TracebackType | None,
42
+ ) -> None:
43
+ self._output.__exit__(exc_type, exc_val, exc_tb)
@@ -0,0 +1,65 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TextIO, Iterator
4
+
5
+ from biofiles.common import Reader, Writer
6
+ from biofiles.types.sequence import Sequence
7
+
8
+
9
+ __all__ = ["FASTAReader", "FASTAWriter"]
10
+
11
+
12
+ @dataclass
13
+ class _SequenceDraft:
14
+ id: str
15
+ description: str
16
+ sequence_parts: list[str] = field(default_factory=list)
17
+
18
+ def finalize(self) -> Sequence:
19
+ return Sequence(
20
+ id=self.id,
21
+ description=self.description,
22
+ sequence="".join(self.sequence_parts),
23
+ )
24
+
25
+
26
+ class FASTAReader(Reader):
27
+ def __iter__(self) -> Iterator[Sequence]:
28
+ draft: _SequenceDraft | None = None
29
+ for line in self._input:
30
+ line = line.rstrip("\n")
31
+ if line.startswith(">"):
32
+ if draft:
33
+ yield draft.finalize()
34
+ line = line.removeprefix(">").lstrip()
35
+ match line.split(maxsplit=1):
36
+ case [id_, desc]:
37
+ pass
38
+ case [id_]:
39
+ desc = ""
40
+ case []:
41
+ raise ValueError(
42
+ f"unexpected line {line!r}, expected a non-empty sequence identifier"
43
+ )
44
+ draft = _SequenceDraft(id=id_, description=desc)
45
+ elif line:
46
+ if not draft:
47
+ raise ValueError(f"unexpected line {line!r}, expected >")
48
+ draft.sequence_parts.append(line)
49
+ if draft:
50
+ yield draft.finalize()
51
+
52
+
53
+ class FASTAWriter(Writer):
54
+ def __init__(self, output: TextIO | Path | str, width: int = 80) -> None:
55
+ super().__init__(output)
56
+ self._width = width
57
+
58
+ def write(self, sequence: Sequence) -> None:
59
+ self._output.write(f">{sequence.id} {sequence.description}\n")
60
+ sequence_len = len(sequence.sequence)
61
+ for offset in range(0, sequence_len, self._width):
62
+ self._output.write(
63
+ sequence.sequence[offset : min(offset + self._width, sequence_len)]
64
+ )
65
+ self._output.write("\n")
@@ -0,0 +1,283 @@
1
+ import sys
2
+ from collections import deque
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Iterator, cast, TextIO
6
+
7
+ from biofiles.common import Strand, Reader
8
+ from biofiles.types.feature import Feature, Gene, Exon
9
+
10
+ __all__ = ["GFFReader"]
11
+
12
+
13
+ @dataclass
14
+ class _FeatureDraft:
15
+ idx: int
16
+ sequence_id: str
17
+ source: str
18
+ type_: str
19
+ start_original: int
20
+ end_original: int
21
+ score: float | None
22
+ strand: Strand | None
23
+ phase: int | None
24
+ attributes: dict[str, str]
25
+
26
+ def pick_attribute(self, *keys: str) -> str | None:
27
+ for key in keys:
28
+ if (value := self.attributes.get(key, None)) is not None:
29
+ return value
30
+ return None
31
+
32
+
33
+ @dataclass
34
+ class _FeatureDrafts:
35
+ drafts: deque[_FeatureDraft] = field(default_factory=deque)
36
+ by_id: dict[str, _FeatureDraft] = field(default_factory=dict)
37
+ # deps: dict[int, deque[int]] = field(default_factory=lambda: defaultdict(deque))
38
+
39
+ def add(self, draft: _FeatureDraft) -> None:
40
+ self.drafts.append(draft)
41
+ if id_ := draft.attributes.get("ID", None):
42
+ self.by_id[id_] = draft
43
+ # if parent_id := draft.attributes.get("Parent", None):
44
+ # parent = self.by_id[parent_id]
45
+ # self.deps[parent.idx].append(draft.idx)
46
+
47
+ # def remove_first_n(self, n: int) -> None:
48
+ # for _ in range(n):
49
+ # draft = self.drafts.popleft()
50
+ # if id_ := draft.attributes.get("ID", None):
51
+ # del self.by_id[id_]
52
+ # self.deps.pop(draft.idx, None)
53
+
54
+
55
+ @dataclass
56
+ class _Features:
57
+ features: list[Feature] = field(default_factory=list)
58
+ by_id: dict[str, Feature] = field(default_factory=dict)
59
+
60
+ def add(self, feature: Feature):
61
+ self.features.append(feature)
62
+ if id_ := feature.attributes.get("ID", None):
63
+ self.by_id[id_] = feature
64
+
65
+
66
+ class GFFReader(Reader):
67
+ def __init__(
68
+ self, input_: TextIO | Path | str, /, streaming_window: int | None = 1000
69
+ ):
70
+ super().__init__(input_)
71
+ self._streaming_window = streaming_window
72
+
73
+ def __iter__(self) -> Iterator[Feature]:
74
+ for line in self._input:
75
+ line = line.rstrip("\n")
76
+ if line.startswith(_VERSION_PREFIX):
77
+ version = line.removeprefix(_VERSION_PREFIX)
78
+ if version == "3":
79
+ yield from self._read_gff3()
80
+ return
81
+ raise ValueError(f"unsupported version {version!r}")
82
+ if line.startswith("#"):
83
+ continue
84
+ raise ValueError(f"unexpected line {line!r}, expected version")
85
+
86
+ def _read_gff3(self) -> Iterator[Feature]:
87
+ drafts = _FeatureDrafts()
88
+ idx = 0
89
+ for line in self._input:
90
+ if line.startswith("#"):
91
+ continue
92
+ line = line.rstrip("\n")
93
+ parts = line.split("\t", maxsplit=8)
94
+ if len(parts) != 9:
95
+ raise ValueError(f"unexpected line {line!r}, expected 9 columns")
96
+ (
97
+ sequence_id,
98
+ source,
99
+ type_,
100
+ start_str,
101
+ end_str,
102
+ score_str,
103
+ strand_str,
104
+ phase_str,
105
+ attributes_str,
106
+ ) = parts
107
+ score = self._parse_score(line, score_str)
108
+ strand = self._parse_strand(line, strand_str)
109
+ phase = self._parse_phase(line, phase_str)
110
+ attributes = self._parse_attributes(line, attributes_str)
111
+
112
+ parent_id = attributes.get("Parent", None)
113
+ # if parent_id is None:
114
+ # yield from self._finalize_drafts(drafts)
115
+ # drafts = _FeatureDrafts()
116
+ if parent_id is not None and parent_id not in drafts.by_id:
117
+ raise ValueError(
118
+ f"unexpected line {line!r}, parent ID not among recent feature IDs"
119
+ )
120
+
121
+ draft = _FeatureDraft(
122
+ idx=idx,
123
+ sequence_id=sequence_id,
124
+ source=source,
125
+ type_=type_,
126
+ start_original=int(start_str),
127
+ end_original=int(end_str),
128
+ score=score,
129
+ strand=strand,
130
+ phase=phase,
131
+ attributes=attributes,
132
+ )
133
+ drafts.add(draft)
134
+ idx += 1
135
+
136
+ # yield from self._finalize_drafts(drafts, self._streaming_window)
137
+
138
+ yield from self._finalize_drafts(drafts, None)
139
+
140
+ def _finalize_drafts(
141
+ self, drafts: _FeatureDrafts, w: int | None
142
+ ) -> Iterator[Feature]:
143
+ # TODO streaming version!
144
+ # code below is already tracking
145
+ # if not drafts.drafts:
146
+ # return
147
+ # if w is not None and len(drafts.drafts) <= w:
148
+ # return
149
+ #
150
+ # end_idx = drafts.drafts[-w].idx if w is not None else drafts.drafts[-1].idx
151
+ #
152
+ # i = 0
153
+ # while i < len(drafts.drafts) and (
154
+ # not drafts.deps[drafts.drafts[i].idx]
155
+ # or drafts.deps[drafts.drafts[i].idx][-1] <= end_idx
156
+ # ):
157
+ # i += 1
158
+ #
159
+ # print(f"FINALIZING {i} DRAFTS OUT OF {len(drafts.drafts)}")
160
+ #
161
+ # result = _Features()
162
+ # for j in range(i):
163
+ # draft = drafts.drafts[j]
164
+ # feature = self._finalize_draft(draft, result)
165
+ # result.add(feature)
166
+ # drafts.remove_first_n(i)
167
+ # yield from result.features
168
+
169
+ result = _Features()
170
+ for draft in drafts.drafts:
171
+ feature = self._finalize_draft(draft, result)
172
+ result.add(feature)
173
+ yield from result.features
174
+
175
+ def _finalize_draft(self, draft: _FeatureDraft, result: _Features) -> Feature:
176
+ match draft.type_:
177
+ case "gene":
178
+ feature = self._finalize_gene(draft, result)
179
+ case "exon":
180
+ feature = self._finalize_exon(draft, result)
181
+ case _:
182
+ feature = self._finalize_other(draft, result)
183
+ if feature.parent:
184
+ new_children = feature.parent.children + (feature,)
185
+ object.__setattr__(feature.parent, "children", new_children)
186
+ return feature
187
+
188
+ def _finalize_gene(self, draft: _FeatureDraft, result: _Features) -> Feature:
189
+ feature = self._finalize_other(draft, result)
190
+ name = draft.pick_attribute("gene_name", "Name")
191
+ biotype = draft.pick_attribute("gene_biotype", "biotype")
192
+ if name is None or biotype is None:
193
+ return feature
194
+ return Gene(**feature.__dict__, name=name, biotype=biotype, exons=())
195
+
196
+ def _finalize_exon(self, draft: _FeatureDraft, result: _Features) -> Feature:
197
+ feature = self._finalize_other(draft, result)
198
+
199
+ gene = feature.parent
200
+ while gene and not isinstance(gene, Gene):
201
+ gene = gene.parent
202
+
203
+ if gene is None:
204
+ return feature
205
+ exon = Exon(**feature.__dict__, gene=gene)
206
+ object.__setattr__(gene, "exons", gene.exons + (exon,))
207
+ return exon
208
+
209
+ def _finalize_other(self, draft: _FeatureDraft, result: _Features) -> Feature:
210
+ parent_id = draft.attributes.get("Parent", None)
211
+ parent = result.by_id[parent_id] if parent_id is not None else None
212
+
213
+ return Feature(
214
+ sequence_id=draft.sequence_id,
215
+ source=draft.source,
216
+ type_=draft.type_,
217
+ start_original=draft.start_original,
218
+ end_original=draft.end_original,
219
+ start_c=draft.start_original - 1,
220
+ end_c=draft.end_original,
221
+ score=draft.score,
222
+ strand=draft.strand,
223
+ phase=draft.phase,
224
+ attributes=draft.attributes,
225
+ parent=parent,
226
+ children=(),
227
+ )
228
+
229
+ def _parse_score(self, line: str, score_str: str) -> float | None:
230
+ if score_str == ".":
231
+ return None
232
+ try:
233
+ return float(score_str)
234
+ except ValueError as exc:
235
+ raise ValueError(
236
+ f"unexpected line {line!r}, score should be a number or '.'"
237
+ ) from exc
238
+
239
+ def _parse_strand(self, line: str, strand_str: str) -> Strand | None:
240
+ if strand_str in ("-", "+"):
241
+ return cast(Strand, strand_str)
242
+ if strand_str == ".":
243
+ return None
244
+ raise ValueError(f"unexpected line {line!r}, strand should be '-', '+' or '.'")
245
+
246
+ def _parse_phase(self, line: str, phase_str: str) -> int | None:
247
+ if phase_str == ".":
248
+ return None
249
+ try:
250
+ return int(phase_str)
251
+ except ValueError as exc:
252
+ raise ValueError(
253
+ f"unexpected line {line!r}, phase should be an integer or '.'"
254
+ ) from exc
255
+
256
+ def _parse_attributes(self, line: str, attributes_str: str) -> dict[str, str]:
257
+ return {
258
+ k: v
259
+ for part in attributes_str.strip(";").split(";")
260
+ for k, v in (part.split("=", 1),)
261
+ }
262
+
263
+
264
+ _VERSION_PREFIX = "##gff-version "
265
+
266
+
267
+ if __name__ == "__main__":
268
+ for path in sys.argv[1:]:
269
+ with GFFReader(path) as r:
270
+ total_features = 0
271
+ annotated_genes = 0
272
+ annotated_exons = 0
273
+ parsed_genes = 0
274
+ parsed_exons = 0
275
+ for feature in r:
276
+ total_features += 1
277
+ annotated_genes += feature.type_ == "gene"
278
+ annotated_exons += feature.type_ == "exon"
279
+ parsed_genes += isinstance(feature, Gene)
280
+ parsed_exons += isinstance(feature, Exon)
281
+ print(
282
+ f"{path}: {total_features} features, {parsed_genes} genes parsed out of {annotated_genes}, {parsed_exons} exons parsed out of {annotated_exons}"
283
+ )
@@ -0,0 +1,87 @@
1
+ import sys
2
+ from collections import Counter
3
+ from typing import Iterator
4
+
5
+ from biofiles.common import Reader
6
+ from biofiles.types.repeat import Repeat
7
+
8
+
9
+ __all__ = ["RepeatMaskerReader"]
10
+
11
+
12
+ class RepeatMaskerReader(Reader):
13
+ def __iter__(self) -> Iterator[Repeat]:
14
+ for line in self._input:
15
+ parts = line.split("\t")
16
+ if not (14 <= len(parts) <= 15):
17
+ # Probably some metainfo. No way to tell.
18
+ continue
19
+
20
+ (
21
+ sw_score_str,
22
+ div_str,
23
+ del_str,
24
+ ins_str,
25
+ seq_id,
26
+ seq_start_str,
27
+ seq_end_str,
28
+ seq_left_str,
29
+ strand_str,
30
+ repeat_name,
31
+ repeat_class_family,
32
+ repeat_start_str,
33
+ repeat_end_str,
34
+ repeat_left_str,
35
+ *repeat_id_or_none,
36
+ ) = parts
37
+
38
+ sw_score = int(sw_score_str)
39
+ div_percent = float(div_str)
40
+ del_percent = float(del_str)
41
+ ins_percent = float(ins_str)
42
+ seq_start = int(seq_start_str)
43
+ seq_end = int(seq_end_str)
44
+ seq_left = int(seq_left_str[1:-1])
45
+ strand = {"+": "+", "C": "-"}[strand_str]
46
+
47
+ if "/" in repeat_class_family:
48
+ repeat_class, repeat_family = repeat_class_family.split("/", 1)
49
+ else:
50
+ repeat_class, repeat_family = repeat_class_family, None
51
+ if strand_str == "C":
52
+ repeat_start_str, repeat_left_str = (repeat_left_str, repeat_start_str)
53
+ repeat_start = int(repeat_start_str)
54
+ repeat_end = int(repeat_end_str)
55
+ repeat_left = int(repeat_left_str[1:-1])
56
+ repeat_id = repeat_id_or_none[0] if repeat_id_or_none else None
57
+ yield Repeat(
58
+ sw_score=sw_score,
59
+ divergence_percent=div_percent,
60
+ insertion_percent=ins_percent,
61
+ deletion_percent=del_percent,
62
+ sequence_id=seq_id,
63
+ sequence_start_original=seq_start,
64
+ sequence_end_original=seq_end,
65
+ sequence_start_c=seq_start - 1,
66
+ sequence_end_c=seq_end,
67
+ sequence_left=seq_left,
68
+ strand=strand,
69
+ repeat_name=repeat_name,
70
+ repeat_class=repeat_class,
71
+ repeat_family=repeat_family,
72
+ repeat_start_original=repeat_start,
73
+ repeat_end_original=repeat_end,
74
+ repeat_start_c=repeat_start - 1,
75
+ repeat_end_c=repeat_end,
76
+ repeat_left=repeat_left,
77
+ repeat_id=repeat_id,
78
+ )
79
+
80
+
81
+ if __name__ == "__main__":
82
+ for path in sys.argv[1:]:
83
+ with RepeatMaskerReader(path) as r:
84
+ repeats_per_class = Counter(repeat.repeat_class for repeat in r)
85
+ print(f"Repeat classes in {path}:")
86
+ for k, v in repeats_per_class.most_common():
87
+ print(f" {k}: {v} repeats")
File without changes
@@ -0,0 +1,45 @@
1
+ from dataclasses import dataclass
2
+
3
+ from biofiles.common import Strand
4
+
5
+
6
+ __all__ = ["Feature", "Gene", "Exon"]
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class Feature:
11
+ sequence_id: str
12
+ source: str
13
+ type_: str
14
+
15
+ start_original: int
16
+ end_original: int
17
+ # Original, 1-based inclusive values.
18
+
19
+ start_c: int
20
+ end_c: int
21
+ # Standardized ("C-style") 0-based values, start inclusive, end exclusive.
22
+
23
+ score: float | None
24
+ strand: Strand | None
25
+ phase: int | None
26
+ attributes: dict[str, str]
27
+
28
+ parent: "GFFFeature | None"
29
+ children: tuple["Feature", ...]
30
+
31
+
32
+ # Custom types for particular kinds of features:
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class Gene(Feature):
37
+ name: str
38
+ biotype: str
39
+ exons: tuple["Exon", ...]
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class Exon(Feature):
44
+ gene: Gene
45
+ # TODO transcript, mRNA
@@ -0,0 +1,30 @@
1
+ from dataclasses import dataclass
2
+
3
+ from biofiles.common import Strand
4
+
5
+
6
+ __all__ = ["Repeat"]
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class Repeat:
11
+ sw_score: int
12
+ divergence_percent: float
13
+ deletion_percent: float
14
+ insertion_percent: float
15
+ sequence_id: str
16
+ sequence_start_original: int
17
+ sequence_end_original: int
18
+ sequence_start_c: int
19
+ sequence_end_c: int
20
+ sequence_left: int
21
+ strand: Strand
22
+ repeat_name: str
23
+ repeat_class: str
24
+ repeat_family: str | None
25
+ repeat_start_original: int
26
+ repeat_end_original: int
27
+ repeat_start_c: int
28
+ repeat_end_c: int
29
+ repeat_left: int
30
+ repeat_id: str | None
@@ -0,0 +1,11 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ __all__ = ["Sequence"]
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class Sequence:
9
+ id: str
10
+ description: str
11
+ sequence: str
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.1
2
+ Name: biofiles
3
+ Version: 0.0.1
4
+ Summary: Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers
5
+ Author-email: Tigran Saluev <tigran@saluev.com>
6
+ Maintainer-email: Tigran Saluev <tigran@saluev.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2023 Tigran Saluev
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Classifier: Programming Language :: Python :: 3
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Operating System :: OS Independent
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Requires-Python: >=3.10
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+
39
+ # biofiles
40
+
41
+ Pure-Python, zero-dependency collection of bioinformatics-related
42
+ file readers and writers.
43
+
44
+ ## Installation
45
+
46
+ ```shell
47
+ python -m pip install biofiles
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ Reading FASTA files:
53
+
54
+ ```python
55
+ from biofiles.fasta import FASTAReader
56
+
57
+ with FASTAReader("sequences.fasta") as r:
58
+ for seq in r:
59
+ print(seq.id, len(seq.sequence))
60
+
61
+ # or
62
+
63
+ with open("sequences.fasta") as f:
64
+ r = FASTAReader(f)
65
+ for seq in r:
66
+ print(seq.id, len(seq.sequence))
67
+ ```
68
+
69
+ Writing FASTA files:
70
+
71
+ ```python
72
+ from biofiles.fasta import FASTAWriter
73
+ from biofiles.types.sequence import Sequence
74
+
75
+ seq = Sequence(id="SEQ", description="Important sequence", sequence="GAGAGA")
76
+
77
+ with FASTAWriter("output.fasta") as w:
78
+ w.write(seq)
79
+ ```
80
+
81
+ Reading GFF genome annotations:
82
+
83
+ ```python
84
+ from biofiles.gff import GFFReader
85
+ from biofiles.types.feature import Gene
86
+
87
+ with GFFReader("GCF_009914755.1_T2T-CHM13v2.0_genomic.gff") as r:
88
+ for feature in r:
89
+ if isinstance(feature, Gene):
90
+ print(feature.name, len(feature.exons))
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT license, see [License](LICENSE).
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ biofiles/__init__.py
5
+ biofiles/common.py
6
+ biofiles/fasta.py
7
+ biofiles/gff.py
8
+ biofiles/repeatmasker.py
9
+ biofiles.egg-info/PKG-INFO
10
+ biofiles.egg-info/SOURCES.txt
11
+ biofiles.egg-info/dependency_links.txt
12
+ biofiles.egg-info/top_level.txt
13
+ biofiles/types/__init__.py
14
+ biofiles/types/feature.py
15
+ biofiles/types/repeat.py
16
+ biofiles/types/sequence.py
@@ -0,0 +1 @@
1
+ biofiles
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "biofiles"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Tigran Saluev", email="tigran@saluev.com" },
10
+ ]
11
+ maintainers = [
12
+ { name="Tigran Saluev", email="tigran@saluev.com" },
13
+ ]
14
+ description = "Pure-Python, zero-dependency collection of bioinformatics-related file readers and writers"
15
+ readme = "README.md"
16
+ license = {file = "LICENSE"}
17
+ requires-python = ">=3.10"
18
+ classifiers = [
19
+ "Programming Language :: Python :: 3",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+