bioseqkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioseqkit/__init__.py +57 -0
- bioseqkit/cli.py +140 -0
- bioseqkit/entrez.py +39 -0
- bioseqkit/index.py +164 -0
- bioseqkit/io.py +182 -0
- bioseqkit/kmer.py +141 -0
- bioseqkit/py.typed +0 -0
- bioseqkit/stats.py +111 -0
- bioseqkit/transform.py +95 -0
- bioseqkit-0.1.0.dist-info/METADATA +141 -0
- bioseqkit-0.1.0.dist-info/RECORD +14 -0
- bioseqkit-0.1.0.dist-info/WHEEL +4 -0
- bioseqkit-0.1.0.dist-info/entry_points.txt +2 -0
- bioseqkit-0.1.0.dist-info/licenses/LICENSE +21 -0
bioseqkit/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""bioseqkit: a lightweight, dependency-free biological sequence toolkit.
|
|
2
|
+
|
|
3
|
+
The package provides pure-Python FASTA/FASTQ parsing, sequence statistics,
|
|
4
|
+
transformations (reverse complement, six-frame translation), k-mer analysis,
|
|
5
|
+
minimizer sampling and FAI-like random-access indexing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from bioseqkit.io import (
|
|
9
|
+
FastaRecord,
|
|
10
|
+
FastqRecord,
|
|
11
|
+
parse_fasta,
|
|
12
|
+
parse_fastq,
|
|
13
|
+
write_fasta,
|
|
14
|
+
)
|
|
15
|
+
from bioseqkit.stats import (
|
|
16
|
+
SeqStats,
|
|
17
|
+
base_composition,
|
|
18
|
+
gc_content,
|
|
19
|
+
n_ratio,
|
|
20
|
+
sequence_stats,
|
|
21
|
+
)
|
|
22
|
+
from bioseqkit.transform import reverse_complement, six_frame_translation, translate
|
|
23
|
+
from bioseqkit.kmer import (
|
|
24
|
+
canonical_kmer,
|
|
25
|
+
count_kmers,
|
|
26
|
+
count_kmers_parallel,
|
|
27
|
+
minimizers,
|
|
28
|
+
top_kmers,
|
|
29
|
+
)
|
|
30
|
+
from bioseqkit.index import FaidxIndex, build_faidx, fetch
|
|
31
|
+
|
|
32
|
+
__version__ = "0.1.0"
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"FastaRecord",
|
|
36
|
+
"FastqRecord",
|
|
37
|
+
"parse_fasta",
|
|
38
|
+
"parse_fastq",
|
|
39
|
+
"write_fasta",
|
|
40
|
+
"SeqStats",
|
|
41
|
+
"sequence_stats",
|
|
42
|
+
"gc_content",
|
|
43
|
+
"n_ratio",
|
|
44
|
+
"base_composition",
|
|
45
|
+
"reverse_complement",
|
|
46
|
+
"translate",
|
|
47
|
+
"six_frame_translation",
|
|
48
|
+
"count_kmers",
|
|
49
|
+
"count_kmers_parallel",
|
|
50
|
+
"top_kmers",
|
|
51
|
+
"canonical_kmer",
|
|
52
|
+
"minimizers",
|
|
53
|
+
"FaidxIndex",
|
|
54
|
+
"build_faidx",
|
|
55
|
+
"fetch",
|
|
56
|
+
"__version__",
|
|
57
|
+
]
|
bioseqkit/cli.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Command-line interface for bioseqkit.
|
|
2
|
+
|
|
3
|
+
Implemented with :mod:`argparse` (standard library) to keep the core package
|
|
4
|
+
dependency-free. Sub-commands: ``stats``, ``revcomp``, ``translate``,
|
|
5
|
+
``kmer``, ``minimizer``, ``index`` and ``fetch``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from typing import Sequence
|
|
14
|
+
|
|
15
|
+
from bioseqkit import __version__
|
|
16
|
+
from bioseqkit.index import build_faidx, fetch as fetch_region
|
|
17
|
+
from bioseqkit.io import FastaRecord, parse_fasta, write_fasta
|
|
18
|
+
from bioseqkit.kmer import count_kmers, count_kmers_parallel, minimizers, top_kmers
|
|
19
|
+
from bioseqkit.stats import sequence_stats
|
|
20
|
+
from bioseqkit.transform import reverse_complement, six_frame_translation
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _read_sequences(path: str) -> list[FastaRecord]:
|
|
24
|
+
return list(parse_fasta(path))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cmd_stats(args: argparse.Namespace) -> int:
|
|
28
|
+
records = _read_sequences(args.input)
|
|
29
|
+
stats = sequence_stats(rec.sequence for rec in records)
|
|
30
|
+
print(json.dumps(stats.as_dict(), indent=2))
|
|
31
|
+
return 0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cmd_revcomp(args: argparse.Namespace) -> int:
|
|
35
|
+
out = [
|
|
36
|
+
FastaRecord(rec.id, (rec.description + " revcomp").strip(), reverse_complement(rec.sequence))
|
|
37
|
+
for rec in _read_sequences(args.input)
|
|
38
|
+
]
|
|
39
|
+
write_fasta(out, sys.stdout)
|
|
40
|
+
return 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def cmd_translate(args: argparse.Namespace) -> int:
|
|
44
|
+
out: list[FastaRecord] = []
|
|
45
|
+
for rec in _read_sequences(args.input):
|
|
46
|
+
for frame in six_frame_translation(rec.sequence):
|
|
47
|
+
out.append(FastaRecord(f"{rec.id}_frame{frame.name}", "", frame.protein))
|
|
48
|
+
write_fasta(out, sys.stdout)
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def cmd_kmer(args: argparse.Namespace) -> int:
|
|
53
|
+
seqs = [rec.sequence for rec in _read_sequences(args.input)]
|
|
54
|
+
if args.threads and args.threads > 1:
|
|
55
|
+
counts = count_kmers_parallel(seqs, args.k, canonical=args.canonical, workers=args.threads)
|
|
56
|
+
else:
|
|
57
|
+
counts = sum(
|
|
58
|
+
(count_kmers(s, args.k, canonical=args.canonical) for s in seqs),
|
|
59
|
+
start=type(count_kmers("", args.k))(),
|
|
60
|
+
)
|
|
61
|
+
for kmer, count in top_kmers(counts, args.top):
|
|
62
|
+
print(f"{kmer}\t{count}")
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def cmd_minimizer(args: argparse.Namespace) -> int:
|
|
67
|
+
for rec in _read_sequences(args.input):
|
|
68
|
+
for pos, mm in minimizers(rec.sequence, args.k, args.w, canonical=not args.no_canonical):
|
|
69
|
+
print(f"{rec.id}\t{pos}\t{mm}")
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def cmd_index(args: argparse.Namespace) -> int:
|
|
74
|
+
index = build_faidx(args.input)
|
|
75
|
+
path = index.write()
|
|
76
|
+
print(f"Wrote index: {path} ({len(index.records)} sequences)")
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def cmd_fetch(args: argparse.Namespace) -> int:
|
|
81
|
+
seq = fetch_region(args.input, args.region)
|
|
82
|
+
name = args.region
|
|
83
|
+
print(f">{name}")
|
|
84
|
+
for i in range(0, len(seq), 70):
|
|
85
|
+
print(seq[i : i + 70])
|
|
86
|
+
return 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
90
|
+
parser = argparse.ArgumentParser(prog="bioseqkit", description=__doc__.split("\n")[0])
|
|
91
|
+
parser.add_argument("--version", action="version", version=f"bioseqkit {__version__}")
|
|
92
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
93
|
+
|
|
94
|
+
p = sub.add_parser("stats", help="sequence statistics (JSON)")
|
|
95
|
+
p.add_argument("input", help="FASTA/FASTA.gz file")
|
|
96
|
+
p.set_defaults(func=cmd_stats)
|
|
97
|
+
|
|
98
|
+
p = sub.add_parser("revcomp", help="reverse complement")
|
|
99
|
+
p.add_argument("input")
|
|
100
|
+
p.set_defaults(func=cmd_revcomp)
|
|
101
|
+
|
|
102
|
+
p = sub.add_parser("translate", help="six-frame translation")
|
|
103
|
+
p.add_argument("input")
|
|
104
|
+
p.set_defaults(func=cmd_translate)
|
|
105
|
+
|
|
106
|
+
p = sub.add_parser("kmer", help="k-mer frequency analysis")
|
|
107
|
+
p.add_argument("input")
|
|
108
|
+
p.add_argument("-k", type=int, default=5, help="k-mer size (default 5)")
|
|
109
|
+
p.add_argument("--top", type=int, default=10, help="report top-N k-mers")
|
|
110
|
+
p.add_argument("--canonical", action="store_true", help="merge reverse-complement k-mers")
|
|
111
|
+
p.add_argument("-t", "--threads", type=int, default=1, help="parallel worker processes")
|
|
112
|
+
p.set_defaults(func=cmd_kmer)
|
|
113
|
+
|
|
114
|
+
p = sub.add_parser("minimizer", help="minimizer sampling")
|
|
115
|
+
p.add_argument("input")
|
|
116
|
+
p.add_argument("-k", type=int, default=15, help="k-mer size (default 15)")
|
|
117
|
+
p.add_argument("-w", type=int, default=10, help="window size (default 10)")
|
|
118
|
+
p.add_argument("--no-canonical", action="store_true", help="disable canonical k-mers")
|
|
119
|
+
p.set_defaults(func=cmd_minimizer)
|
|
120
|
+
|
|
121
|
+
p = sub.add_parser("index", help="build a FAI-like index")
|
|
122
|
+
p.add_argument("input")
|
|
123
|
+
p.set_defaults(func=cmd_index)
|
|
124
|
+
|
|
125
|
+
p = sub.add_parser("fetch", help="fetch a sub-sequence (chr:start-end)")
|
|
126
|
+
p.add_argument("input")
|
|
127
|
+
p.add_argument("region", help="region string, e.g. chr1:1000-2000")
|
|
128
|
+
p.set_defaults(func=cmd_fetch)
|
|
129
|
+
|
|
130
|
+
return parser
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
134
|
+
parser = build_parser()
|
|
135
|
+
args = parser.parse_args(argv)
|
|
136
|
+
return args.func(args)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__": # pragma: no cover
|
|
140
|
+
raise SystemExit(main())
|
bioseqkit/entrez.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Optional NCBI Entrez download helper.
|
|
2
|
+
|
|
3
|
+
Fetches reference sequences from the NCBI E-utilities ``efetch`` endpoint using
|
|
4
|
+
only the standard library (``urllib``), so no third-party HTTP client is
|
|
5
|
+
required. Network access is required at call time.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import urllib.parse
|
|
11
|
+
import urllib.request
|
|
12
|
+
|
|
13
|
+
__all__ = ["efetch_fasta"]
|
|
14
|
+
|
|
15
|
+
_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def efetch_fasta(
|
|
19
|
+
accession: str,
|
|
20
|
+
db: str = "nuccore",
|
|
21
|
+
email: str | None = None,
|
|
22
|
+
api_key: str | None = None,
|
|
23
|
+
timeout: float = 30.0,
|
|
24
|
+
) -> str:
|
|
25
|
+
"""Download a FASTA record from NCBI by accession and return it as text."""
|
|
26
|
+
params = {
|
|
27
|
+
"db": db,
|
|
28
|
+
"id": accession,
|
|
29
|
+
"rettype": "fasta",
|
|
30
|
+
"retmode": "text",
|
|
31
|
+
}
|
|
32
|
+
if email:
|
|
33
|
+
params["email"] = email
|
|
34
|
+
if api_key:
|
|
35
|
+
params["api_key"] = api_key
|
|
36
|
+
url = f"{_EFETCH}?{urllib.parse.urlencode(params)}"
|
|
37
|
+
req = urllib.request.Request(url, headers={"User-Agent": "bioseqkit/0.1"})
|
|
38
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310
|
|
39
|
+
return resp.read().decode("utf-8")
|
bioseqkit/index.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""FAI-like FASTA indexing for random access.
|
|
2
|
+
|
|
3
|
+
Mirrors the ``samtools faidx`` (``*.fai``) format so that arbitrary
|
|
4
|
+
sub-sequences can be fetched without reading the whole file. Each index line
|
|
5
|
+
holds: name, sequence length, byte offset of the first base, number of bases
|
|
6
|
+
per line and number of bytes per line (including the newline).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
__all__ = ["FaidxRecord", "FaidxIndex", "build_faidx", "fetch", "parse_region"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class FaidxRecord:
|
|
19
|
+
name: str
|
|
20
|
+
length: int
|
|
21
|
+
offset: int
|
|
22
|
+
linebases: int
|
|
23
|
+
linewidth: int
|
|
24
|
+
|
|
25
|
+
def to_line(self) -> str:
|
|
26
|
+
return f"{self.name}\t{self.length}\t{self.offset}\t{self.linebases}\t{self.linewidth}"
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_line(cls, line: str) -> "FaidxRecord":
|
|
30
|
+
name, length, offset, linebases, linewidth = line.rstrip("\n").split("\t")
|
|
31
|
+
return cls(name, int(length), int(offset), int(linebases), int(linewidth))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FaidxIndex:
|
|
35
|
+
"""An in-memory FASTA index bound to a plain-text FASTA file."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, fasta_path: str, records: dict[str, FaidxRecord]):
|
|
38
|
+
self.fasta_path = fasta_path
|
|
39
|
+
self.records = records
|
|
40
|
+
|
|
41
|
+
def names(self) -> list[str]:
|
|
42
|
+
return list(self.records)
|
|
43
|
+
|
|
44
|
+
def write(self, fai_path: str | None = None) -> str:
|
|
45
|
+
path = fai_path or self.fasta_path + ".fai"
|
|
46
|
+
with open(path, "w") as fh:
|
|
47
|
+
for rec in self.records.values():
|
|
48
|
+
fh.write(rec.to_line() + "\n")
|
|
49
|
+
return path
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def load(cls, fasta_path: str, fai_path: str | None = None) -> "FaidxIndex":
|
|
53
|
+
path = fai_path or fasta_path + ".fai"
|
|
54
|
+
records: dict[str, FaidxRecord] = {}
|
|
55
|
+
with open(path) as fh:
|
|
56
|
+
for line in fh:
|
|
57
|
+
if line.strip():
|
|
58
|
+
rec = FaidxRecord.from_line(line)
|
|
59
|
+
records[rec.name] = rec
|
|
60
|
+
return cls(fasta_path, records)
|
|
61
|
+
|
|
62
|
+
def fetch(self, name: str, start: int | None = None, end: int | None = None) -> str:
|
|
63
|
+
"""Fetch a sub-sequence using 1-based inclusive ``start``/``end``.
|
|
64
|
+
|
|
65
|
+
With no coordinates the whole sequence is returned.
|
|
66
|
+
"""
|
|
67
|
+
if name not in self.records:
|
|
68
|
+
raise KeyError(f"Sequence {name!r} not found in index")
|
|
69
|
+
rec = self.records[name]
|
|
70
|
+
s = 0 if start is None else start - 1
|
|
71
|
+
e = rec.length if end is None else end
|
|
72
|
+
if s < 0 or e > rec.length or s > e:
|
|
73
|
+
raise ValueError(f"Region out of bounds for {name} (length {rec.length})")
|
|
74
|
+
newline_bytes = rec.linewidth - rec.linebases
|
|
75
|
+
with open(self.fasta_path, "rb") as fh:
|
|
76
|
+
start_line, start_col = divmod(s, rec.linebases)
|
|
77
|
+
byte_start = rec.offset + start_line * rec.linewidth + start_col
|
|
78
|
+
fh.seek(byte_start)
|
|
79
|
+
n_bases = e - s
|
|
80
|
+
# Read enough bytes to cover the requested bases plus newlines.
|
|
81
|
+
n_lines = (start_col + n_bases) // rec.linebases
|
|
82
|
+
n_read = n_bases + n_lines * newline_bytes + rec.linewidth
|
|
83
|
+
raw = fh.read(n_read)
|
|
84
|
+
seq = raw.replace(b"\n", b"").replace(b"\r", b"").decode("ascii")
|
|
85
|
+
return seq[:n_bases]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def build_faidx(fasta_path: str) -> FaidxIndex:
|
|
89
|
+
"""Scan a plain-text FASTA file and build a :class:`FaidxIndex`.
|
|
90
|
+
|
|
91
|
+
Raises :class:`ValueError` for gzip files (which are not seekable by base)
|
|
92
|
+
or for records with inconsistent line lengths.
|
|
93
|
+
"""
|
|
94
|
+
if fasta_path.endswith(".gz"):
|
|
95
|
+
raise ValueError("faidx requires an uncompressed FASTA file")
|
|
96
|
+
records: dict[str, FaidxRecord] = {}
|
|
97
|
+
with open(fasta_path, "rb") as fh:
|
|
98
|
+
name: str | None = None
|
|
99
|
+
length = 0
|
|
100
|
+
offset = 0
|
|
101
|
+
linebases = 0
|
|
102
|
+
linewidth = 0
|
|
103
|
+
line_lengths: list[int] = []
|
|
104
|
+
pos = 0
|
|
105
|
+
|
|
106
|
+
def flush() -> None:
|
|
107
|
+
nonlocal name
|
|
108
|
+
if name is not None:
|
|
109
|
+
_validate_lines(name, line_lengths)
|
|
110
|
+
records[name] = FaidxRecord(name, length, offset, linebases, linewidth)
|
|
111
|
+
|
|
112
|
+
for raw in fh:
|
|
113
|
+
if raw.startswith(b">"):
|
|
114
|
+
flush()
|
|
115
|
+
header = raw[1:].decode("ascii", "replace").strip()
|
|
116
|
+
name = header.split()[0] if header else ""
|
|
117
|
+
length = 0
|
|
118
|
+
linebases = 0
|
|
119
|
+
linewidth = 0
|
|
120
|
+
line_lengths = []
|
|
121
|
+
offset = pos + len(raw)
|
|
122
|
+
else:
|
|
123
|
+
stripped = raw.rstrip(b"\r\n")
|
|
124
|
+
if name is not None:
|
|
125
|
+
if linebases == 0:
|
|
126
|
+
linebases = len(stripped)
|
|
127
|
+
linewidth = len(raw)
|
|
128
|
+
line_lengths.append(len(stripped))
|
|
129
|
+
length += len(stripped)
|
|
130
|
+
pos += len(raw)
|
|
131
|
+
flush()
|
|
132
|
+
return FaidxIndex(fasta_path, records)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _validate_lines(name: str, line_lengths: list[int]) -> None:
|
|
136
|
+
if len(line_lengths) > 1:
|
|
137
|
+
body = line_lengths[:-1]
|
|
138
|
+
if len(set(body)) > 1:
|
|
139
|
+
raise ValueError(f"Inconsistent line lengths in record {name!r}; cannot index")
|
|
140
|
+
if line_lengths[-1] > body[0]:
|
|
141
|
+
raise ValueError(f"Last line longer than others in record {name!r}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def parse_region(region: str) -> tuple[str, int | None, int | None]:
|
|
145
|
+
"""Parse a ``chr:start-end`` region string (1-based, inclusive).
|
|
146
|
+
|
|
147
|
+
``chr`` alone returns the whole sequence.
|
|
148
|
+
"""
|
|
149
|
+
if ":" not in region:
|
|
150
|
+
return region, None, None
|
|
151
|
+
name, span = region.rsplit(":", 1)
|
|
152
|
+
span = span.replace(",", "")
|
|
153
|
+
if "-" in span:
|
|
154
|
+
start_s, end_s = span.split("-", 1)
|
|
155
|
+
return name, int(start_s), int(end_s)
|
|
156
|
+
return name, int(span), int(span)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def fetch(fasta_path: str, region: str) -> str:
|
|
160
|
+
"""Convenience: build/load index and fetch a region in one call."""
|
|
161
|
+
fai_path = fasta_path + ".fai"
|
|
162
|
+
index = FaidxIndex.load(fasta_path) if os.path.exists(fai_path) else build_faidx(fasta_path)
|
|
163
|
+
name, start, end = parse_region(region)
|
|
164
|
+
return index.fetch(name, start, end)
|
bioseqkit/io.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Sequence file I/O.
|
|
2
|
+
|
|
3
|
+
Pure-Python, streaming FASTA/FASTQ parsers implemented with the
|
|
4
|
+
iterator/generator pattern so that arbitrarily large files can be processed
|
|
5
|
+
with constant memory. Both plain-text and gzip-compressed files are supported
|
|
6
|
+
transparently (detected by the ``.gz`` extension or the gzip magic bytes).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import gzip
|
|
12
|
+
import io as _io
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import IO, Iterable, Iterator
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"FastaRecord",
|
|
18
|
+
"FastqRecord",
|
|
19
|
+
"open_text",
|
|
20
|
+
"parse_fasta",
|
|
21
|
+
"parse_fastq",
|
|
22
|
+
"write_fasta",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class FastaRecord:
|
|
28
|
+
"""A single FASTA record."""
|
|
29
|
+
|
|
30
|
+
id: str
|
|
31
|
+
description: str
|
|
32
|
+
sequence: str
|
|
33
|
+
|
|
34
|
+
def __len__(self) -> int:
|
|
35
|
+
return len(self.sequence)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class FastqRecord:
|
|
40
|
+
"""A single FASTQ record, carrying Phred quality string."""
|
|
41
|
+
|
|
42
|
+
id: str
|
|
43
|
+
description: str
|
|
44
|
+
sequence: str
|
|
45
|
+
quality: str
|
|
46
|
+
|
|
47
|
+
def __len__(self) -> int:
|
|
48
|
+
return len(self.sequence)
|
|
49
|
+
|
|
50
|
+
def phred_scores(self, offset: int = 33) -> list[int]:
|
|
51
|
+
"""Decode the ASCII quality string into integer Phred scores."""
|
|
52
|
+
return [ord(c) - offset for c in self.quality]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _is_gzip(path: str) -> bool:
|
|
56
|
+
if path.endswith(".gz"):
|
|
57
|
+
return True
|
|
58
|
+
try:
|
|
59
|
+
with open(path, "rb") as fh:
|
|
60
|
+
return fh.read(2) == b"\x1f\x8b"
|
|
61
|
+
except OSError:
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def open_text(path: str) -> IO[str]:
|
|
66
|
+
"""Open ``path`` as a text stream, transparently handling gzip files."""
|
|
67
|
+
if _is_gzip(path):
|
|
68
|
+
return gzip.open(path, "rt")
|
|
69
|
+
return open(path, "rt")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _split_header(header: str) -> tuple[str, str]:
|
|
73
|
+
header = header.strip()
|
|
74
|
+
if not header:
|
|
75
|
+
return "", ""
|
|
76
|
+
parts = header.split(None, 1)
|
|
77
|
+
seq_id = parts[0]
|
|
78
|
+
description = parts[1] if len(parts) > 1 else ""
|
|
79
|
+
return seq_id, description
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_fasta(source: str | IO[str]) -> Iterator[FastaRecord]:
|
|
83
|
+
"""Stream :class:`FastaRecord` objects from a FASTA file or text stream.
|
|
84
|
+
|
|
85
|
+
Blank lines are ignored. Sequence lines are concatenated so multi-line
|
|
86
|
+
records are handled. Raises :class:`ValueError` on malformed input.
|
|
87
|
+
"""
|
|
88
|
+
handle, own = _as_handle(source)
|
|
89
|
+
try:
|
|
90
|
+
seq_id: str | None = None
|
|
91
|
+
description = ""
|
|
92
|
+
chunks: list[str] = []
|
|
93
|
+
started = False
|
|
94
|
+
for raw in handle:
|
|
95
|
+
line = raw.rstrip("\r\n")
|
|
96
|
+
if not line.strip():
|
|
97
|
+
continue
|
|
98
|
+
if line.startswith(">"):
|
|
99
|
+
started = True
|
|
100
|
+
if seq_id is not None:
|
|
101
|
+
yield FastaRecord(seq_id, description, "".join(chunks))
|
|
102
|
+
seq_id, description = _split_header(line[1:])
|
|
103
|
+
chunks = []
|
|
104
|
+
else:
|
|
105
|
+
if not started:
|
|
106
|
+
raise ValueError("FASTA sequence data before any '>' header")
|
|
107
|
+
chunks.append(line.strip())
|
|
108
|
+
if seq_id is not None:
|
|
109
|
+
yield FastaRecord(seq_id, description, "".join(chunks))
|
|
110
|
+
finally:
|
|
111
|
+
if own:
|
|
112
|
+
handle.close()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def parse_fastq(source: str | IO[str]) -> Iterator[FastqRecord]:
|
|
116
|
+
"""Stream :class:`FastqRecord` objects from a FASTQ file or text stream."""
|
|
117
|
+
handle, own = _as_handle(source)
|
|
118
|
+
try:
|
|
119
|
+
it = iter(handle)
|
|
120
|
+
while True:
|
|
121
|
+
header = _next_nonblank(it)
|
|
122
|
+
if header is None:
|
|
123
|
+
break
|
|
124
|
+
if not header.startswith("@"):
|
|
125
|
+
raise ValueError(f"FASTQ header must start with '@': {header!r}")
|
|
126
|
+
seq_line = _require(it, "sequence")
|
|
127
|
+
plus = _require(it, "'+' separator")
|
|
128
|
+
if not plus.startswith("+"):
|
|
129
|
+
raise ValueError(f"FASTQ separator must start with '+': {plus!r}")
|
|
130
|
+
qual_line = _require(it, "quality")
|
|
131
|
+
if len(seq_line) != len(qual_line):
|
|
132
|
+
raise ValueError(
|
|
133
|
+
"FASTQ sequence and quality length mismatch "
|
|
134
|
+
f"({len(seq_line)} vs {len(qual_line)})"
|
|
135
|
+
)
|
|
136
|
+
seq_id, description = _split_header(header[1:])
|
|
137
|
+
yield FastqRecord(seq_id, description, seq_line, qual_line)
|
|
138
|
+
finally:
|
|
139
|
+
if own:
|
|
140
|
+
handle.close()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def write_fasta(records: Iterable[FastaRecord], handle: IO[str], width: int = 70) -> int:
|
|
144
|
+
"""Write records to a text handle, wrapping sequences at ``width`` columns.
|
|
145
|
+
|
|
146
|
+
Returns the number of records written. ``width <= 0`` disables wrapping.
|
|
147
|
+
"""
|
|
148
|
+
n = 0
|
|
149
|
+
for rec in records:
|
|
150
|
+
header = rec.id if not rec.description else f"{rec.id} {rec.description}"
|
|
151
|
+
handle.write(f">{header}\n")
|
|
152
|
+
seq = rec.sequence
|
|
153
|
+
if width and width > 0:
|
|
154
|
+
for i in range(0, len(seq), width):
|
|
155
|
+
handle.write(seq[i : i + width] + "\n")
|
|
156
|
+
else:
|
|
157
|
+
handle.write(seq + "\n")
|
|
158
|
+
n += 1
|
|
159
|
+
return n
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _as_handle(source: str | IO[str]) -> tuple[IO[str], bool]:
|
|
163
|
+
if isinstance(source, str):
|
|
164
|
+
return open_text(source), True
|
|
165
|
+
if isinstance(source, (_io.TextIOBase,)) or hasattr(source, "read"):
|
|
166
|
+
return source, False
|
|
167
|
+
raise TypeError(f"Unsupported source type: {type(source)!r}")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _next_nonblank(it: Iterator[str]) -> str | None:
|
|
171
|
+
for raw in it:
|
|
172
|
+
line = raw.rstrip("\r\n")
|
|
173
|
+
if line.strip():
|
|
174
|
+
return line
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _require(it: Iterator[str], what: str) -> str:
|
|
179
|
+
try:
|
|
180
|
+
return next(it).rstrip("\r\n")
|
|
181
|
+
except StopIteration as exc: # noqa: F841
|
|
182
|
+
raise ValueError(f"Truncated FASTQ record: missing {what} line") from None
|
bioseqkit/kmer.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""k-mer analysis: counting, top-k, canonical k-mers, parallel counting and minimizers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
7
|
+
from typing import Iterable, Iterator
|
|
8
|
+
|
|
9
|
+
from bioseqkit.transform import reverse_complement
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"iter_kmers",
|
|
13
|
+
"count_kmers",
|
|
14
|
+
"count_kmers_parallel",
|
|
15
|
+
"top_kmers",
|
|
16
|
+
"canonical_kmer",
|
|
17
|
+
"minimizers",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def iter_kmers(sequence: str, k: int) -> Iterator[str]:
|
|
22
|
+
"""Yield successive k-mers of length ``k`` from ``sequence``."""
|
|
23
|
+
if k <= 0:
|
|
24
|
+
raise ValueError("k must be a positive integer")
|
|
25
|
+
for i in range(len(sequence) - k + 1):
|
|
26
|
+
yield sequence[i : i + k]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def canonical_kmer(kmer: str) -> str:
|
|
30
|
+
"""Return the lexicographically smaller of a k-mer and its reverse complement."""
|
|
31
|
+
rc = reverse_complement(kmer)
|
|
32
|
+
return kmer if kmer <= rc else rc
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def count_kmers(
|
|
36
|
+
sequence: str,
|
|
37
|
+
k: int,
|
|
38
|
+
canonical: bool = False,
|
|
39
|
+
skip_ambiguous: bool = True,
|
|
40
|
+
) -> Counter[str]:
|
|
41
|
+
"""Count k-mers in a single sequence.
|
|
42
|
+
|
|
43
|
+
If ``canonical`` is True, a k-mer and its reverse complement are merged.
|
|
44
|
+
If ``skip_ambiguous`` is True, k-mers containing bases outside ``ACGT`` are
|
|
45
|
+
skipped.
|
|
46
|
+
"""
|
|
47
|
+
counts: Counter[str] = Counter()
|
|
48
|
+
seq = sequence.upper()
|
|
49
|
+
valid = set("ACGT")
|
|
50
|
+
for kmer in iter_kmers(seq, k):
|
|
51
|
+
if skip_ambiguous and not set(kmer) <= valid:
|
|
52
|
+
continue
|
|
53
|
+
counts[canonical_kmer(kmer) if canonical else kmer] += 1
|
|
54
|
+
return counts
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _count_chunk(args: tuple[str, int, bool, bool]) -> Counter[str]:
|
|
58
|
+
sequence, k, canonical, skip_ambiguous = args
|
|
59
|
+
return count_kmers(sequence, k, canonical, skip_ambiguous)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _chunk_sequence(sequence: str, n_chunks: int, k: int) -> list[str]:
|
|
63
|
+
"""Split a sequence into ``n_chunks`` overlapping chunks (overlap = k - 1).
|
|
64
|
+
|
|
65
|
+
The overlap guarantees k-mers spanning chunk boundaries are still counted.
|
|
66
|
+
"""
|
|
67
|
+
n = len(sequence)
|
|
68
|
+
if n_chunks <= 1 or n < 2 * k:
|
|
69
|
+
return [sequence]
|
|
70
|
+
size = max(k, n // n_chunks)
|
|
71
|
+
chunks: list[str] = []
|
|
72
|
+
start = 0
|
|
73
|
+
while start < n:
|
|
74
|
+
end = min(n, start + size)
|
|
75
|
+
chunks.append(sequence[start : min(n, end + k - 1)])
|
|
76
|
+
start = end
|
|
77
|
+
return chunks
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def count_kmers_parallel(
|
|
81
|
+
sequences: Iterable[str],
|
|
82
|
+
k: int,
|
|
83
|
+
canonical: bool = False,
|
|
84
|
+
skip_ambiguous: bool = True,
|
|
85
|
+
workers: int = 4,
|
|
86
|
+
) -> Counter[str]:
|
|
87
|
+
"""Count k-mers across sequences using a process pool, then merge counts.
|
|
88
|
+
|
|
89
|
+
Each input sequence is split into overlapping chunks that are distributed
|
|
90
|
+
to worker processes; the per-chunk counters are summed into a single result.
|
|
91
|
+
"""
|
|
92
|
+
chunks: list[str] = []
|
|
93
|
+
for seq in sequences:
|
|
94
|
+
chunks.extend(_chunk_sequence(seq.upper(), workers, k))
|
|
95
|
+
if not chunks:
|
|
96
|
+
return Counter()
|
|
97
|
+
if workers <= 1 or len(chunks) == 1:
|
|
98
|
+
total: Counter[str] = Counter()
|
|
99
|
+
for chunk in chunks:
|
|
100
|
+
total += count_kmers(chunk, k, canonical, skip_ambiguous)
|
|
101
|
+
return total
|
|
102
|
+
|
|
103
|
+
tasks = [(chunk, k, canonical, skip_ambiguous) for chunk in chunks]
|
|
104
|
+
total = Counter()
|
|
105
|
+
with ProcessPoolExecutor(max_workers=workers) as pool:
|
|
106
|
+
for partial in pool.map(_count_chunk, tasks):
|
|
107
|
+
total += partial
|
|
108
|
+
return total
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def top_kmers(counts: Counter[str], n: int = 10) -> list[tuple[str, int]]:
|
|
112
|
+
"""Return the ``n`` most common (k-mer, count) pairs."""
|
|
113
|
+
return counts.most_common(n)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def minimizers(sequence: str, k: int, w: int, canonical: bool = True) -> list[tuple[int, str]]:
|
|
117
|
+
"""Compute (position, minimizer) pairs over a sliding window.
|
|
118
|
+
|
|
119
|
+
For each window of ``w`` consecutive k-mers, the lexicographically smallest
|
|
120
|
+
(canonical, by default) k-mer is selected. Consecutive duplicate minimizers
|
|
121
|
+
are collapsed, mirroring minimap2/Mash behaviour.
|
|
122
|
+
"""
|
|
123
|
+
if k <= 0 or w <= 0:
|
|
124
|
+
raise ValueError("k and w must be positive integers")
|
|
125
|
+
seq = sequence.upper()
|
|
126
|
+
kmers = list(iter_kmers(seq, k))
|
|
127
|
+
if not kmers:
|
|
128
|
+
return []
|
|
129
|
+
keyed = [canonical_kmer(km) if canonical else km for km in kmers]
|
|
130
|
+
result: list[tuple[int, str]] = []
|
|
131
|
+
last: tuple[int, str] | None = None
|
|
132
|
+
n_windows = max(1, len(kmers) - w + 1)
|
|
133
|
+
for start in range(n_windows):
|
|
134
|
+
window = keyed[start : start + w]
|
|
135
|
+
best_val = min(window)
|
|
136
|
+
best_pos = start + window.index(best_val)
|
|
137
|
+
picked = (best_pos, best_val)
|
|
138
|
+
if picked != last:
|
|
139
|
+
result.append(picked)
|
|
140
|
+
last = picked
|
|
141
|
+
return result
|
bioseqkit/py.typed
ADDED
|
File without changes
|
bioseqkit/stats.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Sequence statistics.
|
|
2
|
+
|
|
3
|
+
Length distribution, GC content, N-base ratio and base-composition matrix,
|
|
4
|
+
computed with the Python standard library only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Iterable
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"SeqStats",
|
|
15
|
+
"gc_content",
|
|
16
|
+
"n_ratio",
|
|
17
|
+
"base_composition",
|
|
18
|
+
"sequence_stats",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def gc_content(sequence: str) -> float:
|
|
23
|
+
"""Fraction of G/C bases (case-insensitive). Returns 0.0 for empty input."""
|
|
24
|
+
if not sequence:
|
|
25
|
+
return 0.0
|
|
26
|
+
gc = 0
|
|
27
|
+
for base in sequence:
|
|
28
|
+
if base in "GCgc":
|
|
29
|
+
gc += 1
|
|
30
|
+
return gc / len(sequence)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def n_ratio(sequence: str) -> float:
|
|
34
|
+
"""Fraction of ambiguous ``N`` bases (case-insensitive)."""
|
|
35
|
+
if not sequence:
|
|
36
|
+
return 0.0
|
|
37
|
+
n = sequence.count("N") + sequence.count("n")
|
|
38
|
+
return n / len(sequence)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def base_composition(sequence: str) -> dict[str, int]:
|
|
42
|
+
"""Return a per-base count dictionary (upper-cased keys)."""
|
|
43
|
+
return dict(Counter(sequence.upper()))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class SeqStats:
|
|
48
|
+
"""Aggregate statistics over a collection of sequences."""
|
|
49
|
+
|
|
50
|
+
n_seqs: int = 0
|
|
51
|
+
total_length: int = 0
|
|
52
|
+
min_length: int = 0
|
|
53
|
+
max_length: int = 0
|
|
54
|
+
lengths: list[int] = field(default_factory=list)
|
|
55
|
+
gc_content: float = 0.0
|
|
56
|
+
n_ratio: float = 0.0
|
|
57
|
+
base_counts: dict[str, int] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def mean_length(self) -> float:
|
|
61
|
+
return self.total_length / self.n_seqs if self.n_seqs else 0.0
|
|
62
|
+
|
|
63
|
+
def n50(self) -> int:
|
|
64
|
+
"""Return the N50 of the length distribution."""
|
|
65
|
+
if not self.lengths:
|
|
66
|
+
return 0
|
|
67
|
+
half = self.total_length / 2
|
|
68
|
+
acc = 0
|
|
69
|
+
for length in sorted(self.lengths, reverse=True):
|
|
70
|
+
acc += length
|
|
71
|
+
if acc >= half:
|
|
72
|
+
return length
|
|
73
|
+
return 0
|
|
74
|
+
|
|
75
|
+
def as_dict(self) -> dict[str, object]:
|
|
76
|
+
return {
|
|
77
|
+
"n_seqs": self.n_seqs,
|
|
78
|
+
"total_length": self.total_length,
|
|
79
|
+
"min_length": self.min_length,
|
|
80
|
+
"max_length": self.max_length,
|
|
81
|
+
"mean_length": round(self.mean_length, 3),
|
|
82
|
+
"n50": self.n50(),
|
|
83
|
+
"gc_content": round(self.gc_content, 6),
|
|
84
|
+
"n_ratio": round(self.n_ratio, 6),
|
|
85
|
+
"base_counts": self.base_counts,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def sequence_stats(sequences: Iterable[str]) -> SeqStats:
|
|
90
|
+
"""Compute aggregate :class:`SeqStats` over an iterable of sequences."""
|
|
91
|
+
stats = SeqStats()
|
|
92
|
+
gc = 0
|
|
93
|
+
n_bases = 0
|
|
94
|
+
counts: Counter[str] = Counter()
|
|
95
|
+
for seq in sequences:
|
|
96
|
+
length = len(seq)
|
|
97
|
+
stats.n_seqs += 1
|
|
98
|
+
stats.total_length += length
|
|
99
|
+
stats.lengths.append(length)
|
|
100
|
+
upper = seq.upper()
|
|
101
|
+
counts.update(upper)
|
|
102
|
+
gc += upper.count("G") + upper.count("C")
|
|
103
|
+
n_bases += upper.count("N")
|
|
104
|
+
if stats.lengths:
|
|
105
|
+
stats.min_length = min(stats.lengths)
|
|
106
|
+
stats.max_length = max(stats.lengths)
|
|
107
|
+
if stats.total_length:
|
|
108
|
+
stats.gc_content = gc / stats.total_length
|
|
109
|
+
stats.n_ratio = n_bases / stats.total_length
|
|
110
|
+
stats.base_counts = dict(counts)
|
|
111
|
+
return stats
|
bioseqkit/transform.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Sequence transformations: reverse complement and translation.
|
|
2
|
+
|
|
3
|
+
Implements DNA reverse complement and the standard genetic code, including
|
|
4
|
+
six-frame translation (three forward frames + three reverse-complement frames).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"reverse_complement",
|
|
13
|
+
"complement",
|
|
14
|
+
"translate",
|
|
15
|
+
"six_frame_translation",
|
|
16
|
+
"Frame",
|
|
17
|
+
"CODON_TABLE",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
_COMPLEMENT = str.maketrans(
|
|
21
|
+
"ACGTUNacgtunRYSWKMBDHVryswkmbdhv",
|
|
22
|
+
"TGCAANtgcaanYRSWMKVHDByrswmkvhdb",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Standard genetic code (NCBI transl_table=1).
|
|
26
|
+
CODON_TABLE: dict[str, str] = {
|
|
27
|
+
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
|
|
28
|
+
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
|
|
29
|
+
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
|
|
30
|
+
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
|
|
31
|
+
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
|
|
32
|
+
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
|
|
33
|
+
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
|
|
34
|
+
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
|
|
35
|
+
"TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
|
|
36
|
+
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
|
|
37
|
+
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
|
|
38
|
+
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
|
|
39
|
+
"TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
|
|
40
|
+
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
|
|
41
|
+
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
|
|
42
|
+
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def complement(sequence: str) -> str:
|
|
47
|
+
"""Return the base-wise complement (IUPAC aware, preserves length)."""
|
|
48
|
+
return sequence.translate(_COMPLEMENT)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def reverse_complement(sequence: str) -> str:
|
|
52
|
+
"""Return the reverse complement of a DNA sequence."""
|
|
53
|
+
return sequence.translate(_COMPLEMENT)[::-1]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def translate(sequence: str, unknown: str = "X") -> str:
|
|
57
|
+
"""Translate a nucleotide sequence in reading frame 0.
|
|
58
|
+
|
|
59
|
+
``U`` is treated as ``T``. Incomplete trailing codons are dropped.
|
|
60
|
+
Unknown codons map to ``unknown``.
|
|
61
|
+
"""
|
|
62
|
+
seq = sequence.upper().replace("U", "T")
|
|
63
|
+
protein: list[str] = []
|
|
64
|
+
for i in range(0, len(seq) - 2, 3):
|
|
65
|
+
codon = seq[i : i + 3]
|
|
66
|
+
protein.append(CODON_TABLE.get(codon, unknown))
|
|
67
|
+
return "".join(protein)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class Frame:
|
|
72
|
+
"""One of the six reading frames of a sequence."""
|
|
73
|
+
|
|
74
|
+
strand: str # '+' or '-'
|
|
75
|
+
offset: int # 0, 1 or 2
|
|
76
|
+
protein: str
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def name(self) -> str:
|
|
80
|
+
return f"{self.strand}{self.offset + 1}"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def six_frame_translation(sequence: str, unknown: str = "X") -> list[Frame]:
|
|
84
|
+
"""Return all six reading-frame translations of ``sequence``.
|
|
85
|
+
|
|
86
|
+
Frames ``+1/+2/+3`` translate the forward strand at offsets 0/1/2;
|
|
87
|
+
frames ``-1/-2/-3`` translate the reverse complement at offsets 0/1/2.
|
|
88
|
+
"""
|
|
89
|
+
frames: list[Frame] = []
|
|
90
|
+
forward = sequence.upper().replace("U", "T")
|
|
91
|
+
reverse = reverse_complement(forward)
|
|
92
|
+
for strand, seq in (("+", forward), ("-", reverse)):
|
|
93
|
+
for offset in range(3):
|
|
94
|
+
frames.append(Frame(strand, offset, translate(seq[offset:], unknown)))
|
|
95
|
+
return frames
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bioseqkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, dependency-free biological sequence processing toolkit (FASTA/FASTQ, stats, k-mer, minimizer, indexing).
|
|
5
|
+
Author-email: Jilai Cheng <chengjilai@sjtu.edu.cn>
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: bioinformatics,fasta,fastq,kmer,minimizer,sequence
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Provides-Extra: docs
|
|
11
|
+
Requires-Dist: myst-parser>=2.0; extra == 'docs'
|
|
12
|
+
Requires-Dist: sphinx>=7.0; extra == 'docs'
|
|
13
|
+
Provides-Extra: net
|
|
14
|
+
Requires-Dist: requests>=2.28; extra == 'net'
|
|
15
|
+
Provides-Extra: viz
|
|
16
|
+
Requires-Dist: matplotlib>=3.7; extra == 'viz'
|
|
17
|
+
Requires-Dist: seaborn>=0.12; extra == 'viz'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# bioseqkit
|
|
21
|
+
|
|
22
|
+
A lightweight, **dependency-free** biological sequence processing toolkit built
|
|
23
|
+
from scratch in pure Python. `bioseqkit` implements FASTA/FASTQ parsing,
|
|
24
|
+
sequence statistics, transformations, k-mer / minimizer analysis and FAI-like
|
|
25
|
+
random-access indexing, exposed both as a Python API and a command-line tool.
|
|
26
|
+
|
|
27
|
+
The project is a teaching implementation for **BIO2502 (Programming Languages
|
|
28
|
+
for Biological Computing)**: it deliberately re-implements the low-level I/O,
|
|
29
|
+
streaming and indexing logic instead of relying on Biopython, so the core
|
|
30
|
+
design patterns of bioinformatics data handling are made explicit.
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- **Streaming FASTA/FASTQ parsers** (`io`) — generator based, constant memory,
|
|
35
|
+
transparent gzip support, Phred quality decoding.
|
|
36
|
+
- **Statistics** (`stats`) — length distribution, N50, GC content, N-base
|
|
37
|
+
ratio, base-composition matrix.
|
|
38
|
+
- **Transformations** (`transform`) — reverse complement (IUPAC aware) and
|
|
39
|
+
six-frame translation with the standard genetic code.
|
|
40
|
+
- **k-mer analysis** (`kmer`) — counting, top-k, canonical k-mers,
|
|
41
|
+
**multi-process** parallel counting, and **minimizer** sampling.
|
|
42
|
+
- **FAI-like indexing** (`index`) — `samtools faidx`-compatible index for
|
|
43
|
+
`chr:start-end` random access without scanning the whole file.
|
|
44
|
+
- **CLI** (`cli`) — `stats`, `revcomp`, `translate`, `kmer`, `minimizer`,
|
|
45
|
+
`index`, `fetch`.
|
|
46
|
+
- **NCBI download** (`entrez`) — fetch reference sequences via E-utilities
|
|
47
|
+
(standard-library HTTP only).
|
|
48
|
+
|
|
49
|
+
## Project layout
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
bioseqkit/
|
|
53
|
+
├── pyproject.toml # src-layout, PEP 621 metadata, console script
|
|
54
|
+
├── README.md
|
|
55
|
+
├── LICENSE
|
|
56
|
+
├── environment.yml # conda environment
|
|
57
|
+
├── requirements.txt
|
|
58
|
+
├── src/bioseqkit/
|
|
59
|
+
│ ├── __init__.py # public API
|
|
60
|
+
│ ├── io.py # FASTA/FASTQ parsers
|
|
61
|
+
│ ├── stats.py # sequence statistics
|
|
62
|
+
│ ├── transform.py # revcomp + six-frame translation
|
|
63
|
+
│ ├── kmer.py # k-mer / minimizer (serial + parallel)
|
|
64
|
+
│ ├── index.py # FAI-like random-access index
|
|
65
|
+
│ ├── entrez.py # NCBI download helper
|
|
66
|
+
│ └── cli.py # argparse CLI
|
|
67
|
+
├── tests/ # pytest suite (io/stats/transform/kmer/index/cli)
|
|
68
|
+
├── examples/
|
|
69
|
+
│ ├── demo.ipynb # Jupyter demo (stats, GC, k-mer spectrum, ...)
|
|
70
|
+
│ └── example_data/sample.fa
|
|
71
|
+
├── docs/ # Sphinx documentation
|
|
72
|
+
└── .github/workflows/ci.yml
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
Requires Python >= 3.10. The core package has **no runtime dependencies**.
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# with uv (recommended)
|
|
81
|
+
uv pip install -e .
|
|
82
|
+
|
|
83
|
+
# or plain pip
|
|
84
|
+
pip install -e .
|
|
85
|
+
|
|
86
|
+
# with optional extras (plots for the notebook / NCBI download / docs)
|
|
87
|
+
pip install -e ".[viz,net,docs]"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Command-line usage
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
bioseqkit stats examples/example_data/sample.fa # JSON statistics
|
|
94
|
+
bioseqkit revcomp examples/example_data/sample.fa # reverse complement
|
|
95
|
+
bioseqkit translate examples/example_data/sample.fa # six-frame translation
|
|
96
|
+
bioseqkit kmer examples/example_data/sample.fa -k 5 --top 10 --canonical
|
|
97
|
+
bioseqkit kmer examples/example_data/sample.fa -k 5 -t 4 # parallel
|
|
98
|
+
bioseqkit minimizer examples/example_data/sample.fa -k 15 -w 10
|
|
99
|
+
bioseqkit index examples/example_data/sample.fa # write *.fai
|
|
100
|
+
bioseqkit fetch examples/example_data/sample.fa seq2:1-16
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Python API
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import bioseqkit as bsk
|
|
107
|
+
|
|
108
|
+
for rec in bsk.parse_fasta("examples/example_data/sample.fa"):
|
|
109
|
+
print(rec.id, len(rec), bsk.gc_content(rec.sequence))
|
|
110
|
+
|
|
111
|
+
print(bsk.reverse_complement("ATGC")) # -> GCAT
|
|
112
|
+
print(bsk.translate("ATGGCCTAA")) # -> MA*
|
|
113
|
+
|
|
114
|
+
counts = bsk.count_kmers("ACGTACGTACGT", k=3, canonical=True)
|
|
115
|
+
print(bsk.top_kmers(counts, 3))
|
|
116
|
+
|
|
117
|
+
idx = bsk.build_faidx("examples/example_data/sample.fa")
|
|
118
|
+
print(idx.fetch("seq2", 1, 16))
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Testing
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv run --with pytest pytest -q # 39 tests
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Continuous integration (GitHub Actions) runs `ruff` linting and the `pytest`
|
|
128
|
+
suite on Python 3.10–3.12 for every push.
|
|
129
|
+
|
|
130
|
+
## Data sources
|
|
131
|
+
|
|
132
|
+
- NCBI Nucleotide: <https://www.ncbi.nlm.nih.gov/nucleotide/>
|
|
133
|
+
- UCSC Genome Browser: <https://genome.ucsc.edu/>
|
|
134
|
+
|
|
135
|
+
The bundled `examples/example_data/sample.fa` is a small synthetic sequence for
|
|
136
|
+
offline testing; `demo.ipynb` will download real data from NCBI when a network
|
|
137
|
+
connection is available and fall back to the bundled file otherwise.
|
|
138
|
+
|
|
139
|
+
## License
|
|
140
|
+
|
|
141
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
bioseqkit/__init__.py,sha256=WPNQec42lcU4LJpPE8jM8Y_fFk1bOGL6MtpVhf7JXzY,1261
|
|
2
|
+
bioseqkit/cli.py,sha256=5lUd-el13pi7ZYis5RQQJbtPz_OecZbNTN5IDKpwn3k,4938
|
|
3
|
+
bioseqkit/entrez.py,sha256=33vTm0Lv7m--P_OdiIw_5wj8zgrrvAebKP--PQnT1oM,1144
|
|
4
|
+
bioseqkit/index.py,sha256=y9KcvGW6ndoKgYLP34sGsCNkSbZ2uf3jU3R2yeUEqCo,6038
|
|
5
|
+
bioseqkit/io.py,sha256=_YXwA2tI6xamdHf1sEdHKczpP8SY7YAiEQNPrgenSFM,5586
|
|
6
|
+
bioseqkit/kmer.py,sha256=JFsAF6BXEfgcb3Dq9NBHVoI7vapI34ctMvS8rFUnZM8,4557
|
|
7
|
+
bioseqkit/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
bioseqkit/stats.py,sha256=QHwkda7nP-ithSaB30QTsXZdR_WrUOQIoWqPYpgAthw,3176
|
|
9
|
+
bioseqkit/transform.py,sha256=lU66dRisQUEpRNHSZC9maz4mAXmcHBr1QnLGFfo5Dug,3110
|
|
10
|
+
bioseqkit-0.1.0.dist-info/METADATA,sha256=o4-fLIUZweJ9NMeDV57QqXvesFYp-_5EyXL4Sch4vfc,5228
|
|
11
|
+
bioseqkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
bioseqkit-0.1.0.dist-info/entry_points.txt,sha256=xZmtCloPHVsircuQzpq51XSbYxc4zCLrgtf2SVcva6U,49
|
|
13
|
+
bioseqkit-0.1.0.dist-info/licenses/LICENSE,sha256=8TV7QZtAVFsQL2J6cdUiz2zWvFiWzf7vJ4ix5o3bzK8,1068
|
|
14
|
+
bioseqkit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jilai Cheng
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|