bioseqkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bioseqkit/__init__.py ADDED
@@ -0,0 +1,57 @@
1
+ """bioseqkit: a lightweight, dependency-free biological sequence toolkit.
2
+
3
+ The package provides pure-Python FASTA/FASTQ parsing, sequence statistics,
4
+ transformations (reverse complement, six-frame translation), k-mer analysis,
5
+ minimizer sampling and FAI-like random-access indexing.
6
+ """
7
+
8
+ from bioseqkit.io import (
9
+ FastaRecord,
10
+ FastqRecord,
11
+ parse_fasta,
12
+ parse_fastq,
13
+ write_fasta,
14
+ )
15
+ from bioseqkit.stats import (
16
+ SeqStats,
17
+ base_composition,
18
+ gc_content,
19
+ n_ratio,
20
+ sequence_stats,
21
+ )
22
+ from bioseqkit.transform import reverse_complement, six_frame_translation, translate
23
+ from bioseqkit.kmer import (
24
+ canonical_kmer,
25
+ count_kmers,
26
+ count_kmers_parallel,
27
+ minimizers,
28
+ top_kmers,
29
+ )
30
+ from bioseqkit.index import FaidxIndex, build_faidx, fetch
31
+
32
+ __version__ = "0.1.0"
33
+
34
+ __all__ = [
35
+ "FastaRecord",
36
+ "FastqRecord",
37
+ "parse_fasta",
38
+ "parse_fastq",
39
+ "write_fasta",
40
+ "SeqStats",
41
+ "sequence_stats",
42
+ "gc_content",
43
+ "n_ratio",
44
+ "base_composition",
45
+ "reverse_complement",
46
+ "translate",
47
+ "six_frame_translation",
48
+ "count_kmers",
49
+ "count_kmers_parallel",
50
+ "top_kmers",
51
+ "canonical_kmer",
52
+ "minimizers",
53
+ "FaidxIndex",
54
+ "build_faidx",
55
+ "fetch",
56
+ "__version__",
57
+ ]
bioseqkit/cli.py ADDED
@@ -0,0 +1,140 @@
1
+ """Command-line interface for bioseqkit.
2
+
3
+ Implemented with :mod:`argparse` (standard library) to keep the core package
4
+ dependency-free. Sub-commands: ``stats``, ``revcomp``, ``translate``,
5
+ ``kmer``, ``minimizer``, ``index`` and ``fetch``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from typing import Sequence
14
+
15
+ from bioseqkit import __version__
16
+ from bioseqkit.index import build_faidx, fetch as fetch_region
17
+ from bioseqkit.io import FastaRecord, parse_fasta, write_fasta
18
+ from bioseqkit.kmer import count_kmers, count_kmers_parallel, minimizers, top_kmers
19
+ from bioseqkit.stats import sequence_stats
20
+ from bioseqkit.transform import reverse_complement, six_frame_translation
21
+
22
+
23
+ def _read_sequences(path: str) -> list[FastaRecord]:
24
+ return list(parse_fasta(path))
25
+
26
+
27
+ def cmd_stats(args: argparse.Namespace) -> int:
28
+ records = _read_sequences(args.input)
29
+ stats = sequence_stats(rec.sequence for rec in records)
30
+ print(json.dumps(stats.as_dict(), indent=2))
31
+ return 0
32
+
33
+
34
+ def cmd_revcomp(args: argparse.Namespace) -> int:
35
+ out = [
36
+ FastaRecord(rec.id, (rec.description + " revcomp").strip(), reverse_complement(rec.sequence))
37
+ for rec in _read_sequences(args.input)
38
+ ]
39
+ write_fasta(out, sys.stdout)
40
+ return 0
41
+
42
+
43
+ def cmd_translate(args: argparse.Namespace) -> int:
44
+ out: list[FastaRecord] = []
45
+ for rec in _read_sequences(args.input):
46
+ for frame in six_frame_translation(rec.sequence):
47
+ out.append(FastaRecord(f"{rec.id}_frame{frame.name}", "", frame.protein))
48
+ write_fasta(out, sys.stdout)
49
+ return 0
50
+
51
+
52
+ def cmd_kmer(args: argparse.Namespace) -> int:
53
+ seqs = [rec.sequence for rec in _read_sequences(args.input)]
54
+ if args.threads and args.threads > 1:
55
+ counts = count_kmers_parallel(seqs, args.k, canonical=args.canonical, workers=args.threads)
56
+ else:
57
+ counts = sum(
58
+ (count_kmers(s, args.k, canonical=args.canonical) for s in seqs),
59
+ start=type(count_kmers("", args.k))(),
60
+ )
61
+ for kmer, count in top_kmers(counts, args.top):
62
+ print(f"{kmer}\t{count}")
63
+ return 0
64
+
65
+
66
+ def cmd_minimizer(args: argparse.Namespace) -> int:
67
+ for rec in _read_sequences(args.input):
68
+ for pos, mm in minimizers(rec.sequence, args.k, args.w, canonical=not args.no_canonical):
69
+ print(f"{rec.id}\t{pos}\t{mm}")
70
+ return 0
71
+
72
+
73
+ def cmd_index(args: argparse.Namespace) -> int:
74
+ index = build_faidx(args.input)
75
+ path = index.write()
76
+ print(f"Wrote index: {path} ({len(index.records)} sequences)")
77
+ return 0
78
+
79
+
80
+ def cmd_fetch(args: argparse.Namespace) -> int:
81
+ seq = fetch_region(args.input, args.region)
82
+ name = args.region
83
+ print(f">{name}")
84
+ for i in range(0, len(seq), 70):
85
+ print(seq[i : i + 70])
86
+ return 0
87
+
88
+
89
+ def build_parser() -> argparse.ArgumentParser:
90
+ parser = argparse.ArgumentParser(prog="bioseqkit", description=__doc__.split("\n")[0])
91
+ parser.add_argument("--version", action="version", version=f"bioseqkit {__version__}")
92
+ sub = parser.add_subparsers(dest="command", required=True)
93
+
94
+ p = sub.add_parser("stats", help="sequence statistics (JSON)")
95
+ p.add_argument("input", help="FASTA/FASTA.gz file")
96
+ p.set_defaults(func=cmd_stats)
97
+
98
+ p = sub.add_parser("revcomp", help="reverse complement")
99
+ p.add_argument("input")
100
+ p.set_defaults(func=cmd_revcomp)
101
+
102
+ p = sub.add_parser("translate", help="six-frame translation")
103
+ p.add_argument("input")
104
+ p.set_defaults(func=cmd_translate)
105
+
106
+ p = sub.add_parser("kmer", help="k-mer frequency analysis")
107
+ p.add_argument("input")
108
+ p.add_argument("-k", type=int, default=5, help="k-mer size (default 5)")
109
+ p.add_argument("--top", type=int, default=10, help="report top-N k-mers")
110
+ p.add_argument("--canonical", action="store_true", help="merge reverse-complement k-mers")
111
+ p.add_argument("-t", "--threads", type=int, default=1, help="parallel worker processes")
112
+ p.set_defaults(func=cmd_kmer)
113
+
114
+ p = sub.add_parser("minimizer", help="minimizer sampling")
115
+ p.add_argument("input")
116
+ p.add_argument("-k", type=int, default=15, help="k-mer size (default 15)")
117
+ p.add_argument("-w", type=int, default=10, help="window size (default 10)")
118
+ p.add_argument("--no-canonical", action="store_true", help="disable canonical k-mers")
119
+ p.set_defaults(func=cmd_minimizer)
120
+
121
+ p = sub.add_parser("index", help="build a FAI-like index")
122
+ p.add_argument("input")
123
+ p.set_defaults(func=cmd_index)
124
+
125
+ p = sub.add_parser("fetch", help="fetch a sub-sequence (chr:start-end)")
126
+ p.add_argument("input")
127
+ p.add_argument("region", help="region string, e.g. chr1:1000-2000")
128
+ p.set_defaults(func=cmd_fetch)
129
+
130
+ return parser
131
+
132
+
133
+ def main(argv: Sequence[str] | None = None) -> int:
134
+ parser = build_parser()
135
+ args = parser.parse_args(argv)
136
+ return args.func(args)
137
+
138
+
139
+ if __name__ == "__main__": # pragma: no cover
140
+ raise SystemExit(main())
bioseqkit/entrez.py ADDED
@@ -0,0 +1,39 @@
1
+ """Optional NCBI Entrez download helper.
2
+
3
+ Fetches reference sequences from the NCBI E-utilities ``efetch`` endpoint using
4
+ only the standard library (``urllib``), so no third-party HTTP client is
5
+ required. Network access is required at call time.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import urllib.parse
11
+ import urllib.request
12
+
13
+ __all__ = ["efetch_fasta"]
14
+
15
+ _EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
16
+
17
+
18
+ def efetch_fasta(
19
+ accession: str,
20
+ db: str = "nuccore",
21
+ email: str | None = None,
22
+ api_key: str | None = None,
23
+ timeout: float = 30.0,
24
+ ) -> str:
25
+ """Download a FASTA record from NCBI by accession and return it as text."""
26
+ params = {
27
+ "db": db,
28
+ "id": accession,
29
+ "rettype": "fasta",
30
+ "retmode": "text",
31
+ }
32
+ if email:
33
+ params["email"] = email
34
+ if api_key:
35
+ params["api_key"] = api_key
36
+ url = f"{_EFETCH}?{urllib.parse.urlencode(params)}"
37
+ req = urllib.request.Request(url, headers={"User-Agent": "bioseqkit/0.1"})
38
+ with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310
39
+ return resp.read().decode("utf-8")
bioseqkit/index.py ADDED
@@ -0,0 +1,164 @@
1
+ """FAI-like FASTA indexing for random access.
2
+
3
+ Mirrors the ``samtools faidx`` (``*.fai``) format so that arbitrary
4
+ sub-sequences can be fetched without reading the whole file. Each index line
5
+ holds: name, sequence length, byte offset of the first base, number of bases
6
+ per line and number of bytes per line (including the newline).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from dataclasses import dataclass
13
+
14
+ __all__ = ["FaidxRecord", "FaidxIndex", "build_faidx", "fetch", "parse_region"]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class FaidxRecord:
19
+ name: str
20
+ length: int
21
+ offset: int
22
+ linebases: int
23
+ linewidth: int
24
+
25
+ def to_line(self) -> str:
26
+ return f"{self.name}\t{self.length}\t{self.offset}\t{self.linebases}\t{self.linewidth}"
27
+
28
+ @classmethod
29
+ def from_line(cls, line: str) -> "FaidxRecord":
30
+ name, length, offset, linebases, linewidth = line.rstrip("\n").split("\t")
31
+ return cls(name, int(length), int(offset), int(linebases), int(linewidth))
32
+
33
+
34
+ class FaidxIndex:
35
+ """An in-memory FASTA index bound to a plain-text FASTA file."""
36
+
37
+ def __init__(self, fasta_path: str, records: dict[str, FaidxRecord]):
38
+ self.fasta_path = fasta_path
39
+ self.records = records
40
+
41
+ def names(self) -> list[str]:
42
+ return list(self.records)
43
+
44
+ def write(self, fai_path: str | None = None) -> str:
45
+ path = fai_path or self.fasta_path + ".fai"
46
+ with open(path, "w") as fh:
47
+ for rec in self.records.values():
48
+ fh.write(rec.to_line() + "\n")
49
+ return path
50
+
51
+ @classmethod
52
+ def load(cls, fasta_path: str, fai_path: str | None = None) -> "FaidxIndex":
53
+ path = fai_path or fasta_path + ".fai"
54
+ records: dict[str, FaidxRecord] = {}
55
+ with open(path) as fh:
56
+ for line in fh:
57
+ if line.strip():
58
+ rec = FaidxRecord.from_line(line)
59
+ records[rec.name] = rec
60
+ return cls(fasta_path, records)
61
+
62
+ def fetch(self, name: str, start: int | None = None, end: int | None = None) -> str:
63
+ """Fetch a sub-sequence using 1-based inclusive ``start``/``end``.
64
+
65
+ With no coordinates the whole sequence is returned.
66
+ """
67
+ if name not in self.records:
68
+ raise KeyError(f"Sequence {name!r} not found in index")
69
+ rec = self.records[name]
70
+ s = 0 if start is None else start - 1
71
+ e = rec.length if end is None else end
72
+ if s < 0 or e > rec.length or s > e:
73
+ raise ValueError(f"Region out of bounds for {name} (length {rec.length})")
74
+ newline_bytes = rec.linewidth - rec.linebases
75
+ with open(self.fasta_path, "rb") as fh:
76
+ start_line, start_col = divmod(s, rec.linebases)
77
+ byte_start = rec.offset + start_line * rec.linewidth + start_col
78
+ fh.seek(byte_start)
79
+ n_bases = e - s
80
+ # Read enough bytes to cover the requested bases plus newlines.
81
+ n_lines = (start_col + n_bases) // rec.linebases
82
+ n_read = n_bases + n_lines * newline_bytes + rec.linewidth
83
+ raw = fh.read(n_read)
84
+ seq = raw.replace(b"\n", b"").replace(b"\r", b"").decode("ascii")
85
+ return seq[:n_bases]
86
+
87
+
88
+ def build_faidx(fasta_path: str) -> FaidxIndex:
89
+ """Scan a plain-text FASTA file and build a :class:`FaidxIndex`.
90
+
91
+ Raises :class:`ValueError` for gzip files (which are not seekable by base)
92
+ or for records with inconsistent line lengths.
93
+ """
94
+ if fasta_path.endswith(".gz"):
95
+ raise ValueError("faidx requires an uncompressed FASTA file")
96
+ records: dict[str, FaidxRecord] = {}
97
+ with open(fasta_path, "rb") as fh:
98
+ name: str | None = None
99
+ length = 0
100
+ offset = 0
101
+ linebases = 0
102
+ linewidth = 0
103
+ line_lengths: list[int] = []
104
+ pos = 0
105
+
106
+ def flush() -> None:
107
+ nonlocal name
108
+ if name is not None:
109
+ _validate_lines(name, line_lengths)
110
+ records[name] = FaidxRecord(name, length, offset, linebases, linewidth)
111
+
112
+ for raw in fh:
113
+ if raw.startswith(b">"):
114
+ flush()
115
+ header = raw[1:].decode("ascii", "replace").strip()
116
+ name = header.split()[0] if header else ""
117
+ length = 0
118
+ linebases = 0
119
+ linewidth = 0
120
+ line_lengths = []
121
+ offset = pos + len(raw)
122
+ else:
123
+ stripped = raw.rstrip(b"\r\n")
124
+ if name is not None:
125
+ if linebases == 0:
126
+ linebases = len(stripped)
127
+ linewidth = len(raw)
128
+ line_lengths.append(len(stripped))
129
+ length += len(stripped)
130
+ pos += len(raw)
131
+ flush()
132
+ return FaidxIndex(fasta_path, records)
133
+
134
+
135
+ def _validate_lines(name: str, line_lengths: list[int]) -> None:
136
+ if len(line_lengths) > 1:
137
+ body = line_lengths[:-1]
138
+ if len(set(body)) > 1:
139
+ raise ValueError(f"Inconsistent line lengths in record {name!r}; cannot index")
140
+ if line_lengths[-1] > body[0]:
141
+ raise ValueError(f"Last line longer than others in record {name!r}")
142
+
143
+
144
+ def parse_region(region: str) -> tuple[str, int | None, int | None]:
145
+ """Parse a ``chr:start-end`` region string (1-based, inclusive).
146
+
147
+ ``chr`` alone returns the whole sequence.
148
+ """
149
+ if ":" not in region:
150
+ return region, None, None
151
+ name, span = region.rsplit(":", 1)
152
+ span = span.replace(",", "")
153
+ if "-" in span:
154
+ start_s, end_s = span.split("-", 1)
155
+ return name, int(start_s), int(end_s)
156
+ return name, int(span), int(span)
157
+
158
+
159
+ def fetch(fasta_path: str, region: str) -> str:
160
+ """Convenience: build/load index and fetch a region in one call."""
161
+ fai_path = fasta_path + ".fai"
162
+ index = FaidxIndex.load(fasta_path) if os.path.exists(fai_path) else build_faidx(fasta_path)
163
+ name, start, end = parse_region(region)
164
+ return index.fetch(name, start, end)
bioseqkit/io.py ADDED
@@ -0,0 +1,182 @@
1
+ """Sequence file I/O.
2
+
3
+ Pure-Python, streaming FASTA/FASTQ parsers implemented with the
4
+ iterator/generator pattern so that arbitrarily large files can be processed
5
+ with constant memory. Both plain-text and gzip-compressed files are supported
6
+ transparently (detected by the ``.gz`` extension or the gzip magic bytes).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import gzip
12
+ import io as _io
13
+ from dataclasses import dataclass
14
+ from typing import IO, Iterable, Iterator
15
+
16
+ __all__ = [
17
+ "FastaRecord",
18
+ "FastqRecord",
19
+ "open_text",
20
+ "parse_fasta",
21
+ "parse_fastq",
22
+ "write_fasta",
23
+ ]
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class FastaRecord:
28
+ """A single FASTA record."""
29
+
30
+ id: str
31
+ description: str
32
+ sequence: str
33
+
34
+ def __len__(self) -> int:
35
+ return len(self.sequence)
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class FastqRecord:
40
+ """A single FASTQ record, carrying Phred quality string."""
41
+
42
+ id: str
43
+ description: str
44
+ sequence: str
45
+ quality: str
46
+
47
+ def __len__(self) -> int:
48
+ return len(self.sequence)
49
+
50
+ def phred_scores(self, offset: int = 33) -> list[int]:
51
+ """Decode the ASCII quality string into integer Phred scores."""
52
+ return [ord(c) - offset for c in self.quality]
53
+
54
+
55
+ def _is_gzip(path: str) -> bool:
56
+ if path.endswith(".gz"):
57
+ return True
58
+ try:
59
+ with open(path, "rb") as fh:
60
+ return fh.read(2) == b"\x1f\x8b"
61
+ except OSError:
62
+ return False
63
+
64
+
65
+ def open_text(path: str) -> IO[str]:
66
+ """Open ``path`` as a text stream, transparently handling gzip files."""
67
+ if _is_gzip(path):
68
+ return gzip.open(path, "rt")
69
+ return open(path, "rt")
70
+
71
+
72
+ def _split_header(header: str) -> tuple[str, str]:
73
+ header = header.strip()
74
+ if not header:
75
+ return "", ""
76
+ parts = header.split(None, 1)
77
+ seq_id = parts[0]
78
+ description = parts[1] if len(parts) > 1 else ""
79
+ return seq_id, description
80
+
81
+
82
+ def parse_fasta(source: str | IO[str]) -> Iterator[FastaRecord]:
83
+ """Stream :class:`FastaRecord` objects from a FASTA file or text stream.
84
+
85
+ Blank lines are ignored. Sequence lines are concatenated so multi-line
86
+ records are handled. Raises :class:`ValueError` on malformed input.
87
+ """
88
+ handle, own = _as_handle(source)
89
+ try:
90
+ seq_id: str | None = None
91
+ description = ""
92
+ chunks: list[str] = []
93
+ started = False
94
+ for raw in handle:
95
+ line = raw.rstrip("\r\n")
96
+ if not line.strip():
97
+ continue
98
+ if line.startswith(">"):
99
+ started = True
100
+ if seq_id is not None:
101
+ yield FastaRecord(seq_id, description, "".join(chunks))
102
+ seq_id, description = _split_header(line[1:])
103
+ chunks = []
104
+ else:
105
+ if not started:
106
+ raise ValueError("FASTA sequence data before any '>' header")
107
+ chunks.append(line.strip())
108
+ if seq_id is not None:
109
+ yield FastaRecord(seq_id, description, "".join(chunks))
110
+ finally:
111
+ if own:
112
+ handle.close()
113
+
114
+
115
+ def parse_fastq(source: str | IO[str]) -> Iterator[FastqRecord]:
116
+ """Stream :class:`FastqRecord` objects from a FASTQ file or text stream."""
117
+ handle, own = _as_handle(source)
118
+ try:
119
+ it = iter(handle)
120
+ while True:
121
+ header = _next_nonblank(it)
122
+ if header is None:
123
+ break
124
+ if not header.startswith("@"):
125
+ raise ValueError(f"FASTQ header must start with '@': {header!r}")
126
+ seq_line = _require(it, "sequence")
127
+ plus = _require(it, "'+' separator")
128
+ if not plus.startswith("+"):
129
+ raise ValueError(f"FASTQ separator must start with '+': {plus!r}")
130
+ qual_line = _require(it, "quality")
131
+ if len(seq_line) != len(qual_line):
132
+ raise ValueError(
133
+ "FASTQ sequence and quality length mismatch "
134
+ f"({len(seq_line)} vs {len(qual_line)})"
135
+ )
136
+ seq_id, description = _split_header(header[1:])
137
+ yield FastqRecord(seq_id, description, seq_line, qual_line)
138
+ finally:
139
+ if own:
140
+ handle.close()
141
+
142
+
143
+ def write_fasta(records: Iterable[FastaRecord], handle: IO[str], width: int = 70) -> int:
144
+ """Write records to a text handle, wrapping sequences at ``width`` columns.
145
+
146
+ Returns the number of records written. ``width <= 0`` disables wrapping.
147
+ """
148
+ n = 0
149
+ for rec in records:
150
+ header = rec.id if not rec.description else f"{rec.id} {rec.description}"
151
+ handle.write(f">{header}\n")
152
+ seq = rec.sequence
153
+ if width and width > 0:
154
+ for i in range(0, len(seq), width):
155
+ handle.write(seq[i : i + width] + "\n")
156
+ else:
157
+ handle.write(seq + "\n")
158
+ n += 1
159
+ return n
160
+
161
+
162
+ def _as_handle(source: str | IO[str]) -> tuple[IO[str], bool]:
163
+ if isinstance(source, str):
164
+ return open_text(source), True
165
+ if isinstance(source, (_io.TextIOBase,)) or hasattr(source, "read"):
166
+ return source, False
167
+ raise TypeError(f"Unsupported source type: {type(source)!r}")
168
+
169
+
170
+ def _next_nonblank(it: Iterator[str]) -> str | None:
171
+ for raw in it:
172
+ line = raw.rstrip("\r\n")
173
+ if line.strip():
174
+ return line
175
+ return None
176
+
177
+
178
+ def _require(it: Iterator[str], what: str) -> str:
179
+ try:
180
+ return next(it).rstrip("\r\n")
181
+ except StopIteration as exc: # noqa: F841
182
+ raise ValueError(f"Truncated FASTQ record: missing {what} line") from None
bioseqkit/kmer.py ADDED
@@ -0,0 +1,141 @@
1
+ """k-mer analysis: counting, top-k, canonical k-mers, parallel counting and minimizers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import Counter
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from typing import Iterable, Iterator
8
+
9
+ from bioseqkit.transform import reverse_complement
10
+
11
+ __all__ = [
12
+ "iter_kmers",
13
+ "count_kmers",
14
+ "count_kmers_parallel",
15
+ "top_kmers",
16
+ "canonical_kmer",
17
+ "minimizers",
18
+ ]
19
+
20
+
21
+ def iter_kmers(sequence: str, k: int) -> Iterator[str]:
22
+ """Yield successive k-mers of length ``k`` from ``sequence``."""
23
+ if k <= 0:
24
+ raise ValueError("k must be a positive integer")
25
+ for i in range(len(sequence) - k + 1):
26
+ yield sequence[i : i + k]
27
+
28
+
29
+ def canonical_kmer(kmer: str) -> str:
30
+ """Return the lexicographically smaller of a k-mer and its reverse complement."""
31
+ rc = reverse_complement(kmer)
32
+ return kmer if kmer <= rc else rc
33
+
34
+
35
+ def count_kmers(
36
+ sequence: str,
37
+ k: int,
38
+ canonical: bool = False,
39
+ skip_ambiguous: bool = True,
40
+ ) -> Counter[str]:
41
+ """Count k-mers in a single sequence.
42
+
43
+ If ``canonical`` is True, a k-mer and its reverse complement are merged.
44
+ If ``skip_ambiguous`` is True, k-mers containing bases outside ``ACGT`` are
45
+ skipped.
46
+ """
47
+ counts: Counter[str] = Counter()
48
+ seq = sequence.upper()
49
+ valid = set("ACGT")
50
+ for kmer in iter_kmers(seq, k):
51
+ if skip_ambiguous and not set(kmer) <= valid:
52
+ continue
53
+ counts[canonical_kmer(kmer) if canonical else kmer] += 1
54
+ return counts
55
+
56
+
57
+ def _count_chunk(args: tuple[str, int, bool, bool]) -> Counter[str]:
58
+ sequence, k, canonical, skip_ambiguous = args
59
+ return count_kmers(sequence, k, canonical, skip_ambiguous)
60
+
61
+
62
+ def _chunk_sequence(sequence: str, n_chunks: int, k: int) -> list[str]:
63
+ """Split a sequence into ``n_chunks`` overlapping chunks (overlap = k - 1).
64
+
65
+ The overlap guarantees k-mers spanning chunk boundaries are still counted.
66
+ """
67
+ n = len(sequence)
68
+ if n_chunks <= 1 or n < 2 * k:
69
+ return [sequence]
70
+ size = max(k, n // n_chunks)
71
+ chunks: list[str] = []
72
+ start = 0
73
+ while start < n:
74
+ end = min(n, start + size)
75
+ chunks.append(sequence[start : min(n, end + k - 1)])
76
+ start = end
77
+ return chunks
78
+
79
+
80
+ def count_kmers_parallel(
81
+ sequences: Iterable[str],
82
+ k: int,
83
+ canonical: bool = False,
84
+ skip_ambiguous: bool = True,
85
+ workers: int = 4,
86
+ ) -> Counter[str]:
87
+ """Count k-mers across sequences using a process pool, then merge counts.
88
+
89
+ Each input sequence is split into overlapping chunks that are distributed
90
+ to worker processes; the per-chunk counters are summed into a single result.
91
+ """
92
+ chunks: list[str] = []
93
+ for seq in sequences:
94
+ chunks.extend(_chunk_sequence(seq.upper(), workers, k))
95
+ if not chunks:
96
+ return Counter()
97
+ if workers <= 1 or len(chunks) == 1:
98
+ total: Counter[str] = Counter()
99
+ for chunk in chunks:
100
+ total += count_kmers(chunk, k, canonical, skip_ambiguous)
101
+ return total
102
+
103
+ tasks = [(chunk, k, canonical, skip_ambiguous) for chunk in chunks]
104
+ total = Counter()
105
+ with ProcessPoolExecutor(max_workers=workers) as pool:
106
+ for partial in pool.map(_count_chunk, tasks):
107
+ total += partial
108
+ return total
109
+
110
+
111
+ def top_kmers(counts: Counter[str], n: int = 10) -> list[tuple[str, int]]:
112
+ """Return the ``n`` most common (k-mer, count) pairs."""
113
+ return counts.most_common(n)
114
+
115
+
116
+ def minimizers(sequence: str, k: int, w: int, canonical: bool = True) -> list[tuple[int, str]]:
117
+ """Compute (position, minimizer) pairs over a sliding window.
118
+
119
+ For each window of ``w`` consecutive k-mers, the lexicographically smallest
120
+ (canonical, by default) k-mer is selected. Consecutive duplicate minimizers
121
+ are collapsed, mirroring minimap2/Mash behaviour.
122
+ """
123
+ if k <= 0 or w <= 0:
124
+ raise ValueError("k and w must be positive integers")
125
+ seq = sequence.upper()
126
+ kmers = list(iter_kmers(seq, k))
127
+ if not kmers:
128
+ return []
129
+ keyed = [canonical_kmer(km) if canonical else km for km in kmers]
130
+ result: list[tuple[int, str]] = []
131
+ last: tuple[int, str] | None = None
132
+ n_windows = max(1, len(kmers) - w + 1)
133
+ for start in range(n_windows):
134
+ window = keyed[start : start + w]
135
+ best_val = min(window)
136
+ best_pos = start + window.index(best_val)
137
+ picked = (best_pos, best_val)
138
+ if picked != last:
139
+ result.append(picked)
140
+ last = picked
141
+ return result
bioseqkit/py.typed ADDED
File without changes
bioseqkit/stats.py ADDED
@@ -0,0 +1,111 @@
1
+ """Sequence statistics.
2
+
3
+ Length distribution, GC content, N-base ratio and base-composition matrix,
4
+ computed with the Python standard library only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections import Counter
10
+ from dataclasses import dataclass, field
11
+ from typing import Iterable
12
+
13
+ __all__ = [
14
+ "SeqStats",
15
+ "gc_content",
16
+ "n_ratio",
17
+ "base_composition",
18
+ "sequence_stats",
19
+ ]
20
+
21
+
22
+ def gc_content(sequence: str) -> float:
23
+ """Fraction of G/C bases (case-insensitive). Returns 0.0 for empty input."""
24
+ if not sequence:
25
+ return 0.0
26
+ gc = 0
27
+ for base in sequence:
28
+ if base in "GCgc":
29
+ gc += 1
30
+ return gc / len(sequence)
31
+
32
+
33
+ def n_ratio(sequence: str) -> float:
34
+ """Fraction of ambiguous ``N`` bases (case-insensitive)."""
35
+ if not sequence:
36
+ return 0.0
37
+ n = sequence.count("N") + sequence.count("n")
38
+ return n / len(sequence)
39
+
40
+
41
+ def base_composition(sequence: str) -> dict[str, int]:
42
+ """Return a per-base count dictionary (upper-cased keys)."""
43
+ return dict(Counter(sequence.upper()))
44
+
45
+
46
+ @dataclass
47
+ class SeqStats:
48
+ """Aggregate statistics over a collection of sequences."""
49
+
50
+ n_seqs: int = 0
51
+ total_length: int = 0
52
+ min_length: int = 0
53
+ max_length: int = 0
54
+ lengths: list[int] = field(default_factory=list)
55
+ gc_content: float = 0.0
56
+ n_ratio: float = 0.0
57
+ base_counts: dict[str, int] = field(default_factory=dict)
58
+
59
+ @property
60
+ def mean_length(self) -> float:
61
+ return self.total_length / self.n_seqs if self.n_seqs else 0.0
62
+
63
+ def n50(self) -> int:
64
+ """Return the N50 of the length distribution."""
65
+ if not self.lengths:
66
+ return 0
67
+ half = self.total_length / 2
68
+ acc = 0
69
+ for length in sorted(self.lengths, reverse=True):
70
+ acc += length
71
+ if acc >= half:
72
+ return length
73
+ return 0
74
+
75
+ def as_dict(self) -> dict[str, object]:
76
+ return {
77
+ "n_seqs": self.n_seqs,
78
+ "total_length": self.total_length,
79
+ "min_length": self.min_length,
80
+ "max_length": self.max_length,
81
+ "mean_length": round(self.mean_length, 3),
82
+ "n50": self.n50(),
83
+ "gc_content": round(self.gc_content, 6),
84
+ "n_ratio": round(self.n_ratio, 6),
85
+ "base_counts": self.base_counts,
86
+ }
87
+
88
+
89
+ def sequence_stats(sequences: Iterable[str]) -> SeqStats:
90
+ """Compute aggregate :class:`SeqStats` over an iterable of sequences."""
91
+ stats = SeqStats()
92
+ gc = 0
93
+ n_bases = 0
94
+ counts: Counter[str] = Counter()
95
+ for seq in sequences:
96
+ length = len(seq)
97
+ stats.n_seqs += 1
98
+ stats.total_length += length
99
+ stats.lengths.append(length)
100
+ upper = seq.upper()
101
+ counts.update(upper)
102
+ gc += upper.count("G") + upper.count("C")
103
+ n_bases += upper.count("N")
104
+ if stats.lengths:
105
+ stats.min_length = min(stats.lengths)
106
+ stats.max_length = max(stats.lengths)
107
+ if stats.total_length:
108
+ stats.gc_content = gc / stats.total_length
109
+ stats.n_ratio = n_bases / stats.total_length
110
+ stats.base_counts = dict(counts)
111
+ return stats
bioseqkit/transform.py ADDED
@@ -0,0 +1,95 @@
1
+ """Sequence transformations: reverse complement and translation.
2
+
3
+ Implements DNA reverse complement and the standard genetic code, including
4
+ six-frame translation (three forward frames + three reverse-complement frames).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ __all__ = [
12
+ "reverse_complement",
13
+ "complement",
14
+ "translate",
15
+ "six_frame_translation",
16
+ "Frame",
17
+ "CODON_TABLE",
18
+ ]
19
+
20
+ _COMPLEMENT = str.maketrans(
21
+ "ACGTUNacgtunRYSWKMBDHVryswkmbdhv",
22
+ "TGCAANtgcaanYRSWMKVHDByrswmkvhdb",
23
+ )
24
+
25
+ # Standard genetic code (NCBI transl_table=1).
26
+ CODON_TABLE: dict[str, str] = {
27
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
28
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
29
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
30
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
31
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
32
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
33
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
34
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
35
+ "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
36
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
37
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
38
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
39
+ "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
40
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
41
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
42
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
43
+ }
44
+
45
+
46
+ def complement(sequence: str) -> str:
47
+ """Return the base-wise complement (IUPAC aware, preserves length)."""
48
+ return sequence.translate(_COMPLEMENT)
49
+
50
+
51
+ def reverse_complement(sequence: str) -> str:
52
+ """Return the reverse complement of a DNA sequence."""
53
+ return sequence.translate(_COMPLEMENT)[::-1]
54
+
55
+
56
+ def translate(sequence: str, unknown: str = "X") -> str:
57
+ """Translate a nucleotide sequence in reading frame 0.
58
+
59
+ ``U`` is treated as ``T``. Incomplete trailing codons are dropped.
60
+ Unknown codons map to ``unknown``.
61
+ """
62
+ seq = sequence.upper().replace("U", "T")
63
+ protein: list[str] = []
64
+ for i in range(0, len(seq) - 2, 3):
65
+ codon = seq[i : i + 3]
66
+ protein.append(CODON_TABLE.get(codon, unknown))
67
+ return "".join(protein)
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class Frame:
72
+ """One of the six reading frames of a sequence."""
73
+
74
+ strand: str # '+' or '-'
75
+ offset: int # 0, 1 or 2
76
+ protein: str
77
+
78
+ @property
79
+ def name(self) -> str:
80
+ return f"{self.strand}{self.offset + 1}"
81
+
82
+
83
+ def six_frame_translation(sequence: str, unknown: str = "X") -> list[Frame]:
84
+ """Return all six reading-frame translations of ``sequence``.
85
+
86
+ Frames ``+1/+2/+3`` translate the forward strand at offsets 0/1/2;
87
+ frames ``-1/-2/-3`` translate the reverse complement at offsets 0/1/2.
88
+ """
89
+ frames: list[Frame] = []
90
+ forward = sequence.upper().replace("U", "T")
91
+ reverse = reverse_complement(forward)
92
+ for strand, seq in (("+", forward), ("-", reverse)):
93
+ for offset in range(3):
94
+ frames.append(Frame(strand, offset, translate(seq[offset:], unknown)))
95
+ return frames
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: bioseqkit
3
+ Version: 0.1.0
4
+ Summary: A lightweight, dependency-free biological sequence processing toolkit (FASTA/FASTQ, stats, k-mer, minimizer, indexing).
5
+ Author-email: Jilai Cheng <chengjilai@sjtu.edu.cn>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: bioinformatics,fasta,fastq,kmer,minimizer,sequence
9
+ Requires-Python: >=3.10
10
+ Provides-Extra: docs
11
+ Requires-Dist: myst-parser>=2.0; extra == 'docs'
12
+ Requires-Dist: sphinx>=7.0; extra == 'docs'
13
+ Provides-Extra: net
14
+ Requires-Dist: requests>=2.28; extra == 'net'
15
+ Provides-Extra: viz
16
+ Requires-Dist: matplotlib>=3.7; extra == 'viz'
17
+ Requires-Dist: seaborn>=0.12; extra == 'viz'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # bioseqkit
21
+
22
+ A lightweight, **dependency-free** biological sequence processing toolkit built
23
+ from scratch in pure Python. `bioseqkit` implements FASTA/FASTQ parsing,
24
+ sequence statistics, transformations, k-mer / minimizer analysis and FAI-like
25
+ random-access indexing, exposed both as a Python API and a command-line tool.
26
+
27
+ The project is a teaching implementation for **BIO2502 (Programming Languages
28
+ for Biological Computing)**: it deliberately re-implements the low-level I/O,
29
+ streaming and indexing logic instead of relying on Biopython, so the core
30
+ design patterns of bioinformatics data handling are made explicit.
31
+
32
+ ## Features
33
+
34
+ - **Streaming FASTA/FASTQ parsers** (`io`) — generator based, constant memory,
35
+ transparent gzip support, Phred quality decoding.
36
+ - **Statistics** (`stats`) — length distribution, N50, GC content, N-base
37
+ ratio, base-composition matrix.
38
+ - **Transformations** (`transform`) — reverse complement (IUPAC aware) and
39
+ six-frame translation with the standard genetic code.
40
+ - **k-mer analysis** (`kmer`) — counting, top-k, canonical k-mers,
41
+ **multi-process** parallel counting, and **minimizer** sampling.
42
+ - **FAI-like indexing** (`index`) — `samtools faidx`-compatible index for
43
+ `chr:start-end` random access without scanning the whole file.
44
+ - **CLI** (`cli`) — `stats`, `revcomp`, `translate`, `kmer`, `minimizer`,
45
+ `index`, `fetch`.
46
+ - **NCBI download** (`entrez`) — fetch reference sequences via E-utilities
47
+ (standard-library HTTP only).
48
+
49
+ ## Project layout
50
+
51
+ ```
52
+ bioseqkit/
53
+ ├── pyproject.toml # src-layout, PEP 621 metadata, console script
54
+ ├── README.md
55
+ ├── LICENSE
56
+ ├── environment.yml # conda environment
57
+ ├── requirements.txt
58
+ ├── src/bioseqkit/
59
+ │ ├── __init__.py # public API
60
+ │ ├── io.py # FASTA/FASTQ parsers
61
+ │ ├── stats.py # sequence statistics
62
+ │ ├── transform.py # revcomp + six-frame translation
63
+ │ ├── kmer.py # k-mer / minimizer (serial + parallel)
64
+ │ ├── index.py # FAI-like random-access index
65
+ │ ├── entrez.py # NCBI download helper
66
+ │ └── cli.py # argparse CLI
67
+ ├── tests/ # pytest suite (io/stats/transform/kmer/index/cli)
68
+ ├── examples/
69
+ │ ├── demo.ipynb # Jupyter demo (stats, GC, k-mer spectrum, ...)
70
+ │ └── example_data/sample.fa
71
+ ├── docs/ # Sphinx documentation
72
+ └── .github/workflows/ci.yml
73
+ ```
74
+
75
+ ## Installation
76
+
77
+ Requires Python >= 3.10. The core package has **no runtime dependencies**.
78
+
79
+ ```bash
80
+ # with uv (recommended)
81
+ uv pip install -e .
82
+
83
+ # or plain pip
84
+ pip install -e .
85
+
86
+ # with optional extras (plots for the notebook / NCBI download / docs)
87
+ pip install -e ".[viz,net,docs]"
88
+ ```
89
+
90
+ ## Command-line usage
91
+
92
+ ```bash
93
+ bioseqkit stats examples/example_data/sample.fa # JSON statistics
94
+ bioseqkit revcomp examples/example_data/sample.fa # reverse complement
95
+ bioseqkit translate examples/example_data/sample.fa # six-frame translation
96
+ bioseqkit kmer examples/example_data/sample.fa -k 5 --top 10 --canonical
97
+ bioseqkit kmer examples/example_data/sample.fa -k 5 -t 4 # parallel
98
+ bioseqkit minimizer examples/example_data/sample.fa -k 15 -w 10
99
+ bioseqkit index examples/example_data/sample.fa # write *.fai
100
+ bioseqkit fetch examples/example_data/sample.fa seq2:1-16
101
+ ```
102
+
103
+ ## Python API
104
+
105
+ ```python
106
+ import bioseqkit as bsk
107
+
108
+ for rec in bsk.parse_fasta("examples/example_data/sample.fa"):
109
+ print(rec.id, len(rec), bsk.gc_content(rec.sequence))
110
+
111
+ print(bsk.reverse_complement("ATGC")) # -> GCAT
112
+ print(bsk.translate("ATGGCCTAA")) # -> MA*
113
+
114
+ counts = bsk.count_kmers("ACGTACGTACGT", k=3, canonical=True)
115
+ print(bsk.top_kmers(counts, 3))
116
+
117
+ idx = bsk.build_faidx("examples/example_data/sample.fa")
118
+ print(idx.fetch("seq2", 1, 16))
119
+ ```
120
+
121
+ ## Testing
122
+
123
+ ```bash
124
+ uv run --with pytest pytest -q # 39 tests
125
+ ```
126
+
127
+ Continuous integration (GitHub Actions) runs `ruff` linting and the `pytest`
128
+ suite on Python 3.10–3.12 for every push.
129
+
130
+ ## Data sources
131
+
132
+ - NCBI Nucleotide: <https://www.ncbi.nlm.nih.gov/nucleotide/>
133
+ - UCSC Genome Browser: <https://genome.ucsc.edu/>
134
+
135
+ The bundled `examples/example_data/sample.fa` is a small synthetic sequence for
136
+ offline testing; `demo.ipynb` will download real data from NCBI when a network
137
+ connection is available and fall back to the bundled file otherwise.
138
+
139
+ ## License
140
+
141
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,14 @@
1
+ bioseqkit/__init__.py,sha256=WPNQec42lcU4LJpPE8jM8Y_fFk1bOGL6MtpVhf7JXzY,1261
2
+ bioseqkit/cli.py,sha256=5lUd-el13pi7ZYis5RQQJbtPz_OecZbNTN5IDKpwn3k,4938
3
+ bioseqkit/entrez.py,sha256=33vTm0Lv7m--P_OdiIw_5wj8zgrrvAebKP--PQnT1oM,1144
4
+ bioseqkit/index.py,sha256=y9KcvGW6ndoKgYLP34sGsCNkSbZ2uf3jU3R2yeUEqCo,6038
5
+ bioseqkit/io.py,sha256=_YXwA2tI6xamdHf1sEdHKczpP8SY7YAiEQNPrgenSFM,5586
6
+ bioseqkit/kmer.py,sha256=JFsAF6BXEfgcb3Dq9NBHVoI7vapI34ctMvS8rFUnZM8,4557
7
+ bioseqkit/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ bioseqkit/stats.py,sha256=QHwkda7nP-ithSaB30QTsXZdR_WrUOQIoWqPYpgAthw,3176
9
+ bioseqkit/transform.py,sha256=lU66dRisQUEpRNHSZC9maz4mAXmcHBr1QnLGFfo5Dug,3110
10
+ bioseqkit-0.1.0.dist-info/METADATA,sha256=o4-fLIUZweJ9NMeDV57QqXvesFYp-_5EyXL4Sch4vfc,5228
11
+ bioseqkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
12
+ bioseqkit-0.1.0.dist-info/entry_points.txt,sha256=xZmtCloPHVsircuQzpq51XSbYxc4zCLrgtf2SVcva6U,49
13
+ bioseqkit-0.1.0.dist-info/licenses/LICENSE,sha256=8TV7QZtAVFsQL2J6cdUiz2zWvFiWzf7vJ4ix5o3bzK8,1068
14
+ bioseqkit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ bioseqkit = bioseqkit.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jilai Cheng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.