seqstatx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: seqstatx
3
+ Version: 0.1.0
4
+ Summary: Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more
5
+ Author-email: Wendy Bui <wendybuinta@gmail.com>
6
+ License: MIT
7
+ Keywords: bioinformatics,genomics,fasta,fastq,sequence,qc
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest; extra == "dev"
15
+ Requires-Dist: pytest-cov; extra == "dev"
16
+
17
+ # seqstats
18
+
19
+ [![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
20
+ [![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
21
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
22
+ ![License](https://img.shields.io/badge/license-MIT-green)
23
+
24
+ Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
25
+
26
+ ```
27
+ file seqs total_bp gc% mean_len min_len max_len N50 N90
28
+ -----------------------------------------------------------------------------------------------------------------
29
+ GRCh38.primary_assembly.fa 194 3,088,286,401 40.93 15,918,992 970 248,956,422 153,373,213 40,103,529
30
+ SRR10045678_1.fastq.gz 10000000 1,510,000,000 50.21 151.0 151 151 151 151
31
+ ```
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install seqstatx
37
+ ```
38
+
39
+ Or for development:
40
+
41
+ ```bash
42
+ git clone https://github.com/perhapsstrawberries/seqstats.git
43
+ cd seqstats
44
+ pip install -e .
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ ```bash
50
+ # single file
51
+ seqstatx genome.fa
52
+
53
+ # multiple files, gzipped FASTQ
54
+ seqstatx sample1.fastq.gz sample2.fastq.gz
55
+
56
+ # TSV output for downstream parsing
57
+ seqstatx --tsv *.fa > stats.tsv
58
+
59
+ # pipe to column for alignment
60
+ seqstatx --tsv *.fastq.gz | column -t
61
+ ```
62
+
63
+ ## Metrics
64
+
65
+ | Column | Description |
66
+ |--------|-------------|
67
+ | `seqs` | Number of sequences / reads |
68
+ | `total_bp` | Total base pairs |
69
+ | `gc%` | GC content (%) |
70
+ | `mean_len` | Mean sequence length |
71
+ | `min_len` / `max_len` | Shortest / longest sequence |
72
+ | `N50` | 50% of total assembly is in sequences ≥ this length |
73
+ | `N90` | 90% of total assembly is in sequences ≥ this length |
74
+
75
+ ## Supported formats
76
+
77
+ | Extension | Format |
78
+ |-----------|--------|
79
+ | `.fa` `.fna` `.fasta` | FASTA |
80
+ | `.fq` `.fastq` | FASTQ |
81
+ | `.fa.gz` `.fastq.gz` etc. | gzipped variants |
82
+
83
+ ## Why
84
+
85
+ Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
86
+ `seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
87
+
88
+ ## License
89
+
90
+ MIT
@@ -0,0 +1,74 @@
1
+ # seqstats
2
+
3
+ [![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
5
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
6
+ ![License](https://img.shields.io/badge/license-MIT-green)
7
+
8
+ Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
9
+
10
+ ```
11
+ file seqs total_bp gc% mean_len min_len max_len N50 N90
12
+ -----------------------------------------------------------------------------------------------------------------
13
+ GRCh38.primary_assembly.fa 194 3,088,286,401 40.93 15,918,992 970 248,956,422 153,373,213 40,103,529
14
+ SRR10045678_1.fastq.gz 10000000 1,510,000,000 50.21 151.0 151 151 151 151
15
+ ```
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install seqstatx
21
+ ```
22
+
23
+ Or for development:
24
+
25
+ ```bash
26
+ git clone https://github.com/perhapsstrawberries/seqstats.git
27
+ cd seqstats
28
+ pip install -e .
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ```bash
34
+ # single file
35
+ seqstatx genome.fa
36
+
37
+ # multiple files, gzipped FASTQ
38
+ seqstatx sample1.fastq.gz sample2.fastq.gz
39
+
40
+ # TSV output for downstream parsing
41
+ seqstatx --tsv *.fa > stats.tsv
42
+
43
+ # pipe to column for alignment
44
+ seqstatx --tsv *.fastq.gz | column -t
45
+ ```
46
+
47
+ ## Metrics
48
+
49
+ | Column | Description |
50
+ |--------|-------------|
51
+ | `seqs` | Number of sequences / reads |
52
+ | `total_bp` | Total base pairs |
53
+ | `gc%` | GC content (%) |
54
+ | `mean_len` | Mean sequence length |
55
+ | `min_len` / `max_len` | Shortest / longest sequence |
56
+ | `N50` | 50% of total assembly is in sequences ≥ this length |
57
+ | `N90` | 90% of total assembly is in sequences ≥ this length |
58
+
59
+ ## Supported formats
60
+
61
+ | Extension | Format |
62
+ |-----------|--------|
63
+ | `.fa` `.fna` `.fasta` | FASTA |
64
+ | `.fq` `.fastq` | FASTQ |
65
+ | `.fa.gz` `.fastq.gz` etc. | gzipped variants |
66
+
67
+ ## Why
68
+
69
+ Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
70
+ `seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
71
+
72
+ ## License
73
+
74
+ MIT
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "seqstatx"
7
+ version = "0.1.0"
8
+ description = "Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "Wendy Bui", email = "wendybuinta@gmail.com" }]
13
+ keywords = ["bioinformatics", "genomics", "fasta", "fastq", "sequence", "qc"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
18
+ ]
19
+ dependencies = []
20
+
21
+ [project.scripts]
22
+ seqstatx = "seqstats.cli:main"
23
+
24
+ [project.optional-dependencies]
25
+ dev = ["pytest", "pytest-cov"]
26
+
27
+ [tool.pytest.ini_options]
28
+ testpaths = ["tests"]
29
+ addopts = "-q"
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,84 @@
1
+ """Command-line interface for seqstats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from seqstats.core import SeqStats, parse_file
10
+ from seqstats import __version__
11
+
12
+ _COLS = ["file", "seqs", "total_bp", "gc%", "mean_len", "min_len", "max_len", "N50", "N90"]
13
+ _COL_W = [30, 8, 12, 7, 10, 8, 8, 10, 10]
14
+
15
+
16
+ def _fmt(stats: SeqStats) -> list[str]:
17
+ return [
18
+ stats.name[:29],
19
+ str(stats.n_seqs),
20
+ str(stats.total_bases),
21
+ f"{stats.gc_pct:.2f}",
22
+ f"{stats.mean_len:.1f}",
23
+ str(stats.min_len),
24
+ str(stats.max_len),
25
+ str(stats.n50),
26
+ str(stats.n90),
27
+ ]
28
+
29
+
30
+ def _header() -> str:
31
+ return " ".join(c.ljust(w) for c, w in zip(_COLS, _COL_W))
32
+
33
+
34
+ def _row(stats: SeqStats) -> str:
35
+ return " ".join(v.ljust(w) for v, w in zip(_fmt(stats), _COL_W))
36
+
37
+
38
+ def _tsv_header() -> str:
39
+ return "\t".join(_COLS)
40
+
41
+
42
+ def _tsv_row(stats: SeqStats) -> str:
43
+ return "\t".join(_fmt(stats))
44
+
45
+
46
+ def main(argv: list[str] | None = None) -> None:
47
+ parser = argparse.ArgumentParser(
48
+ prog="seqstatx",
49
+ description="Compute sequence statistics for FASTA/FASTQ files.",
50
+ formatter_class=argparse.RawDescriptionHelpFormatter,
51
+ epilog="""examples:
52
+ seqstatx genome.fa
53
+ seqstatx *.fastq.gz
54
+ seqstatx --tsv reads.fq.gz > stats.tsv
55
+ seqstatx --tsv sample1.fa sample2.fa | column -t""",
56
+ )
57
+ parser.add_argument("files", nargs="+", type=Path, metavar="FILE")
58
+ parser.add_argument("--tsv", action="store_true", help="output tab-separated values")
59
+ parser.add_argument("--version", action="version", version=f"seqstatx {__version__}")
60
+
61
+ args = parser.parse_args(argv)
62
+
63
+ missing = [f for f in args.files if not f.exists()]
64
+ if missing:
65
+ for f in missing:
66
+ print(f"[seqstats] file not found: {f}", file=sys.stderr)
67
+ sys.exit(1)
68
+
69
+ if args.tsv:
70
+ print(_tsv_header())
71
+ else:
72
+ print(_header())
73
+ print("-" * sum(_COL_W) + "-" * (2 * (len(_COL_W) - 1)))
74
+
75
+ for path in args.files:
76
+ stats = parse_file(path)
77
+ if args.tsv:
78
+ print(_tsv_row(stats))
79
+ else:
80
+ print(_row(stats))
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
@@ -0,0 +1,125 @@
1
+ """Core sequence statistics logic."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import sys
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+
10
+
11
+ @dataclass
12
+ class SeqStats:
13
+ name: str
14
+ n_seqs: int = 0
15
+ total_bases: int = 0
16
+ gc_count: int = 0
17
+ _lengths: list[int] = field(default_factory=list, repr=False)
18
+
19
+ def add(self, seq: str) -> None:
20
+ n = len(seq)
21
+ self.n_seqs += 1
22
+ self.total_bases += n
23
+ self.gc_count += seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")
24
+ self._lengths.append(n)
25
+
26
+ @property
27
+ def gc_pct(self) -> float:
28
+ return 100.0 * self.gc_count / self.total_bases if self.total_bases else 0.0
29
+
30
+ @property
31
+ def mean_len(self) -> float:
32
+ return self.total_bases / self.n_seqs if self.n_seqs else 0.0
33
+
34
+ @property
35
+ def min_len(self) -> int:
36
+ return min(self._lengths) if self._lengths else 0
37
+
38
+ @property
39
+ def max_len(self) -> int:
40
+ return max(self._lengths) if self._lengths else 0
41
+
42
+ @property
43
+ def n50(self) -> int:
44
+ if not self._lengths:
45
+ return 0
46
+ sorted_lens = sorted(self._lengths, reverse=True)
47
+ threshold = self.total_bases / 2
48
+ cumsum = 0
49
+ for length in sorted_lens:
50
+ cumsum += length
51
+ if cumsum >= threshold:
52
+ return length
53
+ return 0
54
+
55
+ @property
56
+ def n90(self) -> int:
57
+ if not self._lengths:
58
+ return 0
59
+ sorted_lens = sorted(self._lengths, reverse=True)
60
+ threshold = self.total_bases * 0.9
61
+ cumsum = 0
62
+ for length in sorted_lens:
63
+ cumsum += length
64
+ if cumsum >= threshold:
65
+ return length
66
+ return 0
67
+
68
+
69
+ def _open(path: Path):
70
+ """Open plain or gzipped file."""
71
+ if path.suffix in (".gz", ".gzip"):
72
+ return gzip.open(path, "rt")
73
+ return open(path, "r")
74
+
75
+
76
+ def parse_fasta(path: Path) -> SeqStats:
77
+ stats = SeqStats(name=path.name)
78
+ current: list[str] = []
79
+
80
+ def flush():
81
+ if current:
82
+ stats.add("".join(current))
83
+ current.clear()
84
+
85
+ with _open(path) as fh:
86
+ for line in fh:
87
+ line = line.rstrip()
88
+ if not line:
89
+ continue
90
+ if line.startswith(">"):
91
+ flush()
92
+ else:
93
+ current.append(line)
94
+ flush()
95
+ return stats
96
+
97
+
98
+ def parse_fastq(path: Path) -> SeqStats:
99
+ stats = SeqStats(name=path.name)
100
+ with _open(path) as fh:
101
+ while True:
102
+ header = fh.readline()
103
+ if not header:
104
+ break
105
+ seq = fh.readline().rstrip()
106
+ fh.readline() # +
107
+ fh.readline() # quality
108
+ if seq:
109
+ stats.add(seq)
110
+ return stats
111
+
112
+
113
+ def parse_file(path: Path) -> SeqStats:
114
+ """Detect format by extension and parse."""
115
+ name = path.name.lower()
116
+ if any(name.endswith(ext) for ext in (".fa", ".fna", ".fasta", ".fa.gz", ".fna.gz", ".fasta.gz")):
117
+ return parse_fasta(path)
118
+ if any(name.endswith(ext) for ext in (".fq", ".fastq", ".fq.gz", ".fastq.gz")):
119
+ return parse_fastq(path)
120
+ # fallback: try FASTA
121
+ try:
122
+ return parse_fasta(path)
123
+ except Exception:
124
+ print(f"[seqstats] could not detect format for {path.name}", file=sys.stderr)
125
+ sys.exit(1)
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: seqstatx
3
+ Version: 0.1.0
4
+ Summary: Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more
5
+ Author-email: Wendy Bui <wendybuinta@gmail.com>
6
+ License: MIT
7
+ Keywords: bioinformatics,genomics,fasta,fastq,sequence,qc
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest; extra == "dev"
15
+ Requires-Dist: pytest-cov; extra == "dev"
16
+
17
+ # seqstats
18
+
19
+ [![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
20
+ [![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
21
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
22
+ ![License](https://img.shields.io/badge/license-MIT-green)
23
+
24
+ Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
25
+
26
+ ```
27
+ file seqs total_bp gc% mean_len min_len max_len N50 N90
28
+ -----------------------------------------------------------------------------------------------------------------
29
+ GRCh38.primary_assembly.fa 194 3,088,286,401 40.93 15,918,992 970 248,956,422 153,373,213 40,103,529
30
+ SRR10045678_1.fastq.gz 10000000 1,510,000,000 50.21 151.0 151 151 151 151
31
+ ```
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install seqstatx
37
+ ```
38
+
39
+ Or for development:
40
+
41
+ ```bash
42
+ git clone https://github.com/perhapsstrawberries/seqstats.git
43
+ cd seqstats
44
+ pip install -e .
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ ```bash
50
+ # single file
51
+ seqstatx genome.fa
52
+
53
+ # multiple files, gzipped FASTQ
54
+ seqstatx sample1.fastq.gz sample2.fastq.gz
55
+
56
+ # TSV output for downstream parsing
57
+ seqstatx --tsv *.fa > stats.tsv
58
+
59
+ # pipe to column for alignment
60
+ seqstatx --tsv *.fastq.gz | column -t
61
+ ```
62
+
63
+ ## Metrics
64
+
65
+ | Column | Description |
66
+ |--------|-------------|
67
+ | `seqs` | Number of sequences / reads |
68
+ | `total_bp` | Total base pairs |
69
+ | `gc%` | GC content (%) |
70
+ | `mean_len` | Mean sequence length |
71
+ | `min_len` / `max_len` | Shortest / longest sequence |
72
+ | `N50` | 50% of total assembly is in sequences ≥ this length |
73
+ | `N90` | 90% of total assembly is in sequences ≥ this length |
74
+
75
+ ## Supported formats
76
+
77
+ | Extension | Format |
78
+ |-----------|--------|
79
+ | `.fa` `.fna` `.fasta` | FASTA |
80
+ | `.fq` `.fastq` | FASTQ |
81
+ | `.fa.gz` `.fastq.gz` etc. | gzipped variants |
82
+
83
+ ## Why
84
+
85
+ Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
86
+ `seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
87
+
88
+ ## License
89
+
90
+ MIT
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ seqstats/__init__.py
4
+ seqstats/cli.py
5
+ seqstats/core.py
6
+ seqstatx.egg-info/PKG-INFO
7
+ seqstatx.egg-info/SOURCES.txt
8
+ seqstatx.egg-info/dependency_links.txt
9
+ seqstatx.egg-info/entry_points.txt
10
+ seqstatx.egg-info/requires.txt
11
+ seqstatx.egg-info/top_level.txt
12
+ tests/test_cli.py
13
+ tests/test_core.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ seqstatx = seqstats.cli:main
@@ -0,0 +1,4 @@
1
+
2
+ [dev]
3
+ pytest
4
+ pytest-cov
@@ -0,0 +1 @@
1
+ seqstats
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,57 @@
1
+ """Tests for the seqstats CLI."""
2
+
3
+ import subprocess
4
+ import sys
5
+
6
+ import pytest
7
+
8
+ from seqstats.cli import main
9
+
10
+ FASTA_TEXT = ">seq1\nACGTACGTGG\n>seq2\nGGGGCCCCAAAATTTT\n>seq3\nATAT\n"
11
+
12
+
13
+ def test_cli_default_output(tmp_path, capsys):
14
+ p = tmp_path / "test.fa"
15
+ p.write_text(FASTA_TEXT)
16
+ main([str(p)])
17
+ out = capsys.readouterr().out
18
+ assert "test.fa" in out
19
+ assert "N50" in out
20
+ assert "gc%" in out
21
+
22
+
23
+ def test_cli_tsv_output(tmp_path, capsys):
24
+ p = tmp_path / "test.fa"
25
+ p.write_text(FASTA_TEXT)
26
+ main(["--tsv", str(p)])
27
+ out = capsys.readouterr().out
28
+ lines = out.strip().split("\n")
29
+ assert lines[0].startswith("file\tseqs")
30
+ # data row has tab-separated fields matching header count
31
+ assert len(lines[1].split("\t")) == len(lines[0].split("\t"))
32
+
33
+
34
+ def test_cli_missing_file_exits(tmp_path):
35
+ with pytest.raises(SystemExit) as exc:
36
+ main([str(tmp_path / "nope.fa")])
37
+ assert exc.value.code == 1
38
+
39
+
40
+ def test_cli_multiple_files(tmp_path, capsys):
41
+ p1 = tmp_path / "a.fa"
42
+ p2 = tmp_path / "b.fa"
43
+ p1.write_text(FASTA_TEXT)
44
+ p2.write_text(FASTA_TEXT)
45
+ main(["--tsv", str(p1), str(p2)])
46
+ out = capsys.readouterr().out
47
+ lines = out.strip().split("\n")
48
+ assert len(lines) == 3 # header + 2 rows
49
+
50
+
51
+ def test_cli_version():
52
+ result = subprocess.run(
53
+ [sys.executable, "-m", "seqstats.cli", "--version"],
54
+ capture_output=True,
55
+ text=True,
56
+ )
57
+ assert "seqstatx" in result.stdout
@@ -0,0 +1,135 @@
1
+ """Tests for seqstats.core — values hand-verified."""
2
+
3
+ import gzip
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+
8
+ from seqstats.core import SeqStats, parse_fasta, parse_fastq, parse_file
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # SeqStats unit tests
13
+ # ---------------------------------------------------------------------------
14
+
15
+ def test_empty_stats():
16
+ s = SeqStats(name="empty")
17
+ assert s.n_seqs == 0
18
+ assert s.total_bases == 0
19
+ assert s.gc_pct == 0.0
20
+ assert s.mean_len == 0.0
21
+ assert s.min_len == 0
22
+ assert s.max_len == 0
23
+ assert s.n50 == 0
24
+ assert s.n90 == 0
25
+
26
+
27
+ def test_single_sequence():
28
+ s = SeqStats(name="one")
29
+ s.add("ACGTACGTGG") # 10 bp, GC = C,G,C,G,G,G -> 2 C + 4 G... let's compute
30
+ # A C G T A C G T G G -> C:2, G:4 -> GC = 6
31
+ assert s.n_seqs == 1
32
+ assert s.total_bases == 10
33
+ assert s.gc_count == 6
34
+ assert s.gc_pct == pytest.approx(60.0)
35
+ assert s.mean_len == 10.0
36
+ assert s.min_len == s.max_len == 10
37
+ assert s.n50 == 10
38
+ assert s.n90 == 10
39
+
40
+
41
+ def test_gc_case_insensitive():
42
+ s = SeqStats(name="mixed")
43
+ s.add("acgtACGT") # a c g t A C G T -> c,g,C,G = 4 GC of 8
44
+ assert s.gc_count == 4
45
+ assert s.gc_pct == pytest.approx(50.0)
46
+
47
+
48
+ def test_n50_n90():
49
+ # lengths 16, 10, 4 ; total = 30
50
+ s = SeqStats(name="n50")
51
+ s.add("G" * 16)
52
+ s.add("G" * 10)
53
+ s.add("G" * 4)
54
+ # N50: sorted desc [16,10,4], half=15 -> 16 reaches 16>=15 -> 16
55
+ assert s.n50 == 16
56
+ # N90: 0.9*30 = 27 -> 16 (16), +10 (26), +4 (30>=27) -> 4
57
+ assert s.n90 == 4
58
+
59
+
60
+ def test_n50_classic_example():
61
+ # Classic: [2,3,4,5,6,7,8,9,10], total=54, half=27
62
+ s = SeqStats(name="classic")
63
+ for n in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
64
+ s.add("A" * n)
65
+ # sorted desc: 10,9,8,7,6... cumsum: 10,19,27 -> 8 reaches 27>=27 -> 8
66
+ assert s.n50 == 8
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Parser tests
71
+ # ---------------------------------------------------------------------------
72
+
73
+ FASTA_TEXT = ">seq1\nACGTACGTGG\n>seq2\nGGGGCCCCAAAATTTT\n>seq3\nATAT\n"
74
+ FASTQ_TEXT = "@read1\nACGTACGTGC\n+\nIIIIIIIIII\n@read2\nGGCCGGCCAA\n+\nIIIIIIIIII\n"
75
+
76
+
77
+ def test_parse_fasta(tmp_path):
78
+ p = tmp_path / "test.fa"
79
+ p.write_text(FASTA_TEXT)
80
+ s = parse_fasta(p)
81
+ assert s.n_seqs == 3
82
+ assert s.total_bases == 30 # 10 + 16 + 4
83
+ assert s.min_len == 4
84
+ assert s.max_len == 16
85
+ assert s.gc_pct == pytest.approx(46.666, abs=0.01)
86
+
87
+
88
+ def test_parse_fasta_multiline(tmp_path):
89
+ # Sequence wrapped across multiple lines must be joined
90
+ p = tmp_path / "wrapped.fa"
91
+ p.write_text(">seq1\nACGT\nACGT\nGG\n")
92
+ s = parse_fasta(p)
93
+ assert s.n_seqs == 1
94
+ assert s.total_bases == 10
95
+
96
+
97
+ def test_parse_fastq(tmp_path):
98
+ p = tmp_path / "test.fq"
99
+ p.write_text(FASTQ_TEXT)
100
+ s = parse_fastq(p)
101
+ assert s.n_seqs == 2
102
+ assert s.total_bases == 20
103
+ assert s.gc_pct == pytest.approx(70.0)
104
+
105
+
106
+ def test_parse_gzipped_fastq(tmp_path):
107
+ p = tmp_path / "test.fq.gz"
108
+ with gzip.open(p, "wt") as fh:
109
+ fh.write(FASTQ_TEXT)
110
+ s = parse_fastq(p)
111
+ assert s.n_seqs == 2
112
+ assert s.total_bases == 20
113
+
114
+
115
+ def test_parse_file_detects_fasta(tmp_path):
116
+ p = tmp_path / "genome.fasta"
117
+ p.write_text(FASTA_TEXT)
118
+ s = parse_file(p)
119
+ assert s.n_seqs == 3
120
+
121
+
122
+ def test_parse_file_detects_fastq_gz(tmp_path):
123
+ p = tmp_path / "reads.fastq.gz"
124
+ with gzip.open(p, "wt") as fh:
125
+ fh.write(FASTQ_TEXT)
126
+ s = parse_file(p)
127
+ assert s.n_seqs == 2
128
+
129
+
130
+ def test_parse_fasta_blank_lines(tmp_path):
131
+ p = tmp_path / "blanks.fa"
132
+ p.write_text(">seq1\nACGT\n\n>seq2\nGGGG\n\n")
133
+ s = parse_fasta(p)
134
+ assert s.n_seqs == 2
135
+ assert s.total_bases == 8