PyPI - seqstatx - Versions diffs - 0.1.0__tar.gz - Mend

seqstatx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

seqstatx-0.1.0/PKG-INFO +90 -0
seqstatx-0.1.0/README.md +74 -0
seqstatx-0.1.0/pyproject.toml +29 -0
seqstatx-0.1.0/seqstats/__init__.py +1 -0
seqstatx-0.1.0/seqstats/cli.py +84 -0
seqstatx-0.1.0/seqstats/core.py +125 -0
seqstatx-0.1.0/seqstatx.egg-info/PKG-INFO +90 -0
seqstatx-0.1.0/seqstatx.egg-info/SOURCES.txt +13 -0
seqstatx-0.1.0/seqstatx.egg-info/dependency_links.txt +1 -0
seqstatx-0.1.0/seqstatx.egg-info/entry_points.txt +2 -0
seqstatx-0.1.0/seqstatx.egg-info/requires.txt +4 -0
seqstatx-0.1.0/seqstatx.egg-info/top_level.txt +1 -0
seqstatx-0.1.0/setup.cfg +4 -0
seqstatx-0.1.0/tests/test_cli.py +57 -0
seqstatx-0.1.0/tests/test_core.py +135 -0

seqstatx-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,90 @@
+Metadata-Version: 2.4
+Name: seqstatx
+Version: 0.1.0
+Summary: Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more
+Author-email: Wendy Bui <wendybuinta@gmail.com>
+License: MIT
+Keywords: bioinformatics,genomics,fasta,fastq,sequence,qc
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+# seqstats
+[![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
+[![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
+![Python](https://img.shields.io/badge/python-3.10%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
+```
+file                            seqs      total_bp      gc%      mean_len    min_len   max_len   N50         N90
+-----------------------------------------------------------------------------------------------------------------
+GRCh38.primary_assembly.fa      194       3,088,286,401  40.93   15,918,992  970       248,956,422  153,373,213  40,103,529
+SRR10045678_1.fastq.gz          10000000  1,510,000,000  50.21   151.0       151       151          151          151
+```
+## Install
+```bash
+pip install seqstatx
+```
+Or for development:
+```bash
+git clone https://github.com/perhapsstrawberries/seqstats.git
+cd seqstats
+pip install -e .
+```
+## Usage
+```bash
+# single file
+seqstatx genome.fa
+# multiple files, gzipped FASTQ
+seqstatx sample1.fastq.gz sample2.fastq.gz
+# TSV output for downstream parsing
+seqstatx --tsv *.fa > stats.tsv
+# pipe to column for alignment
+seqstatx --tsv *.fastq.gz | column -t
+```
+## Metrics
+| Column | Description |
+|--------|-------------|
+| `seqs` | Number of sequences / reads |
+| `total_bp` | Total base pairs |
+| `gc%` | GC content (%) |
+| `mean_len` | Mean sequence length |
+| `min_len` / `max_len` | Shortest / longest sequence |
+| `N50` | 50% of total assembly is in sequences ≥ this length |
+| `N90` | 90% of total assembly is in sequences ≥ this length |
+## Supported formats
+| Extension | Format |
+|-----------|--------|
+| `.fa` `.fna` `.fasta` | FASTA |
+| `.fq` `.fastq` | FASTQ |
+| `.fa.gz` `.fastq.gz` etc. | gzipped variants |
+## Why
+Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
+`seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
+## License
+MIT

seqstatx-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,74 @@
+# seqstats
+[![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
+[![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
+![Python](https://img.shields.io/badge/python-3.10%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
+```
+file                            seqs      total_bp      gc%      mean_len    min_len   max_len   N50         N90
+-----------------------------------------------------------------------------------------------------------------
+GRCh38.primary_assembly.fa      194       3,088,286,401  40.93   15,918,992  970       248,956,422  153,373,213  40,103,529
+SRR10045678_1.fastq.gz          10000000  1,510,000,000  50.21   151.0       151       151          151          151
+```
+## Install
+```bash
+pip install seqstatx
+```
+Or for development:
+```bash
+git clone https://github.com/perhapsstrawberries/seqstats.git
+cd seqstats
+pip install -e .
+```
+## Usage
+```bash
+# single file
+seqstatx genome.fa
+# multiple files, gzipped FASTQ
+seqstatx sample1.fastq.gz sample2.fastq.gz
+# TSV output for downstream parsing
+seqstatx --tsv *.fa > stats.tsv
+# pipe to column for alignment
+seqstatx --tsv *.fastq.gz | column -t
+```
+## Metrics
+| Column | Description |
+|--------|-------------|
+| `seqs` | Number of sequences / reads |
+| `total_bp` | Total base pairs |
+| `gc%` | GC content (%) |
+| `mean_len` | Mean sequence length |
+| `min_len` / `max_len` | Shortest / longest sequence |
+| `N50` | 50% of total assembly is in sequences ≥ this length |
+| `N90` | 90% of total assembly is in sequences ≥ this length |
+## Supported formats
+| Extension | Format |
+|-----------|--------|
+| `.fa` `.fna` `.fasta` | FASTA |
+| `.fq` `.fastq` | FASTQ |
+| `.fa.gz` `.fastq.gz` etc. | gzipped variants |
+## Why
+Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
+`seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
+## License
+MIT

seqstatx-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "seqstatx"
+version = "0.1.0"
+description = "Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more"
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.10"
+authors = [{ name = "Wendy Bui", email = "wendybuinta@gmail.com" }]
+keywords = ["bioinformatics", "genomics", "fasta", "fastq", "sequence", "qc"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = []
+[project.scripts]
+seqstatx = "seqstats.cli:main"
+[project.optional-dependencies]
+dev = ["pytest", "pytest-cov"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"

seqstatx-0.1.0/seqstats/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

seqstatx-0.1.0/seqstats/cli.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Command-line interface for seqstats."""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from seqstats.core import SeqStats, parse_file
+from seqstats import __version__
+_COLS = ["file", "seqs", "total_bp", "gc%", "mean_len", "min_len", "max_len", "N50", "N90"]
+_COL_W = [30, 8, 12, 7, 10, 8, 8, 10, 10]
+def _fmt(stats: SeqStats) -> list[str]:
+    return [
+        stats.name[:29],
+        str(stats.n_seqs),
+        str(stats.total_bases),
+        f"{stats.gc_pct:.2f}",
+        f"{stats.mean_len:.1f}",
+        str(stats.min_len),
+        str(stats.max_len),
+        str(stats.n50),
+        str(stats.n90),
+    ]
+def _header() -> str:
+    return "  ".join(c.ljust(w) for c, w in zip(_COLS, _COL_W))
+def _row(stats: SeqStats) -> str:
+    return "  ".join(v.ljust(w) for v, w in zip(_fmt(stats), _COL_W))
+def _tsv_header() -> str:
+    return "\t".join(_COLS)
+def _tsv_row(stats: SeqStats) -> str:
+    return "\t".join(_fmt(stats))
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="seqstatx",
+        description="Compute sequence statistics for FASTA/FASTQ files.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""examples:
+  seqstatx genome.fa
+  seqstatx *.fastq.gz
+  seqstatx --tsv reads.fq.gz > stats.tsv
+  seqstatx --tsv sample1.fa sample2.fa | column -t""",
+    )
+    parser.add_argument("files", nargs="+", type=Path, metavar="FILE")
+    parser.add_argument("--tsv", action="store_true", help="output tab-separated values")
+    parser.add_argument("--version", action="version", version=f"seqstatx {__version__}")
+    args = parser.parse_args(argv)
+    missing = [f for f in args.files if not f.exists()]
+    if missing:
+        for f in missing:
+            print(f"[seqstats] file not found: {f}", file=sys.stderr)
+        sys.exit(1)
+    if args.tsv:
+        print(_tsv_header())
+    else:
+        print(_header())
+        print("-" * sum(_COL_W) + "-" * (2 * (len(_COL_W) - 1)))
+    for path in args.files:
+        stats = parse_file(path)
+        if args.tsv:
+            print(_tsv_row(stats))
+        else:
+            print(_row(stats))
+if __name__ == "__main__":
+    main()

seqstatx-0.1.0/seqstats/core.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""Core sequence statistics logic."""
+from __future__ import annotations
+import gzip
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+@dataclass
+class SeqStats:
+    name: str
+    n_seqs: int = 0
+    total_bases: int = 0
+    gc_count: int = 0
+    _lengths: list[int] = field(default_factory=list, repr=False)
+    def add(self, seq: str) -> None:
+        n = len(seq)
+        self.n_seqs += 1
+        self.total_bases += n
+        self.gc_count += seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")
+        self._lengths.append(n)
+    @property
+    def gc_pct(self) -> float:
+        return 100.0 * self.gc_count / self.total_bases if self.total_bases else 0.0
+    @property
+    def mean_len(self) -> float:
+        return self.total_bases / self.n_seqs if self.n_seqs else 0.0
+    @property
+    def min_len(self) -> int:
+        return min(self._lengths) if self._lengths else 0
+    @property
+    def max_len(self) -> int:
+        return max(self._lengths) if self._lengths else 0
+    @property
+    def n50(self) -> int:
+        if not self._lengths:
+            return 0
+        sorted_lens = sorted(self._lengths, reverse=True)
+        threshold = self.total_bases / 2
+        cumsum = 0
+        for length in sorted_lens:
+            cumsum += length
+            if cumsum >= threshold:
+                return length
+        return 0
+    @property
+    def n90(self) -> int:
+        if not self._lengths:
+            return 0
+        sorted_lens = sorted(self._lengths, reverse=True)
+        threshold = self.total_bases * 0.9
+        cumsum = 0
+        for length in sorted_lens:
+            cumsum += length
+            if cumsum >= threshold:
+                return length
+        return 0
+def _open(path: Path):
+    """Open plain or gzipped file."""
+    if path.suffix in (".gz", ".gzip"):
+        return gzip.open(path, "rt")
+    return open(path, "r")
+def parse_fasta(path: Path) -> SeqStats:
+    stats = SeqStats(name=path.name)
+    current: list[str] = []
+    def flush():
+        if current:
+            stats.add("".join(current))
+            current.clear()
+    with _open(path) as fh:
+        for line in fh:
+            line = line.rstrip()
+            if not line:
+                continue
+            if line.startswith(">"):
+                flush()
+            else:
+                current.append(line)
+    flush()
+    return stats
+def parse_fastq(path: Path) -> SeqStats:
+    stats = SeqStats(name=path.name)
+    with _open(path) as fh:
+        while True:
+            header = fh.readline()
+            if not header:
+                break
+            seq = fh.readline().rstrip()
+            fh.readline()  # +
+            fh.readline()  # quality
+            if seq:
+                stats.add(seq)
+    return stats
+def parse_file(path: Path) -> SeqStats:
+    """Detect format by extension and parse."""
+    name = path.name.lower()
+    if any(name.endswith(ext) for ext in (".fa", ".fna", ".fasta", ".fa.gz", ".fna.gz", ".fasta.gz")):
+        return parse_fasta(path)
+    if any(name.endswith(ext) for ext in (".fq", ".fastq", ".fq.gz", ".fastq.gz")):
+        return parse_fastq(path)
+    # fallback: try FASTA
+    try:
+        return parse_fasta(path)
+    except Exception:
+        print(f"[seqstats] could not detect format for {path.name}", file=sys.stderr)
+        sys.exit(1)

seqstatx-0.1.0/seqstatx.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,90 @@
+Metadata-Version: 2.4
+Name: seqstatx
+Version: 0.1.0
+Summary: Fast sequence statistics for FASTA/FASTQ files — N50, GC%, length distributions and more
+Author-email: Wendy Bui <wendybuinta@gmail.com>
+License: MIT
+Keywords: bioinformatics,genomics,fasta,fastq,sequence,qc
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+# seqstats
+[![CI](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml/badge.svg)](https://github.com/perhapsstrawberries/seqstats/actions/workflows/ci.yml)
+[![PyPI](https://img.shields.io/pypi/v/seqstatx)](https://pypi.org/project/seqstatx/)
+![Python](https://img.shields.io/badge/python-3.10%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+Fast sequence statistics for FASTA and FASTQ files — works on plain or gzipped inputs, no dependencies.
+```
+file                            seqs      total_bp      gc%      mean_len    min_len   max_len   N50         N90
+-----------------------------------------------------------------------------------------------------------------
+GRCh38.primary_assembly.fa      194       3,088,286,401  40.93   15,918,992  970       248,956,422  153,373,213  40,103,529
+SRR10045678_1.fastq.gz          10000000  1,510,000,000  50.21   151.0       151       151          151          151
+```
+## Install
+```bash
+pip install seqstatx
+```
+Or for development:
+```bash
+git clone https://github.com/perhapsstrawberries/seqstats.git
+cd seqstats
+pip install -e .
+```
+## Usage
+```bash
+# single file
+seqstatx genome.fa
+# multiple files, gzipped FASTQ
+seqstatx sample1.fastq.gz sample2.fastq.gz
+# TSV output for downstream parsing
+seqstatx --tsv *.fa > stats.tsv
+# pipe to column for alignment
+seqstatx --tsv *.fastq.gz | column -t
+```
+## Metrics
+| Column | Description |
+|--------|-------------|
+| `seqs` | Number of sequences / reads |
+| `total_bp` | Total base pairs |
+| `gc%` | GC content (%) |
+| `mean_len` | Mean sequence length |
+| `min_len` / `max_len` | Shortest / longest sequence |
+| `N50` | 50% of total assembly is in sequences ≥ this length |
+| `N90` | 90% of total assembly is in sequences ≥ this length |
+## Supported formats
+| Extension | Format |
+|-----------|--------|
+| `.fa` `.fna` `.fasta` | FASTA |
+| `.fq` `.fastq` | FASTQ |
+| `.fa.gz` `.fastq.gz` etc. | gzipped variants |
+## Why
+Existing tools (seqkit, seqtk) are great but require installation of compiled binaries.
+`seqstats` is pure Python 3.10+, zero dependencies, pip-installable from any HPC or Conda environment.
+## License
+MIT

seqstatx-0.1.0/seqstatx.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+README.md
+pyproject.toml
+seqstats/__init__.py
+seqstats/cli.py
+seqstats/core.py
+seqstatx.egg-info/PKG-INFO
+seqstatx.egg-info/SOURCES.txt
+seqstatx.egg-info/dependency_links.txt
+seqstatx.egg-info/entry_points.txt
+seqstatx.egg-info/requires.txt
+seqstatx.egg-info/top_level.txt
+tests/test_cli.py
+tests/test_core.py

seqstatx-0.1.0/seqstatx.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

seqstatx-0.1.0/seqstatx.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ seqstatx = seqstats.cli:main

seqstatx-0.1.0/seqstatx.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+[dev]
+pytest
+pytest-cov

seqstatx-0.1.0/seqstatx.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ seqstats

seqstatx-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

seqstatx-0.1.0/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Tests for the seqstats CLI."""
+import subprocess
+import sys
+import pytest
+from seqstats.cli import main
+FASTA_TEXT = ">seq1\nACGTACGTGG\n>seq2\nGGGGCCCCAAAATTTT\n>seq3\nATAT\n"
+def test_cli_default_output(tmp_path, capsys):
+    p = tmp_path / "test.fa"
+    p.write_text(FASTA_TEXT)
+    main([str(p)])
+    out = capsys.readouterr().out
+    assert "test.fa" in out
+    assert "N50" in out
+    assert "gc%" in out
+def test_cli_tsv_output(tmp_path, capsys):
+    p = tmp_path / "test.fa"
+    p.write_text(FASTA_TEXT)
+    main(["--tsv", str(p)])
+    out = capsys.readouterr().out
+    lines = out.strip().split("\n")
+    assert lines[0].startswith("file\tseqs")
+    # data row has tab-separated fields matching header count
+    assert len(lines[1].split("\t")) == len(lines[0].split("\t"))
+def test_cli_missing_file_exits(tmp_path):
+    with pytest.raises(SystemExit) as exc:
+        main([str(tmp_path / "nope.fa")])
+    assert exc.value.code == 1
+def test_cli_multiple_files(tmp_path, capsys):
+    p1 = tmp_path / "a.fa"
+    p2 = tmp_path / "b.fa"
+    p1.write_text(FASTA_TEXT)
+    p2.write_text(FASTA_TEXT)
+    main(["--tsv", str(p1), str(p2)])
+    out = capsys.readouterr().out
+    lines = out.strip().split("\n")
+    assert len(lines) == 3  # header + 2 rows
+def test_cli_version():
+    result = subprocess.run(
+        [sys.executable, "-m", "seqstats.cli", "--version"],
+        capture_output=True,
+        text=True,
+    )
+    assert "seqstatx" in result.stdout

seqstatx-0.1.0/tests/test_core.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Tests for seqstats.core — values hand-verified."""
+import gzip
+from pathlib import Path
+import pytest
+from seqstats.core import SeqStats, parse_fasta, parse_fastq, parse_file
+# ---------------------------------------------------------------------------
+# SeqStats unit tests
+# ---------------------------------------------------------------------------
+def test_empty_stats():
+    s = SeqStats(name="empty")
+    assert s.n_seqs == 0
+    assert s.total_bases == 0
+    assert s.gc_pct == 0.0
+    assert s.mean_len == 0.0
+    assert s.min_len == 0
+    assert s.max_len == 0
+    assert s.n50 == 0
+    assert s.n90 == 0
+def test_single_sequence():
+    s = SeqStats(name="one")
+    s.add("ACGTACGTGG")  # 10 bp, GC = C,G,C,G,G,G -> 2 C + 4 G... let's compute
+    # A C G T A C G T G G -> C:2, G:4 -> GC = 6
+    assert s.n_seqs == 1
+    assert s.total_bases == 10
+    assert s.gc_count == 6
+    assert s.gc_pct == pytest.approx(60.0)
+    assert s.mean_len == 10.0
+    assert s.min_len == s.max_len == 10
+    assert s.n50 == 10
+    assert s.n90 == 10
+def test_gc_case_insensitive():
+    s = SeqStats(name="mixed")
+    s.add("acgtACGT")  # a c g t A C G T -> c,g,C,G = 4 GC of 8
+    assert s.gc_count == 4
+    assert s.gc_pct == pytest.approx(50.0)
+def test_n50_n90():
+    # lengths 16, 10, 4 ; total = 30
+    s = SeqStats(name="n50")
+    s.add("G" * 16)
+    s.add("G" * 10)
+    s.add("G" * 4)
+    # N50: sorted desc [16,10,4], half=15 -> 16 reaches 16>=15 -> 16
+    assert s.n50 == 16
+    # N90: 0.9*30 = 27 -> 16 (16), +10 (26), +4 (30>=27) -> 4
+    assert s.n90 == 4
+def test_n50_classic_example():
+    # Classic: [2,3,4,5,6,7,8,9,10], total=54, half=27
+    s = SeqStats(name="classic")
+    for n in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
+        s.add("A" * n)
+    # sorted desc: 10,9,8,7,6... cumsum: 10,19,27 -> 8 reaches 27>=27 -> 8
+    assert s.n50 == 8
+# ---------------------------------------------------------------------------
+# Parser tests
+# ---------------------------------------------------------------------------
+FASTA_TEXT = ">seq1\nACGTACGTGG\n>seq2\nGGGGCCCCAAAATTTT\n>seq3\nATAT\n"
+FASTQ_TEXT = "@read1\nACGTACGTGC\n+\nIIIIIIIIII\n@read2\nGGCCGGCCAA\n+\nIIIIIIIIII\n"
+def test_parse_fasta(tmp_path):
+    p = tmp_path / "test.fa"
+    p.write_text(FASTA_TEXT)
+    s = parse_fasta(p)
+    assert s.n_seqs == 3
+    assert s.total_bases == 30  # 10 + 16 + 4
+    assert s.min_len == 4
+    assert s.max_len == 16
+    assert s.gc_pct == pytest.approx(46.666, abs=0.01)
+def test_parse_fasta_multiline(tmp_path):
+    # Sequence wrapped across multiple lines must be joined
+    p = tmp_path / "wrapped.fa"
+    p.write_text(">seq1\nACGT\nACGT\nGG\n")
+    s = parse_fasta(p)
+    assert s.n_seqs == 1
+    assert s.total_bases == 10
+def test_parse_fastq(tmp_path):
+    p = tmp_path / "test.fq"
+    p.write_text(FASTQ_TEXT)
+    s = parse_fastq(p)
+    assert s.n_seqs == 2
+    assert s.total_bases == 20
+    assert s.gc_pct == pytest.approx(70.0)
+def test_parse_gzipped_fastq(tmp_path):
+    p = tmp_path / "test.fq.gz"
+    with gzip.open(p, "wt") as fh:
+        fh.write(FASTQ_TEXT)
+    s = parse_fastq(p)
+    assert s.n_seqs == 2
+    assert s.total_bases == 20
+def test_parse_file_detects_fasta(tmp_path):
+    p = tmp_path / "genome.fasta"
+    p.write_text(FASTA_TEXT)
+    s = parse_file(p)
+    assert s.n_seqs == 3
+def test_parse_file_detects_fastq_gz(tmp_path):
+    p = tmp_path / "reads.fastq.gz"
+    with gzip.open(p, "wt") as fh:
+        fh.write(FASTQ_TEXT)
+    s = parse_file(p)
+    assert s.n_seqs == 2
+def test_parse_fasta_blank_lines(tmp_path):
+    p = tmp_path / "blanks.fa"
+    p.write_text(">seq1\nACGT\n\n>seq2\nGGGG\n\n")
+    s = parse_fasta(p)
+    assert s.n_seqs == 2
+    assert s.total_bases == 8