bioseqkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. bioseqkit-0.1.0/.github/workflows/ci.yml +28 -0
  2. bioseqkit-0.1.0/.gitignore +17 -0
  3. bioseqkit-0.1.0/.python-version +1 -0
  4. bioseqkit-0.1.0/LICENSE +21 -0
  5. bioseqkit-0.1.0/PKG-INFO +141 -0
  6. bioseqkit-0.1.0/README.md +122 -0
  7. bioseqkit-0.1.0/docs/conf.py +19 -0
  8. bioseqkit-0.1.0/docs/index.rst +36 -0
  9. bioseqkit-0.1.0/docs/quickstart.md +21 -0
  10. bioseqkit-0.1.0/environment.yml +17 -0
  11. bioseqkit-0.1.0/examples/demo.ipynb +222 -0
  12. bioseqkit-0.1.0/examples/example_data/sample.fa +5 -0
  13. bioseqkit-0.1.0/homework.md +3 -0
  14. bioseqkit-0.1.0/pyproject.toml +40 -0
  15. bioseqkit-0.1.0/references.bib +61 -0
  16. bioseqkit-0.1.0/report.pdf +1598 -2
  17. bioseqkit-0.1.0/report.typ +214 -0
  18. bioseqkit-0.1.0/requirements.txt +9 -0
  19. bioseqkit-0.1.0/src/bioseqkit/__init__.py +57 -0
  20. bioseqkit-0.1.0/src/bioseqkit/cli.py +140 -0
  21. bioseqkit-0.1.0/src/bioseqkit/entrez.py +39 -0
  22. bioseqkit-0.1.0/src/bioseqkit/index.py +164 -0
  23. bioseqkit-0.1.0/src/bioseqkit/io.py +182 -0
  24. bioseqkit-0.1.0/src/bioseqkit/kmer.py +141 -0
  25. bioseqkit-0.1.0/src/bioseqkit/py.typed +0 -0
  26. bioseqkit-0.1.0/src/bioseqkit/stats.py +111 -0
  27. bioseqkit-0.1.0/src/bioseqkit/transform.py +95 -0
  28. bioseqkit-0.1.0/tests/conftest.py +50 -0
  29. bioseqkit-0.1.0/tests/test_cli.py +53 -0
  30. bioseqkit-0.1.0/tests/test_index.py +53 -0
  31. bioseqkit-0.1.0/tests/test_io.py +67 -0
  32. bioseqkit-0.1.0/tests/test_kmer.py +60 -0
  33. bioseqkit-0.1.0/tests/test_stats.py +41 -0
  34. bioseqkit-0.1.0/tests/test_transform.py +40 -0
  35. bioseqkit-0.1.0/uv.lock +1918 -0
  36. bioseqkit-0.1.0//345/274/200/351/242/230/346/212/245/345/221/212.pdf +5060 -3
@@ -0,0 +1,28 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Set up Python ${{ matrix.python-version }}
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: ${{ matrix.python-version }}
21
+ - name: Install
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install -e ".[viz]" pytest ruff
25
+ - name: Lint
26
+ run: ruff check src tests
27
+ - name: Test
28
+ run: pytest -q
@@ -0,0 +1,17 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Generated artifacts
13
+ *.fai
14
+ .pytest_cache/
15
+ .ruff_cache/
16
+ docs/_build/
17
+ .ipynb_checkpoints/
@@ -0,0 +1 @@
1
+ 3.14
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jilai Cheng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: bioseqkit
3
+ Version: 0.1.0
4
+ Summary: A lightweight, dependency-free biological sequence processing toolkit (FASTA/FASTQ, stats, k-mer, minimizer, indexing).
5
+ Author-email: Jilai Cheng <chengjilai@sjtu.edu.cn>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: bioinformatics,fasta,fastq,kmer,minimizer,sequence
9
+ Requires-Python: >=3.10
10
+ Provides-Extra: docs
11
+ Requires-Dist: myst-parser>=2.0; extra == 'docs'
12
+ Requires-Dist: sphinx>=7.0; extra == 'docs'
13
+ Provides-Extra: net
14
+ Requires-Dist: requests>=2.28; extra == 'net'
15
+ Provides-Extra: viz
16
+ Requires-Dist: matplotlib>=3.7; extra == 'viz'
17
+ Requires-Dist: seaborn>=0.12; extra == 'viz'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # bioseqkit
21
+
22
+ A lightweight, **dependency-free** biological sequence processing toolkit built
23
+ from scratch in pure Python. `bioseqkit` implements FASTA/FASTQ parsing,
24
+ sequence statistics, transformations, k-mer / minimizer analysis and FAI-like
25
+ random-access indexing, exposed both as a Python API and a command-line tool.
26
+
27
+ The project is a teaching implementation for **BIO2502 (Programming Languages
28
+ for Biological Computing)**: it deliberately re-implements the low-level I/O,
29
+ streaming and indexing logic instead of relying on Biopython, so the core
30
+ design patterns of bioinformatics data handling are made explicit.
31
+
32
+ ## Features
33
+
34
+ - **Streaming FASTA/FASTQ parsers** (`io`) — generator based, constant memory,
35
+ transparent gzip support, Phred quality decoding.
36
+ - **Statistics** (`stats`) — length distribution, N50, GC content, N-base
37
+ ratio, base-composition matrix.
38
+ - **Transformations** (`transform`) — reverse complement (IUPAC aware) and
39
+ six-frame translation with the standard genetic code.
40
+ - **k-mer analysis** (`kmer`) — counting, top-k, canonical k-mers,
41
+ **multi-process** parallel counting, and **minimizer** sampling.
42
+ - **FAI-like indexing** (`index`) — `samtools faidx`-compatible index for
43
+ `chr:start-end` random access without scanning the whole file.
44
+ - **CLI** (`cli`) — `stats`, `revcomp`, `translate`, `kmer`, `minimizer`,
45
+ `index`, `fetch`.
46
+ - **NCBI download** (`entrez`) — fetch reference sequences via E-utilities
47
+ (standard-library HTTP only).
48
+
49
+ ## Project layout
50
+
51
+ ```
52
+ bioseqkit/
53
+ ├── pyproject.toml # src-layout, PEP 621 metadata, console script
54
+ ├── README.md
55
+ ├── LICENSE
56
+ ├── environment.yml # conda environment
57
+ ├── requirements.txt
58
+ ├── src/bioseqkit/
59
+ │ ├── __init__.py # public API
60
+ │ ├── io.py # FASTA/FASTQ parsers
61
+ │ ├── stats.py # sequence statistics
62
+ │ ├── transform.py # revcomp + six-frame translation
63
+ │ ├── kmer.py # k-mer / minimizer (serial + parallel)
64
+ │ ├── index.py # FAI-like random-access index
65
+ │ ├── entrez.py # NCBI download helper
66
+ │ └── cli.py # argparse CLI
67
+ ├── tests/ # pytest suite (io/stats/transform/kmer/index/cli)
68
+ ├── examples/
69
+ │ ├── demo.ipynb # Jupyter demo (stats, GC, k-mer spectrum, ...)
70
+ │ └── example_data/sample.fa
71
+ ├── docs/ # Sphinx documentation
72
+ └── .github/workflows/ci.yml
73
+ ```
74
+
75
+ ## Installation
76
+
77
+ Requires Python >= 3.10. The core package has **no runtime dependencies**.
78
+
79
+ ```bash
80
+ # with uv (recommended)
81
+ uv pip install -e .
82
+
83
+ # or plain pip
84
+ pip install -e .
85
+
86
+ # with optional extras (plots for the notebook / NCBI download / docs)
87
+ pip install -e ".[viz,net,docs]"
88
+ ```
89
+
90
+ ## Command-line usage
91
+
92
+ ```bash
93
+ bioseqkit stats examples/example_data/sample.fa # JSON statistics
94
+ bioseqkit revcomp examples/example_data/sample.fa # reverse complement
95
+ bioseqkit translate examples/example_data/sample.fa # six-frame translation
96
+ bioseqkit kmer examples/example_data/sample.fa -k 5 --top 10 --canonical
97
+ bioseqkit kmer examples/example_data/sample.fa -k 5 -t 4 # parallel
98
+ bioseqkit minimizer examples/example_data/sample.fa -k 15 -w 10
99
+ bioseqkit index examples/example_data/sample.fa # write *.fai
100
+ bioseqkit fetch examples/example_data/sample.fa seq2:1-16
101
+ ```
102
+
103
+ ## Python API
104
+
105
+ ```python
106
+ import bioseqkit as bsk
107
+
108
+ for rec in bsk.parse_fasta("examples/example_data/sample.fa"):
109
+ print(rec.id, len(rec), bsk.gc_content(rec.sequence))
110
+
111
+ print(bsk.reverse_complement("ATGC")) # -> GCAT
112
+ print(bsk.translate("ATGGCCTAA")) # -> MA*
113
+
114
+ counts = bsk.count_kmers("ACGTACGTACGT", k=3, canonical=True)
115
+ print(bsk.top_kmers(counts, 3))
116
+
117
+ idx = bsk.build_faidx("examples/example_data/sample.fa")
118
+ print(idx.fetch("seq2", 1, 16))
119
+ ```
120
+
121
+ ## Testing
122
+
123
+ ```bash
124
+ uv run --with pytest pytest -q # 39 tests
125
+ ```
126
+
127
+ Continuous integration (GitHub Actions) runs `ruff` linting and the `pytest`
128
+ suite on Python 3.10–3.12 for every push.
129
+
130
+ ## Data sources
131
+
132
+ - NCBI Nucleotide: <https://www.ncbi.nlm.nih.gov/nucleotide/>
133
+ - UCSC Genome Browser: <https://genome.ucsc.edu/>
134
+
135
+ The bundled `examples/example_data/sample.fa` is a small synthetic sequence for
136
+ offline testing; `demo.ipynb` will download real data from NCBI when a network
137
+ connection is available and fall back to the bundled file otherwise.
138
+
139
+ ## License
140
+
141
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,122 @@
1
+ # bioseqkit
2
+
3
+ A lightweight, **dependency-free** biological sequence processing toolkit built
4
+ from scratch in pure Python. `bioseqkit` implements FASTA/FASTQ parsing,
5
+ sequence statistics, transformations, k-mer / minimizer analysis and FAI-like
6
+ random-access indexing, exposed both as a Python API and a command-line tool.
7
+
8
+ The project is a teaching implementation for **BIO2502 (Programming Languages
9
+ for Biological Computing)**: it deliberately re-implements the low-level I/O,
10
+ streaming and indexing logic instead of relying on Biopython, so the core
11
+ design patterns of bioinformatics data handling are made explicit.
12
+
13
+ ## Features
14
+
15
+ - **Streaming FASTA/FASTQ parsers** (`io`) — generator based, constant memory,
16
+ transparent gzip support, Phred quality decoding.
17
+ - **Statistics** (`stats`) — length distribution, N50, GC content, N-base
18
+ ratio, base-composition matrix.
19
+ - **Transformations** (`transform`) — reverse complement (IUPAC aware) and
20
+ six-frame translation with the standard genetic code.
21
+ - **k-mer analysis** (`kmer`) — counting, top-k, canonical k-mers,
22
+ **multi-process** parallel counting, and **minimizer** sampling.
23
+ - **FAI-like indexing** (`index`) — `samtools faidx`-compatible index for
24
+ `chr:start-end` random access without scanning the whole file.
25
+ - **CLI** (`cli`) — `stats`, `revcomp`, `translate`, `kmer`, `minimizer`,
26
+ `index`, `fetch`.
27
+ - **NCBI download** (`entrez`) — fetch reference sequences via E-utilities
28
+ (standard-library HTTP only).
29
+
30
+ ## Project layout
31
+
32
+ ```
33
+ bioseqkit/
34
+ ├── pyproject.toml # src-layout, PEP 621 metadata, console script
35
+ ├── README.md
36
+ ├── LICENSE
37
+ ├── environment.yml # conda environment
38
+ ├── requirements.txt
39
+ ├── src/bioseqkit/
40
+ │ ├── __init__.py # public API
41
+ │ ├── io.py # FASTA/FASTQ parsers
42
+ │ ├── stats.py # sequence statistics
43
+ │ ├── transform.py # revcomp + six-frame translation
44
+ │ ├── kmer.py # k-mer / minimizer (serial + parallel)
45
+ │ ├── index.py # FAI-like random-access index
46
+ │ ├── entrez.py # NCBI download helper
47
+ │ └── cli.py # argparse CLI
48
+ ├── tests/ # pytest suite (io/stats/transform/kmer/index/cli)
49
+ ├── examples/
50
+ │ ├── demo.ipynb # Jupyter demo (stats, GC, k-mer spectrum, ...)
51
+ │ └── example_data/sample.fa
52
+ ├── docs/ # Sphinx documentation
53
+ └── .github/workflows/ci.yml
54
+ ```
55
+
56
+ ## Installation
57
+
58
+ Requires Python >= 3.10. The core package has **no runtime dependencies**.
59
+
60
+ ```bash
61
+ # with uv (recommended)
62
+ uv pip install -e .
63
+
64
+ # or plain pip
65
+ pip install -e .
66
+
67
+ # with optional extras (plots for the notebook / NCBI download / docs)
68
+ pip install -e ".[viz,net,docs]"
69
+ ```
70
+
71
+ ## Command-line usage
72
+
73
+ ```bash
74
+ bioseqkit stats examples/example_data/sample.fa # JSON statistics
75
+ bioseqkit revcomp examples/example_data/sample.fa # reverse complement
76
+ bioseqkit translate examples/example_data/sample.fa # six-frame translation
77
+ bioseqkit kmer examples/example_data/sample.fa -k 5 --top 10 --canonical
78
+ bioseqkit kmer examples/example_data/sample.fa -k 5 -t 4 # parallel
79
+ bioseqkit minimizer examples/example_data/sample.fa -k 15 -w 10
80
+ bioseqkit index examples/example_data/sample.fa # write *.fai
81
+ bioseqkit fetch examples/example_data/sample.fa seq2:1-16
82
+ ```
83
+
84
+ ## Python API
85
+
86
+ ```python
87
+ import bioseqkit as bsk
88
+
89
+ for rec in bsk.parse_fasta("examples/example_data/sample.fa"):
90
+ print(rec.id, len(rec), bsk.gc_content(rec.sequence))
91
+
92
+ print(bsk.reverse_complement("ATGC")) # -> GCAT
93
+ print(bsk.translate("ATGGCCTAA")) # -> MA*
94
+
95
+ counts = bsk.count_kmers("ACGTACGTACGT", k=3, canonical=True)
96
+ print(bsk.top_kmers(counts, 3))
97
+
98
+ idx = bsk.build_faidx("examples/example_data/sample.fa")
99
+ print(idx.fetch("seq2", 1, 16))
100
+ ```
101
+
102
+ ## Testing
103
+
104
+ ```bash
105
+ uv run --with pytest pytest -q # 39 tests
106
+ ```
107
+
108
+ Continuous integration (GitHub Actions) runs `ruff` linting and the `pytest`
109
+ suite on Python 3.10–3.12 for every push.
110
+
111
+ ## Data sources
112
+
113
+ - NCBI Nucleotide: <https://www.ncbi.nlm.nih.gov/nucleotide/>
114
+ - UCSC Genome Browser: <https://genome.ucsc.edu/>
115
+
116
+ The bundled `examples/example_data/sample.fa` is a small synthetic sequence for
117
+ offline testing; `demo.ipynb` will download real data from NCBI when a network
118
+ connection is available and fall back to the bundled file otherwise.
119
+
120
+ ## License
121
+
122
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,19 @@
1
+ """Sphinx configuration for bioseqkit."""
2
+
3
+ project = "bioseqkit"
4
+ author = "Jilai Cheng"
5
+ release = "0.1.0"
6
+
7
+ extensions = [
8
+ "sphinx.ext.autodoc",
9
+ "sphinx.ext.napoleon",
10
+ "sphinx.ext.viewcode",
11
+ "myst_parser",
12
+ ]
13
+
14
+ autodoc_typehints = "description"
15
+ templates_path = ["_templates"]
16
+ exclude_patterns = ["_build"]
17
+
18
+ html_theme = "alabaster"
19
+ source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
@@ -0,0 +1,36 @@
1
+ bioseqkit documentation
2
+ ========================
3
+
4
+ *bioseqkit* is a lightweight, dependency-free biological sequence processing
5
+ toolkit: pure-Python FASTA/FASTQ parsing, statistics, transformations, k-mer /
6
+ minimizer analysis and FAI-like random-access indexing.
7
+
8
+ .. toctree::
9
+ :maxdepth: 2
10
+ :caption: Contents
11
+
12
+ quickstart
13
+
14
+ API reference
15
+ -------------
16
+
17
+ .. automodule:: bioseqkit.io
18
+ :members:
19
+
20
+ .. automodule:: bioseqkit.stats
21
+ :members:
22
+
23
+ .. automodule:: bioseqkit.transform
24
+ :members:
25
+
26
+ .. automodule:: bioseqkit.kmer
27
+ :members:
28
+
29
+ .. automodule:: bioseqkit.index
30
+ :members:
31
+
32
+ Indices
33
+ -------
34
+
35
+ * :ref:`genindex`
36
+ * :ref:`modindex`
@@ -0,0 +1,21 @@
1
+ # Quickstart
2
+
3
+ Install the package and run the CLI:
4
+
5
+ ```bash
6
+ pip install -e .
7
+ bioseqkit stats examples/example_data/sample.fa
8
+ ```
9
+
10
+ Use the Python API:
11
+
12
+ ```python
13
+ import bioseqkit as bsk
14
+
15
+ records = list(bsk.parse_fasta("examples/example_data/sample.fa"))
16
+ stats = bsk.sequence_stats(r.sequence for r in records)
17
+ print(stats.as_dict())
18
+
19
+ print(bsk.reverse_complement("ATGC")) # GCAT
20
+ print(bsk.count_kmers("ACGTACGT", 3).most_common(3))
21
+ ```
@@ -0,0 +1,17 @@
1
+ name: bioseqkit
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python>=3.10
7
+ - pip
8
+ - matplotlib
9
+ - seaborn
10
+ - jupyter
11
+ - pip:
12
+ - -e .
13
+ - pytest
14
+ - pytest-benchmark
15
+ - ruff
16
+ - sphinx
17
+ - myst-parser
@@ -0,0 +1,222 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# bioseqkit demo\n",
8
+ "\n",
9
+ "This notebook demonstrates the core features of **bioseqkit** on a real\n",
10
+ "sequence. It tries to download the *E. coli* K-12 MG1655 genome region (or\n",
11
+ "the human mitochondrion `chrM`) from NCBI; if there is no network access it\n",
12
+ "falls back to the bundled `example_data/sample.fa`.\n"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "markdown",
17
+ "metadata": {},
18
+ "source": [
19
+ "## 1. Load data (NCBI with local fallback)"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from pathlib import Path\n",
29
+ "import bioseqkit as bsk\n",
30
+ "\n",
31
+ "DATA = Path('example_data/sample.fa')\n",
32
+ "records = list(bsk.parse_fasta(str(DATA)))\n",
33
+ "\n",
34
+ "# Optional: fetch a real genome from NCBI (needs internet).\n",
35
+ "try:\n",
36
+ " from bioseqkit.entrez import efetch_fasta\n",
37
+ " import io\n",
38
+ " fasta_text = efetch_fasta('NC_012920.1', email='you@example.com') # human chrM\n",
39
+ " records = list(bsk.parse_fasta(io.StringIO(fasta_text)))\n",
40
+ " print('Downloaded from NCBI:', records[0].id, records[0].description)\n",
41
+ "except Exception as exc:\n",
42
+ " print('Using local example data (no network):', exc)\n",
43
+ "\n",
44
+ "for r in records:\n",
45
+ " print(r.id, len(r), r.description)\n"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "markdown",
50
+ "metadata": {},
51
+ "source": [
52
+ "## 2. Basic statistics"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "stats = bsk.sequence_stats(r.sequence for r in records)\n",
62
+ "import json\n",
63
+ "print(json.dumps(stats.as_dict(), indent=2))\n"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "markdown",
68
+ "metadata": {},
69
+ "source": [
70
+ "## 3. GC content & length distribution"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "import matplotlib.pyplot as plt\n",
80
+ "\n",
81
+ "gc = [bsk.gc_content(r.sequence) for r in records]\n",
82
+ "lengths = [len(r) for r in records]\n",
83
+ "\n",
84
+ "fig, axes = plt.subplots(1, 2, figsize=(10, 4))\n",
85
+ "axes[0].bar([r.id for r in records], gc, color='steelblue')\n",
86
+ "axes[0].set_ylabel('GC content'); axes[0].set_title('GC content per sequence')\n",
87
+ "axes[1].bar([r.id for r in records], lengths, color='indianred')\n",
88
+ "axes[1].set_ylabel('length (bp)'); axes[1].set_title('Sequence length')\n",
89
+ "plt.tight_layout(); plt.show()\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "## 4. k-mer spectrum"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "from collections import Counter\n",
106
+ "k = 4\n",
107
+ "counts = Counter()\n",
108
+ "for r in records:\n",
109
+ " counts += bsk.count_kmers(r.sequence, k, canonical=True)\n",
110
+ "\n",
111
+ "print('Top-10 canonical %d-mers:' % k)\n",
112
+ "for kmer, c in bsk.top_kmers(counts, 10):\n",
113
+ " print(f' {kmer}\\t{c}')\n",
114
+ "\n",
115
+ "# k-mer spectrum: histogram of k-mer multiplicities\n",
116
+ "spectrum = Counter(counts.values())\n",
117
+ "xs = sorted(spectrum)\n",
118
+ "plt.figure(figsize=(6, 4))\n",
119
+ "plt.bar(xs, [spectrum[x] for x in xs], color='seagreen')\n",
120
+ "plt.xlabel('k-mer multiplicity'); plt.ylabel('# distinct k-mers')\n",
121
+ "plt.title(f'{k}-mer spectrum'); plt.tight_layout(); plt.show()\n"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "markdown",
126
+ "metadata": {},
127
+ "source": [
128
+ "## 5. Six-frame translation vs. known CDS"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": null,
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": [
137
+ "seq = records[0].sequence\n",
138
+ "for frame in bsk.six_frame_translation(seq):\n",
139
+ " print(frame.name, frame.protein[:60])\n"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "metadata": {},
145
+ "source": [
146
+ "## 6. Minimizer distribution"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": null,
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "mins = bsk.minimizers(records[0].sequence, k=8, w=5)\n",
156
+ "positions = [p for p, _ in mins]\n",
157
+ "print(f'{len(mins)} minimizers selected from {len(records[0])} bp')\n",
158
+ "plt.figure(figsize=(8, 2))\n",
159
+ "plt.eventplot(positions, colors='black')\n",
160
+ "plt.xlabel('position (bp)'); plt.title('Minimizer positions'); plt.yticks([])\n",
161
+ "plt.tight_layout(); plt.show()\n"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "metadata": {},
167
+ "source": [
168
+ "## 7. Random access with the FAI-like index"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "from bioseqkit.index import build_faidx\n",
178
+ "idx = build_faidx(str(DATA))\n",
179
+ "name = idx.names()[0]\n",
180
+ "print('Index records:', idx.names())\n",
181
+ "print('Fetch', name, '1-20:', idx.fetch(name, 1, 20))\n"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "metadata": {},
187
+ "source": [
188
+ "## 8. Multithreaded k-mer benchmark\n",
189
+ "Compare single-process vs. multi-process k-mer counting on a larger sequence."
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "import time\n",
199
+ "big = (records[0].sequence * 2000)\n",
200
+ "print(f'sequence length: {len(big):,} bp')\n",
201
+ "\n",
202
+ "t0 = time.perf_counter(); c1 = bsk.count_kmers(big, 8, canonical=True); t1 = time.perf_counter()\n",
203
+ "c4 = bsk.count_kmers_parallel([big], 8, canonical=True, workers=4); t2 = time.perf_counter()\n",
204
+ "print(f'serial: {t1 - t0:.3f}s')\n",
205
+ "print(f'parallel: {t2 - t1:.3f}s (identical result: {c1 == c4})')\n"
206
+ ]
207
+ }
208
+ ],
209
+ "metadata": {
210
+ "kernelspec": {
211
+ "display_name": "Python 3",
212
+ "language": "python",
213
+ "name": "python3"
214
+ },
215
+ "language_info": {
216
+ "name": "python",
217
+ "version": "3.x"
218
+ }
219
+ },
220
+ "nbformat": 4,
221
+ "nbformat_minor": 5
222
+ }
@@ -0,0 +1,5 @@
1
+ >seq1 synthetic demo sequence
2
+ ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGACGTACGTACGTACGTACGTAC
3
+ GTACGTACGTACGGCCTTAAGGCCTTAAGGCCNNNNACGTACGTACGTACGTACGTACGTA
4
+ >seq2 gc-rich fragment
5
+ GGGGCCCCGGGGCCCCGGGGCCCCGGGGCCCCATATATATATATGCGCGCGCGCGCGCGCG
@@ -0,0 +1,3 @@
1
+ 课程项目报告(typst),需包含摘要、背景、项目设计、测试结果、讨论、参考文献
2
+ 项目代码需要完整、规范的结构,以及详细的说明README.md
3
+ 录制的介绍项目视频,包含项目介绍和演示,格式为mp4