PyPI - bioseqkits - Versions diffs - 0.1.0__tar.gz - Mend

bioseqkits 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

bioseqkits-0.1.0/PKG-INFO +151 -0
bioseqkits-0.1.0/README.md +123 -0
bioseqkits-0.1.0/pyproject.toml +48 -0
bioseqkits-0.1.0/setup.cfg +4 -0
bioseqkits-0.1.0/setup.py +37 -0
bioseqkits-0.1.0/src/bioseqkit/__init__.py +3 -0
bioseqkits-0.1.0/src/bioseqkits/__init__.py +3 -0
bioseqkits-0.1.0/src/bioseqkits/cli.py +228 -0
bioseqkits-0.1.0/src/bioseqkits/models.py +162 -0
bioseqkits-0.1.0/src/bioseqkits/operations.py +123 -0
bioseqkits-0.1.0/src/bioseqkits/parser.py +126 -0
bioseqkits-0.1.0/src/bioseqkits/stats.py +79 -0
bioseqkits-0.1.0/src/bioseqkits/utils.py +133 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/PKG-INFO +151 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/SOURCES.txt +18 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/dependency_links.txt +1 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/entry_points.txt +2 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/requires.txt +13 -0
bioseqkits-0.1.0/src/bioseqkits.egg-info/top_level.txt +2 -0
bioseqkits-0.1.0/tests/test_bioseqkit.py +66 -0

bioseqkits-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,151 @@
+Metadata-Version: 2.4
+Name: bioseqkits
+Version: 0.1.0
+Summary: Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis.
+Author: Ouyang Qinglang, Lin Yushu, Liu Yijun
+Author-email: Ouyang Qinglang <ouyang725@sjtu.edu.cn>, Lin Yushu <benzi0228@sjtu.edu.cn>, Liu Yijun <liu_yijun@sjtu.edu.cn>
+License: MIT
+Project-URL: Homepage, https://github.com/Neuromancer-P/bioseqkits
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: License :: OSI Approved :: MIT License
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Provides-Extra: test
+Requires-Dist: pytest>=7.0; extra == "test"
+Provides-Extra: notebook
+Requires-Dist: matplotlib>=3.5; extra == "notebook"
+Requires-Dist: seaborn>=0.11; extra == "notebook"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: jupyter; extra == "dev"
+Requires-Dist: black; extra == "dev"
+Requires-Dist: isort; extra == "dev"
+Dynamic: author
+Dynamic: requires-python
+# bioseqkits
+A lightweight modular Python toolkit for FASTA/FASTQ parsing and sequence analysis, implemented from scratch with pure Python.
+## Project Overview
+`bioseqkits` is a teaching-oriented toolkit that recreates core FASTA/FASTQ parsing and sequence analysis logic without relying on `Bio.SeqIO` or other sequence parsing libraries. It uses Python generators to parse sequence files in a streaming fashion, supports plain and gzip-compressed input, and exposes common bioinformatics operations through both a Python API and a command-line interface.
+This repository currently includes:
+- `src/bioseqkits/` package with parser, sequence model, operations, stats, and CLI modules
+- pytest-based unit tests under `tests/`
+- Jupyter notebooks for interactive sequence analysis and validation
+## Key Features
+### 1. FASTA / FASTQ Parsing
+- Pure Python parser for FASTA and FASTQ files
+- Supports both plain and gzip-compressed files
+- Generator-based streaming parsing for low memory usage
+- Handles multi-line sequences, blank lines, and common FASTQ formatting edge cases
+### 2. Sequence Operations
+- `reverse_complement` for DNA reverse complement computation
+- `dna_to_rna` and `rna_to_dna` conversion
+- `translate` for frame-specific translation
+- `six_frame_translation` for all six reading frames
+- `kmer_frequency` and `top_k_kmers` for k-mer analysis
+### 3. Sequence Statistics
+- Per-record sequence length
+- GC content calculation
+- N base ratio calculation
+- Base composition percentages for A/C/G/T/N
+- Convenience functions like `calculate_sequence_stats`
+### 4. Command-Line Interface (CLI)
+Supported subcommands:
+- `stats` — sequence statistics table output
+- `revcomp` — reverse complement sequence generation
+- `translate` — protein translation of input sequences
+### 5. Testing and Notebooks
+- `pytest` test suite for parser and algorithm edge cases
+- Notebook examples for FASTA analysis and validation
+- Current notebook file: `analysis_sequence_fasta.ipynb`
+## Installation
+### Clone repository
+```bash
+git clone https://github.com/Neuromancer-P/bioseqkit.git
+cd bioseqkit
+```
+### Install package
+```bash
+pip install .
+```
+### Install development dependencies
+```bash
+pip install -e .[dev]
+```
+## Quick Start
+### Python API example
+```python
+from bioseqkits.parser import parse_fasta
+from bioseqkits.operations import reverse_complement, dna_to_rna
+from bioseqkits.stats import calculate_sequence_stats
+with open('tests/data/sequence.fasta') as fh:
+    for record in parse_fasta(fh):
+        stats = calculate_sequence_stats(record)
+        rc = reverse_complement(record.seq)
+        rna = dna_to_rna(record.seq)
+        print(record.id, stats['length'], stats['gc_content'], rc[:20], rna[:20])
+```
+### Run CLI commands
+```bash
+bioseqkits stats tests/data/sequence.fasta
+bioseqkits revcomp tests/data/sequence.fasta -o output_revcomp.fasta
+bioseqkits translate tests/data/sequence.fasta --frame 0
+```
+### Run tests
+```bash
+pytest tests/ -v
+```
+## Project Structure
+```
+src/bioseqkits/
+├── cli.py
+├── models.py
+├── operations.py
+├── parser.py
+├── stats.py
+├── utils.py
+├── __init__.py
+tests/
+├── __init__.py
+├── conftest.py
+├── test_bioseqkit.py
+├── data/
+│   └── sequence.fasta
+analysis_sequence_fasta.ipynb
+bioseqkit_test.ipynb
+pyproject.toml
+README.md
+setup.py
+```
+## Packaging
+This project is configured with `pyproject.toml` and uses `setuptools` for packaging. The package exposes a console script named `bioseqkits` that points to `bioseqkits.cli:main`.
+## License
+MIT License

bioseqkits-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,123 @@
+# bioseqkits
+A lightweight modular Python toolkit for FASTA/FASTQ parsing and sequence analysis, implemented from scratch with pure Python.
+## Project Overview
+`bioseqkits` is a teaching-oriented toolkit that recreates core FASTA/FASTQ parsing and sequence analysis logic without relying on `Bio.SeqIO` or other sequence parsing libraries. It uses Python generators to parse sequence files in a streaming fashion, supports plain and gzip-compressed input, and exposes common bioinformatics operations through both a Python API and a command-line interface.
+This repository currently includes:
+- `src/bioseqkits/` package with parser, sequence model, operations, stats, and CLI modules
+- pytest-based unit tests under `tests/`
+- Jupyter notebooks for interactive sequence analysis and validation
+## Key Features
+### 1. FASTA / FASTQ Parsing
+- Pure Python parser for FASTA and FASTQ files
+- Supports both plain and gzip-compressed files
+- Generator-based streaming parsing for low memory usage
+- Handles multi-line sequences, blank lines, and common FASTQ formatting edge cases
+### 2. Sequence Operations
+- `reverse_complement` for DNA reverse complement computation
+- `dna_to_rna` and `rna_to_dna` conversion
+- `translate` for frame-specific translation
+- `six_frame_translation` for all six reading frames
+- `kmer_frequency` and `top_k_kmers` for k-mer analysis
+### 3. Sequence Statistics
+- Per-record sequence length
+- GC content calculation
+- N base ratio calculation
+- Base composition percentages for A/C/G/T/N
+- Convenience functions like `calculate_sequence_stats`
+### 4. Command-Line Interface (CLI)
+Supported subcommands:
+- `stats` — sequence statistics table output
+- `revcomp` — reverse complement sequence generation
+- `translate` — protein translation of input sequences
+### 5. Testing and Notebooks
+- `pytest` test suite for parser and algorithm edge cases
+- Notebook examples for FASTA analysis and validation
+- Current notebook file: `analysis_sequence_fasta.ipynb`
+## Installation
+### Clone repository
+```bash
+git clone https://github.com/Neuromancer-P/bioseqkit.git
+cd bioseqkit
+```
+### Install package
+```bash
+pip install .
+```
+### Install development dependencies
+```bash
+pip install -e .[dev]
+```
+## Quick Start
+### Python API example
+```python
+from bioseqkits.parser import parse_fasta
+from bioseqkits.operations import reverse_complement, dna_to_rna
+from bioseqkits.stats import calculate_sequence_stats
+with open('tests/data/sequence.fasta') as fh:
+    for record in parse_fasta(fh):
+        stats = calculate_sequence_stats(record)
+        rc = reverse_complement(record.seq)
+        rna = dna_to_rna(record.seq)
+        print(record.id, stats['length'], stats['gc_content'], rc[:20], rna[:20])
+```
+### Run CLI commands
+```bash
+bioseqkits stats tests/data/sequence.fasta
+bioseqkits revcomp tests/data/sequence.fasta -o output_revcomp.fasta
+bioseqkits translate tests/data/sequence.fasta --frame 0
+```
+### Run tests
+```bash
+pytest tests/ -v
+```
+## Project Structure
+```
+src/bioseqkits/
+├── cli.py
+├── models.py
+├── operations.py
+├── parser.py
+├── stats.py
+├── utils.py
+├── __init__.py
+tests/
+├── __init__.py
+├── conftest.py
+├── test_bioseqkit.py
+├── data/
+│   └── sequence.fasta
+analysis_sequence_fasta.ipynb
+bioseqkit_test.ipynb
+pyproject.toml
+README.md
+setup.py
+```
+## Packaging
+This project is configured with `pyproject.toml` and uses `setuptools` for packaging. The package exposes a console script named `bioseqkits` that points to `bioseqkits.cli:main`.
+## License
+MIT License

bioseqkits-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,48 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "bioseqkits"
+version = "0.1.0"
+authors = [
+  { name = "Ouyang Qinglang", email = "ouyang725@sjtu.edu.cn" },
+  { name = "Lin Yushu", email = "benzi0228@sjtu.edu.cn" },
+  { name = "Liu Yijun", email = "liu_yijun@sjtu.edu.cn" }
+]
+description = "Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis."
+readme = {file = "README.md", content-type = "text/markdown"}
+license = { text = "MIT" }
+classifiers = [
+  "Programming Language :: Python :: 3.10",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Science/Research",
+  "Topic :: Scientific/Engineering :: Bio-Informatics",
+  "License :: OSI Approved :: MIT License"
+]
+requires-python = ">=3.9"
+dependencies = []
+[project.optional-dependencies]
+test = [
+  "pytest>=7.0",
+]
+notebook = [
+  "matplotlib>=3.5",
+  "seaborn>=0.11",
+]
+dev = [
+  "pytest>=7.0",
+  "jupyter",
+  "black",
+  "isort"
+]
+[project.urls]
+"Homepage" = "https://github.com/Neuromancer-P/bioseqkits"
+[project.scripts]
+bioseqkits = "bioseqkits.cli:main"
+[tool.setuptools.packages.find]
+where = ["src"]

bioseqkits-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

bioseqkits-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,37 @@
+from pathlib import Path
+from setuptools import setup, find_packages
+here = Path(__file__).resolve().parent
+long_description = (here / "README.md").read_text(encoding="utf-8")
+setup(
+    name="bioseqkits",
+    version="0.1.0",
+    description="Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="Ouyang Qinglang, Lin Yushu, Liu Yijun",
+    author_email="ouyang725@sjtu.edu.cn, benzi0228@sjtu.edu.cn, liu_yijun@sjtu.edu.cn",
+    license="MIT",
+    python_requires=">=3.9",
+    packages=find_packages(where="src"),
+    package_dir={"": "src"},
+    install_requires=[],
+    extras_require={
+        "test": ["pytest>=7.0"],
+        "notebook": ["matplotlib>=3.5", "seaborn>=0.11"],
+        "dev": ["pytest>=7.0", "jupyter", "black", "isort"],
+    },
+    entry_points={
+        "console_scripts": [
+            "bioseqkits=bioseqkits.cli:main",
+        ],
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3.10",
+        "Operating System :: OS Independent",
+        "Intended Audience :: Science/Research",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+        "License :: OSI Approved :: MIT License",
+    ],
+)

bioseqkits-0.1.0/src/bioseqkit/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""bioseqkits - A lightweight FASTA/FASTQ sequence processing toolkit."""
+__version__ = "0.1.0"

bioseqkits-0.1.0/src/bioseqkits/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""bioseqkits - A lightweight FASTA/FASTQ sequence processing toolkit."""
+__version__ = "0.1.0"

bioseqkits-0.1.0/src/bioseqkits/cli.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+src/genbankx/cli.py
+命令行接口模块
+"""
+import argparse
+import sys
+from typing import List, Dict, Optional
+from pathlib import Path
+from bioseqkits.parser import parse_fasta, parse_fastq
+from bioseqkits.stats import calculate_sequence_stats
+from bioseqkits.operations import reverse_complement, six_frame_translation
+from bioseqkits.utils import FileFormat, open_sequence_file, detect_format
+from bioseqkits.models import SeqRecord
+def cmd_stats(args):
+    """执行 stats 子命令"""
+    file_path = args.input
+    output_format = args.format
+    try:
+        # 检测文件格式
+        file_format = detect_format(file_path)
+        # 解析并统计
+        results = []
+        with open_sequence_file(file_path) as file_handle:
+            if file_format == FileFormat.FASTA:
+                parser = parse_fasta(file_handle)
+            elif file_format == FileFormat.FASTQ:
+                parser = parse_fastq(file_handle)
+            else:
+                print(f"错误：不支持的文件格式 '{file_format}'", file=sys.stderr)
+                sys.exit(1)
+            for seq_record in parser:
+                stats = calculate_sequence_stats(seq_record)
+                results.append({
+                    'id': seq_record.id,
+                    'description': seq_record.description,
+                    **stats
+                })
+        # 输出结果
+        if output_format == 'table':
+            print_table(results)
+        elif output_format == 'csv':
+            print_csv(results)
+    except FileNotFoundError:
+        print(f"错误：文件 '{file_path}' 不存在", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"错误：{e}", file=sys.stderr)
+        sys.exit(1)
+def cmd_revcomp(args):
+    """执行 revcomp 子命令"""
+    file_path = args.input
+    output_file = args.output
+    try:
+        file_format = detect_format(file_path)
+        with open_sequence_file(file_path) as file_handle:
+            if file_format == FileFormat.FASTA:
+                parser = parse_fasta(file_handle)
+            elif file_format == FileFormat.FASTQ:
+                parser = parse_fastq(file_handle)
+            else:
+                print(f"错误：不支持的文件格式 '{file_format}'", file=sys.stderr)
+                sys.exit(1)
+            # 输出到文件或终端
+            if output_file:
+                with open(output_file, 'w') as out_handle:
+                    for seq_record in parser:
+                        rc_seq = reverse_complement(seq_record.seq)
+                        out_handle.write(f">{seq_record.id}_revcomp {seq_record.description}\n")
+                        out_handle.write(f"{rc_seq}\n")
+                print(f"✅ 反向互补序列已写入：{output_file}")
+            else:
+                for seq_record in parser:
+                    rc_seq = reverse_complement(seq_record.seq)
+                    print(f">{seq_record.id}_revcomp {seq_record.description}")
+                    print(f"{rc_seq}")
+    except FileNotFoundError:
+        print(f"错误：文件 '{file_path}' 不存在", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"错误：{e}", file=sys.stderr)
+        sys.exit(1)
+def cmd_translate(args):
+    """执行 translate 子命令"""
+    file_path = args.input
+    frame = args.frame
+    try:
+        file_format = detect_format(file_path)
+        with open_sequence_file(file_path) as file_handle:
+            if file_format == FileFormat.FASTA:
+                parser = parse_fasta(file_handle)
+            elif file_format == FileFormat.FASTQ:
+                parser = parse_fastq(file_handle)
+            else:
+                print(f"错误：不支持的文件格式 '{file_format}'", file=sys.stderr)
+                sys.exit(1)
+            for seq_record in parser:
+                if frame is None:
+                    # 六框翻译
+                    proteins = six_frame_translation(seq_record.seq)
+                    for i, protein in enumerate(proteins):
+                        frame_name = f"Frame {i+1}" if i < 3 else f"RC Frame {i-2}"
+                        print(f">{seq_record.id} {frame_name}")
+                        print(protein)
+                else:
+                    # 单框翻译
+                    from bioseqkits.operations import translate
+                    protein = translate(seq_record.seq, frame)
+                    print(f">{seq_record.id} Frame {frame}")
+                    print(protein)
+    except FileNotFoundError:
+        print(f"错误：文件 '{file_path}' 不存在", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"错误：{e}", file=sys.stderr)
+        sys.exit(1)
+def print_table(results: List[Dict]):
+    """以表格形式输出统计结果"""
+    if not results:
+        print("无数据")
+        return
+    # 定义列
+    columns = ['id', 'length', 'gc_content', 'n_ratio', 'A', 'C', 'G', 'T', 'N']
+    # 打印表头
+    header = " | ".join(f"{col:>12}" for col in columns)
+    print(header)
+    print("-" * len(header))
+    # 打印数据
+    for row in results:
+        values = []
+        for col in columns:
+            value = row.get(col, '')
+            if isinstance(value, float):
+                values.append(f"{value:>12.2f}")
+            else:
+                values.append(f"{str(value):>12}")
+        print(" | ".join(values))
+def print_csv(results: List[Dict]):
+    """以 CSV 形式输出统计结果"""
+    if not results:
+        return
+    # 定义列
+    columns = ['id', 'description', 'length', 'gc_content', 'n_ratio', 'A', 'C', 'G', 'T', 'N']
+    # 打印表头
+    print(",".join(columns))
+    # 打印数据
+    for row in results:
+        values = []
+        for col in columns:
+            value = row.get(col, '')
+            # CSV 转义
+            if ',' in str(value) or '"' in str(value):
+                value = f'"{str(value).replace(chr(34), chr(34)+chr(34))}"'
+            values.append(str(value))
+        print(",".join(values))
+def main():
+    """主入口函数"""
+    parser = argparse.ArgumentParser(
+        prog='bioseqkits',
+        description='生物序列处理工具包'
+    )
+    subparsers = parser.add_subparsers(dest='command', help='可用命令')
+    # stats 子命令
+    stats_parser = subparsers.add_parser('stats', help='序列统计分析')
+    stats_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
+    stats_parser.add_argument('--format', choices=['table', 'csv'], default='table',
+                              help='输出格式 (默认: table)')
+    stats_parser.set_defaults(func=cmd_stats)
+    # revcomp 子命令
+    revcomp_parser = subparsers.add_parser('revcomp', help='反向互补序列')
+    revcomp_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
+    revcomp_parser.add_argument('--output', '-o', help='输出文件路径 (默认: 终端)')
+    revcomp_parser.set_defaults(func=cmd_revcomp)
+    # translate 子命令
+    translate_parser = subparsers.add_parser('translate', help='六框翻译')
+    translate_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
+    translate_parser.add_argument('--frame', type=int, choices=[0, 1, 2],
+                                  help='阅读框 (0-2，默认: 六框翻译)')
+    translate_parser.set_defaults(func=cmd_translate)
+    # 解析参数
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        sys.exit(1)
+    # 执行命令
+    args.func(args)
+if __name__ == '__main__':
+    main()