bioseqkits 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: bioseqkits
3
+ Version: 0.1.0
4
+ Summary: Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis.
5
+ Author: Ouyang Qinglang, Lin Yushu, Liu Yijun
6
+ Author-email: Ouyang Qinglang <ouyang725@sjtu.edu.cn>, Lin Yushu <benzi0228@sjtu.edu.cn>, Liu Yijun <liu_yijun@sjtu.edu.cn>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/Neuromancer-P/bioseqkits
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ Provides-Extra: test
17
+ Requires-Dist: pytest>=7.0; extra == "test"
18
+ Provides-Extra: notebook
19
+ Requires-Dist: matplotlib>=3.5; extra == "notebook"
20
+ Requires-Dist: seaborn>=0.11; extra == "notebook"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=7.0; extra == "dev"
23
+ Requires-Dist: jupyter; extra == "dev"
24
+ Requires-Dist: black; extra == "dev"
25
+ Requires-Dist: isort; extra == "dev"
26
+ Dynamic: author
27
+ Dynamic: requires-python
28
+
29
+ # bioseqkits
30
+
31
+ A lightweight modular Python toolkit for FASTA/FASTQ parsing and sequence analysis, implemented from scratch with pure Python.
32
+
33
+ ## Project Overview
34
+
35
+ `bioseqkits` is a teaching-oriented toolkit that recreates core FASTA/FASTQ parsing and sequence analysis logic without relying on `Bio.SeqIO` or other sequence parsing libraries. It uses Python generators to parse sequence files in a streaming fashion, supports plain and gzip-compressed input, and exposes common bioinformatics operations through both a Python API and a command-line interface.
36
+
37
+ This repository currently includes:
38
+ - `src/bioseqkits/` package with parser, sequence model, operations, stats, and CLI modules
39
+ - pytest-based unit tests under `tests/`
40
+ - Jupyter notebooks for interactive sequence analysis and validation
41
+
42
+ ## Key Features
43
+
44
+ ### 1. FASTA / FASTQ Parsing
45
+ - Pure Python parser for FASTA and FASTQ files
46
+ - Supports both plain and gzip-compressed files
47
+ - Generator-based streaming parsing for low memory usage
48
+ - Handles multi-line sequences, blank lines, and common FASTQ formatting edge cases
49
+
50
+ ### 2. Sequence Operations
51
+ - `reverse_complement` for DNA reverse complement computation
52
+ - `dna_to_rna` and `rna_to_dna` conversion
53
+ - `translate` for frame-specific translation
54
+ - `six_frame_translation` for all six reading frames
55
+ - `kmer_frequency` and `top_k_kmers` for k-mer analysis
56
+
57
+ ### 3. Sequence Statistics
58
+ - Per-record sequence length
59
+ - GC content calculation
60
+ - N base ratio calculation
61
+ - Base composition percentages for A/C/G/T/N
62
+ - Convenience functions like `calculate_sequence_stats`
63
+
64
+ ### 4. Command-Line Interface (CLI)
65
+ Supported subcommands:
66
+ - `stats` — sequence statistics table output
67
+ - `revcomp` — reverse complement sequence generation
68
+ - `translate` — protein translation of input sequences
69
+
70
+ ### 5. Testing and Notebooks
71
+ - `pytest` test suite for parser and algorithm edge cases
72
+ - Notebook examples for FASTA analysis and validation
73
+ - Current notebook file: `analysis_sequence_fasta.ipynb`
74
+
75
+ ## Installation
76
+
77
+ ### Clone repository
78
+ ```bash
79
+ git clone https://github.com/Neuromancer-P/bioseqkit.git
80
+ cd bioseqkit
81
+ ```
82
+
83
+ ### Install package
84
+ ```bash
85
+ pip install .
86
+ ```
87
+
88
+ ### Install development dependencies
89
+ ```bash
90
+ pip install -e .[dev]
91
+ ```
92
+
93
+ ## Quick Start
94
+
95
+ ### Python API example
96
+ ```python
97
+ from bioseqkits.parser import parse_fasta
98
+ from bioseqkits.operations import reverse_complement, dna_to_rna
99
+ from bioseqkits.stats import calculate_sequence_stats
100
+
101
+ with open('tests/data/sequence.fasta') as fh:
102
+ for record in parse_fasta(fh):
103
+ stats = calculate_sequence_stats(record)
104
+ rc = reverse_complement(record.seq)
105
+ rna = dna_to_rna(record.seq)
106
+ print(record.id, stats['length'], stats['gc_content'], rc[:20], rna[:20])
107
+ ```
108
+
109
+ ### Run CLI commands
110
+ ```bash
111
+ bioseqkits stats tests/data/sequence.fasta
112
+ bioseqkits revcomp tests/data/sequence.fasta -o output_revcomp.fasta
113
+ bioseqkits translate tests/data/sequence.fasta --frame 0
114
+ ```
115
+
116
+ ### Run tests
117
+ ```bash
118
+ pytest tests/ -v
119
+ ```
120
+
121
+ ## Project Structure
122
+
123
+ ```
124
+ src/bioseqkits/
125
+ ├── cli.py
126
+ ├── models.py
127
+ ├── operations.py
128
+ ├── parser.py
129
+ ├── stats.py
130
+ ├── utils.py
131
+ ├── __init__.py
132
+ tests/
133
+ ├── __init__.py
134
+ ├── conftest.py
135
+ ├── test_bioseqkit.py
136
+ ├── data/
137
+ │ └── sequence.fasta
138
+ analysis_sequence_fasta.ipynb
139
+ bioseqkit_test.ipynb
140
+ pyproject.toml
141
+ README.md
142
+ setup.py
143
+ ```
144
+
145
+ ## Packaging
146
+
147
+ This project is configured with `pyproject.toml` and uses `setuptools` for packaging. The package exposes a console script named `bioseqkits` that points to `bioseqkits.cli:main`.
148
+
149
+ ## License
150
+
151
+ MIT License
@@ -0,0 +1,123 @@
1
+ # bioseqkits
2
+
3
+ A lightweight modular Python toolkit for FASTA/FASTQ parsing and sequence analysis, implemented from scratch with pure Python.
4
+
5
+ ## Project Overview
6
+
7
+ `bioseqkits` is a teaching-oriented toolkit that recreates core FASTA/FASTQ parsing and sequence analysis logic without relying on `Bio.SeqIO` or other sequence parsing libraries. It uses Python generators to parse sequence files in a streaming fashion, supports plain and gzip-compressed input, and exposes common bioinformatics operations through both a Python API and a command-line interface.
8
+
9
+ This repository currently includes:
10
+ - `src/bioseqkits/` package with parser, sequence model, operations, stats, and CLI modules
11
+ - pytest-based unit tests under `tests/`
12
+ - Jupyter notebooks for interactive sequence analysis and validation
13
+
14
+ ## Key Features
15
+
16
+ ### 1. FASTA / FASTQ Parsing
17
+ - Pure Python parser for FASTA and FASTQ files
18
+ - Supports both plain and gzip-compressed files
19
+ - Generator-based streaming parsing for low memory usage
20
+ - Handles multi-line sequences, blank lines, and common FASTQ formatting edge cases
21
+
22
+ ### 2. Sequence Operations
23
+ - `reverse_complement` for DNA reverse complement computation
24
+ - `dna_to_rna` and `rna_to_dna` conversion
25
+ - `translate` for frame-specific translation
26
+ - `six_frame_translation` for all six reading frames
27
+ - `kmer_frequency` and `top_k_kmers` for k-mer analysis
28
+
29
+ ### 3. Sequence Statistics
30
+ - Per-record sequence length
31
+ - GC content calculation
32
+ - N base ratio calculation
33
+ - Base composition percentages for A/C/G/T/N
34
+ - Convenience functions like `calculate_sequence_stats`
35
+
36
+ ### 4. Command-Line Interface (CLI)
37
+ Supported subcommands:
38
+ - `stats` — sequence statistics table output
39
+ - `revcomp` — reverse complement sequence generation
40
+ - `translate` — protein translation of input sequences
41
+
42
+ ### 5. Testing and Notebooks
43
+ - `pytest` test suite for parser and algorithm edge cases
44
+ - Notebook examples for FASTA analysis and validation
45
+ - Current notebook file: `analysis_sequence_fasta.ipynb`
46
+
47
+ ## Installation
48
+
49
+ ### Clone repository
50
+ ```bash
51
+ git clone https://github.com/Neuromancer-P/bioseqkit.git
52
+ cd bioseqkit
53
+ ```
54
+
55
+ ### Install package
56
+ ```bash
57
+ pip install .
58
+ ```
59
+
60
+ ### Install development dependencies
61
+ ```bash
62
+ pip install -e .[dev]
63
+ ```
64
+
65
+ ## Quick Start
66
+
67
+ ### Python API example
68
+ ```python
69
+ from bioseqkits.parser import parse_fasta
70
+ from bioseqkits.operations import reverse_complement, dna_to_rna
71
+ from bioseqkits.stats import calculate_sequence_stats
72
+
73
+ with open('tests/data/sequence.fasta') as fh:
74
+ for record in parse_fasta(fh):
75
+ stats = calculate_sequence_stats(record)
76
+ rc = reverse_complement(record.seq)
77
+ rna = dna_to_rna(record.seq)
78
+ print(record.id, stats['length'], stats['gc_content'], rc[:20], rna[:20])
79
+ ```
80
+
81
+ ### Run CLI commands
82
+ ```bash
83
+ bioseqkits stats tests/data/sequence.fasta
84
+ bioseqkits revcomp tests/data/sequence.fasta -o output_revcomp.fasta
85
+ bioseqkits translate tests/data/sequence.fasta --frame 0
86
+ ```
87
+
88
+ ### Run tests
89
+ ```bash
90
+ pytest tests/ -v
91
+ ```
92
+
93
+ ## Project Structure
94
+
95
+ ```
96
+ src/bioseqkits/
97
+ ├── cli.py
98
+ ├── models.py
99
+ ├── operations.py
100
+ ├── parser.py
101
+ ├── stats.py
102
+ ├── utils.py
103
+ ├── __init__.py
104
+ tests/
105
+ ├── __init__.py
106
+ ├── conftest.py
107
+ ├── test_bioseqkit.py
108
+ ├── data/
109
+ │ └── sequence.fasta
110
+ analysis_sequence_fasta.ipynb
111
+ bioseqkit_test.ipynb
112
+ pyproject.toml
113
+ README.md
114
+ setup.py
115
+ ```
116
+
117
+ ## Packaging
118
+
119
+ This project is configured with `pyproject.toml` and uses `setuptools` for packaging. The package exposes a console script named `bioseqkits` that points to `bioseqkits.cli:main`.
120
+
121
+ ## License
122
+
123
+ MIT License
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "bioseqkits"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name = "Ouyang Qinglang", email = "ouyang725@sjtu.edu.cn" },
10
+ { name = "Lin Yushu", email = "benzi0228@sjtu.edu.cn" },
11
+ { name = "Liu Yijun", email = "liu_yijun@sjtu.edu.cn" }
12
+ ]
13
+ description = "Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis."
14
+ readme = {file = "README.md", content-type = "text/markdown"}
15
+ license = { text = "MIT" }
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3.10",
18
+ "Operating System :: OS Independent",
19
+ "Intended Audience :: Science/Research",
20
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
21
+ "License :: OSI Approved :: MIT License"
22
+ ]
23
+ requires-python = ">=3.9"
24
+ dependencies = []
25
+
26
+ [project.optional-dependencies]
27
+ test = [
28
+ "pytest>=7.0",
29
+ ]
30
+ notebook = [
31
+ "matplotlib>=3.5",
32
+ "seaborn>=0.11",
33
+ ]
34
+ dev = [
35
+ "pytest>=7.0",
36
+ "jupyter",
37
+ "black",
38
+ "isort"
39
+ ]
40
+
41
+ [project.urls]
42
+ "Homepage" = "https://github.com/Neuromancer-P/bioseqkits"
43
+
44
+ [project.scripts]
45
+ bioseqkits = "bioseqkits.cli:main"
46
+
47
+ [tool.setuptools.packages.find]
48
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ from setuptools import setup, find_packages
3
+
4
+ here = Path(__file__).resolve().parent
5
+ long_description = (here / "README.md").read_text(encoding="utf-8")
6
+
7
+ setup(
8
+ name="bioseqkits",
9
+ version="0.1.0",
10
+ description="Lightweight Python toolkit for FASTA/FASTQ parsing and bioinformatics sequence analysis.",
11
+ long_description=long_description,
12
+ long_description_content_type="text/markdown",
13
+ author="Ouyang Qinglang, Lin Yushu, Liu Yijun",
14
+ author_email="ouyang725@sjtu.edu.cn, benzi0228@sjtu.edu.cn, liu_yijun@sjtu.edu.cn",
15
+ license="MIT",
16
+ python_requires=">=3.9",
17
+ packages=find_packages(where="src"),
18
+ package_dir={"": "src"},
19
+ install_requires=[],
20
+ extras_require={
21
+ "test": ["pytest>=7.0"],
22
+ "notebook": ["matplotlib>=3.5", "seaborn>=0.11"],
23
+ "dev": ["pytest>=7.0", "jupyter", "black", "isort"],
24
+ },
25
+ entry_points={
26
+ "console_scripts": [
27
+ "bioseqkits=bioseqkits.cli:main",
28
+ ],
29
+ },
30
+ classifiers=[
31
+ "Programming Language :: Python :: 3.10",
32
+ "Operating System :: OS Independent",
33
+ "Intended Audience :: Science/Research",
34
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
35
+ "License :: OSI Approved :: MIT License",
36
+ ],
37
+ )
@@ -0,0 +1,3 @@
1
+ """bioseqkits - A lightweight FASTA/FASTQ sequence processing toolkit."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ """bioseqkits - A lightweight FASTA/FASTQ sequence processing toolkit."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,228 @@
1
+ """
2
+ src/genbankx/cli.py
3
+ 命令行接口模块
4
+ """
5
+ import argparse
6
+ import sys
7
+ from typing import List, Dict, Optional
8
+ from pathlib import Path
9
+
10
+ from bioseqkits.parser import parse_fasta, parse_fastq
11
+ from bioseqkits.stats import calculate_sequence_stats
12
+ from bioseqkits.operations import reverse_complement, six_frame_translation
13
+ from bioseqkits.utils import FileFormat, open_sequence_file, detect_format
14
+ from bioseqkits.models import SeqRecord
15
+
16
+
17
+ def cmd_stats(args):
18
+ """执行 stats 子命令"""
19
+ file_path = args.input
20
+ output_format = args.format
21
+
22
+ try:
23
+ # 检测文件格式
24
+ file_format = detect_format(file_path)
25
+
26
+ # 解析并统计
27
+ results = []
28
+ with open_sequence_file(file_path) as file_handle:
29
+ if file_format == FileFormat.FASTA:
30
+ parser = parse_fasta(file_handle)
31
+ elif file_format == FileFormat.FASTQ:
32
+ parser = parse_fastq(file_handle)
33
+ else:
34
+ print(f"错误:不支持的文件格式 '{file_format}'", file=sys.stderr)
35
+ sys.exit(1)
36
+
37
+ for seq_record in parser:
38
+ stats = calculate_sequence_stats(seq_record)
39
+ results.append({
40
+ 'id': seq_record.id,
41
+ 'description': seq_record.description,
42
+ **stats
43
+ })
44
+
45
+ # 输出结果
46
+ if output_format == 'table':
47
+ print_table(results)
48
+ elif output_format == 'csv':
49
+ print_csv(results)
50
+
51
+ except FileNotFoundError:
52
+ print(f"错误:文件 '{file_path}' 不存在", file=sys.stderr)
53
+ sys.exit(1)
54
+ except Exception as e:
55
+ print(f"错误:{e}", file=sys.stderr)
56
+ sys.exit(1)
57
+
58
+
59
+ def cmd_revcomp(args):
60
+ """执行 revcomp 子命令"""
61
+ file_path = args.input
62
+ output_file = args.output
63
+
64
+ try:
65
+ file_format = detect_format(file_path)
66
+
67
+ with open_sequence_file(file_path) as file_handle:
68
+ if file_format == FileFormat.FASTA:
69
+ parser = parse_fasta(file_handle)
70
+ elif file_format == FileFormat.FASTQ:
71
+ parser = parse_fastq(file_handle)
72
+ else:
73
+ print(f"错误:不支持的文件格式 '{file_format}'", file=sys.stderr)
74
+ sys.exit(1)
75
+
76
+ # 输出到文件或终端
77
+ if output_file:
78
+ with open(output_file, 'w') as out_handle:
79
+ for seq_record in parser:
80
+ rc_seq = reverse_complement(seq_record.seq)
81
+ out_handle.write(f">{seq_record.id}_revcomp {seq_record.description}\n")
82
+ out_handle.write(f"{rc_seq}\n")
83
+ print(f"✅ 反向互补序列已写入:{output_file}")
84
+ else:
85
+ for seq_record in parser:
86
+ rc_seq = reverse_complement(seq_record.seq)
87
+ print(f">{seq_record.id}_revcomp {seq_record.description}")
88
+ print(f"{rc_seq}")
89
+
90
+ except FileNotFoundError:
91
+ print(f"错误:文件 '{file_path}' 不存在", file=sys.stderr)
92
+ sys.exit(1)
93
+ except Exception as e:
94
+ print(f"错误:{e}", file=sys.stderr)
95
+ sys.exit(1)
96
+
97
+
98
+ def cmd_translate(args):
99
+ """执行 translate 子命令"""
100
+ file_path = args.input
101
+ frame = args.frame
102
+
103
+ try:
104
+ file_format = detect_format(file_path)
105
+
106
+ with open_sequence_file(file_path) as file_handle:
107
+ if file_format == FileFormat.FASTA:
108
+ parser = parse_fasta(file_handle)
109
+ elif file_format == FileFormat.FASTQ:
110
+ parser = parse_fastq(file_handle)
111
+ else:
112
+ print(f"错误:不支持的文件格式 '{file_format}'", file=sys.stderr)
113
+ sys.exit(1)
114
+
115
+ for seq_record in parser:
116
+ if frame is None:
117
+ # 六框翻译
118
+ proteins = six_frame_translation(seq_record.seq)
119
+ for i, protein in enumerate(proteins):
120
+ frame_name = f"Frame {i+1}" if i < 3 else f"RC Frame {i-2}"
121
+ print(f">{seq_record.id} {frame_name}")
122
+ print(protein)
123
+ else:
124
+ # 单框翻译
125
+ from bioseqkits.operations import translate
126
+ protein = translate(seq_record.seq, frame)
127
+ print(f">{seq_record.id} Frame {frame}")
128
+ print(protein)
129
+
130
+ except FileNotFoundError:
131
+ print(f"错误:文件 '{file_path}' 不存在", file=sys.stderr)
132
+ sys.exit(1)
133
+ except Exception as e:
134
+ print(f"错误:{e}", file=sys.stderr)
135
+ sys.exit(1)
136
+
137
+
138
+ def print_table(results: List[Dict]):
139
+ """以表格形式输出统计结果"""
140
+ if not results:
141
+ print("无数据")
142
+ return
143
+
144
+ # 定义列
145
+ columns = ['id', 'length', 'gc_content', 'n_ratio', 'A', 'C', 'G', 'T', 'N']
146
+
147
+ # 打印表头
148
+ header = " | ".join(f"{col:>12}" for col in columns)
149
+ print(header)
150
+ print("-" * len(header))
151
+
152
+ # 打印数据
153
+ for row in results:
154
+ values = []
155
+ for col in columns:
156
+ value = row.get(col, '')
157
+ if isinstance(value, float):
158
+ values.append(f"{value:>12.2f}")
159
+ else:
160
+ values.append(f"{str(value):>12}")
161
+ print(" | ".join(values))
162
+
163
+
164
+ def print_csv(results: List[Dict]):
165
+ """以 CSV 形式输出统计结果"""
166
+ if not results:
167
+ return
168
+
169
+ # 定义列
170
+ columns = ['id', 'description', 'length', 'gc_content', 'n_ratio', 'A', 'C', 'G', 'T', 'N']
171
+
172
+ # 打印表头
173
+ print(",".join(columns))
174
+
175
+ # 打印数据
176
+ for row in results:
177
+ values = []
178
+ for col in columns:
179
+ value = row.get(col, '')
180
+ # CSV 转义
181
+ if ',' in str(value) or '"' in str(value):
182
+ value = f'"{str(value).replace(chr(34), chr(34)+chr(34))}"'
183
+ values.append(str(value))
184
+ print(",".join(values))
185
+
186
+
187
+ def main():
188
+ """主入口函数"""
189
+ parser = argparse.ArgumentParser(
190
+ prog='bioseqkits',
191
+ description='生物序列处理工具包'
192
+ )
193
+
194
+ subparsers = parser.add_subparsers(dest='command', help='可用命令')
195
+
196
+ # stats 子命令
197
+ stats_parser = subparsers.add_parser('stats', help='序列统计分析')
198
+ stats_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
199
+ stats_parser.add_argument('--format', choices=['table', 'csv'], default='table',
200
+ help='输出格式 (默认: table)')
201
+ stats_parser.set_defaults(func=cmd_stats)
202
+
203
+ # revcomp 子命令
204
+ revcomp_parser = subparsers.add_parser('revcomp', help='反向互补序列')
205
+ revcomp_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
206
+ revcomp_parser.add_argument('--output', '-o', help='输出文件路径 (默认: 终端)')
207
+ revcomp_parser.set_defaults(func=cmd_revcomp)
208
+
209
+ # translate 子命令
210
+ translate_parser = subparsers.add_parser('translate', help='六框翻译')
211
+ translate_parser.add_argument('input', help='输入文件路径 (FASTA/FASTQ)')
212
+ translate_parser.add_argument('--frame', type=int, choices=[0, 1, 2],
213
+ help='阅读框 (0-2,默认: 六框翻译)')
214
+ translate_parser.set_defaults(func=cmd_translate)
215
+
216
+ # 解析参数
217
+ args = parser.parse_args()
218
+
219
+ if args.command is None:
220
+ parser.print_help()
221
+ sys.exit(1)
222
+
223
+ # 执行命令
224
+ args.func(args)
225
+
226
+
227
+ if __name__ == '__main__':
228
+ main()