FracSim 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. fracsim-1.0.0/LICENSE +21 -0
  2. fracsim-1.0.0/PKG-INFO +68 -0
  3. fracsim-1.0.0/README.md +35 -0
  4. fracsim-1.0.0/fracsim/FracSim.egg-info/PKG-INFO +68 -0
  5. fracsim-1.0.0/fracsim/FracSim.egg-info/SOURCES.txt +27 -0
  6. fracsim-1.0.0/fracsim/FracSim.egg-info/dependency_links.txt +1 -0
  7. fracsim-1.0.0/fracsim/FracSim.egg-info/entry_points.txt +2 -0
  8. fracsim-1.0.0/fracsim/FracSim.egg-info/requires.txt +4 -0
  9. fracsim-1.0.0/fracsim/FracSim.egg-info/top_level.txt +4 -0
  10. fracsim-1.0.0/fracsim/input_layer/__init__.py +13 -0
  11. fracsim-1.0.0/fracsim/input_layer/cli.py +119 -0
  12. fracsim-1.0.0/fracsim/input_layer/file_reader.py +243 -0
  13. fracsim-1.0.0/fracsim/input_layer/parser/__init__.py +6 -0
  14. fracsim-1.0.0/fracsim/input_layer/parser/fasta_parser.py +84 -0
  15. fracsim-1.0.0/fracsim/input_layer/parser/fastq_parser.py +102 -0
  16. fracsim-1.0.0/fracsim/output_layer/__init__.py +7 -0
  17. fracsim-1.0.0/fracsim/output_layer/console.py +91 -0
  18. fracsim-1.0.0/fracsim/output_layer/file_writer.py +66 -0
  19. fracsim-1.0.0/fracsim/output_layer/formatter.py +159 -0
  20. fracsim-1.0.0/fracsim/process_layer/__init__.py +16 -0
  21. fracsim-1.0.0/fracsim/process_layer/ani.py +83 -0
  22. fracsim-1.0.0/fracsim/process_layer/jaccard.py +50 -0
  23. fracsim-1.0.0/fracsim/process_layer/kmer_sketch.py +265 -0
  24. fracsim-1.0.0/fracsim/process_layer/models.py +72 -0
  25. fracsim-1.0.0/fracsim/utils/__init__.py +6 -0
  26. fracsim-1.0.0/fracsim/utils/hash.py +58 -0
  27. fracsim-1.0.0/fracsim/utils/validator.py +64 -0
  28. fracsim-1.0.0/setup.cfg +4 -0
  29. fracsim-1.0.0/setup.py +39 -0
fracsim-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Julian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
fracsim-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: FracSim
3
+ Version: 1.0.0
4
+ Summary: a FracMinHash-based genome similarity estimator for bacteria
5
+ Home-page: https://github.com/zhuyu534/FracSim.git
6
+ Author: YuZhu
7
+ Author-email: zhuyu1068@gmail.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: mmh3>=4.0.0
20
+ Requires-Dist: numpy>=1.21.0
21
+ Requires-Dist: pytest>=7.0.0
22
+ Requires-Dist: pytest-cov>=4.0.0
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ ## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
35
+
36
+ [![Latest Version](https://img.shields.io/github/v/release/zhuyu534/FracSim?color=red)](https://github.com/zhuyu534/FracSim/releases)
37
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
38
+ [![Platform](https://img.shields.io/badge/platform-linux%20%7C%20macos%20%7C%20windows-lightgrey?color=orange)](https://github.com/zhuyu534/FracSim)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
40
+
41
+ </div>
42
+
43
+ FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
44
+
45
+ Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
46
+
47
+ Documents: https://zhuyu534.github.io/FracSim
48
+
49
+ ## ✨ Features
50
+
51
+ - **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
52
+ - **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
53
+ - **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
54
+ - **Easy to use**: Clean command‑line interface with multi‑threading support.
55
+ - **Open source**: MIT licensed – contributions and usage are welcome.
56
+
57
+ ## 📦 Installation
58
+
59
+ ### Requirements
60
+ - Python 3.8 or higher
61
+ - pip package manager
62
+
63
+ ### Install from source
64
+
65
+ ```bash
66
+ git clone https://github.com/zhuyu534/FracSim.git
67
+ cd FracSim
68
+ pip install -e .
@@ -0,0 +1,35 @@
1
+ ## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
2
+
3
+ [![Latest Version](https://img.shields.io/github/v/release/zhuyu534/FracSim?color=red)](https://github.com/zhuyu534/FracSim/releases)
4
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
5
+ [![Platform](https://img.shields.io/badge/platform-linux%20%7C%20macos%20%7C%20windows-lightgrey?color=orange)](https://github.com/zhuyu534/FracSim)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ </div>
9
+
10
+ FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
11
+
12
+ Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
13
+
14
+ Documents: https://zhuyu534.github.io/FracSim
15
+
16
+ ## ✨ Features
17
+
18
+ - **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
19
+ - **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
20
+ - **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
21
+ - **Easy to use**: Clean command‑line interface with multi‑threading support.
22
+ - **Open source**: MIT licensed – contributions and usage are welcome.
23
+
24
+ ## 📦 Installation
25
+
26
+ ### Requirements
27
+ - Python 3.8 or higher
28
+ - pip package manager
29
+
30
+ ### Install from source
31
+
32
+ ```bash
33
+ git clone https://github.com/zhuyu534/FracSim.git
34
+ cd FracSim
35
+ pip install -e .
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: FracSim
3
+ Version: 1.0.0
4
+ Summary: a FracMinHash-based genome similarity estimator for bacteria
5
+ Home-page: https://github.com/zhuyu534/FracSim.git
6
+ Author: YuZhu
7
+ Author-email: zhuyu1068@gmail.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: mmh3>=4.0.0
20
+ Requires-Dist: numpy>=1.21.0
21
+ Requires-Dist: pytest>=7.0.0
22
+ Requires-Dist: pytest-cov>=4.0.0
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ ## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
35
+
36
+ [![Latest Version](https://img.shields.io/github/v/release/zhuyu534/FracSim?color=red)](https://github.com/zhuyu534/FracSim/releases)
37
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
38
+ [![Platform](https://img.shields.io/badge/platform-linux%20%7C%20macos%20%7C%20windows-lightgrey?color=orange)](https://github.com/zhuyu534/FracSim)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
40
+
41
+ </div>
42
+
43
+ FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
44
+
45
+ Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
46
+
47
+ Documents: https://zhuyu534.github.io/FracSim
48
+
49
+ ## ✨ Features
50
+
51
+ - **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
52
+ - **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
53
+ - **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
54
+ - **Easy to use**: Clean command‑line interface with multi‑threading support.
55
+ - **Open source**: MIT licensed – contributions and usage are welcome.
56
+
57
+ ## 📦 Installation
58
+
59
+ ### Requirements
60
+ - Python 3.8 or higher
61
+ - pip package manager
62
+
63
+ ### Install from source
64
+
65
+ ```bash
66
+ git clone https://github.com/zhuyu534/FracSim.git
67
+ cd FracSim
68
+ pip install -e .
@@ -0,0 +1,27 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ fracsim/FracSim.egg-info/PKG-INFO
5
+ fracsim/FracSim.egg-info/SOURCES.txt
6
+ fracsim/FracSim.egg-info/dependency_links.txt
7
+ fracsim/FracSim.egg-info/entry_points.txt
8
+ fracsim/FracSim.egg-info/requires.txt
9
+ fracsim/FracSim.egg-info/top_level.txt
10
+ fracsim/input_layer/__init__.py
11
+ fracsim/input_layer/cli.py
12
+ fracsim/input_layer/file_reader.py
13
+ fracsim/input_layer/parser/__init__.py
14
+ fracsim/input_layer/parser/fasta_parser.py
15
+ fracsim/input_layer/parser/fastq_parser.py
16
+ fracsim/output_layer/__init__.py
17
+ fracsim/output_layer/console.py
18
+ fracsim/output_layer/file_writer.py
19
+ fracsim/output_layer/formatter.py
20
+ fracsim/process_layer/__init__.py
21
+ fracsim/process_layer/ani.py
22
+ fracsim/process_layer/jaccard.py
23
+ fracsim/process_layer/kmer_sketch.py
24
+ fracsim/process_layer/models.py
25
+ fracsim/utils/__init__.py
26
+ fracsim/utils/hash.py
27
+ fracsim/utils/validator.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ fracsim = fracsim.main:main
@@ -0,0 +1,4 @@
1
+ mmh3>=4.0.0
2
+ numpy>=1.21.0
3
+ pytest>=7.0.0
4
+ pytest-cov>=4.0.0
@@ -0,0 +1,4 @@
1
+ input_layer
2
+ output_layer
3
+ process_layer
4
+ utils
@@ -0,0 +1,13 @@
1
+ """输入层模块"""
2
+
3
+ from .cli import parse_arguments
4
+ from .file_reader import FileReader
5
+ from .parser.fasta_parser import FastaParser
6
+ from .parser.fastq_parser import FastqParser
7
+
8
+ __all__ = [
9
+ 'parse_arguments',
10
+ 'FileReader',
11
+ 'FastaParser',
12
+ 'FastqParser'
13
+ ]
@@ -0,0 +1,119 @@
1
+ """命令行参数解析模块"""
2
+
3
+ import argparse #命令行解析工具
4
+ from ..version import __version__, __description__
5
+
6
+
7
+ def parse_arguments():
8
+ """
9
+ 解析命令行参数
10
+
11
+ Returns:
12
+ argparse.Namespace: 解析后的参数对象
13
+ """
14
+ parser = argparse.ArgumentParser(
15
+ usage="%(prog)s [-h] (-i INPUT [INPUT ...] | -l LIST) [options]",
16
+ description=__description__,
17
+ formatter_class=argparse.RawDescriptionHelpFormatter, #控制帮助信息的格式
18
+ epilog="""
19
+ Examples:
20
+ %(prog)s -i genome1.fasta genome2.fasta -k 21 -s 0.01
21
+ %(prog)s -l genomes.txt -k 21 -s 0.001 -o output_dir/results.csv
22
+ %(prog)s -i genome1.fastq.gz genome2.fastq.gz --threads 4 --ani -V
23
+ """
24
+ )
25
+
26
+ # 数据输入方式参数
27
+ input_group = parser.add_mutually_exclusive_group(required=True) #创建互斥参数组,必须从该组中选择一个参数使用
28
+ input_group.add_argument(
29
+ '-i', '--input',
30
+ nargs='+', #可接收多个值
31
+ help='从基因组文件输入,多个文件之间使用空格分隔'
32
+ )
33
+ input_group.add_argument(
34
+ '-l', '--list',
35
+ help='从包含基因组文件路径的列表文件输入(每行一个文件路径)'
36
+ )
37
+
38
+
39
+ # 核心算法参数(K值、采样率、随机数种子)
40
+ parser.add_argument(
41
+ '-k', '--kmer-size',
42
+ type=int,
43
+ default=31,
44
+ help='k-mer长度,取值范围[1-64],默认值31'
45
+ )
46
+ parser.add_argument(
47
+ '-s', '--scaled',
48
+ type=float,
49
+ default=0.01,
50
+ help='FracMinHash采样率,取值范围(0,1],默认值0.01'
51
+ )
52
+ parser.add_argument(
53
+ '--seed',
54
+ type=int,
55
+ default=42,
56
+ help='哈希函数随机种子,默认值42'
57
+ )
58
+
59
+
60
+ # 计算选项
61
+ parser.add_argument(
62
+ '-a','--ani',
63
+ action='store_true',
64
+ help='计算输出ANI值(默认只计算Jaccard指数)'
65
+ )
66
+ parser.add_argument(
67
+ '-t','--threads',
68
+ type=int,
69
+ default=1,
70
+ help='计算线程数,默认值1'
71
+ )
72
+ parser.add_argument(
73
+ '-m','--min-similarity',
74
+ type=float,
75
+ default=0.00,
76
+ help='最小相似度阈值,只输出大于该值的结果,默认0.0'
77
+ )
78
+
79
+
80
+ # 输出选项参数
81
+ parser.add_argument(
82
+ '-o', '--output',
83
+ help='输出文件路径'
84
+ )
85
+ parser.add_argument(
86
+ '-f','--format',
87
+ choices=['table', 'json', 'csv'],
88
+ default='table',
89
+ help='结果输出格式,默认表格格式'
90
+ )
91
+
92
+
93
+ # 版本信息,输出过程信息
94
+ parser.add_argument(
95
+ '-v','--version',
96
+ action='version',
97
+ version=f'%(prog)s {__version__}',
98
+ help='版本信息'
99
+ )
100
+ parser.add_argument(
101
+ '-V','--verbose',
102
+ action='store_true',
103
+ help='输出详细信息'
104
+ )
105
+
106
+ # 解析参数
107
+ args = parser.parse_args()
108
+
109
+ # 参数验证
110
+ if args.kmer_size < 1 or args.kmer_size > 64:
111
+ parser.error("k-mer长度必须在1-64之间")
112
+
113
+ if args.scaled <= 0 or args.scaled > 1:
114
+ parser.error("采样率必须在(0,1]范围内")
115
+
116
+ if args.min_similarity < 0 or args.min_similarity > 1:
117
+ parser.error("最小相似度必须在0-1之间")
118
+
119
+ return args
@@ -0,0 +1,243 @@
1
+ """文件读取器模块"""
2
+
3
+ import os # 文件路径和存在性检查
4
+ import sys # 错误输出信息
5
+ import gzip
6
+ import bz2
7
+ import lzma
8
+ import zipfile
9
+ import io
10
+ from typing import List, Generator, Optional,Tuple # 类型提示
11
+ from .parser.fasta_parser import FastaParser # 导入两个解析器
12
+ from .parser.fastq_parser import FastqParser
13
+ from ..process_layer.models import GenomeData # 导入 GenomeData 类
14
+
15
+ class FileReader:
16
+ """文件读取器类(支持压缩格式)"""
17
+
18
+ def __init__(self, verbose=False):
19
+ """
20
+ 初始化文件读取器
21
+
22
+ Args:
23
+ verbose: 是否输出详细信息
24
+ """
25
+ self.verbose = verbose
26
+ self.fasta_parser = FastaParser()
27
+ self.fastq_parser = FastqParser()
28
+
29
+ def get_file_list(self, input_files: List[str], list_file: Optional[str] = None) -> List[str]:
30
+ """
31
+ 获取所有需要处理的文件列表
32
+
33
+ Args:
34
+ input_files: 直接输入的文件
35
+ list_file: 包含文件路径的列表文件
36
+
37
+ Returns:
38
+ List[str]: 文件路径列表
39
+ """
40
+ file_list = []
41
+
42
+ if input_files:
43
+ file_list.extend(input_files)
44
+
45
+ if list_file:
46
+ try:
47
+ with open(list_file, 'r') as f:
48
+ for line in f:
49
+ line = line.strip()
50
+ if line and not line.startswith('#'):
51
+ file_list.append(line)
52
+ except Exception as e:
53
+ sys.stderr.write(f"读取列表文件失败 {list_file}: {e}\n")
54
+ sys.exit(1)
55
+
56
+ # 验证文件存在性
57
+ valid_files = []
58
+ for file_path in file_list:
59
+ if os.path.exists(file_path):
60
+ valid_files.append(file_path)
61
+ else:
62
+ sys.stderr.write(f"警告: 文件不存在 {file_path}\n")
63
+
64
+ if not valid_files:
65
+ sys.stderr.write("错误: 没有有效的输入文件\n")
66
+ sys.exit(1)
67
+
68
+ if self.verbose:
69
+ sys.stderr.write(f"找到 {len(valid_files)} 个有效文件\n")
70
+
71
+ return valid_files
72
+
73
+
74
+ # --------检测压缩类型---------
75
+ def _detect_compression(self, file_path: str) -> Tuple[Optional[str], str]:
76
+ """
77
+ 检测文件是否为压缩格式,并返回 (压缩类型, 实际扩展名)
78
+ 压缩类型: None, 'gz', 'bz2', 'xz', 'zip'
79
+ """
80
+ COMPRESS_EXTS = {'.gz': 'gz', '.bz2': 'bz2', '.xz': 'xz', '.zip': 'zip'}
81
+ ext = os.path.splitext(file_path)[1].lower()
82
+ if ext in COMPRESS_EXTS:
83
+ compression = COMPRESS_EXTS[ext]
84
+ # 去掉压缩后缀,再取一次扩展名作为实际格式
85
+ base_path = file_path[: -len(ext)]
86
+ base_ext = os.path.splitext(base_path)[1].lower()
87
+ else:
88
+ compression = None
89
+ base_ext = ext
90
+ return compression, base_ext
91
+
92
+ # ----------解压到内存----------
93
+ def _decompress_to_memory(self, file_path: str, compression: str) -> io.TextIOBase:
94
+ """
95
+ 将压缩文件解压到内存,返回一个文本文件对象
96
+ 支持 gz, bz2, xz, zip
97
+ """
98
+ mem_buffer = io.BytesIO()
99
+
100
+ try:
101
+ if compression == 'gz':
102
+ with gzip.open(file_path, 'rb') as src:
103
+ while chunk := src.read(8192):
104
+ mem_buffer.write(chunk)
105
+ elif compression == 'bz2':
106
+ with bz2.open(file_path, 'rb') as src:
107
+ while chunk := src.read(8192):
108
+ mem_buffer.write(chunk)
109
+ elif compression == 'xz':
110
+ with lzma.open(file_path, 'rb') as src:
111
+ while chunk := src.read(8192):
112
+ mem_buffer.write(chunk)
113
+ elif compression == 'zip':
114
+ with zipfile.ZipFile(file_path, 'r') as zipf:
115
+ # 取第一个非目录文件
116
+ for name in zipf.namelist():
117
+ if not name.endswith('/'):
118
+ with zipf.open(name, 'r') as src:
119
+ while chunk := src.read(8192):
120
+ mem_buffer.write(chunk)
121
+ break
122
+ else:
123
+ raise ValueError(f"ZIP文件中没有可读取的文件: {file_path}")
124
+ else:
125
+ raise ValueError(f"不支持的压缩格式: {compression}")
126
+ except Exception as e:
127
+ raise IOError(f"解压文件失败 {file_path}: {e}")
128
+
129
+ mem_buffer.seek(0)
130
+ # 将二进制流包装为文本流(解析器需要文本模式)
131
+ text_stream = io.TextIOWrapper(mem_buffer, encoding='utf-8')
132
+ # 保留 buffer 引用,防止被垃圾回收
133
+ text_stream._buffer = mem_buffer
134
+ return text_stream
135
+
136
+
137
+ def read_sequences(self, file_path: str) -> Generator[tuple, None, None]: # 序列读取接口,根据文件类型分配解析器
138
+ """
139
+ 读取文件中的序列,支持普通文件及压缩格式(.gz/.bz2/.xz/.zip)
140
+ 压缩文件将解压到内存后由解析器处理
141
+
142
+ Yields:
143
+ tuple: (序列ID, 序列字符串, 质量分数)
144
+ """
145
+ compression, base_ext = self._detect_compression(file_path)
146
+
147
+ try:
148
+ # 根据实际格式分发
149
+ if base_ext in ['.fasta', '.fa', '.fna', '.ffn', '.frn']:
150
+ if compression:
151
+ # 压缩文件:解压到内存再解析
152
+ mem_file = self._decompress_to_memory(file_path, compression)
153
+ yield from self.fasta_parser.parse(mem_file)
154
+ else:
155
+ yield from self.fasta_parser.parse(file_path)
156
+
157
+ elif base_ext in ['.fastq', '.fq']:
158
+ if compression:
159
+ mem_file = self._decompress_to_memory(file_path, compression)
160
+ yield from self.fastq_parser.parse(mem_file)
161
+ else:
162
+ yield from self.fastq_parser.parse(file_path)
163
+
164
+ else:
165
+ # 未知扩展名,尝试自动检测(可能为压缩文件)
166
+ yield from self._auto_detect_parser(file_path, compression)
167
+
168
+ except Exception as e:
169
+ sys.stderr.write(f"读取文件失败 {file_path}: {e}\n")
170
+
171
+ #----------自动检测格式 ----------
172
+ def _auto_detect_parser(self, file_path: str, compression: Optional[str] = None) -> Generator[tuple, None, None]:
173
+ """
174
+ 自动检测文件格式并解析(支持压缩文件)
175
+ """
176
+ try:
177
+ # 如果是压缩文件,先解压到内存
178
+ if compression:
179
+ mem_file = self._decompress_to_memory(file_path, compression)
180
+ # 递归调用自身,此时 compression=None,mem_file 作为文件对象
181
+ yield from self._auto_detect_parser(mem_file, compression=None)
182
+ return
183
+
184
+ # 此时 file_path 可能是普通路径或内存文件对象
185
+ if isinstance(file_path, io.TextIOBase):
186
+ f = file_path
187
+ need_rewind = True
188
+ else:
189
+ f = open(file_path, 'r')
190
+ need_rewind = False
191
+
192
+ with f:
193
+ # 读取第一个非空字符
194
+ first_char = None
195
+ for line in f:
196
+ line = line.strip()
197
+ if line:
198
+ first_char = line[0]
199
+ break
200
+ if not first_char:
201
+ raise ValueError("文件为空")
202
+
203
+ f.seek(0) # 重置指针
204
+
205
+ if first_char == '>':
206
+ if isinstance(file_path, io.TextIOBase):
207
+ yield from self.fasta_parser.parse(f)
208
+ else:
209
+ yield from self.fasta_parser.parse(file_path)
210
+ elif first_char == '@':
211
+ if isinstance(file_path, io.TextIOBase):
212
+ yield from self.fastq_parser.parse(f)
213
+ else:
214
+ yield from self.fastq_parser.parse(file_path)
215
+ else:
216
+ raise ValueError(f"无法识别的文件格式,首字符: '{first_char}'")
217
+
218
+ except Exception as e:
219
+ raise ValueError(f"自动检测失败: {e}")
220
+
221
+ def read_genome(self, file_path: str) -> GenomeData:
222
+ """
223
+ 读取一个基因组文件的所有序列,返回包含多条序列的 GenomeData 对象
224
+
225
+ Args:
226
+ file_path: 文件路径
227
+
228
+ Returns:
229
+ GenomeData: 包含所有序列的基因组数据
230
+ """
231
+ sequences = []
232
+ genome_id = os.path.basename(file_path) # 使用文件名作为基因组ID
233
+
234
+ # 读取所有序列
235
+ for _, sequence, _ in self.read_sequences(file_path):
236
+ sequences.append(sequence)
237
+
238
+ return GenomeData(
239
+ file_path=file_path,
240
+ seq_id=genome_id,
241
+ sequences=sequences
242
+ )
243
+
@@ -0,0 +1,6 @@
1
+ """解析器模块"""
2
+
3
+ from .fasta_parser import FastaParser
4
+ from .fastq_parser import FastqParser
5
+
6
+ __all__ = ['FastaParser', 'FastqParser']