FracSim 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fracsim-1.0.0/LICENSE +21 -0
- fracsim-1.0.0/PKG-INFO +68 -0
- fracsim-1.0.0/README.md +35 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/PKG-INFO +68 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/SOURCES.txt +27 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/dependency_links.txt +1 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/entry_points.txt +2 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/requires.txt +4 -0
- fracsim-1.0.0/fracsim/FracSim.egg-info/top_level.txt +4 -0
- fracsim-1.0.0/fracsim/input_layer/__init__.py +13 -0
- fracsim-1.0.0/fracsim/input_layer/cli.py +119 -0
- fracsim-1.0.0/fracsim/input_layer/file_reader.py +243 -0
- fracsim-1.0.0/fracsim/input_layer/parser/__init__.py +6 -0
- fracsim-1.0.0/fracsim/input_layer/parser/fasta_parser.py +84 -0
- fracsim-1.0.0/fracsim/input_layer/parser/fastq_parser.py +102 -0
- fracsim-1.0.0/fracsim/output_layer/__init__.py +7 -0
- fracsim-1.0.0/fracsim/output_layer/console.py +91 -0
- fracsim-1.0.0/fracsim/output_layer/file_writer.py +66 -0
- fracsim-1.0.0/fracsim/output_layer/formatter.py +159 -0
- fracsim-1.0.0/fracsim/process_layer/__init__.py +16 -0
- fracsim-1.0.0/fracsim/process_layer/ani.py +83 -0
- fracsim-1.0.0/fracsim/process_layer/jaccard.py +50 -0
- fracsim-1.0.0/fracsim/process_layer/kmer_sketch.py +265 -0
- fracsim-1.0.0/fracsim/process_layer/models.py +72 -0
- fracsim-1.0.0/fracsim/utils/__init__.py +6 -0
- fracsim-1.0.0/fracsim/utils/hash.py +58 -0
- fracsim-1.0.0/fracsim/utils/validator.py +64 -0
- fracsim-1.0.0/setup.cfg +4 -0
- fracsim-1.0.0/setup.py +39 -0
fracsim-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Julian
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fracsim-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: FracSim
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: a FracMinHash-based genome similarity estimator for bacteria
|
|
5
|
+
Home-page: https://github.com/zhuyu534/FracSim.git
|
|
6
|
+
Author: YuZhu
|
|
7
|
+
Author-email: zhuyu1068@gmail.com
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: mmh3>=4.0.0
|
|
20
|
+
Requires-Dist: numpy>=1.21.0
|
|
21
|
+
Requires-Dist: pytest>=7.0.0
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: author-email
|
|
25
|
+
Dynamic: classifier
|
|
26
|
+
Dynamic: description
|
|
27
|
+
Dynamic: description-content-type
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
|
|
35
|
+
|
|
36
|
+
[](https://github.com/zhuyu534/FracSim/releases)
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](https://github.com/zhuyu534/FracSim)
|
|
39
|
+
[](https://opensource.org/licenses/MIT)
|
|
40
|
+
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
|
|
44
|
+
|
|
45
|
+
Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
|
|
46
|
+
|
|
47
|
+
Documents: https://zhuyu534.github.io/FracSim
|
|
48
|
+
|
|
49
|
+
## ✨ Features
|
|
50
|
+
|
|
51
|
+
- **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
|
|
52
|
+
- **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
|
|
53
|
+
- **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
|
|
54
|
+
- **Easy to use**: Clean command‑line interface with multi‑threading support.
|
|
55
|
+
- **Open source**: MIT licensed – contributions and usage are welcome.
|
|
56
|
+
|
|
57
|
+
## 📦 Installation
|
|
58
|
+
|
|
59
|
+
### Requirements
|
|
60
|
+
- Python 3.8 or higher
|
|
61
|
+
- pip package manager
|
|
62
|
+
|
|
63
|
+
### Install from source
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/zhuyu534/FracSim.git
|
|
67
|
+
cd FracSim
|
|
68
|
+
pip install -e .
|
fracsim-1.0.0/README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
|
|
2
|
+
|
|
3
|
+
[](https://github.com/zhuyu534/FracSim/releases)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://github.com/zhuyu534/FracSim)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
</div>
|
|
9
|
+
|
|
10
|
+
FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
|
|
11
|
+
|
|
12
|
+
Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
|
|
13
|
+
|
|
14
|
+
Documents: https://zhuyu534.github.io/FracSim
|
|
15
|
+
|
|
16
|
+
## ✨ Features
|
|
17
|
+
|
|
18
|
+
- **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
|
|
19
|
+
- **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
|
|
20
|
+
- **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
|
|
21
|
+
- **Easy to use**: Clean command‑line interface with multi‑threading support.
|
|
22
|
+
- **Open source**: MIT licensed – contributions and usage are welcome.
|
|
23
|
+
|
|
24
|
+
## 📦 Installation
|
|
25
|
+
|
|
26
|
+
### Requirements
|
|
27
|
+
- Python 3.8 or higher
|
|
28
|
+
- pip package manager
|
|
29
|
+
|
|
30
|
+
### Install from source
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
git clone https://github.com/zhuyu534/FracSim.git
|
|
34
|
+
cd FracSim
|
|
35
|
+
pip install -e .
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: FracSim
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: a FracMinHash-based genome similarity estimator for bacteria
|
|
5
|
+
Home-page: https://github.com/zhuyu534/FracSim.git
|
|
6
|
+
Author: YuZhu
|
|
7
|
+
Author-email: zhuyu1068@gmail.com
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: mmh3>=4.0.0
|
|
20
|
+
Requires-Dist: numpy>=1.21.0
|
|
21
|
+
Requires-Dist: pytest>=7.0.0
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: author-email
|
|
25
|
+
Dynamic: classifier
|
|
26
|
+
Dynamic: description
|
|
27
|
+
Dynamic: description-content-type
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
## <img src="images/logo.png" width="40" height="40" style="border-radius: 50%; vertical-align: text-bottom; margin-right: 8px;"> FracSim:fast bacterial genome similarity estimation using FracMinHash sketching
|
|
35
|
+
|
|
36
|
+
[](https://github.com/zhuyu534/FracSim/releases)
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](https://github.com/zhuyu534/FracSim)
|
|
39
|
+
[](https://opensource.org/licenses/MIT)
|
|
40
|
+
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
FracSim is a **fast** and **accurate** tool for estimating **bacterial** genome similarity, based on the **FracMinHash** genome sketching algorithm. It compresses large genomes into compact hash sets to rapidly compute **Jaccard similarity** and **ANI (Average Nucleotide Identity)** between genomes.
|
|
44
|
+
|
|
45
|
+
Whether for **species identification**, **strain typing**, or **large-scale genome comparison**, FracSim significantly reduces memory usage and computation time while maintaining high accuracy.
|
|
46
|
+
|
|
47
|
+
Documents: https://zhuyu534.github.io/FracSim
|
|
48
|
+
|
|
49
|
+
## ✨ Features
|
|
50
|
+
|
|
51
|
+
- **Fast**: Uses FracMinHash sketching to dramatically lower memory footprint and runtime.
|
|
52
|
+
- **Accurate**: Provides Jaccard index and ANI (Average Nucleotide Identity) estimates.
|
|
53
|
+
- **Flexible**: Supports FASTA/Q formats, configurable k‑mer size and sampling rate.
|
|
54
|
+
- **Easy to use**: Clean command‑line interface with multi‑threading support.
|
|
55
|
+
- **Open source**: MIT licensed – contributions and usage are welcome.
|
|
56
|
+
|
|
57
|
+
## 📦 Installation
|
|
58
|
+
|
|
59
|
+
### Requirements
|
|
60
|
+
- Python 3.8 or higher
|
|
61
|
+
- pip package manager
|
|
62
|
+
|
|
63
|
+
### Install from source
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/zhuyu534/FracSim.git
|
|
67
|
+
cd FracSim
|
|
68
|
+
pip install -e .
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
fracsim/FracSim.egg-info/PKG-INFO
|
|
5
|
+
fracsim/FracSim.egg-info/SOURCES.txt
|
|
6
|
+
fracsim/FracSim.egg-info/dependency_links.txt
|
|
7
|
+
fracsim/FracSim.egg-info/entry_points.txt
|
|
8
|
+
fracsim/FracSim.egg-info/requires.txt
|
|
9
|
+
fracsim/FracSim.egg-info/top_level.txt
|
|
10
|
+
fracsim/input_layer/__init__.py
|
|
11
|
+
fracsim/input_layer/cli.py
|
|
12
|
+
fracsim/input_layer/file_reader.py
|
|
13
|
+
fracsim/input_layer/parser/__init__.py
|
|
14
|
+
fracsim/input_layer/parser/fasta_parser.py
|
|
15
|
+
fracsim/input_layer/parser/fastq_parser.py
|
|
16
|
+
fracsim/output_layer/__init__.py
|
|
17
|
+
fracsim/output_layer/console.py
|
|
18
|
+
fracsim/output_layer/file_writer.py
|
|
19
|
+
fracsim/output_layer/formatter.py
|
|
20
|
+
fracsim/process_layer/__init__.py
|
|
21
|
+
fracsim/process_layer/ani.py
|
|
22
|
+
fracsim/process_layer/jaccard.py
|
|
23
|
+
fracsim/process_layer/kmer_sketch.py
|
|
24
|
+
fracsim/process_layer/models.py
|
|
25
|
+
fracsim/utils/__init__.py
|
|
26
|
+
fracsim/utils/hash.py
|
|
27
|
+
fracsim/utils/validator.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""输入层模块"""
|
|
2
|
+
|
|
3
|
+
from .cli import parse_arguments
|
|
4
|
+
from .file_reader import FileReader
|
|
5
|
+
from .parser.fasta_parser import FastaParser
|
|
6
|
+
from .parser.fastq_parser import FastqParser
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'parse_arguments',
|
|
10
|
+
'FileReader',
|
|
11
|
+
'FastaParser',
|
|
12
|
+
'FastqParser'
|
|
13
|
+
]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""命令行参数解析模块"""
|
|
2
|
+
|
|
3
|
+
import argparse #命令行解析工具
|
|
4
|
+
from ..version import __version__, __description__
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_arguments():
|
|
8
|
+
"""
|
|
9
|
+
解析命令行参数
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
argparse.Namespace: 解析后的参数对象
|
|
13
|
+
"""
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
usage="%(prog)s [-h] (-i INPUT [INPUT ...] | -l LIST) [options]",
|
|
16
|
+
description=__description__,
|
|
17
|
+
formatter_class=argparse.RawDescriptionHelpFormatter, #控制帮助信息的格式
|
|
18
|
+
epilog="""
|
|
19
|
+
Examples:
|
|
20
|
+
%(prog)s -i genome1.fasta genome2.fasta -k 21 -s 0.01
|
|
21
|
+
%(prog)s -l genomes.txt -k 21 -s 0.001 -o output_dir/results.csv
|
|
22
|
+
%(prog)s -i genome1.fastq.gz genome2.fastq.gz --threads 4 --ani -V
|
|
23
|
+
"""
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# 数据输入方式参数
|
|
27
|
+
input_group = parser.add_mutually_exclusive_group(required=True) #创建互斥参数组,必须从该组中选择一个参数使用
|
|
28
|
+
input_group.add_argument(
|
|
29
|
+
'-i', '--input',
|
|
30
|
+
nargs='+', #可接收多个值
|
|
31
|
+
help='从基因组文件输入,多个文件之间使用空格分隔'
|
|
32
|
+
)
|
|
33
|
+
input_group.add_argument(
|
|
34
|
+
'-l', '--list',
|
|
35
|
+
help='从包含基因组文件路径的列表文件输入(每行一个文件路径)'
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# 核心算法参数(K值、采样率、随机数种子)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
'-k', '--kmer-size',
|
|
42
|
+
type=int,
|
|
43
|
+
default=31,
|
|
44
|
+
help='k-mer长度,取值范围[1-64],默认值31'
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
'-s', '--scaled',
|
|
48
|
+
type=float,
|
|
49
|
+
default=0.01,
|
|
50
|
+
help='FracMinHash采样率,取值范围(0,1],默认值0.01'
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
'--seed',
|
|
54
|
+
type=int,
|
|
55
|
+
default=42,
|
|
56
|
+
help='哈希函数随机种子,默认值42'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# 计算选项
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
'-a','--ani',
|
|
63
|
+
action='store_true',
|
|
64
|
+
help='计算输出ANI值(默认只计算Jaccard指数)'
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
'-t','--threads',
|
|
68
|
+
type=int,
|
|
69
|
+
default=1,
|
|
70
|
+
help='计算线程数,默认值1'
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
'-m','--min-similarity',
|
|
74
|
+
type=float,
|
|
75
|
+
default=0.00,
|
|
76
|
+
help='最小相似度阈值,只输出大于该值的结果,默认0.0'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# 输出选项参数
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
'-o', '--output',
|
|
83
|
+
help='输出文件路径'
|
|
84
|
+
)
|
|
85
|
+
parser.add_argument(
|
|
86
|
+
'-f','--format',
|
|
87
|
+
choices=['table', 'json', 'csv'],
|
|
88
|
+
default='table',
|
|
89
|
+
help='结果输出格式,默认表格格式'
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# 版本信息,输出过程信息
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
'-v','--version',
|
|
96
|
+
action='version',
|
|
97
|
+
version=f'%(prog)s {__version__}',
|
|
98
|
+
help='版本信息'
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
'-V','--verbose',
|
|
102
|
+
action='store_true',
|
|
103
|
+
help='输出详细信息'
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# 解析参数
|
|
107
|
+
args = parser.parse_args()
|
|
108
|
+
|
|
109
|
+
# 参数验证
|
|
110
|
+
if args.kmer_size < 1 or args.kmer_size > 64:
|
|
111
|
+
parser.error("k-mer长度必须在1-64之间")
|
|
112
|
+
|
|
113
|
+
if args.scaled <= 0 or args.scaled > 1:
|
|
114
|
+
parser.error("采样率必须在(0,1]范围内")
|
|
115
|
+
|
|
116
|
+
if args.min_similarity < 0 or args.min_similarity > 1:
|
|
117
|
+
parser.error("最小相似度必须在0-1之间")
|
|
118
|
+
|
|
119
|
+
return args
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""文件读取器模块"""
|
|
2
|
+
|
|
3
|
+
import os # 文件路径和存在性检查
|
|
4
|
+
import sys # 错误输出信息
|
|
5
|
+
import gzip
|
|
6
|
+
import bz2
|
|
7
|
+
import lzma
|
|
8
|
+
import zipfile
|
|
9
|
+
import io
|
|
10
|
+
from typing import List, Generator, Optional,Tuple # 类型提示
|
|
11
|
+
from .parser.fasta_parser import FastaParser # 导入两个解析器
|
|
12
|
+
from .parser.fastq_parser import FastqParser
|
|
13
|
+
from ..process_layer.models import GenomeData # 导入 GenomeData 类
|
|
14
|
+
|
|
15
|
+
class FileReader:
|
|
16
|
+
"""文件读取器类(支持压缩格式)"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, verbose=False):
|
|
19
|
+
"""
|
|
20
|
+
初始化文件读取器
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
verbose: 是否输出详细信息
|
|
24
|
+
"""
|
|
25
|
+
self.verbose = verbose
|
|
26
|
+
self.fasta_parser = FastaParser()
|
|
27
|
+
self.fastq_parser = FastqParser()
|
|
28
|
+
|
|
29
|
+
def get_file_list(self, input_files: List[str], list_file: Optional[str] = None) -> List[str]:
|
|
30
|
+
"""
|
|
31
|
+
获取所有需要处理的文件列表
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
input_files: 直接输入的文件
|
|
35
|
+
list_file: 包含文件路径的列表文件
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List[str]: 文件路径列表
|
|
39
|
+
"""
|
|
40
|
+
file_list = []
|
|
41
|
+
|
|
42
|
+
if input_files:
|
|
43
|
+
file_list.extend(input_files)
|
|
44
|
+
|
|
45
|
+
if list_file:
|
|
46
|
+
try:
|
|
47
|
+
with open(list_file, 'r') as f:
|
|
48
|
+
for line in f:
|
|
49
|
+
line = line.strip()
|
|
50
|
+
if line and not line.startswith('#'):
|
|
51
|
+
file_list.append(line)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
sys.stderr.write(f"读取列表文件失败 {list_file}: {e}\n")
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
# 验证文件存在性
|
|
57
|
+
valid_files = []
|
|
58
|
+
for file_path in file_list:
|
|
59
|
+
if os.path.exists(file_path):
|
|
60
|
+
valid_files.append(file_path)
|
|
61
|
+
else:
|
|
62
|
+
sys.stderr.write(f"警告: 文件不存在 {file_path}\n")
|
|
63
|
+
|
|
64
|
+
if not valid_files:
|
|
65
|
+
sys.stderr.write("错误: 没有有效的输入文件\n")
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
if self.verbose:
|
|
69
|
+
sys.stderr.write(f"找到 {len(valid_files)} 个有效文件\n")
|
|
70
|
+
|
|
71
|
+
return valid_files
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# --------检测压缩类型---------
|
|
75
|
+
def _detect_compression(self, file_path: str) -> Tuple[Optional[str], str]:
|
|
76
|
+
"""
|
|
77
|
+
检测文件是否为压缩格式,并返回 (压缩类型, 实际扩展名)
|
|
78
|
+
压缩类型: None, 'gz', 'bz2', 'xz', 'zip'
|
|
79
|
+
"""
|
|
80
|
+
COMPRESS_EXTS = {'.gz': 'gz', '.bz2': 'bz2', '.xz': 'xz', '.zip': 'zip'}
|
|
81
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
82
|
+
if ext in COMPRESS_EXTS:
|
|
83
|
+
compression = COMPRESS_EXTS[ext]
|
|
84
|
+
# 去掉压缩后缀,再取一次扩展名作为实际格式
|
|
85
|
+
base_path = file_path[: -len(ext)]
|
|
86
|
+
base_ext = os.path.splitext(base_path)[1].lower()
|
|
87
|
+
else:
|
|
88
|
+
compression = None
|
|
89
|
+
base_ext = ext
|
|
90
|
+
return compression, base_ext
|
|
91
|
+
|
|
92
|
+
# ----------解压到内存----------
|
|
93
|
+
def _decompress_to_memory(self, file_path: str, compression: str) -> io.TextIOBase:
|
|
94
|
+
"""
|
|
95
|
+
将压缩文件解压到内存,返回一个文本文件对象
|
|
96
|
+
支持 gz, bz2, xz, zip
|
|
97
|
+
"""
|
|
98
|
+
mem_buffer = io.BytesIO()
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
if compression == 'gz':
|
|
102
|
+
with gzip.open(file_path, 'rb') as src:
|
|
103
|
+
while chunk := src.read(8192):
|
|
104
|
+
mem_buffer.write(chunk)
|
|
105
|
+
elif compression == 'bz2':
|
|
106
|
+
with bz2.open(file_path, 'rb') as src:
|
|
107
|
+
while chunk := src.read(8192):
|
|
108
|
+
mem_buffer.write(chunk)
|
|
109
|
+
elif compression == 'xz':
|
|
110
|
+
with lzma.open(file_path, 'rb') as src:
|
|
111
|
+
while chunk := src.read(8192):
|
|
112
|
+
mem_buffer.write(chunk)
|
|
113
|
+
elif compression == 'zip':
|
|
114
|
+
with zipfile.ZipFile(file_path, 'r') as zipf:
|
|
115
|
+
# 取第一个非目录文件
|
|
116
|
+
for name in zipf.namelist():
|
|
117
|
+
if not name.endswith('/'):
|
|
118
|
+
with zipf.open(name, 'r') as src:
|
|
119
|
+
while chunk := src.read(8192):
|
|
120
|
+
mem_buffer.write(chunk)
|
|
121
|
+
break
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError(f"ZIP文件中没有可读取的文件: {file_path}")
|
|
124
|
+
else:
|
|
125
|
+
raise ValueError(f"不支持的压缩格式: {compression}")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise IOError(f"解压文件失败 {file_path}: {e}")
|
|
128
|
+
|
|
129
|
+
mem_buffer.seek(0)
|
|
130
|
+
# 将二进制流包装为文本流(解析器需要文本模式)
|
|
131
|
+
text_stream = io.TextIOWrapper(mem_buffer, encoding='utf-8')
|
|
132
|
+
# 保留 buffer 引用,防止被垃圾回收
|
|
133
|
+
text_stream._buffer = mem_buffer
|
|
134
|
+
return text_stream
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def read_sequences(self, file_path: str) -> Generator[tuple, None, None]: # 序列读取接口,根据文件类型分配解析器
|
|
138
|
+
"""
|
|
139
|
+
读取文件中的序列,支持普通文件及压缩格式(.gz/.bz2/.xz/.zip)
|
|
140
|
+
压缩文件将解压到内存后由解析器处理
|
|
141
|
+
|
|
142
|
+
Yields:
|
|
143
|
+
tuple: (序列ID, 序列字符串, 质量分数)
|
|
144
|
+
"""
|
|
145
|
+
compression, base_ext = self._detect_compression(file_path)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# 根据实际格式分发
|
|
149
|
+
if base_ext in ['.fasta', '.fa', '.fna', '.ffn', '.frn']:
|
|
150
|
+
if compression:
|
|
151
|
+
# 压缩文件:解压到内存再解析
|
|
152
|
+
mem_file = self._decompress_to_memory(file_path, compression)
|
|
153
|
+
yield from self.fasta_parser.parse(mem_file)
|
|
154
|
+
else:
|
|
155
|
+
yield from self.fasta_parser.parse(file_path)
|
|
156
|
+
|
|
157
|
+
elif base_ext in ['.fastq', '.fq']:
|
|
158
|
+
if compression:
|
|
159
|
+
mem_file = self._decompress_to_memory(file_path, compression)
|
|
160
|
+
yield from self.fastq_parser.parse(mem_file)
|
|
161
|
+
else:
|
|
162
|
+
yield from self.fastq_parser.parse(file_path)
|
|
163
|
+
|
|
164
|
+
else:
|
|
165
|
+
# 未知扩展名,尝试自动检测(可能为压缩文件)
|
|
166
|
+
yield from self._auto_detect_parser(file_path, compression)
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
sys.stderr.write(f"读取文件失败 {file_path}: {e}\n")
|
|
170
|
+
|
|
171
|
+
#----------自动检测格式 ----------
|
|
172
|
+
def _auto_detect_parser(self, file_path: str, compression: Optional[str] = None) -> Generator[tuple, None, None]:
|
|
173
|
+
"""
|
|
174
|
+
自动检测文件格式并解析(支持压缩文件)
|
|
175
|
+
"""
|
|
176
|
+
try:
|
|
177
|
+
# 如果是压缩文件,先解压到内存
|
|
178
|
+
if compression:
|
|
179
|
+
mem_file = self._decompress_to_memory(file_path, compression)
|
|
180
|
+
# 递归调用自身,此时 compression=None,mem_file 作为文件对象
|
|
181
|
+
yield from self._auto_detect_parser(mem_file, compression=None)
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
# 此时 file_path 可能是普通路径或内存文件对象
|
|
185
|
+
if isinstance(file_path, io.TextIOBase):
|
|
186
|
+
f = file_path
|
|
187
|
+
need_rewind = True
|
|
188
|
+
else:
|
|
189
|
+
f = open(file_path, 'r')
|
|
190
|
+
need_rewind = False
|
|
191
|
+
|
|
192
|
+
with f:
|
|
193
|
+
# 读取第一个非空字符
|
|
194
|
+
first_char = None
|
|
195
|
+
for line in f:
|
|
196
|
+
line = line.strip()
|
|
197
|
+
if line:
|
|
198
|
+
first_char = line[0]
|
|
199
|
+
break
|
|
200
|
+
if not first_char:
|
|
201
|
+
raise ValueError("文件为空")
|
|
202
|
+
|
|
203
|
+
f.seek(0) # 重置指针
|
|
204
|
+
|
|
205
|
+
if first_char == '>':
|
|
206
|
+
if isinstance(file_path, io.TextIOBase):
|
|
207
|
+
yield from self.fasta_parser.parse(f)
|
|
208
|
+
else:
|
|
209
|
+
yield from self.fasta_parser.parse(file_path)
|
|
210
|
+
elif first_char == '@':
|
|
211
|
+
if isinstance(file_path, io.TextIOBase):
|
|
212
|
+
yield from self.fastq_parser.parse(f)
|
|
213
|
+
else:
|
|
214
|
+
yield from self.fastq_parser.parse(file_path)
|
|
215
|
+
else:
|
|
216
|
+
raise ValueError(f"无法识别的文件格式,首字符: '{first_char}'")
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
raise ValueError(f"自动检测失败: {e}")
|
|
220
|
+
|
|
221
|
+
def read_genome(self, file_path: str) -> GenomeData:
|
|
222
|
+
"""
|
|
223
|
+
读取一个基因组文件的所有序列,返回包含多条序列的 GenomeData 对象
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
file_path: 文件路径
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
GenomeData: 包含所有序列的基因组数据
|
|
230
|
+
"""
|
|
231
|
+
sequences = []
|
|
232
|
+
genome_id = os.path.basename(file_path) # 使用文件名作为基因组ID
|
|
233
|
+
|
|
234
|
+
# 读取所有序列
|
|
235
|
+
for _, sequence, _ in self.read_sequences(file_path):
|
|
236
|
+
sequences.append(sequence)
|
|
237
|
+
|
|
238
|
+
return GenomeData(
|
|
239
|
+
file_path=file_path,
|
|
240
|
+
seq_id=genome_id,
|
|
241
|
+
sequences=sequences
|
|
242
|
+
)
|
|
243
|
+
|