py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py2ls might be problematic. Click here for more details.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/ImageLoader.py +621 -0
- py2ls/__init__.py +7 -5
- py2ls/apptainer2ls.py +3940 -0
- py2ls/batman.py +164 -42
- py2ls/bio.py +2595 -0
- py2ls/cell_image_clf.py +1632 -0
- py2ls/container2ls.py +4635 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/email/email_html_template.html +88 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/data/re_common_pattern.json +173 -0
- py2ls/data/sns_info.json +74 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/styles/stylelib/.DS_Store +0 -0
- py2ls/data/styles/stylelib/grid.mplstyle +15 -0
- py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
- py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
- py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
- py2ls/data/styles/stylelib/light.mplstyl +6 -0
- py2ls/data/styles/stylelib/muted.mplstyle +6 -0
- py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature.mplstyle +31 -0
- py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
- py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
- py2ls/data/styles/stylelib/paper.mplstyle +290 -0
- py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
- py2ls/data/styles/stylelib/retro.mplstyle +4 -0
- py2ls/data/styles/stylelib/sans.mplstyle +10 -0
- py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
- py2ls/data/styles/stylelib/science.mplstyle +48 -0
- py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
- py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
- py2ls/data/tiles.csv +146 -0
- py2ls/data/usages_pd.json +1417 -0
- py2ls/data/usages_sns.json +31 -0
- py2ls/docker2ls.py +5446 -0
- py2ls/ec2ls.py +61 -0
- py2ls/fetch_update.py +145 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +8242 -0
- py2ls/image_ml2ls.py +2100 -0
- py2ls/ips.py +33909 -3418
- py2ls/ml2ls.py +7700 -0
- py2ls/mol.py +289 -0
- py2ls/mount2ls.py +1307 -0
- py2ls/netfinder.py +873 -351
- py2ls/nl2ls.py +283 -0
- py2ls/ocr.py +1581 -458
- py2ls/plot.py +10394 -314
- py2ls/rna2ls.py +311 -0
- py2ls/ssh2ls.md +456 -0
- py2ls/ssh2ls.py +5933 -0
- py2ls/ssh2ls_v01.py +2204 -0
- py2ls/stats.py +66 -172
- py2ls/temp20251124.py +509 -0
- py2ls/translator.py +2 -0
- py2ls/utils/decorators.py +3564 -0
- py2ls/utils_bio.py +3453 -0
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/utils_bio.py
ADDED
|
@@ -0,0 +1,3453 @@
|
|
|
1
|
+
"""
|
|
2
|
+
内容包含:
|
|
3
|
+
1.基础使用
|
|
4
|
+
2.GEO数据获取
|
|
5
|
+
3.SRA数据获取
|
|
6
|
+
4.数据搜索功能
|
|
7
|
+
5.批量处理
|
|
8
|
+
6.缓存管理
|
|
9
|
+
7.高级功能
|
|
10
|
+
8.故障排除
|
|
11
|
+
|
|
12
|
+
多数据源支持:GEO, TCGA, SRA, ArrayExpress, ENCODE, 单细胞数据等
|
|
13
|
+
多种数据格式:表达矩阵、临床数据、突变数据、FASTQ文件等
|
|
14
|
+
智能缓存:自动缓存下载的数据,避免重复下载
|
|
15
|
+
并行下载:支持多线程并行下载大型文件
|
|
16
|
+
数据搜索:内置数据集搜索功能
|
|
17
|
+
批量处理:支持批量下载多个数据集
|
|
18
|
+
配置管理:支持YAML/JSON配置文件
|
|
19
|
+
历史记录:记录所有下载操作
|
|
20
|
+
向后兼容:保持与现有GEO函数的兼容性
|
|
21
|
+
错误处理:完善的错误处理和日志记录
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# 1. 简单使用(自动优先fastq-dump)
|
|
25
|
+
fetcher = BioDataFetcher(dir_save="./my_cache", prefer_fastq_dump=True)
|
|
26
|
+
result = fetcher.fetch_data("SRR1635435", data_type='sra', data_format='fastq')
|
|
27
|
+
|
|
28
|
+
# 2. 使用配置文件
|
|
29
|
+
fetcher = BioDataFetcher(dir_save="./my_cache", config_file="./config.yaml")
|
|
30
|
+
|
|
31
|
+
# 3. 强制指定方法
|
|
32
|
+
result = fetcher.fetch_data(
|
|
33
|
+
dataset_ids="SRR1635435",
|
|
34
|
+
data_type='sra',
|
|
35
|
+
data_format='fastq',
|
|
36
|
+
download_method='fastq_dump' # 或 'ftp'
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# 4. 传递fastq-dump参数
|
|
40
|
+
result = fetcher.fetch_data(
|
|
41
|
+
dataset_ids="SRR1635435",
|
|
42
|
+
data_type='sra',
|
|
43
|
+
data_format='fastq',
|
|
44
|
+
split_files=True,
|
|
45
|
+
gzip_output=True,
|
|
46
|
+
threads=4
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
import os
|
|
52
|
+
import re
|
|
53
|
+
import pandas as pd
|
|
54
|
+
import numpy as np
|
|
55
|
+
from typing import Union, Dict, List, Optional, Any, Tuple, Callable
|
|
56
|
+
import logging
|
|
57
|
+
from pathlib import Path
|
|
58
|
+
import warnings
|
|
59
|
+
from datetime import datetime
|
|
60
|
+
import json
|
|
61
|
+
import yaml
|
|
62
|
+
from dataclasses import dataclass, field
|
|
63
|
+
from enum import Enum
|
|
64
|
+
import hashlib
|
|
65
|
+
import pickle
|
|
66
|
+
from tqdm import tqdm
|
|
67
|
+
import time
|
|
68
|
+
import requests
|
|
69
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
|
|
74
|
+
# 导入现有的GEO函数
|
|
75
|
+
try:
|
|
76
|
+
from . import bio as geo_utils
|
|
77
|
+
GEO_UTILS_AVAILABLE = True
|
|
78
|
+
except ImportError:
|
|
79
|
+
GEO_UTILS_AVAILABLE = False
|
|
80
|
+
warnings.warn("GEO utils not available. Make sure bio.py is in the same directory")
|
|
81
|
+
|
|
82
|
+
# 可能需要的额外库(可选择安装)
|
|
83
|
+
try:
|
|
84
|
+
import GEOparse
|
|
85
|
+
GEOPARSE_AVAILABLE = True
|
|
86
|
+
except ImportError:
|
|
87
|
+
GEOPARSE_AVAILABLE = False
|
|
88
|
+
warnings.warn("GEOparse not available. Install with: pip install GEOparse")
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
from pysradb import SRAweb
|
|
92
|
+
SRADB_AVAILABLE = True
|
|
93
|
+
except ImportError:
|
|
94
|
+
SRADB_AVAILABLE = False
|
|
95
|
+
warnings.warn("pysradb not available. Install with: pip install pysradb")
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
import gseapy as gp
|
|
99
|
+
GSEAPY_AVAILABLE = True
|
|
100
|
+
except ImportError:
|
|
101
|
+
GSEAPY_AVAILABLE = False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
import mygene
|
|
105
|
+
MYGENE_AVAILABLE = True
|
|
106
|
+
except ImportError:
|
|
107
|
+
MYGENE_AVAILABLE = False
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
import requests
|
|
111
|
+
REQUESTS_AVAILABLE = True
|
|
112
|
+
except ImportError:
|
|
113
|
+
REQUESTS_AVAILABLE = False
|
|
114
|
+
warnings.warn("requests not available. Install with: pip install requests")
|
|
115
|
+
|
|
116
|
+
# 配置日志
|
|
117
|
+
logging.basicConfig(
|
|
118
|
+
level=logging.INFO,
|
|
119
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
120
|
+
)
|
|
121
|
+
logger = logging.getLogger(__name__)
|
|
122
|
+
|
|
123
|
+
# 数据类型枚举
|
|
124
|
+
class DataSource(Enum):
|
|
125
|
+
GEO = "geo" # Gene Expression Omnibus
|
|
126
|
+
SRA = "sra" # Sequence Read Archive
|
|
127
|
+
TCGA = "tcga" # The Cancer Genome Atlas
|
|
128
|
+
ENCODE = "encode" # ENCODE Project
|
|
129
|
+
ARRAY_EXPRESS = "arrayexpress" # ArrayExpress
|
|
130
|
+
DDBJ = "ddbj" # DNA Data Bank of Japan
|
|
131
|
+
EGA = "ega" # European Genome-phenome Archive
|
|
132
|
+
SINGLE_CELL = "single_cell" # 单细胞数据
|
|
133
|
+
PROTEIN_ATLAS = "protein_atlas" # Human Protein Atlas
|
|
134
|
+
STRINGDB = "stringdb" # STRING数据库
|
|
135
|
+
KEGG = "kegg" # KEGG通路
|
|
136
|
+
REACTOME = "reactome" # Reactome通路
|
|
137
|
+
CUSTOM = "custom" # 自定义数据源
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def from_accession(cls, accession: str) -> 'DataSource':
|
|
141
|
+
"""根据accession自动推断数据源"""
|
|
142
|
+
accession = accession.upper()
|
|
143
|
+
|
|
144
|
+
# GEO数据集
|
|
145
|
+
if re.match(r'^GSE\d+$', accession) or re.match(r'^GDS\d+$', accession):
|
|
146
|
+
return cls.GEO
|
|
147
|
+
|
|
148
|
+
# SRA数据集
|
|
149
|
+
elif re.match(r'^(SRR|ERR|DRR)\d+$', accession):
|
|
150
|
+
return cls.SRA
|
|
151
|
+
|
|
152
|
+
# TCGA项目
|
|
153
|
+
elif re.match(r'^TCGA-[A-Z0-9]+$', accession) or accession.startswith('TCGA_'):
|
|
154
|
+
return cls.TCGA
|
|
155
|
+
|
|
156
|
+
# ENCODE数据集
|
|
157
|
+
elif re.match(r'^ENC[SR]\d+$', accession):
|
|
158
|
+
return cls.ENCODE
|
|
159
|
+
|
|
160
|
+
# ArrayExpress
|
|
161
|
+
elif re.match(r'^E-[A-Z]{4}-\d+$', accession):
|
|
162
|
+
return cls.ARRAY_EXPRESS
|
|
163
|
+
|
|
164
|
+
# DDBJ
|
|
165
|
+
elif re.match(r'^(DRA|DRS|DRX|DRZ)\d+$', accession):
|
|
166
|
+
return cls.DDBJ
|
|
167
|
+
|
|
168
|
+
# 单细胞数据集(常见格式)
|
|
169
|
+
elif re.match(r'^SC\d+$', accession) or 'SC' in accession:
|
|
170
|
+
return cls.SINGLE_CELL
|
|
171
|
+
|
|
172
|
+
# 默认返回GEO
|
|
173
|
+
else:
|
|
174
|
+
return cls.GEO
|
|
175
|
+
|
|
176
|
+
class DataFormat(Enum):
|
|
177
|
+
EXPRESSION = "expression" # 表达矩阵
|
|
178
|
+
COUNTS = "counts" # 原始计数
|
|
179
|
+
FASTQ = "fastq" # FASTQ文件
|
|
180
|
+
BAM = "bam" # BAM文件
|
|
181
|
+
METADATA = "metadata" # 元数据
|
|
182
|
+
CLINICAL = "clinical" # 临床数据
|
|
183
|
+
MUTATIONS = "mutations" # 突变数据
|
|
184
|
+
PROBE = "probe" # 探针信息
|
|
185
|
+
ANNOTATION = "annotation" # 注释信息
|
|
186
|
+
NETWORK = "network" # 网络数据
|
|
187
|
+
PATHWAY = "pathway" # 通路数据
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def infer_format(cls, data_type: DataSource, **kwargs) -> 'DataFormat':
|
|
191
|
+
"""根据数据源和其他参数推断数据格式"""
|
|
192
|
+
platform = kwargs.get('platform', '').lower()
|
|
193
|
+
data_format = kwargs.get('data_format', '').lower()
|
|
194
|
+
|
|
195
|
+
# 如果有明确指定的格式,使用它
|
|
196
|
+
if data_format:
|
|
197
|
+
for fmt in cls:
|
|
198
|
+
if fmt.value == data_format:
|
|
199
|
+
return fmt
|
|
200
|
+
|
|
201
|
+
# 根据数据源推断
|
|
202
|
+
if data_type == DataSource.GEO:
|
|
203
|
+
return cls.EXPRESSION
|
|
204
|
+
elif data_type == DataSource.SRA:
|
|
205
|
+
return cls.FASTQ if kwargs.get('download_fastq', False) else cls.METADATA
|
|
206
|
+
elif data_type == DataSource.TCGA:
|
|
207
|
+
if platform == 'clinical':
|
|
208
|
+
return cls.CLINICAL
|
|
209
|
+
elif platform == 'mutations':
|
|
210
|
+
return cls.MUTATIONS
|
|
211
|
+
else:
|
|
212
|
+
return cls.EXPRESSION
|
|
213
|
+
elif data_type == DataSource.ENCODE:
|
|
214
|
+
return cls.BAM if 'chip' in platform else cls.EXPRESSION
|
|
215
|
+
else:
|
|
216
|
+
return cls.METADATA
|
|
217
|
+
class FastqDumpDownloader:
|
|
218
|
+
"""
|
|
219
|
+
使用fastq-dump下载SRA数据的下载器
|
|
220
|
+
更可靠,支持更多功能
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
def __init__(self, cache_dir: str = "./sra_fastqdump", use_prefetch: bool = True):
|
|
224
|
+
"""
|
|
225
|
+
Parameters:
|
|
226
|
+
-----------
|
|
227
|
+
cache_dir : str
|
|
228
|
+
缓存目录
|
|
229
|
+
use_prefetch : bool
|
|
230
|
+
是否使用prefetch先下载.sra文件(推荐)
|
|
231
|
+
"""
|
|
232
|
+
self.cache_dir = Path(cache_dir)
|
|
233
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
234
|
+
self.use_prefetch = use_prefetch
|
|
235
|
+
|
|
236
|
+
# 查找工具
|
|
237
|
+
self.fastq_dump_path = shutil.which("fastq-dump") or shutil.which("fastq-dump.exe")
|
|
238
|
+
self.prefetch_path = shutil.which("prefetch") or shutil.which("prefetch.exe")
|
|
239
|
+
self.fasterq_dump_path = shutil.which("fasterq-dump") or shutil.which("fasterq-dump.exe")
|
|
240
|
+
|
|
241
|
+
print(f"工具状态:")
|
|
242
|
+
print(f" fastq-dump: {'✅ 可用' if self.fastq_dump_path else '❌ 未找到'}")
|
|
243
|
+
print(f" prefetch: {'✅ 可用' if self.prefetch_path else '❌ 未找到'}")
|
|
244
|
+
print(f" fasterq-dump: {'✅ 可用' if self.fasterq_dump_path else '❌ 未找到'}")
|
|
245
|
+
|
|
246
|
+
def download_with_fastq_dump(self,
|
|
247
|
+
accession: str,
|
|
248
|
+
output_dir: Optional[Path] = None,
|
|
249
|
+
split_files: bool = True,
|
|
250
|
+
gzip_output: bool = True,
|
|
251
|
+
max_retries: int = 3) -> Dict[str, Any]:
|
|
252
|
+
"""
|
|
253
|
+
使用fastq-dump下载数据
|
|
254
|
+
|
|
255
|
+
Parameters:
|
|
256
|
+
-----------
|
|
257
|
+
accession : str
|
|
258
|
+
SRA accession (SRR, ERR, DRR)
|
|
259
|
+
output_dir : Path
|
|
260
|
+
输出目录
|
|
261
|
+
split_files : bool
|
|
262
|
+
是否拆分文件(paired-end数据拆分为 _1.fastq 和 _2.fastq)
|
|
263
|
+
gzip_output : bool
|
|
264
|
+
是否gzip压缩输出
|
|
265
|
+
max_retries : int
|
|
266
|
+
最大重试次数
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
--------
|
|
270
|
+
Dict: 下载结果
|
|
271
|
+
"""
|
|
272
|
+
import time
|
|
273
|
+
|
|
274
|
+
if output_dir is None:
|
|
275
|
+
output_dir = self.cache_dir / accession
|
|
276
|
+
else:
|
|
277
|
+
output_dir = Path(output_dir) / accession
|
|
278
|
+
|
|
279
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
280
|
+
|
|
281
|
+
if not self.fastq_dump_path:
|
|
282
|
+
return {
|
|
283
|
+
'accession': accession,
|
|
284
|
+
'success': False,
|
|
285
|
+
'error': 'fastq-dump not found. Please install SRA Toolkit.',
|
|
286
|
+
'step': 'tool_check'
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
print(f"使用fastq-dump下载: {accession}")
|
|
290
|
+
print(f"输出目录: {output_dir}")
|
|
291
|
+
print(f"拆分文件: {split_files}")
|
|
292
|
+
print(f"gzip压缩: {gzip_output}")
|
|
293
|
+
print("-" * 50)
|
|
294
|
+
|
|
295
|
+
results = {}
|
|
296
|
+
|
|
297
|
+
# 方法1:使用prefetch + fastq-dump(推荐)
|
|
298
|
+
if self.use_prefetch and self.prefetch_path:
|
|
299
|
+
print("方法1: prefetch + fastq-dump")
|
|
300
|
+
result = self._download_with_prefetch(
|
|
301
|
+
accession=accession,
|
|
302
|
+
output_dir=output_dir,
|
|
303
|
+
split_files=split_files,
|
|
304
|
+
gzip_output=gzip_output,
|
|
305
|
+
max_retries=max_retries
|
|
306
|
+
)
|
|
307
|
+
results['prefetch_method'] = result
|
|
308
|
+
|
|
309
|
+
if result.get('success', False):
|
|
310
|
+
print("✅ prefetch方法成功")
|
|
311
|
+
return self._format_result(accession, output_dir, result)
|
|
312
|
+
|
|
313
|
+
# 方法2:直接使用fastq-dump
|
|
314
|
+
print("\n方法2: 直接使用fastq-dump")
|
|
315
|
+
result = self._download_direct(
|
|
316
|
+
accession=accession,
|
|
317
|
+
output_dir=output_dir,
|
|
318
|
+
split_files=split_files,
|
|
319
|
+
gzip_output=gzip_output,
|
|
320
|
+
max_retries=max_retries
|
|
321
|
+
)
|
|
322
|
+
results['direct_method'] = result
|
|
323
|
+
|
|
324
|
+
if result.get('success', False):
|
|
325
|
+
print("✅ 直接方法成功")
|
|
326
|
+
return self._format_result(accession, output_dir, result)
|
|
327
|
+
|
|
328
|
+
# 方法3:使用fasterq-dump(如果可用)
|
|
329
|
+
if self.fasterq_dump_path:
|
|
330
|
+
print("\n方法3: 使用fasterq-dump(更快)")
|
|
331
|
+
result = self._download_with_fasterq_dump(
|
|
332
|
+
accession=accession,
|
|
333
|
+
output_dir=output_dir,
|
|
334
|
+
split_files=split_files,
|
|
335
|
+
gzip_output=gzip_output,
|
|
336
|
+
max_retries=max_retries
|
|
337
|
+
)
|
|
338
|
+
results['fasterq_method'] = result
|
|
339
|
+
|
|
340
|
+
if result.get('success', False):
|
|
341
|
+
print("✅ fasterq-dump方法成功")
|
|
342
|
+
return self._format_result(accession, output_dir, result)
|
|
343
|
+
|
|
344
|
+
# 所有方法都失败
|
|
345
|
+
print("❌ 所有方法都失败")
|
|
346
|
+
return {
|
|
347
|
+
'accession': accession,
|
|
348
|
+
'success': False,
|
|
349
|
+
'error': 'All download methods failed',
|
|
350
|
+
'results': results,
|
|
351
|
+
'output_dir': str(output_dir)
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
def _download_with_prefetch(self, accession, output_dir, split_files, gzip_output, max_retries):
|
|
355
|
+
"""使用prefetch下载.sra文件,然后用fastq-dump转换"""
|
|
356
|
+
import time
|
|
357
|
+
|
|
358
|
+
sra_dir = output_dir / ".sra_cache"
|
|
359
|
+
sra_dir.mkdir(exist_ok=True)
|
|
360
|
+
|
|
361
|
+
# 步骤1: 使用prefetch下载.sra文件
|
|
362
|
+
print(" 步骤1: 使用prefetch下载.sra文件...")
|
|
363
|
+
|
|
364
|
+
prefetch_cmd = [
|
|
365
|
+
self.prefetch_path,
|
|
366
|
+
accession,
|
|
367
|
+
"-O", str(sra_dir),
|
|
368
|
+
"--progress" # 显示进度
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
print(f" 运行: {' '.join(prefetch_cmd)}")
|
|
373
|
+
result = subprocess.run(
|
|
374
|
+
prefetch_cmd,
|
|
375
|
+
capture_output=True,
|
|
376
|
+
text=True,
|
|
377
|
+
timeout=600, # 10分钟超时
|
|
378
|
+
check=True
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
print(f" prefetch完成: {result.stdout[-200:] if result.stdout else '无输出'}")
|
|
382
|
+
|
|
383
|
+
# 查找下载的.sra文件
|
|
384
|
+
sra_files = list(sra_dir.glob(f"**/{accession}.sra"))
|
|
385
|
+
if not sra_files:
|
|
386
|
+
sra_files = list(sra_dir.glob(f"**/*.sra"))
|
|
387
|
+
|
|
388
|
+
if not sra_files:
|
|
389
|
+
return {'success': False, 'error': 'No .sra file found after prefetch'}
|
|
390
|
+
|
|
391
|
+
sra_file = sra_files[0]
|
|
392
|
+
print(f" 找到.sra文件: {sra_file} ({sra_file.stat().st_size/1024/1024:.1f} MB)")
|
|
393
|
+
|
|
394
|
+
# 步骤2: 使用fastq-dump转换
|
|
395
|
+
return self._run_fastq_dump(
|
|
396
|
+
input_file=str(sra_file),
|
|
397
|
+
output_dir=output_dir,
|
|
398
|
+
split_files=split_files,
|
|
399
|
+
gzip_output=gzip_output
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
except subprocess.TimeoutExpired:
|
|
403
|
+
return {'success': False, 'error': 'prefetch timed out after 10 minutes'}
|
|
404
|
+
except subprocess.CalledProcessError as e:
|
|
405
|
+
return {'success': False, 'error': f'prefetch failed: {e.stderr[:200]}'}
|
|
406
|
+
except Exception as e:
|
|
407
|
+
return {'success': False, 'error': f'prefetch error: {type(e).__name__}: {e}'}
|
|
408
|
+
|
|
409
|
+
def _download_direct(self, accession, output_dir, split_files, gzip_output, max_retries):
|
|
410
|
+
"""直接使用fastq-dump下载(不先下载.sra文件)"""
|
|
411
|
+
print(" 直接下载并转换...")
|
|
412
|
+
|
|
413
|
+
# 构建fastq-dump命令
|
|
414
|
+
cmd = self._build_fastq_dump_command(
|
|
415
|
+
accession=accession,
|
|
416
|
+
output_dir=output_dir,
|
|
417
|
+
split_files=split_files,
|
|
418
|
+
gzip_output=gzip_output
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
print(f" 运行: {' '.join(cmd)}")
|
|
423
|
+
result = subprocess.run(
|
|
424
|
+
cmd,
|
|
425
|
+
capture_output=True,
|
|
426
|
+
text=True,
|
|
427
|
+
timeout=900, # 15分钟超时(可能较长)
|
|
428
|
+
check=True
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
print(f" fastq-dump输出: {result.stdout[-500:] if result.stdout else '无输出'}")
|
|
432
|
+
|
|
433
|
+
# 检查生成的文件
|
|
434
|
+
return self._check_output_files(output_dir, accession, split_files, gzip_output)
|
|
435
|
+
|
|
436
|
+
except subprocess.TimeoutExpired:
|
|
437
|
+
return {'success': False, 'error': 'fastq-dump timed out after 15 minutes'}
|
|
438
|
+
except subprocess.CalledProcessError as e:
|
|
439
|
+
error_msg = e.stderr[:500] if e.stderr else str(e)
|
|
440
|
+
return {'success': False, 'error': f'fastq-dump failed: {error_msg}'}
|
|
441
|
+
except Exception as e:
|
|
442
|
+
return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {e}'}
|
|
443
|
+
|
|
444
|
+
def _download_with_fasterq_dump(self, accession, output_dir, split_files, gzip_output, max_retries):
|
|
445
|
+
"""使用fasterq-dump(更快版本)"""
|
|
446
|
+
print(" 使用fasterq-dump...")
|
|
447
|
+
|
|
448
|
+
# 构建fasterq-dump命令
|
|
449
|
+
cmd = [
|
|
450
|
+
self.fasterq_dump_path,
|
|
451
|
+
accession,
|
|
452
|
+
"-O", str(output_dir),
|
|
453
|
+
"-e", "4", # 使用4个线程
|
|
454
|
+
"-p" # 显示进度
|
|
455
|
+
]
|
|
456
|
+
|
|
457
|
+
if split_files:
|
|
458
|
+
cmd.append("--split-files")
|
|
459
|
+
|
|
460
|
+
try:
|
|
461
|
+
print(f" 运行: {' '.join(cmd)}")
|
|
462
|
+
result = subprocess.run(
|
|
463
|
+
cmd,
|
|
464
|
+
capture_output=True,
|
|
465
|
+
text=True,
|
|
466
|
+
timeout=600, # 10分钟超时
|
|
467
|
+
check=True
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
print(f" fasterq-dump输出: {result.stdout[-500:] if result.stdout else '无输出'}")
|
|
471
|
+
|
|
472
|
+
# 如果需要gzip,使用并行gzip
|
|
473
|
+
if gzip_output:
|
|
474
|
+
self._gzip_files(output_dir)
|
|
475
|
+
|
|
476
|
+
return self._check_output_files(output_dir, accession, split_files, gzip_output)
|
|
477
|
+
|
|
478
|
+
except subprocess.TimeoutExpired:
|
|
479
|
+
return {'success': False, 'error': 'fasterq-dump timed out'}
|
|
480
|
+
except subprocess.CalledProcessError as e:
|
|
481
|
+
error_msg = e.stderr[:500] if e.stderr else str(e)
|
|
482
|
+
return {'success': False, 'error': f'fasterq-dump failed: {error_msg}'}
|
|
483
|
+
except Exception as e:
|
|
484
|
+
return {'success': False, 'error': f'fasterq-dump error: {type(e).__name__}: {e}'}
|
|
485
|
+
|
|
486
|
+
def _build_fastq_dump_command(self, accession, output_dir, split_files, gzip_output):
|
|
487
|
+
"""构建fastq-dump命令"""
|
|
488
|
+
cmd = [
|
|
489
|
+
self.fastq_dump_path,
|
|
490
|
+
accession,
|
|
491
|
+
"--outdir", str(output_dir),
|
|
492
|
+
"--skip-technical", # 跳过技术读取
|
|
493
|
+
"--readids", # 在读取ID中包含原始名称
|
|
494
|
+
"--dumpbase", # 以碱基形式格式化序列
|
|
495
|
+
"--clip", # 移除适配器和质量修剪
|
|
496
|
+
]
|
|
497
|
+
|
|
498
|
+
if split_files:
|
|
499
|
+
cmd.append("--split-files")
|
|
500
|
+
|
|
501
|
+
if gzip_output:
|
|
502
|
+
cmd.append("--gzip")
|
|
503
|
+
|
|
504
|
+
# 添加其他有用选项
|
|
505
|
+
cmd.extend([
|
|
506
|
+
"--read-filter", "pass", # 只保留通过的读取
|
|
507
|
+
"--origfmt" # 保持原始格式
|
|
508
|
+
])
|
|
509
|
+
|
|
510
|
+
return cmd
|
|
511
|
+
|
|
512
|
+
def _run_fastq_dump(self, input_file, output_dir, split_files, gzip_output):
|
|
513
|
+
"""运行fastq-dump转换.sra文件"""
|
|
514
|
+
cmd = [
|
|
515
|
+
self.fastq_dump_path,
|
|
516
|
+
input_file,
|
|
517
|
+
"--outdir", str(output_dir),
|
|
518
|
+
"--skip-technical",
|
|
519
|
+
"--readids",
|
|
520
|
+
"--dumpbase",
|
|
521
|
+
"--clip",
|
|
522
|
+
]
|
|
523
|
+
|
|
524
|
+
if split_files:
|
|
525
|
+
cmd.append("--split-files")
|
|
526
|
+
|
|
527
|
+
if gzip_output:
|
|
528
|
+
cmd.append("--gzip")
|
|
529
|
+
|
|
530
|
+
try:
|
|
531
|
+
print(f" 运行fastq-dump: {' '.join(cmd)}")
|
|
532
|
+
result = subprocess.run(
|
|
533
|
+
cmd,
|
|
534
|
+
capture_output=True,
|
|
535
|
+
text=True,
|
|
536
|
+
timeout=300, # 5分钟超时(.sra文件已本地存在)
|
|
537
|
+
check=True
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
print(f" fastq-dump完成")
|
|
541
|
+
return self._check_output_files(output_dir, Path(input_file).stem, split_files, gzip_output)
|
|
542
|
+
|
|
543
|
+
except subprocess.CalledProcessError as e:
|
|
544
|
+
return {'success': False, 'error': f'fastq-dump conversion failed: {e.stderr[:200]}'}
|
|
545
|
+
except Exception as e:
|
|
546
|
+
return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {e}'}
|
|
547
|
+
|
|
548
|
+
def _gzip_files(self, output_dir):
|
|
549
|
+
"""并行gzip文件(如果fastq-dump没有自动gzip)"""
|
|
550
|
+
import gzip
|
|
551
|
+
import shutil
|
|
552
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
553
|
+
|
|
554
|
+
fastq_files = list(output_dir.glob("*.fastq"))
|
|
555
|
+
|
|
556
|
+
if not fastq_files:
|
|
557
|
+
return
|
|
558
|
+
|
|
559
|
+
print(f" 压缩 {len(fastq_files)} 个fastq文件...")
|
|
560
|
+
|
|
561
|
+
def compress_file(fastq_path):
|
|
562
|
+
gzip_path = fastq_path.with_suffix('.fastq.gz')
|
|
563
|
+
|
|
564
|
+
try:
|
|
565
|
+
with open(fastq_path, 'rb') as f_in:
|
|
566
|
+
with gzip.open(gzip_path, 'wb') as f_out:
|
|
567
|
+
shutil.copyfileobj(f_in, f_out)
|
|
568
|
+
|
|
569
|
+
# 删除原始文件
|
|
570
|
+
fastq_path.unlink()
|
|
571
|
+
return True
|
|
572
|
+
except Exception as e:
|
|
573
|
+
print(f" 压缩失败 {fastq_path.name}: {e}")
|
|
574
|
+
return False
|
|
575
|
+
|
|
576
|
+
# 并行压缩
|
|
577
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
578
|
+
results = list(executor.map(compress_file, fastq_files))
|
|
579
|
+
|
|
580
|
+
success_count = sum(results)
|
|
581
|
+
print(f" 压缩完成: {success_count}/{len(fastq_files)} 成功")
|
|
582
|
+
|
|
583
|
+
def _check_output_files(self, output_dir, accession, split_files, gzip_output):
|
|
584
|
+
"""检查输出文件"""
|
|
585
|
+
# 查找生成的文件
|
|
586
|
+
patterns = []
|
|
587
|
+
if gzip_output:
|
|
588
|
+
patterns.extend([f"{accession}*.fastq.gz", f"{accession}*.fq.gz"])
|
|
589
|
+
else:
|
|
590
|
+
patterns.extend([f"{accession}*.fastq", f"{accession}*.fq"])
|
|
591
|
+
|
|
592
|
+
files = []
|
|
593
|
+
for pattern in patterns:
|
|
594
|
+
files.extend(output_dir.glob(pattern))
|
|
595
|
+
|
|
596
|
+
files = [str(f) for f in files if f.exists() and f.stat().st_size > 0]
|
|
597
|
+
|
|
598
|
+
if files:
|
|
599
|
+
total_size = sum(Path(f).stat().st_size for f in files)
|
|
600
|
+
return {
|
|
601
|
+
'success': True,
|
|
602
|
+
'files': files,
|
|
603
|
+
'file_count': len(files),
|
|
604
|
+
'total_size_bytes': total_size,
|
|
605
|
+
'total_size_mb': total_size / (1024 * 1024)
|
|
606
|
+
}
|
|
607
|
+
else:
|
|
608
|
+
return {'success': False, 'error': 'No output files found'}
|
|
609
|
+
|
|
610
|
+
def _format_result(self, accession, output_dir, result):
|
|
611
|
+
"""格式化结果"""
|
|
612
|
+
return {
|
|
613
|
+
'accession': accession,
|
|
614
|
+
'success': True,
|
|
615
|
+
'files': result.get('files', []),
|
|
616
|
+
'file_count': result.get('file_count', 0),
|
|
617
|
+
'total_size_mb': result.get('total_size_mb', 0),
|
|
618
|
+
'output_dir': str(output_dir),
|
|
619
|
+
'method': 'fastq-dump'
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
# 测试使用
|
|
623
|
+
def test_fastq_dump_downloader():
|
|
624
|
+
"""测试fastq-dump下载器"""
|
|
625
|
+
print("测试fastq-dump下载器")
|
|
626
|
+
print("=" * 60)
|
|
627
|
+
|
|
628
|
+
downloader = FastqDumpDownloader(cache_dir="./fastqdump_test")
|
|
629
|
+
|
|
630
|
+
# 测试小文件
|
|
631
|
+
result = downloader.download_with_fastq_dump(
|
|
632
|
+
accession="SRR390728", # 小文件,约1MB
|
|
633
|
+
output_dir="./test_output",
|
|
634
|
+
split_files=True,
|
|
635
|
+
gzip_output=True,
|
|
636
|
+
max_retries=2
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
print(f"\n结果:")
|
|
640
|
+
print(f" 成功: {result['success']}")
|
|
641
|
+
print(f" 文件数: {result.get('file_count', 0)}")
|
|
642
|
+
print(f" 总大小: {result.get('total_size_mb', 0):.2f} MB")
|
|
643
|
+
|
|
644
|
+
if result['success'] and result.get('files'):
|
|
645
|
+
print(f" 文件列表:")
|
|
646
|
+
for filepath in result['files']:
|
|
647
|
+
size_mb = Path(filepath).stat().st_size / (1024 * 1024)
|
|
648
|
+
print(f" - {Path(filepath).name} ({size_mb:.2f} MB)")
|
|
649
|
+
|
|
650
|
+
return result
|
|
651
|
+
|
|
652
|
+
# test_fastq_dump_downloader()
|
|
653
|
+
|
|
654
|
+
class SRADownloader:
|
|
655
|
+
"""
|
|
656
|
+
独立的SRA数据下载器,不依赖pysradb
|
|
657
|
+
直接使用ENA和NCBI API
|
|
658
|
+
"""
|
|
659
|
+
|
|
660
|
+
def __init__(self, cache_dir: str = "./sra_data", max_workers: int = 4):
|
|
661
|
+
self.cache_dir = Path(cache_dir)
|
|
662
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
663
|
+
self.max_workers = max_workers
|
|
664
|
+
self.session = requests.Session()
|
|
665
|
+
self.session.headers.update({
|
|
666
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
667
|
+
})
|
|
668
|
+
|
|
669
|
+
def get_metadata(self, accession: str) -> Dict[str, Any]:
|
|
670
|
+
"""
|
|
671
|
+
从ENA获取SRA元数据
|
|
672
|
+
参数可以是:SRR/ERR/DRR运行号,SRS样本号,SRX实验号
|
|
673
|
+
"""
|
|
674
|
+
# 尝试不同API端点
|
|
675
|
+
endpoints = [
|
|
676
|
+
self._get_ena_metadata,
|
|
677
|
+
self._get_ncbi_metadata,
|
|
678
|
+
]
|
|
679
|
+
|
|
680
|
+
for endpoint in endpoints:
|
|
681
|
+
try:
|
|
682
|
+
metadata = endpoint(accession)
|
|
683
|
+
if metadata:
|
|
684
|
+
return metadata
|
|
685
|
+
except Exception as e:
|
|
686
|
+
logger.debug(f"{endpoint.__name__} failed: {e}")
|
|
687
|
+
|
|
688
|
+
return {'error': f'无法获取 {accession} 的元数据'}
|
|
689
|
+
|
|
690
|
+
def _get_ena_metadata(self, accession: str) -> Dict[str, Any]:
|
|
691
|
+
"""使用ENA API获取元数据"""
|
|
692
|
+
base_url = "https://www.ebi.ac.uk/ena/portal/api/search"
|
|
693
|
+
|
|
694
|
+
# 根据accession类型确定结果类型
|
|
695
|
+
if accession.startswith(('SRR', 'ERR', 'DRR')):
|
|
696
|
+
result_type = 'read_run'
|
|
697
|
+
elif accession.startswith(('SRS', 'ERS', 'DRS')):
|
|
698
|
+
result_type = 'sample'
|
|
699
|
+
elif accession.startswith(('SRX', 'ERX', 'DRX')):
|
|
700
|
+
result_type = 'experiment'
|
|
701
|
+
else:
|
|
702
|
+
result_type = 'read_run' # 默认
|
|
703
|
+
|
|
704
|
+
fields = [
|
|
705
|
+
'accession', 'secondary_sample_accession', 'run_accession',
|
|
706
|
+
'experiment_accession', 'study_accession', 'submission_accession',
|
|
707
|
+
'instrument_platform', 'instrument_model', 'library_layout',
|
|
708
|
+
'library_selection', 'library_source', 'library_strategy',
|
|
709
|
+
'read_count', 'base_count', 'sample_alias', 'sample_title',
|
|
710
|
+
'experiment_title', 'study_title', 'fastq_ftp', 'submitted_ftp',
|
|
711
|
+
'sra_ftp', 'first_public', 'last_updated'
|
|
712
|
+
]
|
|
713
|
+
|
|
714
|
+
params = {
|
|
715
|
+
'result': result_type,
|
|
716
|
+
'query': f'accession="{accession}" OR run_accession="{accession}"',
|
|
717
|
+
'fields': ','.join(fields),
|
|
718
|
+
'format': 'json',
|
|
719
|
+
'limit': 1
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
try:
|
|
723
|
+
response = self.session.get(base_url, params=params, timeout=30)
|
|
724
|
+
response.raise_for_status()
|
|
725
|
+
|
|
726
|
+
data = response.json()
|
|
727
|
+
if data and isinstance(data, list) and len(data) > 0:
|
|
728
|
+
return data[0]
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.error(f"ENA metadata API error: {e}")
|
|
731
|
+
|
|
732
|
+
return {}
|
|
733
|
+
|
|
734
|
+
def _get_ncbi_metadata(self, accession: str) -> Dict[str, Any]:
|
|
735
|
+
"""使用NCBI API获取元数据(备用)"""
|
|
736
|
+
# Entrez API
|
|
737
|
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
738
|
+
|
|
739
|
+
# 搜索
|
|
740
|
+
search_params = {
|
|
741
|
+
'db': 'sra',
|
|
742
|
+
'term': f'{accession}[Accession]',
|
|
743
|
+
'retmax': 1,
|
|
744
|
+
'retmode': 'json'
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
try:
|
|
748
|
+
search_response = self.session.get(base_url + "esearch.fcgi", params=search_params)
|
|
749
|
+
search_data = search_response.json()
|
|
750
|
+
|
|
751
|
+
ids = search_data.get('esearchresult', {}).get('idlist', [])
|
|
752
|
+
if not ids:
|
|
753
|
+
return {}
|
|
754
|
+
|
|
755
|
+
# 获取摘要
|
|
756
|
+
summary_params = {
|
|
757
|
+
'db': 'sra',
|
|
758
|
+
'id': ids[0],
|
|
759
|
+
'retmode': 'json'
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
summary_response = self.session.get(base_url + "esummary.fcgi", params=summary_params)
|
|
763
|
+
summary_data = summary_response.json()
|
|
764
|
+
|
|
765
|
+
result = summary_data.get('result', {}).get(ids[0], {})
|
|
766
|
+
|
|
767
|
+
# 转换为标准格式
|
|
768
|
+
metadata = {
|
|
769
|
+
'accession': accession,
|
|
770
|
+
'title': result.get('title', ''),
|
|
771
|
+
'organism': result.get('organism', ''),
|
|
772
|
+
'platform': result.get('platform', ''),
|
|
773
|
+
'library_strategy': result.get('librarystrategy', ''),
|
|
774
|
+
'library_source': result.get('librarysource', ''),
|
|
775
|
+
'library_selection': result.get('libraryselection', ''),
|
|
776
|
+
'instrument': result.get('instrument', ''),
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
return metadata
|
|
780
|
+
|
|
781
|
+
except Exception as e:
|
|
782
|
+
logger.error(f"NCBI metadata API error: {e}")
|
|
783
|
+
return {}
|
|
784
|
+
|
|
785
|
+
def get_fastq_links(self, accession: str) -> List[str]:
|
|
786
|
+
"""获取FASTQ下载链接"""
|
|
787
|
+
metadata = self.get_metadata(accession)
|
|
788
|
+
|
|
789
|
+
links = []
|
|
790
|
+
|
|
791
|
+
# 从元数据中提取FASTQ链接
|
|
792
|
+
for field in ['fastq_ftp', 'submitted_ftp', 'sra_ftp']:
|
|
793
|
+
if field in metadata and metadata[field]:
|
|
794
|
+
ftp_links = str(metadata[field]).split(';')
|
|
795
|
+
for link in ftp_links:
|
|
796
|
+
link = link.strip()
|
|
797
|
+
if link:
|
|
798
|
+
if not link.startswith(('http://', 'https://', 'ftp://')):
|
|
799
|
+
link = f"ftp://{link}"
|
|
800
|
+
links.append(link)
|
|
801
|
+
|
|
802
|
+
# 如果没有找到链接,生成默认链接
|
|
803
|
+
if not links:
|
|
804
|
+
links = self._generate_default_links(accession)
|
|
805
|
+
|
|
806
|
+
return list(set(links)) # 去重
|
|
807
|
+
|
|
808
|
+
def _generate_default_links(self, accession: str) -> List[str]:
|
|
809
|
+
"""生成默认的ENA FTP链接"""
|
|
810
|
+
links = []
|
|
811
|
+
|
|
812
|
+
# ENA标准FTP路径模式
|
|
813
|
+
# ftp://ftp.sra.ebi.ac.uk/vol1/fastq/XXXnnn/XXXnnnXXX/
|
|
814
|
+
|
|
815
|
+
if accession.startswith(('SRR', 'ERR', 'DRR')):
|
|
816
|
+
# 提取前6位
|
|
817
|
+
prefix = accession[:6]
|
|
818
|
+
# 尝试不同路径模式
|
|
819
|
+
patterns = [
|
|
820
|
+
f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}.fastq.gz",
|
|
821
|
+
f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}_1.fastq.gz",
|
|
822
|
+
f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}_2.fastq.gz",
|
|
823
|
+
f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/00{accession[-1]}/{accession}/{accession}.fastq.gz",
|
|
824
|
+
]
|
|
825
|
+
links.extend(patterns)
|
|
826
|
+
|
|
827
|
+
return links
|
|
828
|
+
|
|
829
|
+
def download_fastq(self,
|
|
830
|
+
accession: str,
|
|
831
|
+
output_dir: Optional[Path] = None,
|
|
832
|
+
max_files: int = 10) -> Dict[str, Any]:
|
|
833
|
+
"""下载FASTQ文件"""
|
|
834
|
+
if output_dir is None:
|
|
835
|
+
output_dir = self.cache_dir / accession
|
|
836
|
+
else:
|
|
837
|
+
output_dir = Path(output_dir) / accession
|
|
838
|
+
|
|
839
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
840
|
+
|
|
841
|
+
# 获取下载链接
|
|
842
|
+
links = self.get_fastq_links(accession)
|
|
843
|
+
|
|
844
|
+
if not links:
|
|
845
|
+
return {
|
|
846
|
+
'accession': accession,
|
|
847
|
+
'success': False,
|
|
848
|
+
'error': 'No download links found',
|
|
849
|
+
'files': []
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
logger.info(f"Found {len(links)} download links for {accession}")
|
|
853
|
+
|
|
854
|
+
# 限制下载文件数量
|
|
855
|
+
if len(links) > max_files:
|
|
856
|
+
logger.info(f"Limiting to {max_files} files")
|
|
857
|
+
links = links[:max_files]
|
|
858
|
+
|
|
859
|
+
# 并行下载
|
|
860
|
+
downloaded_files = []
|
|
861
|
+
|
|
862
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
863
|
+
# 提交下载任务
|
|
864
|
+
future_to_url = {
|
|
865
|
+
executor.submit(self._download_file, url, output_dir): url
|
|
866
|
+
for url in links
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
# 使用进度条
|
|
870
|
+
for future in tqdm(as_completed(future_to_url),
|
|
871
|
+
total=len(links),
|
|
872
|
+
desc=f"Downloading {accession}"):
|
|
873
|
+
url = future_to_url[future]
|
|
874
|
+
try:
|
|
875
|
+
result = future.result(timeout=300)
|
|
876
|
+
if result['success']:
|
|
877
|
+
downloaded_files.append(result['filepath'])
|
|
878
|
+
else:
|
|
879
|
+
logger.error(f"Failed to download {url}: {result.get('error')}")
|
|
880
|
+
except Exception as e:
|
|
881
|
+
logger.error(f"Download task failed for {url}: {e}")
|
|
882
|
+
|
|
883
|
+
return {
|
|
884
|
+
'accession': accession,
|
|
885
|
+
'success': len(downloaded_files) > 0,
|
|
886
|
+
'files': downloaded_files,
|
|
887
|
+
'output_dir': str(output_dir),
|
|
888
|
+
'metadata': self.get_metadata(accession)
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
def _download_file(self, url: str, output_dir: Path) -> Dict[str, Any]:
|
|
892
|
+
"""下载单个文件"""
|
|
893
|
+
filename = self._extract_filename(url)
|
|
894
|
+
filepath = output_dir / filename
|
|
895
|
+
|
|
896
|
+
# 检查文件是否已存在
|
|
897
|
+
if filepath.exists():
|
|
898
|
+
file_size = filepath.stat().st_size
|
|
899
|
+
if file_size > 1024: # 大于1KB认为文件完整
|
|
900
|
+
logger.debug(f"File already exists: {filepath}")
|
|
901
|
+
return {
|
|
902
|
+
'success': True,
|
|
903
|
+
'filepath': str(filepath),
|
|
904
|
+
'size': file_size,
|
|
905
|
+
'cached': True
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
try:
|
|
909
|
+
# 根据URL协议选择下载方法
|
|
910
|
+
if url.startswith('ftp://'):
|
|
911
|
+
result = self._download_ftp(url, filepath)
|
|
912
|
+
elif url.startswith('http'):
|
|
913
|
+
result = self._download_http(url, filepath)
|
|
914
|
+
else:
|
|
915
|
+
result = {'success': False, 'error': f'Unsupported protocol: {url}'}
|
|
916
|
+
|
|
917
|
+
if result['success']:
|
|
918
|
+
logger.info(f"Downloaded: {filename}")
|
|
919
|
+
|
|
920
|
+
return result
|
|
921
|
+
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error(f"Download failed for {url}: {e}")
|
|
924
|
+
return {'success': False, 'error': str(e)}
|
|
925
|
+
|
|
926
|
+
def _extract_filename(self, url: str) -> str:
|
|
927
|
+
"""从URL提取文件名"""
|
|
928
|
+
# 移除查询参数
|
|
929
|
+
if '?' in url:
|
|
930
|
+
url = url.split('?')[0]
|
|
931
|
+
|
|
932
|
+
# 获取最后一部分作为文件名
|
|
933
|
+
filename = url.split('/')[-1]
|
|
934
|
+
|
|
935
|
+
# 如果文件名为空,使用默认名
|
|
936
|
+
if not filename or filename.endswith('/'):
|
|
937
|
+
return "unknown_file.fastq.gz"
|
|
938
|
+
|
|
939
|
+
return filename
|
|
940
|
+
|
|
941
|
+
def _download_http(self, url: str, filepath: Path) -> Dict[str, Any]:
|
|
942
|
+
"""下载HTTP/HTTPS文件"""
|
|
943
|
+
try:
|
|
944
|
+
response = self.session.get(url, stream=True, timeout=60)
|
|
945
|
+
response.raise_for_status()
|
|
946
|
+
|
|
947
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
948
|
+
|
|
949
|
+
with open(filepath, 'wb') as f:
|
|
950
|
+
downloaded = 0
|
|
951
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
952
|
+
if chunk:
|
|
953
|
+
f.write(chunk)
|
|
954
|
+
downloaded += len(chunk)
|
|
955
|
+
|
|
956
|
+
actual_size = filepath.stat().st_size
|
|
957
|
+
|
|
958
|
+
return {
|
|
959
|
+
'success': True,
|
|
960
|
+
'filepath': str(filepath),
|
|
961
|
+
'size': actual_size,
|
|
962
|
+
'expected_size': total_size
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
except Exception as e:
|
|
966
|
+
return {'success': False, 'error': f'HTTP download failed: {e}'}
|
|
967
|
+
|
|
968
|
+
def _download_ftp(self, url: str, filepath: Path) -> Dict[str, Any]:
|
|
969
|
+
"""下载FTP文件"""
|
|
970
|
+
import ftplib
|
|
971
|
+
from urllib.parse import urlparse
|
|
972
|
+
|
|
973
|
+
try:
|
|
974
|
+
# 解析FTP URL
|
|
975
|
+
parsed = urlparse(url)
|
|
976
|
+
hostname = parsed.hostname
|
|
977
|
+
path = parsed.path
|
|
978
|
+
|
|
979
|
+
if not hostname:
|
|
980
|
+
return {'success': False, 'error': 'Invalid FTP URL'}
|
|
981
|
+
|
|
982
|
+
# 连接FTP服务器
|
|
983
|
+
ftp = ftplib.FTP(hostname, timeout=30)
|
|
984
|
+
ftp.login() # 匿名登录
|
|
985
|
+
|
|
986
|
+
# 提取目录和文件名
|
|
987
|
+
if '/' in path:
|
|
988
|
+
dir_path = '/'.join(path.split('/')[:-1]) or '/'
|
|
989
|
+
filename = path.split('/')[-1]
|
|
990
|
+
else:
|
|
991
|
+
dir_path = '/'
|
|
992
|
+
filename = path
|
|
993
|
+
|
|
994
|
+
# 切换到目录
|
|
995
|
+
if dir_path != '/':
|
|
996
|
+
try:
|
|
997
|
+
ftp.cwd(dir_path)
|
|
998
|
+
except:
|
|
999
|
+
# 如果目录不存在,尝试创建路径
|
|
1000
|
+
pass
|
|
1001
|
+
|
|
1002
|
+
# 获取文件大小
|
|
1003
|
+
try:
|
|
1004
|
+
ftp.sendcmd("TYPE I") # 二进制模式
|
|
1005
|
+
file_size = ftp.size(filename)
|
|
1006
|
+
except:
|
|
1007
|
+
file_size = 0
|
|
1008
|
+
|
|
1009
|
+
# 下载文件
|
|
1010
|
+
with open(filepath, 'wb') as f:
|
|
1011
|
+
ftp.retrbinary(f"RETR {filename}", f.write)
|
|
1012
|
+
|
|
1013
|
+
ftp.quit()
|
|
1014
|
+
|
|
1015
|
+
actual_size = filepath.stat().st_size
|
|
1016
|
+
|
|
1017
|
+
return {
|
|
1018
|
+
'success': True,
|
|
1019
|
+
'filepath': str(filepath),
|
|
1020
|
+
'size': actual_size,
|
|
1021
|
+
'expected_size': file_size
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
except Exception as e:
|
|
1025
|
+
return {'success': False, 'error': f'FTP download failed: {e}'}
|
|
1026
|
+
|
|
1027
|
+
def batch_download(self,
|
|
1028
|
+
accessions: List[str],
|
|
1029
|
+
output_dir: Optional[Path] = None) -> Dict[str, Any]:
|
|
1030
|
+
"""批量下载多个accession"""
|
|
1031
|
+
results = {}
|
|
1032
|
+
|
|
1033
|
+
for accession in tqdm(accessions, desc="Processing accessions"):
|
|
1034
|
+
try:
|
|
1035
|
+
result = self.download_fastq(accession, output_dir)
|
|
1036
|
+
results[accession] = result
|
|
1037
|
+
except Exception as e:
|
|
1038
|
+
results[accession] = {
|
|
1039
|
+
'accession': accession,
|
|
1040
|
+
'success': False,
|
|
1041
|
+
'error': str(e)
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
# 统计
|
|
1045
|
+
total = len(accessions)
|
|
1046
|
+
successful = sum(1 for r in results.values() if r.get('success', False))
|
|
1047
|
+
|
|
1048
|
+
return {
|
|
1049
|
+
'total': total,
|
|
1050
|
+
'successful': successful,
|
|
1051
|
+
'failed': total - successful,
|
|
1052
|
+
'results': results
|
|
1053
|
+
}
|
|
1054
|
+
@dataclass
|
|
1055
|
+
class DatasetConfig:
|
|
1056
|
+
"""数据集配置"""
|
|
1057
|
+
dataset_id: str
|
|
1058
|
+
data_type: DataSource
|
|
1059
|
+
data_format: DataFormat
|
|
1060
|
+
organism: Optional[str] = None
|
|
1061
|
+
platform: Optional[str] = None
|
|
1062
|
+
samples: Optional[List[str]] = None
|
|
1063
|
+
force_download: bool = False
|
|
1064
|
+
custom_params: Dict[str, Any] = field(default_factory=dict)
|
|
1065
|
+
|
|
1066
|
+
@classmethod
|
|
1067
|
+
def from_accession(cls, accession: str, **kwargs) -> 'DatasetConfig':
|
|
1068
|
+
"""从accession创建配置"""
|
|
1069
|
+
data_type = DataSource.from_accession(accession)
|
|
1070
|
+
data_format = DataFormat.infer_format(data_type, **kwargs)
|
|
1071
|
+
|
|
1072
|
+
return cls(
|
|
1073
|
+
dataset_id=accession,
|
|
1074
|
+
data_type=data_type,
|
|
1075
|
+
data_format=data_format,
|
|
1076
|
+
organism=kwargs.get('organism'),
|
|
1077
|
+
platform=kwargs.get('platform'),
|
|
1078
|
+
samples=kwargs.get('samples'),
|
|
1079
|
+
force_download=kwargs.get('force_download', False),
|
|
1080
|
+
custom_params={k: v for k, v in kwargs.items()
|
|
1081
|
+
if k not in ['dataset_id', 'organism', 'platform',
|
|
1082
|
+
'samples', 'force_download']}
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
class CacheManager:
|
|
1086
|
+
"""缓存管理器"""
|
|
1087
|
+
|
|
1088
|
+
def __init__(self, cache_dir: Path):
|
|
1089
|
+
self.cache_dir = cache_dir
|
|
1090
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
1091
|
+
self.metadata_file = cache_dir / "cache_metadata.json"
|
|
1092
|
+
self._load_metadata()
|
|
1093
|
+
|
|
1094
|
+
def _load_metadata(self):
|
|
1095
|
+
"""加载缓存元数据"""
|
|
1096
|
+
if self.metadata_file.exists():
|
|
1097
|
+
with open(self.metadata_file, 'r') as f:
|
|
1098
|
+
self.metadata = json.load(f)
|
|
1099
|
+
else:
|
|
1100
|
+
self.metadata = {}
|
|
1101
|
+
|
|
1102
|
+
def _save_metadata(self):
|
|
1103
|
+
"""保存缓存元数据"""
|
|
1104
|
+
with open(self.metadata_file, 'w') as f:
|
|
1105
|
+
json.dump(self.metadata, f, indent=2)
|
|
1106
|
+
|
|
1107
|
+
def get_cache_key(self, config: DatasetConfig) -> str:
|
|
1108
|
+
"""生成缓存键"""
|
|
1109
|
+
key_parts = [
|
|
1110
|
+
config.dataset_id,
|
|
1111
|
+
config.data_type.value,
|
|
1112
|
+
config.data_format.value,
|
|
1113
|
+
config.organism or 'any',
|
|
1114
|
+
config.platform or 'any',
|
|
1115
|
+
str(sorted(config.samples)) if config.samples else 'all'
|
|
1116
|
+
]
|
|
1117
|
+
key_string = '|'.join(key_parts)
|
|
1118
|
+
return hashlib.md5(key_string.encode()).hexdigest()
|
|
1119
|
+
|
|
1120
|
+
def get_cache_path(self, config: DatasetConfig) -> Path:
|
|
1121
|
+
"""获取缓存路径"""
|
|
1122
|
+
cache_key = self.get_cache_key(config)
|
|
1123
|
+
cache_dir = self.cache_dir / config.data_type.value
|
|
1124
|
+
cache_dir.mkdir(exist_ok=True)
|
|
1125
|
+
return cache_dir / f"{cache_key}.pkl"
|
|
1126
|
+
|
|
1127
|
+
def exists(self, config: DatasetConfig) -> bool:
|
|
1128
|
+
"""检查缓存是否存在"""
|
|
1129
|
+
cache_path = self.get_cache_path(config)
|
|
1130
|
+
return cache_path.exists()
|
|
1131
|
+
|
|
1132
|
+
def load(self, config: DatasetConfig) -> Optional[Any]:
|
|
1133
|
+
"""从缓存加载数据"""
|
|
1134
|
+
cache_path = self.get_cache_path(config)
|
|
1135
|
+
|
|
1136
|
+
if cache_path.exists():
|
|
1137
|
+
try:
|
|
1138
|
+
with open(cache_path, 'rb') as f:
|
|
1139
|
+
data = pickle.load(f)
|
|
1140
|
+
|
|
1141
|
+
# 更新访问时间
|
|
1142
|
+
cache_key = self.get_cache_key(config)
|
|
1143
|
+
self.metadata[cache_key] = {
|
|
1144
|
+
'last_accessed': datetime.now().isoformat(),
|
|
1145
|
+
'dataset_id': config.dataset_id,
|
|
1146
|
+
'data_type': config.data_type.value,
|
|
1147
|
+
'data_format': config.data_format.value
|
|
1148
|
+
}
|
|
1149
|
+
self._save_metadata()
|
|
1150
|
+
|
|
1151
|
+
logger.info(f"Loaded from cache: {cache_path}")
|
|
1152
|
+
return data
|
|
1153
|
+
except Exception as e:
|
|
1154
|
+
logger.warning(f"Failed to load cache: {e}")
|
|
1155
|
+
|
|
1156
|
+
return None
|
|
1157
|
+
|
|
1158
|
+
def save(self, config: DatasetConfig, data: Any):
|
|
1159
|
+
"""保存数据到缓存"""
|
|
1160
|
+
cache_path = self.get_cache_path(config)
|
|
1161
|
+
|
|
1162
|
+
try:
|
|
1163
|
+
with open(cache_path, 'wb') as f:
|
|
1164
|
+
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1165
|
+
|
|
1166
|
+
# 更新元数据
|
|
1167
|
+
cache_key = self.get_cache_key(config)
|
|
1168
|
+
self.metadata[cache_key] = {
|
|
1169
|
+
'created': datetime.now().isoformat(),
|
|
1170
|
+
'last_accessed': datetime.now().isoformat(),
|
|
1171
|
+
'dataset_id': config.dataset_id,
|
|
1172
|
+
'data_type': config.data_type.value,
|
|
1173
|
+
'data_format': config.data_format.value,
|
|
1174
|
+
'size': cache_path.stat().st_size if cache_path.exists() else 0
|
|
1175
|
+
}
|
|
1176
|
+
self._save_metadata()
|
|
1177
|
+
|
|
1178
|
+
logger.info(f"Saved to cache: {cache_path}")
|
|
1179
|
+
except Exception as e:
|
|
1180
|
+
logger.error(f"Failed to save cache: {e}")
|
|
1181
|
+
|
|
1182
|
+
def clear_cache(self, data_type: Optional[str] = None, older_than_days: Optional[int] = None):
|
|
1183
|
+
"""清理缓存"""
|
|
1184
|
+
cache_files = list(self.cache_dir.rglob("*.pkl"))
|
|
1185
|
+
|
|
1186
|
+
for cache_file in cache_files:
|
|
1187
|
+
try:
|
|
1188
|
+
if data_type and data_type not in str(cache_file):
|
|
1189
|
+
continue
|
|
1190
|
+
|
|
1191
|
+
if older_than_days:
|
|
1192
|
+
file_age = datetime.now().timestamp() - cache_file.stat().st_mtime
|
|
1193
|
+
if file_age < older_than_days * 86400:
|
|
1194
|
+
continue
|
|
1195
|
+
|
|
1196
|
+
cache_file.unlink()
|
|
1197
|
+
logger.info(f"Removed cache: {cache_file}")
|
|
1198
|
+
except Exception as e:
|
|
1199
|
+
logger.error(f"Failed to remove cache {cache_file}: {e}")
|
|
1200
|
+
|
|
1201
|
+
self._load_metadata() # 重新加载元数据
|
|
1202
|
+
|
|
1203
|
+
class BioDataFetcher:
|
|
1204
|
+
"""
|
|
1205
|
+
生物信息学数据获取器终极版
|
|
1206
|
+
支持多数据源、自动类型推断、智能缓存和并行下载
|
|
1207
|
+
"""
|
|
1208
|
+
|
|
1209
|
+
def __init__(self, dir_save: str = "./bio_data_cache",
|
|
1210
|
+
config_file: Optional[str] = None,
|
|
1211
|
+
auto_infer: bool = True,
|
|
1212
|
+
prefer_fastq_dump: bool = True):
|
|
1213
|
+
"""
|
|
1214
|
+
初始化数据获取器
|
|
1215
|
+
|
|
1216
|
+
Parameters:
|
|
1217
|
+
-----------
|
|
1218
|
+
dir_save : str
|
|
1219
|
+
数据缓存目录
|
|
1220
|
+
config_file : str
|
|
1221
|
+
配置文件路径(YAML或JSON格式)
|
|
1222
|
+
auto_infer : bool
|
|
1223
|
+
是否启用自动类型推断
|
|
1224
|
+
prefer_fastq_dump : bool
|
|
1225
|
+
是否优先使用fastq-dump下载SRA数据
|
|
1226
|
+
"""
|
|
1227
|
+
self.dir_save = Path(dir_save)
|
|
1228
|
+
self.auto_infer = auto_infer
|
|
1229
|
+
self.prefer_fastq_dump = prefer_fastq_dump
|
|
1230
|
+
# 初始化缓存管理器
|
|
1231
|
+
self.cache = CacheManager(self.dir_save)
|
|
1232
|
+
|
|
1233
|
+
# 加载配置
|
|
1234
|
+
self.config = self._load_config(config_file)
|
|
1235
|
+
|
|
1236
|
+
# 数据源API客户端
|
|
1237
|
+
self.sra_client = None
|
|
1238
|
+
self.mygene_client = None
|
|
1239
|
+
self._init_clients()
|
|
1240
|
+
# 检查fastq-dump是否可用
|
|
1241
|
+
self.fastq_dump_available = self._check_fastq_dump_available()
|
|
1242
|
+
|
|
1243
|
+
# 数据源处理器映射 - 使用字符串键确保一致性
|
|
1244
|
+
self.data_processors = {
|
|
1245
|
+
'geo': self._process_geo,
|
|
1246
|
+
'sra': self._process_sra,
|
|
1247
|
+
'tcga': self._process_tcga,
|
|
1248
|
+
'encode': self._process_encode,
|
|
1249
|
+
'arrayexpress': self._process_array_express,
|
|
1250
|
+
'single_cell': self._process_single_cell,
|
|
1251
|
+
'custom': self._process_custom,
|
|
1252
|
+
# 同时支持枚举键的别名
|
|
1253
|
+
DataSource.GEO: self._process_geo,
|
|
1254
|
+
DataSource.SRA: self._process_sra,
|
|
1255
|
+
DataSource.TCGA: self._process_tcga,
|
|
1256
|
+
DataSource.ENCODE: self._process_encode,
|
|
1257
|
+
DataSource.ARRAY_EXPRESS: self._process_array_express,
|
|
1258
|
+
DataSource.SINGLE_CELL: self._process_single_cell,
|
|
1259
|
+
DataSource.CUSTOM: self._process_custom,
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
# 注册数据库API信息
|
|
1263
|
+
self.database_apis = {
|
|
1264
|
+
'ncbi': {
|
|
1265
|
+
'base_url': 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/',
|
|
1266
|
+
'formats': ['xml', 'json'],
|
|
1267
|
+
'rate_limit': 3
|
|
1268
|
+
},
|
|
1269
|
+
'ena': {
|
|
1270
|
+
'base_url': 'https://www.ebi.ac.uk/ena/portal/api/',
|
|
1271
|
+
'formats': ['json'],
|
|
1272
|
+
'rate_limit': 10
|
|
1273
|
+
},
|
|
1274
|
+
'gdc': {
|
|
1275
|
+
'base_url': 'https://api.gdc.cancer.gov/',
|
|
1276
|
+
'formats': ['json'],
|
|
1277
|
+
'rate_limit': 5
|
|
1278
|
+
},
|
|
1279
|
+
'encode': {
|
|
1280
|
+
'base_url': 'https://www.encodeproject.org/',
|
|
1281
|
+
'formats': ['json'],
|
|
1282
|
+
'rate_limit': 5
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
logger.info(f"BioDataFetcher initialized with cache dir: {self.dir_save}")
|
|
1287
|
+
if self.fastq_dump_available and self.prefer_fastq_dump:
|
|
1288
|
+
logger.info("fastq-dump available, will use it for SRA downloads")
|
|
1289
|
+
def _check_fastq_dump_available(self) -> bool:
|
|
1290
|
+
"""检查fastq-dump是否可用"""
|
|
1291
|
+
import shutil
|
|
1292
|
+
|
|
1293
|
+
# 检查主要工具
|
|
1294
|
+
tools_to_check = ['fastq-dump', 'prefetch']
|
|
1295
|
+
available_tools = []
|
|
1296
|
+
|
|
1297
|
+
for tool in tools_to_check:
|
|
1298
|
+
path = shutil.which(tool)
|
|
1299
|
+
if path:
|
|
1300
|
+
available_tools.append((tool, path))
|
|
1301
|
+
logger.debug(f"{tool} found: {path}")
|
|
1302
|
+
else:
|
|
1303
|
+
logger.debug(f"{tool} not found in PATH")
|
|
1304
|
+
|
|
1305
|
+
if len(available_tools) >= 1: # 至少需要fastq-dump
|
|
1306
|
+
logger.info(f"fastq-dump tools available: {[t[0] for t in available_tools]}")
|
|
1307
|
+
return True
|
|
1308
|
+
else:
|
|
1309
|
+
install_fastq_dump_helper()
|
|
1310
|
+
logger.warning("fastq-dump not available. SRA downloads may use FTP fallback.")
|
|
1311
|
+
return False
|
|
1312
|
+
|
|
1313
|
+
def _load_config(self, config_file: Optional[str]) -> Dict:
|
|
1314
|
+
"""加载配置文件"""
|
|
1315
|
+
default_config = {
|
|
1316
|
+
'max_retries': 3,
|
|
1317
|
+
'timeout': 30,
|
|
1318
|
+
'batch_size': 10,
|
|
1319
|
+
'prefer_cached': True,
|
|
1320
|
+
'download_fastq': False,
|
|
1321
|
+
'parallel_downloads': 4,
|
|
1322
|
+
'ncbi_api_key': None,
|
|
1323
|
+
'ensembl_api_key': None,
|
|
1324
|
+
'max_cache_size_gb': 10,
|
|
1325
|
+
'auto_normalize': True,
|
|
1326
|
+
'gene_id_conversion': True,
|
|
1327
|
+
'quality_control': True,
|
|
1328
|
+
'prefer_fastq_dump': True, # 是否优先使用fastq-dump
|
|
1329
|
+
'fastq_dump_split_files': True, # 是否拆分文件
|
|
1330
|
+
'fastq_dump_gzip_output': True, # 是否gzip压缩
|
|
1331
|
+
'fastq_dump_use_prefetch': True, # 是否使用prefetch
|
|
1332
|
+
'fastq_dump_threads': 4, # 线程数
|
|
1333
|
+
'fastq_dump_max_retries': 2, # 最大重试次数
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1336
|
+
if config_file and Path(config_file).exists():
|
|
1337
|
+
try:
|
|
1338
|
+
with open(config_file, 'r') as f:
|
|
1339
|
+
if config_file.endswith('.yaml') or config_file.endswith('.yml'):
|
|
1340
|
+
user_config = yaml.safe_load(f)
|
|
1341
|
+
elif config_file.endswith('.json'):
|
|
1342
|
+
user_config = json.load(f)
|
|
1343
|
+
else:
|
|
1344
|
+
logger.warning(f"Unsupported config file format: {config_file}")
|
|
1345
|
+
return default_config
|
|
1346
|
+
|
|
1347
|
+
# 合并配置
|
|
1348
|
+
default_config.update(user_config)
|
|
1349
|
+
except Exception as e:
|
|
1350
|
+
logger.error(f"Error loading config file: {e}")
|
|
1351
|
+
|
|
1352
|
+
return default_config
|
|
1353
|
+
|
|
1354
|
+
def _init_clients(self):
|
|
1355
|
+
"""初始化API客户端"""
|
|
1356
|
+
if SRADB_AVAILABLE:
|
|
1357
|
+
try:
|
|
1358
|
+
self.sra_client = SRAweb()
|
|
1359
|
+
logger.info("SRAweb client initialized")
|
|
1360
|
+
except Exception as e:
|
|
1361
|
+
logger.warning(f"Failed to initialize SRAweb client: {e}")
|
|
1362
|
+
|
|
1363
|
+
if MYGENE_AVAILABLE:
|
|
1364
|
+
try:
|
|
1365
|
+
self.mygene_client = mygene.MyGeneInfo()
|
|
1366
|
+
logger.info("MyGene client initialized")
|
|
1367
|
+
except Exception as e:
|
|
1368
|
+
logger.warning(f"Failed to initialize MyGene client: {e}")
|
|
1369
|
+
|
|
1370
|
+
def fetch_data(self,
|
|
1371
|
+
dataset_ids: Union[str, List[str]],
|
|
1372
|
+
data_type: Optional[str] = None,
|
|
1373
|
+
data_format: Optional[str] = None,
|
|
1374
|
+
organism: Optional[str] = None,
|
|
1375
|
+
platform: Optional[str] = None,
|
|
1376
|
+
samples: Optional[List[str]] = None,
|
|
1377
|
+
force_download: bool = False,
|
|
1378
|
+
**kwargs) -> Dict[str, Any]:
|
|
1379
|
+
"""
|
|
1380
|
+
通用数据获取函数(智能版)
|
|
1381
|
+
|
|
1382
|
+
Parameters:
|
|
1383
|
+
-----------
|
|
1384
|
+
dataset_ids : Union[str, List[str]]
|
|
1385
|
+
数据集ID或ID列表
|
|
1386
|
+
data_type : Optional[str]
|
|
1387
|
+
数据类型,如未指定则自动推断
|
|
1388
|
+
data_format : Optional[str]
|
|
1389
|
+
数据格式,如未指定则自动推断
|
|
1390
|
+
organism : Optional[str]
|
|
1391
|
+
物种
|
|
1392
|
+
platform : Optional[str]
|
|
1393
|
+
平台类型
|
|
1394
|
+
samples : Optional[List[str]]
|
|
1395
|
+
指定样本ID列表
|
|
1396
|
+
force_download : bool
|
|
1397
|
+
强制重新下载,忽略缓存
|
|
1398
|
+
|
|
1399
|
+
Returns:
|
|
1400
|
+
--------
|
|
1401
|
+
Dict[str, Any]: 包含数据和元数据的字典
|
|
1402
|
+
"""
|
|
1403
|
+
if isinstance(dataset_ids, str):
|
|
1404
|
+
dataset_ids = [dataset_ids]
|
|
1405
|
+
|
|
1406
|
+
results = {}
|
|
1407
|
+
|
|
1408
|
+
for dataset_id in dataset_ids:
|
|
1409
|
+
try:
|
|
1410
|
+
# 自动推断数据类型
|
|
1411
|
+
inferred_type = data_type or self._infer_data_type(dataset_id)
|
|
1412
|
+
|
|
1413
|
+
# 创建数据集配置
|
|
1414
|
+
config = DatasetConfig(
|
|
1415
|
+
dataset_id=dataset_id,
|
|
1416
|
+
data_type=DataSource(inferred_type),
|
|
1417
|
+
data_format=DataFormat(data_format or 'expression'),
|
|
1418
|
+
organism=organism,
|
|
1419
|
+
platform=platform,
|
|
1420
|
+
samples=samples,
|
|
1421
|
+
force_download=force_download,
|
|
1422
|
+
custom_params=kwargs
|
|
1423
|
+
)
|
|
1424
|
+
|
|
1425
|
+
# 获取数据
|
|
1426
|
+
result = self._fetch_with_config(config)
|
|
1427
|
+
results[dataset_id] = result
|
|
1428
|
+
|
|
1429
|
+
except Exception as e:
|
|
1430
|
+
logger.error(f"Failed to fetch data for {dataset_id}: {e}")
|
|
1431
|
+
results[dataset_id] = {
|
|
1432
|
+
'error': str(e),
|
|
1433
|
+
'traceback': self._format_exception(e)
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
# 记录下载历史
|
|
1437
|
+
self._record_download_history(dataset_ids)
|
|
1438
|
+
|
|
1439
|
+
return results
|
|
1440
|
+
|
|
1441
|
+
def _infer_data_type(self, dataset_id: str) -> str:
|
|
1442
|
+
"""根据数据集ID推断数据类型"""
|
|
1443
|
+
if self.auto_infer:
|
|
1444
|
+
return DataSource.from_accession(dataset_id).value
|
|
1445
|
+
|
|
1446
|
+
# 使用启发式规则
|
|
1447
|
+
dataset_id = dataset_id.upper()
|
|
1448
|
+
|
|
1449
|
+
# GEO系列
|
|
1450
|
+
if dataset_id.startswith('GSE') or dataset_id.startswith('GDS'):
|
|
1451
|
+
return 'geo'
|
|
1452
|
+
|
|
1453
|
+
# SRA运行
|
|
1454
|
+
elif dataset_id.startswith(('SRR', 'ERR', 'DRR')):
|
|
1455
|
+
return 'sra'
|
|
1456
|
+
|
|
1457
|
+
# TCGA项目
|
|
1458
|
+
elif dataset_id.startswith('TCGA'):
|
|
1459
|
+
return 'tcga'
|
|
1460
|
+
|
|
1461
|
+
# ENCODE实验
|
|
1462
|
+
elif dataset_id.startswith('ENC'):
|
|
1463
|
+
return 'encode'
|
|
1464
|
+
|
|
1465
|
+
# ArrayExpress
|
|
1466
|
+
elif re.match(r'^E-[A-Z]{4}-\d+$', dataset_id):
|
|
1467
|
+
return 'arrayexpress'
|
|
1468
|
+
|
|
1469
|
+
# 默认使用GEO
|
|
1470
|
+
else:
|
|
1471
|
+
return 'geo'
|
|
1472
|
+
|
|
1473
|
+
def _fetch_with_config(self, config: DatasetConfig) -> Any:
|
|
1474
|
+
"""使用配置获取数据"""
|
|
1475
|
+
dataset_id = config.dataset_id
|
|
1476
|
+
|
|
1477
|
+
# 检查缓存
|
|
1478
|
+
if not config.force_download and self.config['prefer_cached']:
|
|
1479
|
+
cached_data = self.cache.load(config)
|
|
1480
|
+
if cached_data is not None:
|
|
1481
|
+
logger.info(f"Using cached data for {dataset_id}")
|
|
1482
|
+
return cached_data
|
|
1483
|
+
|
|
1484
|
+
logger.info(f"Fetching data for {dataset_id} [{config.data_type.value}/{config.data_format.value}]")
|
|
1485
|
+
|
|
1486
|
+
# 根据数据类型选择处理器
|
|
1487
|
+
# 将枚举类型转换为字符串键进行查找
|
|
1488
|
+
data_type_key = config.data_type.value if isinstance(config.data_type, DataSource) else config.data_type
|
|
1489
|
+
|
|
1490
|
+
processor = self.data_processors.get(data_type_key)
|
|
1491
|
+
if not processor:
|
|
1492
|
+
# 如果直接查找失败,尝试使用枚举值查找
|
|
1493
|
+
if isinstance(config.data_type, DataSource):
|
|
1494
|
+
processor = self.data_processors.get(config.data_type)
|
|
1495
|
+
|
|
1496
|
+
if not processor:
|
|
1497
|
+
# 最后尝试将字符串转换为枚举
|
|
1498
|
+
try:
|
|
1499
|
+
enum_type = DataSource(data_type_key)
|
|
1500
|
+
processor = self.data_processors.get(enum_type)
|
|
1501
|
+
except:
|
|
1502
|
+
pass
|
|
1503
|
+
|
|
1504
|
+
if not processor:
|
|
1505
|
+
raise ValueError(f"No processor for data type: {config.data_type} (key: {data_type_key})")
|
|
1506
|
+
processor = self.data_processors.get(config.data_type)
|
|
1507
|
+
if not processor:
|
|
1508
|
+
raise ValueError(f"No processor for data type: {config.data_type}")
|
|
1509
|
+
|
|
1510
|
+
# 获取数据
|
|
1511
|
+
data = processor(config)
|
|
1512
|
+
|
|
1513
|
+
# 后处理
|
|
1514
|
+
data = self._post_process(data, config)
|
|
1515
|
+
|
|
1516
|
+
# 保存到缓存
|
|
1517
|
+
if not config.force_download:
|
|
1518
|
+
self.cache.save(config, data)
|
|
1519
|
+
|
|
1520
|
+
return data
|
|
1521
|
+
|
|
1522
|
+
def _process_geo(self, config: DatasetConfig) -> Any:
|
|
1523
|
+
"""处理GEO数据"""
|
|
1524
|
+
if not GEO_UTILS_AVAILABLE:
|
|
1525
|
+
raise ImportError("GEO utilities not available")
|
|
1526
|
+
|
|
1527
|
+
# 使用现有的GEO函数
|
|
1528
|
+
geo_data = geo_utils.load_geo(
|
|
1529
|
+
datasets=config.dataset_id,
|
|
1530
|
+
dir_save=str(self.dir_save / "geo"),
|
|
1531
|
+
verbose=config.custom_params.get('verbose', False)
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
# 根据格式提取数据
|
|
1535
|
+
if config.data_format == DataFormat.EXPRESSION:
|
|
1536
|
+
data = geo_utils.get_data(
|
|
1537
|
+
geo=geo_data,
|
|
1538
|
+
dataset=config.dataset_id,
|
|
1539
|
+
verbose=config.custom_params.get('verbose', False)
|
|
1540
|
+
)
|
|
1541
|
+
elif config.data_format == DataFormat.METADATA:
|
|
1542
|
+
data = geo_utils.get_meta(
|
|
1543
|
+
geo=geo_data,
|
|
1544
|
+
dataset=config.dataset_id,
|
|
1545
|
+
verbose=config.custom_params.get('verbose', False)
|
|
1546
|
+
)
|
|
1547
|
+
elif config.data_format == DataFormat.PROBE:
|
|
1548
|
+
data = geo_utils.get_probe(
|
|
1549
|
+
geo=geo_data,
|
|
1550
|
+
dataset=config.dataset_id,
|
|
1551
|
+
platform_id=config.platform,
|
|
1552
|
+
verbose=config.custom_params.get('verbose', False)
|
|
1553
|
+
)
|
|
1554
|
+
else:
|
|
1555
|
+
raise ValueError(f"Unsupported GEO format: {config.data_format}")
|
|
1556
|
+
|
|
1557
|
+
# 过滤样本
|
|
1558
|
+
if config.samples:
|
|
1559
|
+
data = self._filter_samples(data, config.samples)
|
|
1560
|
+
|
|
1561
|
+
return data
|
|
1562
|
+
def _process_sra(self, config: DatasetConfig) -> Any:
|
|
1563
|
+
"""
|
|
1564
|
+
智能SRA处理器
|
|
1565
|
+
优先使用fastq-dump,失败时回退到FTP下载
|
|
1566
|
+
"""
|
|
1567
|
+
dataset_id = config.dataset_id
|
|
1568
|
+
|
|
1569
|
+
if config.data_format == DataFormat.METADATA:
|
|
1570
|
+
# 元数据仍然使用原来的方法
|
|
1571
|
+
return self._process_sra_original(config)
|
|
1572
|
+
|
|
1573
|
+
elif config.data_format == DataFormat.FASTQ:
|
|
1574
|
+
# FASTQ下载:优先使用fastq-dump
|
|
1575
|
+
logger.info(f"Processing SRA FASTQ: {dataset_id}")
|
|
1576
|
+
|
|
1577
|
+
# 检查是否强制使用某种方法
|
|
1578
|
+
force_method = config.custom_params.get('download_method')
|
|
1579
|
+
|
|
1580
|
+
if force_method == 'fastq_dump' or (self.prefer_fastq_dump and self.fastq_dump_available and force_method != 'ftp'):
|
|
1581
|
+
# 尝试使用fastq-dump
|
|
1582
|
+
logger.info(f"Attempting fastq-dump for {dataset_id}")
|
|
1583
|
+
result = self._download_with_fastq_dump(config)
|
|
1584
|
+
|
|
1585
|
+
if result.get('success', False):
|
|
1586
|
+
logger.info(f"fastq-dump successful for {dataset_id}")
|
|
1587
|
+
return result
|
|
1588
|
+
else:
|
|
1589
|
+
logger.warning(f"fastq-dump failed for {dataset_id}: {result.get('error', 'unknown')}")
|
|
1590
|
+
|
|
1591
|
+
# 如果用户没有明确要求fastq-dump,回退到FTP
|
|
1592
|
+
if force_method != 'fastq_dump':
|
|
1593
|
+
logger.info(f"Falling back to FTP for {dataset_id}")
|
|
1594
|
+
return self._download_with_ftp(config)
|
|
1595
|
+
else:
|
|
1596
|
+
return result # 用户明确要求fastq-dump,即使失败也返回
|
|
1597
|
+
|
|
1598
|
+
else:
|
|
1599
|
+
# 使用FTP下载
|
|
1600
|
+
logger.info(f"Using FTP for {dataset_id}")
|
|
1601
|
+
return self._download_with_ftp(config)
|
|
1602
|
+
|
|
1603
|
+
else:
|
|
1604
|
+
raise ValueError(f"Unsupported SRA format: {config.data_format}")
|
|
1605
|
+
def _download_with_ftp(self, config: DatasetConfig) -> Dict[str, Any]:
|
|
1606
|
+
"""使用FTP下载(回退方法)"""
|
|
1607
|
+
dataset_id = config.dataset_id
|
|
1608
|
+
|
|
1609
|
+
logger.info(f"Using FTP fallback for {dataset_id}")
|
|
1610
|
+
|
|
1611
|
+
# 使用原来的SRADownloader
|
|
1612
|
+
downloader = SRADownloader(
|
|
1613
|
+
cache_dir=str(self.dir_save / "fastq"),
|
|
1614
|
+
max_workers=config.custom_params.get('parallel_downloads', 4)
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
result = downloader.download_fastq(
|
|
1618
|
+
dataset_id,
|
|
1619
|
+
output_dir=self.dir_save / "fastq",
|
|
1620
|
+
max_files=config.custom_params.get('max_files', 10)
|
|
1621
|
+
)
|
|
1622
|
+
|
|
1623
|
+
# 添加方法标记
|
|
1624
|
+
if isinstance(result, dict):
|
|
1625
|
+
result['download_method'] = 'ftp'
|
|
1626
|
+
|
|
1627
|
+
return result
|
|
1628
|
+
def _process_sra_original(self, config: DatasetConfig) -> Any:
|
|
1629
|
+
"""处理SRA数据 - 使用独立的下载器"""
|
|
1630
|
+
dataset_id = config.dataset_id
|
|
1631
|
+
|
|
1632
|
+
if config.data_format == DataFormat.METADATA:
|
|
1633
|
+
# 使用独立的下载器获取元数据
|
|
1634
|
+
downloader = SRADownloader(cache_dir=str(self.dir_save / "sra"))
|
|
1635
|
+
metadata = downloader.get_metadata(dataset_id)
|
|
1636
|
+
|
|
1637
|
+
# 转换为DataFrame
|
|
1638
|
+
if isinstance(metadata, dict) and metadata:
|
|
1639
|
+
return pd.DataFrame([metadata])
|
|
1640
|
+
else:
|
|
1641
|
+
return pd.DataFrame()
|
|
1642
|
+
|
|
1643
|
+
elif config.data_format == DataFormat.FASTQ:
|
|
1644
|
+
# 使用独立的下载器下载FASTQ
|
|
1645
|
+
downloader = SRADownloader(
|
|
1646
|
+
cache_dir=str(self.dir_save / "fastq"),
|
|
1647
|
+
max_workers=config.custom_params.get('parallel_downloads', 4)
|
|
1648
|
+
)
|
|
1649
|
+
|
|
1650
|
+
result = downloader.download_fastq(
|
|
1651
|
+
dataset_id,
|
|
1652
|
+
output_dir=self.dir_save / "fastq",
|
|
1653
|
+
max_files=config.custom_params.get('max_files', 10)
|
|
1654
|
+
)
|
|
1655
|
+
|
|
1656
|
+
return result
|
|
1657
|
+
|
|
1658
|
+
else:
|
|
1659
|
+
raise ValueError(f"Unsupported SRA format: {config.data_format}")
|
|
1660
|
+
|
|
1661
|
+
def _download_sra_fastq(self, config: DatasetConfig) -> Dict:
|
|
1662
|
+
"""下载SRA FASTQ文件"""
|
|
1663
|
+
import requests
|
|
1664
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1665
|
+
|
|
1666
|
+
dataset_id = config.dataset_id
|
|
1667
|
+
output_dir = self.dir_save / "fastq" / dataset_id
|
|
1668
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1669
|
+
|
|
1670
|
+
# 获取下载链接
|
|
1671
|
+
download_links = self._get_sra_download_links(dataset_id)
|
|
1672
|
+
|
|
1673
|
+
if not download_links:
|
|
1674
|
+
raise ValueError(f"No download links found for {dataset_id}")
|
|
1675
|
+
|
|
1676
|
+
logger.info(f"Found {len(download_links)} download links for {dataset_id}")
|
|
1677
|
+
|
|
1678
|
+
# 并行下载
|
|
1679
|
+
downloaded_files = []
|
|
1680
|
+
max_workers = config.custom_params.get('parallel_downloads',
|
|
1681
|
+
self.config['parallel_downloads'])
|
|
1682
|
+
|
|
1683
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1684
|
+
future_to_url = {
|
|
1685
|
+
executor.submit(self._download_file, url, output_dir, config): url
|
|
1686
|
+
for url in download_links[:10] # 限制最多下载10个文件
|
|
1687
|
+
}
|
|
1688
|
+
|
|
1689
|
+
for future in tqdm(as_completed(future_to_url),
|
|
1690
|
+
total=len(future_to_url),
|
|
1691
|
+
desc=f"Downloading {dataset_id}"):
|
|
1692
|
+
url = future_to_url[future]
|
|
1693
|
+
try:
|
|
1694
|
+
filepath = future.result(timeout=300)
|
|
1695
|
+
if filepath:
|
|
1696
|
+
downloaded_files.append(str(filepath))
|
|
1697
|
+
except Exception as e:
|
|
1698
|
+
logger.error(f"Failed to download {url}: {e}")
|
|
1699
|
+
|
|
1700
|
+
return {
|
|
1701
|
+
'metadata': self._get_sra_metadata(dataset_id),
|
|
1702
|
+
'fastq_files': downloaded_files,
|
|
1703
|
+
'output_dir': str(output_dir)
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
def _get_sra_download_links(self, accession: str) -> List[str]:
|
|
1707
|
+
"""获取SRA下载链接"""
|
|
1708
|
+
try:
|
|
1709
|
+
# 尝试ENA API
|
|
1710
|
+
ena_links = self._get_ena_download_links(accession)
|
|
1711
|
+
if ena_links:
|
|
1712
|
+
return ena_links
|
|
1713
|
+
|
|
1714
|
+
# 尝试NCBI
|
|
1715
|
+
ncbi_links = self._get_ncbi_download_links(accession)
|
|
1716
|
+
if ncbi_links:
|
|
1717
|
+
return ncbi_links
|
|
1718
|
+
|
|
1719
|
+
# 生成默认链接
|
|
1720
|
+
return self._generate_default_links(accession)
|
|
1721
|
+
|
|
1722
|
+
except Exception as e:
|
|
1723
|
+
logger.error(f"Failed to get download links for {accession}: {e}")
|
|
1724
|
+
return []
|
|
1725
|
+
|
|
1726
|
+
def _get_ena_download_links(self, accession: str) -> List[str]:
|
|
1727
|
+
"""从ENA获取下载链接"""
|
|
1728
|
+
if not REQUESTS_AVAILABLE:
|
|
1729
|
+
return []
|
|
1730
|
+
|
|
1731
|
+
try:
|
|
1732
|
+
url = "https://www.ebi.ac.uk/ena/portal/api/filereport"
|
|
1733
|
+
params = {
|
|
1734
|
+
'accession': accession,
|
|
1735
|
+
'result': 'read_run',
|
|
1736
|
+
'fields': 'fastq_ftp',
|
|
1737
|
+
'format': 'json'
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
response = requests.get(url, params=params, timeout=30)
|
|
1741
|
+
response.raise_for_status()
|
|
1742
|
+
|
|
1743
|
+
data = response.json()
|
|
1744
|
+
if data and isinstance(data, list):
|
|
1745
|
+
links = []
|
|
1746
|
+
for item in data:
|
|
1747
|
+
if 'fastq_ftp' in item and item['fastq_ftp']:
|
|
1748
|
+
ftp_links = str(item['fastq_ftp']).split(';')
|
|
1749
|
+
for link in ftp_links:
|
|
1750
|
+
link = link.strip()
|
|
1751
|
+
if link:
|
|
1752
|
+
links.append(f"ftp://{link}")
|
|
1753
|
+
return links
|
|
1754
|
+
except Exception as e:
|
|
1755
|
+
logger.debug(f"ENA API failed: {e}")
|
|
1756
|
+
|
|
1757
|
+
return []
|
|
1758
|
+
|
|
1759
|
+
def _get_sra_metadata(self, accession: str) -> pd.DataFrame:
|
|
1760
|
+
"""获取SRA元数据"""
|
|
1761
|
+
if self.sra_client:
|
|
1762
|
+
return self.sra_client.search_sra(run_accession =accession, detailed=True)
|
|
1763
|
+
return pd.DataFrame()
|
|
1764
|
+
|
|
1765
|
+
def _download_with_fastq_dump(self, config: DatasetConfig) -> Dict[str, Any]:
|
|
1766
|
+
"""使用fastq-dump下载SRA数据"""
|
|
1767
|
+
import subprocess
|
|
1768
|
+
import shutil
|
|
1769
|
+
import time
|
|
1770
|
+
|
|
1771
|
+
dataset_id = config.dataset_id
|
|
1772
|
+
|
|
1773
|
+
# 提取参数
|
|
1774
|
+
output_dir = self.dir_save / "fastq" / dataset_id
|
|
1775
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1776
|
+
|
|
1777
|
+
split_files = config.custom_params.get('split_files', True)
|
|
1778
|
+
gzip_output = config.custom_params.get('gzip_output', True)
|
|
1779
|
+
use_prefetch = config.custom_params.get('use_prefetch', True)
|
|
1780
|
+
max_retries = config.custom_params.get('max_retries', 2)
|
|
1781
|
+
threads = config.custom_params.get('threads', 4)
|
|
1782
|
+
|
|
1783
|
+
# 查找工具
|
|
1784
|
+
fastq_dump_path = shutil.which("fastq-dump")
|
|
1785
|
+
prefetch_path = shutil.which("prefetch")
|
|
1786
|
+
fasterq_dump_path = shutil.which("fasterq-dump")
|
|
1787
|
+
if not fastq_dump_path and not fasterq_dump_path:
|
|
1788
|
+
return {
|
|
1789
|
+
'success': False,
|
|
1790
|
+
'error': 'Neither fastq-dump nor fasterq-dump found in PATH',
|
|
1791
|
+
'accession': dataset_id,
|
|
1792
|
+
}
|
|
1793
|
+
|
|
1794
|
+
logger.info(f"Downloading {dataset_id} with fastq-dump")
|
|
1795
|
+
logger.info(f" Output dir: {output_dir}")
|
|
1796
|
+
logger.info(f" Split files: {split_files}")
|
|
1797
|
+
logger.info(f" Gzip output: {gzip_output}")
|
|
1798
|
+
|
|
1799
|
+
results = {}
|
|
1800
|
+
|
|
1801
|
+
# 方法1: 使用prefetch + fastq-dump(如果可用)
|
|
1802
|
+
if use_prefetch and prefetch_path:
|
|
1803
|
+
logger.info("Method 1: Using prefetch + fastq-dump")
|
|
1804
|
+
result = self._run_prefetch_fastq_dump(
|
|
1805
|
+
accession=dataset_id,
|
|
1806
|
+
fastq_dump_path=fastq_dump_path,
|
|
1807
|
+
prefetch_path=prefetch_path,
|
|
1808
|
+
output_dir=output_dir,
|
|
1809
|
+
split_files=split_files,
|
|
1810
|
+
gzip_output=gzip_output,
|
|
1811
|
+
threads=threads,
|
|
1812
|
+
max_retries=max_retries
|
|
1813
|
+
)
|
|
1814
|
+
results['prefetch_method'] = result
|
|
1815
|
+
|
|
1816
|
+
if result.get('success', False):
|
|
1817
|
+
return self._format_fastq_dump_result(dataset_id, output_dir, result, 'prefetch+fastq-dump')
|
|
1818
|
+
|
|
1819
|
+
# 方法2: 直接使用fastq-dump
|
|
1820
|
+
logger.info("Method 2: Using fastq-dump directly")
|
|
1821
|
+
result = self._run_fastq_dump_direct(
|
|
1822
|
+
accession=dataset_id,
|
|
1823
|
+
fastq_dump_path=fastq_dump_path,
|
|
1824
|
+
output_dir=output_dir,
|
|
1825
|
+
split_files=split_files,
|
|
1826
|
+
gzip_output=gzip_output,
|
|
1827
|
+
threads=threads,
|
|
1828
|
+
max_retries=max_retries
|
|
1829
|
+
)
|
|
1830
|
+
results['direct_method'] = result
|
|
1831
|
+
|
|
1832
|
+
if result.get('success', False):
|
|
1833
|
+
return self._format_fastq_dump_result(dataset_id, output_dir, result, 'fastq-dump')
|
|
1834
|
+
|
|
1835
|
+
# 方法3: 使用fasterq-dump(如果可用)
|
|
1836
|
+
if fasterq_dump_path:
|
|
1837
|
+
logger.info("Method 3: Using fasterq-dump")
|
|
1838
|
+
result = self._run_fasterq_dump(
|
|
1839
|
+
accession=dataset_id,
|
|
1840
|
+
fasterq_dump_path=fasterq_dump_path,
|
|
1841
|
+
output_dir=output_dir,
|
|
1842
|
+
split_files=split_files,
|
|
1843
|
+
gzip_output=gzip_output,
|
|
1844
|
+
threads=threads,
|
|
1845
|
+
max_retries=max_retries
|
|
1846
|
+
)
|
|
1847
|
+
results['fasterq_method'] = result
|
|
1848
|
+
|
|
1849
|
+
if result.get('success', False):
|
|
1850
|
+
return self._format_fastq_dump_result(dataset_id, output_dir, result, 'fasterq-dump')
|
|
1851
|
+
|
|
1852
|
+
# 所有方法都失败
|
|
1853
|
+
logger.error(f"All fastq-dump methods failed for {dataset_id}")
|
|
1854
|
+
return {
|
|
1855
|
+
'success': False,
|
|
1856
|
+
'error': 'All fastq-dump methods failed',
|
|
1857
|
+
'accession': dataset_id,
|
|
1858
|
+
'results': results,
|
|
1859
|
+
'method': 'fastq-dump'
|
|
1860
|
+
}
|
|
1861
|
+
|
|
1862
|
+
def _run_prefetch_fastq_dump(self, accession, fastq_dump_path, prefetch_path,
|
|
1863
|
+
output_dir, split_files, gzip_output, threads, max_retries):
|
|
1864
|
+
"""使用prefetch下载.sra文件,然后用fastq-dump转换"""
|
|
1865
|
+
|
|
1866
|
+
sra_dir = output_dir / "sra"
|
|
1867
|
+
sra_dir.mkdir(exist_ok=True)
|
|
1868
|
+
# 步骤1: 使用prefetch
|
|
1869
|
+
prefetch_cmd = [
|
|
1870
|
+
prefetch_path,
|
|
1871
|
+
accession,
|
|
1872
|
+
"-O", str(sra_dir),
|
|
1873
|
+
"--progress"
|
|
1874
|
+
]
|
|
1875
|
+
|
|
1876
|
+
try:
|
|
1877
|
+
logger.info(f"Running prefetch: {' '.join(prefetch_cmd)}")
|
|
1878
|
+
|
|
1879
|
+
# 运行prefetch
|
|
1880
|
+
result = subprocess.run(
|
|
1881
|
+
prefetch_cmd,
|
|
1882
|
+
capture_output=True,
|
|
1883
|
+
text=True,
|
|
1884
|
+
timeout=300, # 10分钟超时
|
|
1885
|
+
check=False,# 不立即抛出异常
|
|
1886
|
+
)
|
|
1887
|
+
|
|
1888
|
+
# 详细记录输出
|
|
1889
|
+
logger.debug(f"prefetch return code: {result.returncode}")
|
|
1890
|
+
if result.stdout:
|
|
1891
|
+
logger.debug(f"prefetch stdout (last 500 chars): {result.stdout[-500:]}")
|
|
1892
|
+
if result.stderr:
|
|
1893
|
+
logger.error(f"prefetch stderr: {result.stderr}")
|
|
1894
|
+
|
|
1895
|
+
if result.returncode != 0:
|
|
1896
|
+
error_msg = f"prefetch failed with code {result.returncode}"
|
|
1897
|
+
if result.stderr:
|
|
1898
|
+
error_msg += f": {result.stderr[:200]}"
|
|
1899
|
+
return {'success': False, 'error': error_msg}
|
|
1900
|
+
|
|
1901
|
+
# 查找.sra文件
|
|
1902
|
+
sra_files = list(sra_dir.glob(f"**/{accession}.sra"))
|
|
1903
|
+
if not sra_files:
|
|
1904
|
+
sra_files = list(sra_dir.glob(f"**/*.sra"))
|
|
1905
|
+
|
|
1906
|
+
if not sra_files:
|
|
1907
|
+
# 列出目录内容帮助调试
|
|
1908
|
+
all_files = list(sra_dir.rglob("*"))
|
|
1909
|
+
file_list = [f"{f.name} ({f.stat().st_size} bytes)" for f in all_files if f.is_file()]
|
|
1910
|
+
logger.warning(f"No .sra files found. Directory contents: {file_list}")
|
|
1911
|
+
return {'success': False, 'error': f'No .sra file found. Files: {file_list}'}
|
|
1912
|
+
|
|
1913
|
+
|
|
1914
|
+
sra_file = sra_files[0]
|
|
1915
|
+
logger.info(f"Found .sra file: {sra_file.name} ({sra_file.stat().st_size/1024/1024:.1f} MB)")
|
|
1916
|
+
|
|
1917
|
+
# 步骤2: 使用fastq-dump转换
|
|
1918
|
+
return self._run_fastq_dump_on_file(
|
|
1919
|
+
sra_file=sra_file,
|
|
1920
|
+
fastq_dump_path=fastq_dump_path,
|
|
1921
|
+
output_dir=output_dir,
|
|
1922
|
+
split_files=split_files,
|
|
1923
|
+
gzip_output=gzip_output,
|
|
1924
|
+
threads=threads
|
|
1925
|
+
)
|
|
1926
|
+
|
|
1927
|
+
except subprocess.TimeoutExpired:
|
|
1928
|
+
return {'success': False, 'error': 'prefetch timed out'}
|
|
1929
|
+
except Exception as e:
|
|
1930
|
+
import traceback
|
|
1931
|
+
error_details = traceback.format_exc()[:500]
|
|
1932
|
+
return {'success': False, 'error': f'prefetch error: {type(e).__name__}: {str(e)[:200]}\n{error_details}'}
|
|
1933
|
+
|
|
1934
|
+
|
|
1935
|
+
def _run_fastq_dump_direct(self, accession, fastq_dump_path, output_dir,
|
|
1936
|
+
split_files, gzip_output, threads, max_retries):
|
|
1937
|
+
"""直接使用fastq-dump下载"""
|
|
1938
|
+
# 构建命令
|
|
1939
|
+
cmd = [
|
|
1940
|
+
fastq_dump_path,
|
|
1941
|
+
accession,
|
|
1942
|
+
"--outdir", str(output_dir),
|
|
1943
|
+
"--skip-technical",
|
|
1944
|
+
"--readids",
|
|
1945
|
+
"--dumpbase",
|
|
1946
|
+
"--clip",
|
|
1947
|
+
"--read-filter", "pass",
|
|
1948
|
+
"--origfmt"
|
|
1949
|
+
]
|
|
1950
|
+
|
|
1951
|
+
if split_files:
|
|
1952
|
+
cmd.append("--split-files")
|
|
1953
|
+
|
|
1954
|
+
if gzip_output:
|
|
1955
|
+
cmd.append("--gzip")
|
|
1956
|
+
|
|
1957
|
+
# 添加线程支持(如果版本支持)
|
|
1958
|
+
if threads > 1:
|
|
1959
|
+
cmd.extend(["--threads", str(threads)])
|
|
1960
|
+
|
|
1961
|
+
try:
|
|
1962
|
+
logger.info(f"Running fastq-dump: {' '.join(cmd[:10])}...") # 只显示前10个参数
|
|
1963
|
+
|
|
1964
|
+
result = subprocess.run(
|
|
1965
|
+
cmd,
|
|
1966
|
+
capture_output=True,
|
|
1967
|
+
text=True,
|
|
1968
|
+
timeout=300, # 15分钟超时
|
|
1969
|
+
check=False
|
|
1970
|
+
)
|
|
1971
|
+
|
|
1972
|
+
logger.debug(f"fastq-dump stdout: {result.stdout[-500:] if result.stdout else ''}")
|
|
1973
|
+
logger.debug(f"fastq-dump stderr: {result.stderr[-500:] if result.stderr else ''}")
|
|
1974
|
+
|
|
1975
|
+
# 检查输出文件
|
|
1976
|
+
return self._check_fastq_output(output_dir, accession, split_files, gzip_output)
|
|
1977
|
+
|
|
1978
|
+
except subprocess.TimeoutExpired:
|
|
1979
|
+
return {'success': False, 'error': 'fastq-dump timed out'}
|
|
1980
|
+
except subprocess.CalledProcessError as e:
|
|
1981
|
+
error_msg = e.stderr[:500] if e.stderr else str(e)
|
|
1982
|
+
return {'success': False, 'error': f'fastq-dump failed: {error_msg}'}
|
|
1983
|
+
except Exception as e:
|
|
1984
|
+
return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {str(e)[:200]}'}
|
|
1985
|
+
|
|
1986
|
+
def _run_fasterq_dump(self, accession, fasterq_dump_path, output_dir,
|
|
1987
|
+
split_files, gzip_output, threads, max_retries):
|
|
1988
|
+
"""使用fasterq-dump"""
|
|
1989
|
+
cmd = [
|
|
1990
|
+
fasterq_dump_path,
|
|
1991
|
+
accession,
|
|
1992
|
+
"-O", str(output_dir),
|
|
1993
|
+
"-e", str(threads),
|
|
1994
|
+
"-p", # 显示进度
|
|
1995
|
+
"-t", str(output_dir / "temp") # 临时目录
|
|
1996
|
+
]
|
|
1997
|
+
|
|
1998
|
+
if split_files:
|
|
1999
|
+
cmd.append("--split-files")
|
|
2000
|
+
|
|
2001
|
+
try:
|
|
2002
|
+
logger.info(f"Running fasterq-dump: {' '.join(cmd)}")
|
|
2003
|
+
|
|
2004
|
+
result = subprocess.run(
|
|
2005
|
+
cmd,
|
|
2006
|
+
capture_output=True,
|
|
2007
|
+
text=True,
|
|
2008
|
+
timeout=600, # 10分钟超时
|
|
2009
|
+
check=True
|
|
2010
|
+
)
|
|
2011
|
+
|
|
2012
|
+
logger.debug(f"fasterq-dump stdout: {result.stdout[-500:] if result.stdout else ''}")
|
|
2013
|
+
logger.debug(f"fasterq-dump stderr: {result.stderr[-500:] if result.stderr else ''}")
|
|
2014
|
+
|
|
2015
|
+
# 如果需要gzip,压缩文件
|
|
2016
|
+
if gzip_output:
|
|
2017
|
+
self._compress_fastq_files(output_dir)
|
|
2018
|
+
|
|
2019
|
+
return self._check_fastq_output(output_dir, accession, split_files, gzip_output)
|
|
2020
|
+
|
|
2021
|
+
except subprocess.TimeoutExpired:
|
|
2022
|
+
return {'success': False, 'error': 'fasterq-dump timed out'}
|
|
2023
|
+
except subprocess.CalledProcessError as e:
|
|
2024
|
+
error_msg = e.stderr[:500] if e.stderr else str(e)
|
|
2025
|
+
return {'success': False, 'error': f'fasterq-dump failed: {error_msg}'}
|
|
2026
|
+
except Exception as e:
|
|
2027
|
+
return {'success': False, 'error': f'fasterq-dump error: {type(e).__name__}: {str(e)[:200]}'}
|
|
2028
|
+
|
|
2029
|
+
def _run_fastq_dump_on_file(self, sra_file, fastq_dump_path, output_dir,
|
|
2030
|
+
split_files, gzip_output, threads):
|
|
2031
|
+
"""对已有的.sra文件运行fastq-dump"""
|
|
2032
|
+
cmd = [
|
|
2033
|
+
fastq_dump_path,
|
|
2034
|
+
str(sra_file),
|
|
2035
|
+
"--outdir", str(output_dir),
|
|
2036
|
+
"--skip-technical",
|
|
2037
|
+
"--readids",
|
|
2038
|
+
"--dumpbase",
|
|
2039
|
+
"--clip",
|
|
2040
|
+
"--read-filter", "pass",
|
|
2041
|
+
"--origfmt"
|
|
2042
|
+
]
|
|
2043
|
+
|
|
2044
|
+
if split_files:
|
|
2045
|
+
cmd.append("--split-files")
|
|
2046
|
+
|
|
2047
|
+
if gzip_output:
|
|
2048
|
+
cmd.append("--gzip")
|
|
2049
|
+
|
|
2050
|
+
if threads > 1:
|
|
2051
|
+
cmd.extend(["--threads", str(threads)])
|
|
2052
|
+
|
|
2053
|
+
try:
|
|
2054
|
+
logger.info(f"Running fastq-dump on .sra file: {' '.join(cmd[:8])}...")
|
|
2055
|
+
|
|
2056
|
+
result = subprocess.run(
|
|
2057
|
+
cmd,
|
|
2058
|
+
capture_output=True,
|
|
2059
|
+
text=True,
|
|
2060
|
+
timeout=300, # 5分钟超时(文件已本地存在)
|
|
2061
|
+
check=True
|
|
2062
|
+
)
|
|
2063
|
+
|
|
2064
|
+
return self._check_fastq_output(output_dir, sra_file.stem, split_files, gzip_output)
|
|
2065
|
+
|
|
2066
|
+
except subprocess.CalledProcessError as e:
|
|
2067
|
+
return {'success': False, 'error': f'fastq-dump conversion failed: {e.stderr[:200] if e.stderr else str(e)}'}
|
|
2068
|
+
except Exception as e:
|
|
2069
|
+
return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {str(e)[:200]}'}
|
|
2070
|
+
|
|
2071
|
+
def _check_fastq_output(self, output_dir, accession, split_files, gzip_output):
|
|
2072
|
+
"""检查fastq输出文件"""
|
|
2073
|
+
import glob
|
|
2074
|
+
|
|
2075
|
+
# 查找文件模式
|
|
2076
|
+
if gzip_output:
|
|
2077
|
+
patterns = [f"{accession}*.fastq.gz", f"{accession}*.fq.gz"]
|
|
2078
|
+
else:
|
|
2079
|
+
patterns = [f"{accession}*.fastq", f"{accession}*.fq"]
|
|
2080
|
+
|
|
2081
|
+
files = []
|
|
2082
|
+
for pattern in patterns:
|
|
2083
|
+
files.extend(output_dir.glob(pattern))
|
|
2084
|
+
|
|
2085
|
+
# 过滤空文件
|
|
2086
|
+
files = [str(f) for f in files if f.exists() and f.stat().st_size > 0]
|
|
2087
|
+
|
|
2088
|
+
if files:
|
|
2089
|
+
total_size = sum(Path(f).stat().st_size for f in files)
|
|
2090
|
+
return {
|
|
2091
|
+
'success': True,
|
|
2092
|
+
'files': files,
|
|
2093
|
+
'file_count': len(files),
|
|
2094
|
+
'total_size_bytes': total_size,
|
|
2095
|
+
'total_size_mb': total_size / (1024 * 1024)
|
|
2096
|
+
}
|
|
2097
|
+
else:
|
|
2098
|
+
# 尝试其他命名模式
|
|
2099
|
+
all_fastq_files = list(output_dir.glob("*.fastq*"))
|
|
2100
|
+
if all_fastq_files:
|
|
2101
|
+
files = [str(f) for f in all_fastq_files if f.stat().st_size > 0]
|
|
2102
|
+
if files:
|
|
2103
|
+
total_size = sum(Path(f).stat().st_size for f in files)
|
|
2104
|
+
return {
|
|
2105
|
+
'success': True,
|
|
2106
|
+
'files': files,
|
|
2107
|
+
'file_count': len(files),
|
|
2108
|
+
'total_size_bytes': total_size,
|
|
2109
|
+
'total_size_mb': total_size / (1024 * 1024),
|
|
2110
|
+
'note': 'Files found with different naming pattern'
|
|
2111
|
+
}
|
|
2112
|
+
|
|
2113
|
+
return {'success': False, 'error': 'No output files found'}
|
|
2114
|
+
|
|
2115
|
+
def _compress_fastq_files(self, output_dir):
|
|
2116
|
+
"""压缩fastq文件"""
|
|
2117
|
+
import gzip
|
|
2118
|
+
import shutil
|
|
2119
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2120
|
+
|
|
2121
|
+
fastq_files = list(output_dir.glob("*.fastq"))
|
|
2122
|
+
|
|
2123
|
+
if not fastq_files:
|
|
2124
|
+
return
|
|
2125
|
+
|
|
2126
|
+
logger.info(f"Compressing {len(fastq_files)} fastq files...")
|
|
2127
|
+
|
|
2128
|
+
def compress_file(fastq_path):
|
|
2129
|
+
gzip_path = fastq_path.with_suffix('.fastq.gz')
|
|
2130
|
+
|
|
2131
|
+
try:
|
|
2132
|
+
with open(fastq_path, 'rb') as f_in:
|
|
2133
|
+
with gzip.open(gzip_path, 'wb') as f_out:
|
|
2134
|
+
shutil.copyfileobj(f_in, f_out)
|
|
2135
|
+
|
|
2136
|
+
# 删除原始文件
|
|
2137
|
+
fastq_path.unlink()
|
|
2138
|
+
return True
|
|
2139
|
+
except Exception as e:
|
|
2140
|
+
logger.warning(f"Failed to compress {fastq_path.name}: {e}")
|
|
2141
|
+
return False
|
|
2142
|
+
|
|
2143
|
+
# 并行压缩
|
|
2144
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
2145
|
+
results = list(executor.map(compress_file, fastq_files))
|
|
2146
|
+
|
|
2147
|
+
success_count = sum(results)
|
|
2148
|
+
logger.info(f"Compression complete: {success_count}/{len(fastq_files)} successful")
|
|
2149
|
+
|
|
2150
|
+
def _format_fastq_dump_result(self, accession, output_dir, result, method):
|
|
2151
|
+
"""格式化fastq-dump结果"""
|
|
2152
|
+
formatted = {
|
|
2153
|
+
'accession': accession,
|
|
2154
|
+
'success': result['success'],
|
|
2155
|
+
'files': result.get('files', []),
|
|
2156
|
+
'file_count': result.get('file_count', 0),
|
|
2157
|
+
'total_size_mb': result.get('total_size_mb', 0),
|
|
2158
|
+
'output_dir': str(output_dir),
|
|
2159
|
+
'method': method,
|
|
2160
|
+
'download_method': 'fastq-dump'
|
|
2161
|
+
}
|
|
2162
|
+
|
|
2163
|
+
if 'note' in result:
|
|
2164
|
+
formatted['note'] = result['note']
|
|
2165
|
+
|
|
2166
|
+
return formatted
|
|
2167
|
+
|
|
2168
|
+
|
|
2169
|
+
def _download_file(self, url: str, output_dir: Path, config: DatasetConfig) -> Optional[Path]:
|
|
2170
|
+
"""下载单个文件"""
|
|
2171
|
+
import requests
|
|
2172
|
+
|
|
2173
|
+
filename = url.split('/')[-1].split('?')[0]
|
|
2174
|
+
filepath = output_dir / filename
|
|
2175
|
+
|
|
2176
|
+
# 检查文件是否已存在
|
|
2177
|
+
if filepath.exists() and not config.force_download:
|
|
2178
|
+
file_size = filepath.stat().st_size
|
|
2179
|
+
if file_size > 1000: # 文件大小合理
|
|
2180
|
+
logger.debug(f"File already exists: {filepath}")
|
|
2181
|
+
return filepath
|
|
2182
|
+
|
|
2183
|
+
try:
|
|
2184
|
+
if url.startswith('ftp://'):
|
|
2185
|
+
return self._download_ftp_file(url, filepath)
|
|
2186
|
+
else:
|
|
2187
|
+
return self._download_http_file(url, filepath)
|
|
2188
|
+
except Exception as e:
|
|
2189
|
+
logger.error(f"Failed to download {url}: {e}")
|
|
2190
|
+
return None
|
|
2191
|
+
|
|
2192
|
+
def _download_http_file(self, url: str, filepath: Path) -> Path:
|
|
2193
|
+
"""下载HTTP文件"""
|
|
2194
|
+
import requests
|
|
2195
|
+
|
|
2196
|
+
headers = {
|
|
2197
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
response = requests.get(url, stream=True, headers=headers, timeout=60)
|
|
2201
|
+
response.raise_for_status()
|
|
2202
|
+
|
|
2203
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
2204
|
+
|
|
2205
|
+
with open(filepath, 'wb') as f:
|
|
2206
|
+
downloaded = 0
|
|
2207
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
2208
|
+
if chunk:
|
|
2209
|
+
f.write(chunk)
|
|
2210
|
+
downloaded += len(chunk)
|
|
2211
|
+
|
|
2212
|
+
logger.info(f"Downloaded: {filepath.name} ({downloaded/1024/1024:.1f} MB)")
|
|
2213
|
+
return filepath
|
|
2214
|
+
|
|
2215
|
+
def _process_tcga(self, config: DatasetConfig) -> Any:
|
|
2216
|
+
"""处理TCGA数据"""
|
|
2217
|
+
# 实现TCGA数据下载逻辑
|
|
2218
|
+
raise NotImplementedError("TCGA data fetching not yet implemented")
|
|
2219
|
+
|
|
2220
|
+
def _process_encode(self, config: DatasetConfig) -> Any:
|
|
2221
|
+
"""处理ENCODE数据"""
|
|
2222
|
+
# 实现ENCODE数据下载逻辑
|
|
2223
|
+
raise NotImplementedError("ENCODE data fetching not yet implemented")
|
|
2224
|
+
|
|
2225
|
+
def _process_array_express(self, config: DatasetConfig) -> Any:
|
|
2226
|
+
"""处理ArrayExpress数据"""
|
|
2227
|
+
# 实现ArrayExpress数据下载逻辑
|
|
2228
|
+
raise NotImplementedError("ArrayExpress data fetching not yet implemented")
|
|
2229
|
+
|
|
2230
|
+
def _process_single_cell(self, config: DatasetConfig) -> Any:
|
|
2231
|
+
"""处理单细胞数据"""
|
|
2232
|
+
# 实现单细胞数据下载逻辑
|
|
2233
|
+
raise NotImplementedError("Single-cell data fetching not yet implemented")
|
|
2234
|
+
|
|
2235
|
+
def _process_custom(self, config: DatasetConfig) -> Any:
|
|
2236
|
+
"""处理自定义数据"""
|
|
2237
|
+
# 用户自定义数据处理
|
|
2238
|
+
custom_func = config.custom_params.get('custom_function')
|
|
2239
|
+
if custom_func and callable(custom_func):
|
|
2240
|
+
return custom_func(config.dataset_id, **config.custom_params)
|
|
2241
|
+
|
|
2242
|
+
raise ValueError("No custom function provided for custom data source")
|
|
2243
|
+
|
|
2244
|
+
def _post_process(self, data: Any, config: DatasetConfig) -> Any:
|
|
2245
|
+
"""数据后处理"""
|
|
2246
|
+
if isinstance(data, pd.DataFrame):
|
|
2247
|
+
# 自动归一化
|
|
2248
|
+
if self.config['auto_normalize'] and config.data_format == DataFormat.EXPRESSION:
|
|
2249
|
+
data = self._auto_normalize(data)
|
|
2250
|
+
|
|
2251
|
+
# 基因ID转换
|
|
2252
|
+
if self.config['gene_id_conversion']:
|
|
2253
|
+
data = self._convert_gene_ids(data)
|
|
2254
|
+
|
|
2255
|
+
# 质量控制
|
|
2256
|
+
if self.config['quality_control']:
|
|
2257
|
+
data = self._quality_control(data)
|
|
2258
|
+
|
|
2259
|
+
return data
|
|
2260
|
+
|
|
2261
|
+
def _auto_normalize(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2262
|
+
"""自动归一化表达数据"""
|
|
2263
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
2264
|
+
|
|
2265
|
+
if len(numeric_cols) == 0:
|
|
2266
|
+
return df
|
|
2267
|
+
|
|
2268
|
+
# 检测数据类型
|
|
2269
|
+
if self._is_raw_counts(df[numeric_cols]):
|
|
2270
|
+
logger.info("Detected raw counts, normalizing with TMM")
|
|
2271
|
+
try:
|
|
2272
|
+
return self._normalize_counts(df, numeric_cols)
|
|
2273
|
+
except Exception as e:
|
|
2274
|
+
logger.warning(f"TMM normalization failed: {e}")
|
|
2275
|
+
|
|
2276
|
+
return df
|
|
2277
|
+
|
|
2278
|
+
def _is_raw_counts(self, df_numeric: pd.DataFrame) -> bool:
|
|
2279
|
+
"""检测是否为原始计数数据"""
|
|
2280
|
+
# 检查是否为整数
|
|
2281
|
+
if not df_numeric.applymap(lambda x: isinstance(x, (int, np.integer))).all().all():
|
|
2282
|
+
return False
|
|
2283
|
+
|
|
2284
|
+
# 检查数值范围
|
|
2285
|
+
max_val = df_numeric.max().max()
|
|
2286
|
+
min_val = df_numeric.min().min()
|
|
2287
|
+
|
|
2288
|
+
# 原始计数通常是正整数,且最大值较大
|
|
2289
|
+
return min_val >= 0 and max_val > 1000
|
|
2290
|
+
|
|
2291
|
+
def _normalize_counts(self, df: pd.DataFrame, numeric_cols: pd.Index) -> pd.DataFrame:
|
|
2292
|
+
"""使用TMM方法归一化计数数据"""
|
|
2293
|
+
from statsmodels import robust
|
|
2294
|
+
import numpy as np
|
|
2295
|
+
|
|
2296
|
+
df_numeric = df[numeric_cols]
|
|
2297
|
+
|
|
2298
|
+
# 简单的TMM-like归一化
|
|
2299
|
+
# 计算几何均值作为参考样本
|
|
2300
|
+
log_counts = np.log1p(df_numeric.values)
|
|
2301
|
+
ref_sample = np.exp(np.mean(log_counts, axis=1))
|
|
2302
|
+
|
|
2303
|
+
# 计算缩放因子
|
|
2304
|
+
scaling_factors = []
|
|
2305
|
+
for col in df_numeric.columns:
|
|
2306
|
+
sample_counts = df_numeric[col].values
|
|
2307
|
+
log_ratio = np.log1p(sample_counts) - np.log1p(ref_sample)
|
|
2308
|
+
m_value = log_ratio - np.median(log_ratio)
|
|
2309
|
+
a_value = 0.5 * (np.log1p(sample_counts) + np.log1p(ref_sample))
|
|
2310
|
+
|
|
2311
|
+
# 修剪极端值
|
|
2312
|
+
trim_frac = 0.3
|
|
2313
|
+
n = len(m_value)
|
|
2314
|
+
trim_n = int(n * trim_frac)
|
|
2315
|
+
indices = np.argsort(a_value)
|
|
2316
|
+
keep_indices = indices[trim_n:n-trim_n]
|
|
2317
|
+
|
|
2318
|
+
# 计算缩放因子
|
|
2319
|
+
scaling_factor = np.exp(np.mean(m_value[keep_indices]))
|
|
2320
|
+
scaling_factors.append(scaling_factor)
|
|
2321
|
+
|
|
2322
|
+
# 应用缩放因子
|
|
2323
|
+
scaling_factors = np.array(scaling_factors)
|
|
2324
|
+
df_normalized = df.copy()
|
|
2325
|
+
for i, col in enumerate(numeric_cols):
|
|
2326
|
+
df_normalized[col] = df_numeric[col] / scaling_factors[i]
|
|
2327
|
+
|
|
2328
|
+
return df_normalized
|
|
2329
|
+
|
|
2330
|
+
def _convert_gene_ids(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2331
|
+
"""转换基因ID"""
|
|
2332
|
+
if not MYGENE_AVAILABLE or self.mygene_client is None:
|
|
2333
|
+
return df
|
|
2334
|
+
|
|
2335
|
+
# 检测可能的基因ID列
|
|
2336
|
+
gene_id_cols = [col for col in df.columns
|
|
2337
|
+
if col.lower() in ['gene_id', 'gene_symbol', 'entrez', 'ensembl']]
|
|
2338
|
+
|
|
2339
|
+
if not gene_id_cols:
|
|
2340
|
+
return df
|
|
2341
|
+
|
|
2342
|
+
# 使用mygene.info进行ID转换
|
|
2343
|
+
try:
|
|
2344
|
+
gene_ids = df[gene_id_cols[0]].dropna().tolist()
|
|
2345
|
+
results = self.mygene_client.querymany(gene_ids, scopes='symbol', fields='symbol,name')
|
|
2346
|
+
|
|
2347
|
+
# 创建映射
|
|
2348
|
+
id_map = {}
|
|
2349
|
+
for result in results:
|
|
2350
|
+
if 'query' in result and 'symbol' in result:
|
|
2351
|
+
id_map[result['query']] = result['symbol']
|
|
2352
|
+
|
|
2353
|
+
# 应用映射
|
|
2354
|
+
df = df.copy()
|
|
2355
|
+
df[gene_id_cols[0]] = df[gene_id_cols[0]].map(id_map).fillna(df[gene_id_cols[0]])
|
|
2356
|
+
|
|
2357
|
+
except Exception as e:
|
|
2358
|
+
logger.warning(f"Gene ID conversion failed: {e}")
|
|
2359
|
+
|
|
2360
|
+
return df
|
|
2361
|
+
|
|
2362
|
+
def _quality_control(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2363
|
+
"""质量控制"""
|
|
2364
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
2365
|
+
|
|
2366
|
+
if len(numeric_cols) == 0:
|
|
2367
|
+
return df
|
|
2368
|
+
|
|
2369
|
+
# 移除全为零的行
|
|
2370
|
+
df_numeric = df[numeric_cols]
|
|
2371
|
+
non_zero_rows = (df_numeric != 0).any(axis=1)
|
|
2372
|
+
|
|
2373
|
+
if not non_zero_rows.all():
|
|
2374
|
+
logger.info(f"Removing {sum(~non_zero_rows)} rows with all zeros")
|
|
2375
|
+
df = df[non_zero_rows].copy()
|
|
2376
|
+
|
|
2377
|
+
# 移除低表达基因
|
|
2378
|
+
mean_expression = df_numeric.mean(axis=1)
|
|
2379
|
+
if len(mean_expression) > 1000: # 只在数据量较大时过滤
|
|
2380
|
+
threshold = mean_expression.quantile(0.1)
|
|
2381
|
+
keep_rows = mean_expression >= threshold
|
|
2382
|
+
|
|
2383
|
+
if not keep_rows.all():
|
|
2384
|
+
logger.info(f"Removing {sum(~keep_rows)} low-expression rows")
|
|
2385
|
+
df = df[keep_rows].copy()
|
|
2386
|
+
|
|
2387
|
+
return df
|
|
2388
|
+
|
|
2389
|
+
def _filter_samples(self, data: Any, samples: List[str]) -> Any:
|
|
2390
|
+
"""过滤样本"""
|
|
2391
|
+
if isinstance(data, pd.DataFrame):
|
|
2392
|
+
# 尝试按列名过滤
|
|
2393
|
+
if any(sample in data.columns for sample in samples):
|
|
2394
|
+
return data[samples]
|
|
2395
|
+
# 尝试按索引过滤
|
|
2396
|
+
elif any(sample in data.index for sample in samples):
|
|
2397
|
+
return data.loc[data.index.intersection(samples)]
|
|
2398
|
+
|
|
2399
|
+
return data
|
|
2400
|
+
|
|
2401
|
+
def _record_download_history(self, dataset_ids: List[str]):
|
|
2402
|
+
"""记录下载历史"""
|
|
2403
|
+
history_file = self.dir_save / "download_history.json"
|
|
2404
|
+
|
|
2405
|
+
history = []
|
|
2406
|
+
if history_file.exists():
|
|
2407
|
+
try:
|
|
2408
|
+
with open(history_file, 'r') as f:
|
|
2409
|
+
history = json.load(f)
|
|
2410
|
+
if isinstance(history, dict):
|
|
2411
|
+
history = [history]
|
|
2412
|
+
elif not isinstance(history, list):
|
|
2413
|
+
history = []
|
|
2414
|
+
except:
|
|
2415
|
+
history = []
|
|
2416
|
+
|
|
2417
|
+
for dataset_id in dataset_ids:
|
|
2418
|
+
history.append({
|
|
2419
|
+
'dataset_id': dataset_id,
|
|
2420
|
+
'timestamp': datetime.now().isoformat(),
|
|
2421
|
+
'cache_dir': str(self.dir_save)
|
|
2422
|
+
})
|
|
2423
|
+
|
|
2424
|
+
# 只保留最近100条记录
|
|
2425
|
+
history = history[-100:]
|
|
2426
|
+
|
|
2427
|
+
with open(history_file, 'w') as f:
|
|
2428
|
+
json.dump(history, f, indent=2)
|
|
2429
|
+
|
|
2430
|
+
def _format_exception(self, e: Exception) -> str:
|
|
2431
|
+
"""格式化异常信息"""
|
|
2432
|
+
import traceback
|
|
2433
|
+
return traceback.format_exc()
|
|
2434
|
+
|
|
2435
|
+
# 公共API方法
|
|
2436
|
+
def list_datasets(self,
|
|
2437
|
+
data_type: Optional[str] = None,
|
|
2438
|
+
search_query: Optional[str] = None,
|
|
2439
|
+
organism: Optional[str] = None,
|
|
2440
|
+
limit: int = 50) -> pd.DataFrame:
|
|
2441
|
+
"""列出或搜索数据集"""
|
|
2442
|
+
if search_query:
|
|
2443
|
+
return self._search_datasets(search_query, data_type, organism, limit)
|
|
2444
|
+
|
|
2445
|
+
# 列出缓存的数据集
|
|
2446
|
+
return self.cache_list(data_type)
|
|
2447
|
+
|
|
2448
|
+
def _search_datasets(self,
|
|
2449
|
+
query: str,
|
|
2450
|
+
data_type: Optional[str],
|
|
2451
|
+
organism: Optional[str],
|
|
2452
|
+
limit: int) -> pd.DataFrame:
|
|
2453
|
+
"""搜索数据集"""
|
|
2454
|
+
import requests
|
|
2455
|
+
|
|
2456
|
+
# 根据数据类型选择API
|
|
2457
|
+
if data_type == 'geo' or data_type is None:
|
|
2458
|
+
return self._search_geo(query, organism, limit)
|
|
2459
|
+
elif data_type == 'sra':
|
|
2460
|
+
return self._search_sra(query, limit)
|
|
2461
|
+
else:
|
|
2462
|
+
logger.warning(f"Search not supported for data type: {data_type}")
|
|
2463
|
+
return pd.DataFrame()
|
|
2464
|
+
|
|
2465
|
+
def _search_geo(self, query: str, organism: Optional[str], limit: int) -> pd.DataFrame:
|
|
2466
|
+
"""搜索GEO数据集"""
|
|
2467
|
+
if not REQUESTS_AVAILABLE:
|
|
2468
|
+
return pd.DataFrame()
|
|
2469
|
+
|
|
2470
|
+
try:
|
|
2471
|
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
2472
|
+
|
|
2473
|
+
search_term = query
|
|
2474
|
+
if organism:
|
|
2475
|
+
search_term += f" AND {organism}[Organism]"
|
|
2476
|
+
|
|
2477
|
+
# 搜索
|
|
2478
|
+
search_params = {
|
|
2479
|
+
'db': 'gds',
|
|
2480
|
+
'term': search_term,
|
|
2481
|
+
'retmax': limit,
|
|
2482
|
+
'retmode': 'json'
|
|
2483
|
+
}
|
|
2484
|
+
|
|
2485
|
+
response = requests.get(base_url + "esearch.fcgi", params=search_params)
|
|
2486
|
+
response.raise_for_status()
|
|
2487
|
+
|
|
2488
|
+
result = response.json()
|
|
2489
|
+
ids = result.get('esearchresult', {}).get('idlist', [])
|
|
2490
|
+
|
|
2491
|
+
if not ids:
|
|
2492
|
+
return pd.DataFrame()
|
|
2493
|
+
|
|
2494
|
+
# 获取详细信息
|
|
2495
|
+
summary_params = {
|
|
2496
|
+
'db': 'gds',
|
|
2497
|
+
'id': ','.join(ids),
|
|
2498
|
+
'retmode': 'json'
|
|
2499
|
+
}
|
|
2500
|
+
|
|
2501
|
+
summary_response = requests.get(base_url + "esummary.fcgi", params=summary_params)
|
|
2502
|
+
summary_result = summary_response.json()
|
|
2503
|
+
|
|
2504
|
+
datasets = []
|
|
2505
|
+
for uid in ids:
|
|
2506
|
+
info = summary_result.get('result', {}).get(uid, {})
|
|
2507
|
+
datasets.append({
|
|
2508
|
+
'accession': info.get('accession', ''),
|
|
2509
|
+
'title': info.get('title', ''),
|
|
2510
|
+
'summary': info.get('summary', '')[:200] + '...' if info.get('summary') else '',
|
|
2511
|
+
'organism': info.get('organism', ''),
|
|
2512
|
+
'platform': info.get('platform', ''),
|
|
2513
|
+
'samples': info.get('samples', 0),
|
|
2514
|
+
'type': info.get('entrytype', ''),
|
|
2515
|
+
'gdstype': info.get('gdstype', ''),
|
|
2516
|
+
'pubmed': info.get('pubmed', ''),
|
|
2517
|
+
})
|
|
2518
|
+
|
|
2519
|
+
return pd.DataFrame(datasets)
|
|
2520
|
+
|
|
2521
|
+
except Exception as e:
|
|
2522
|
+
logger.error(f"Failed to search GEO datasets: {e}")
|
|
2523
|
+
return pd.DataFrame()
|
|
2524
|
+
|
|
2525
|
+
def _search_sra(self, query: str, limit: int) -> pd.DataFrame:
|
|
2526
|
+
"""搜索SRA数据集"""
|
|
2527
|
+
if not self.sra_client:
|
|
2528
|
+
return pd.DataFrame()
|
|
2529
|
+
|
|
2530
|
+
try:
|
|
2531
|
+
df = self.sra_client.search_sra(query, size=limit)
|
|
2532
|
+
return df
|
|
2533
|
+
except Exception as e:
|
|
2534
|
+
logger.error(f"Failed to search SRA datasets: {e}")
|
|
2535
|
+
return pd.DataFrame()
|
|
2536
|
+
|
|
2537
|
+
def cache_list(self, data_type: Optional[str] = None) -> pd.DataFrame:
|
|
2538
|
+
"""列出缓存的数据集"""
|
|
2539
|
+
cache_files = list(self.dir_save.rglob("*.pkl"))
|
|
2540
|
+
|
|
2541
|
+
datasets = []
|
|
2542
|
+
for file_path in cache_files:
|
|
2543
|
+
try:
|
|
2544
|
+
rel_path = file_path.relative_to(self.dir_save)
|
|
2545
|
+
parts = rel_path.parts
|
|
2546
|
+
|
|
2547
|
+
if len(parts) >= 2:
|
|
2548
|
+
ds_type = parts[0]
|
|
2549
|
+
|
|
2550
|
+
if data_type and ds_type != data_type:
|
|
2551
|
+
continue
|
|
2552
|
+
|
|
2553
|
+
# 尝试从缓存元数据获取信息
|
|
2554
|
+
cache_key = file_path.stem
|
|
2555
|
+
metadata = self.cache.metadata.get(cache_key, {})
|
|
2556
|
+
|
|
2557
|
+
datasets.append({
|
|
2558
|
+
'dataset_id': metadata.get('dataset_id', 'Unknown'),
|
|
2559
|
+
'data_type': ds_type,
|
|
2560
|
+
'data_format': metadata.get('data_format', 'Unknown'),
|
|
2561
|
+
'file_path': str(file_path),
|
|
2562
|
+
'size_mb': file_path.stat().st_size / (1024 * 1024),
|
|
2563
|
+
'created': metadata.get('created', 'Unknown'),
|
|
2564
|
+
'last_accessed': metadata.get('last_accessed', 'Unknown'),
|
|
2565
|
+
})
|
|
2566
|
+
except:
|
|
2567
|
+
continue
|
|
2568
|
+
|
|
2569
|
+
if datasets:
|
|
2570
|
+
df = pd.DataFrame(datasets)
|
|
2571
|
+
return df.sort_values('last_accessed', ascending=False)
|
|
2572
|
+
|
|
2573
|
+
return pd.DataFrame()
|
|
2574
|
+
|
|
2575
|
+
def batch_fetch(self,
|
|
2576
|
+
configs: List[Dict[str, Any]],
|
|
2577
|
+
max_workers: int = 4,
|
|
2578
|
+
progress_bar: bool = True) -> Dict[str, Any]:
|
|
2579
|
+
"""批量获取数据"""
|
|
2580
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2581
|
+
|
|
2582
|
+
results = {}
|
|
2583
|
+
|
|
2584
|
+
def fetch_task(config_dict: Dict) -> Tuple[str, Any]:
|
|
2585
|
+
"""单个获取任务"""
|
|
2586
|
+
try:
|
|
2587
|
+
dataset_id = config_dict.get('dataset_id', config_dict.get('id'))
|
|
2588
|
+
if not dataset_id:
|
|
2589
|
+
return 'unknown', {'error': 'No dataset_id provided'}
|
|
2590
|
+
|
|
2591
|
+
# 创建配置
|
|
2592
|
+
config = DatasetConfig.from_accession(dataset_id, **config_dict)
|
|
2593
|
+
|
|
2594
|
+
# 获取数据
|
|
2595
|
+
data = self._fetch_with_config(config)
|
|
2596
|
+
return dataset_id, data
|
|
2597
|
+
|
|
2598
|
+
except Exception as e:
|
|
2599
|
+
dataset_id = config_dict.get('dataset_id', config_dict.get('id', 'unknown'))
|
|
2600
|
+
return dataset_id, {'error': str(e)}
|
|
2601
|
+
|
|
2602
|
+
# 使用进度条
|
|
2603
|
+
if progress_bar:
|
|
2604
|
+
configs_iter = tqdm(configs, desc="Batch fetching")
|
|
2605
|
+
else:
|
|
2606
|
+
configs_iter = configs
|
|
2607
|
+
|
|
2608
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
2609
|
+
future_to_config = {
|
|
2610
|
+
executor.submit(fetch_task, config): config
|
|
2611
|
+
for config in configs_iter
|
|
2612
|
+
}
|
|
2613
|
+
|
|
2614
|
+
for future in as_completed(future_to_config):
|
|
2615
|
+
dataset_id, result = future.result()
|
|
2616
|
+
results[dataset_id] = result
|
|
2617
|
+
|
|
2618
|
+
return results
|
|
2619
|
+
|
|
2620
|
+
def get_dataset_info(self, dataset_id: str) -> Dict[str, Any]:
|
|
2621
|
+
"""获取数据集信息"""
|
|
2622
|
+
# 自动推断数据类型
|
|
2623
|
+
data_type = self._infer_data_type(dataset_id)
|
|
2624
|
+
|
|
2625
|
+
info = {
|
|
2626
|
+
'dataset_id': dataset_id,
|
|
2627
|
+
'inferred_type': data_type,
|
|
2628
|
+
'data_source': DataSource.from_accession(dataset_id).value,
|
|
2629
|
+
'cache_status': self._check_cache_status(dataset_id),
|
|
2630
|
+
'available_formats': self._get_available_formats(dataset_id, data_type),
|
|
2631
|
+
}
|
|
2632
|
+
|
|
2633
|
+
# 尝试获取元数据
|
|
2634
|
+
try:
|
|
2635
|
+
if data_type == 'geo':
|
|
2636
|
+
info['metadata'] = self._get_geo_info(dataset_id)
|
|
2637
|
+
elif data_type == 'sra':
|
|
2638
|
+
info['metadata'] = self._get_sra_info(dataset_id)
|
|
2639
|
+
except Exception as e:
|
|
2640
|
+
info['metadata_error'] = str(e)
|
|
2641
|
+
|
|
2642
|
+
return info
|
|
2643
|
+
|
|
2644
|
+
def _check_cache_status(self, dataset_id: str) -> Dict[str, Any]:
|
|
2645
|
+
"""检查缓存状态"""
|
|
2646
|
+
cache_files = list(self.dir_save.rglob(f"*{dataset_id}*.pkl"))
|
|
2647
|
+
|
|
2648
|
+
status = {
|
|
2649
|
+
'cached': len(cache_files) > 0,
|
|
2650
|
+
'files': [],
|
|
2651
|
+
'total_size_mb': 0
|
|
2652
|
+
}
|
|
2653
|
+
|
|
2654
|
+
for file_path in cache_files:
|
|
2655
|
+
status['files'].append({
|
|
2656
|
+
'path': str(file_path),
|
|
2657
|
+
'size_mb': file_path.stat().st_size / (1024 * 1024),
|
|
2658
|
+
'modified': datetime.fromtimestamp(file_path.stat().st_mtime)
|
|
2659
|
+
})
|
|
2660
|
+
status['total_size_mb'] += file_path.stat().st_size / (1024 * 1024)
|
|
2661
|
+
|
|
2662
|
+
return status
|
|
2663
|
+
|
|
2664
|
+
def _get_available_formats(self, dataset_id: str, data_type: str) -> List[str]:
|
|
2665
|
+
"""获取可用数据格式"""
|
|
2666
|
+
if data_type == 'geo':
|
|
2667
|
+
return ['expression', 'metadata', 'probe']
|
|
2668
|
+
elif data_type == 'sra':
|
|
2669
|
+
return ['metadata', 'fastq']
|
|
2670
|
+
elif data_type == 'tcga':
|
|
2671
|
+
return ['expression', 'clinical', 'mutations']
|
|
2672
|
+
else:
|
|
2673
|
+
return ['metadata']
|
|
2674
|
+
|
|
2675
|
+
def _get_geo_info(self, dataset_id: str) -> Dict[str, Any]:
|
|
2676
|
+
"""获取GEO数据集信息"""
|
|
2677
|
+
if not REQUESTS_AVAILABLE:
|
|
2678
|
+
return {}
|
|
2679
|
+
|
|
2680
|
+
try:
|
|
2681
|
+
url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
|
|
2682
|
+
params = {'acc': dataset_id, 'targ': 'self', 'form': 'xml', 'view': 'quick'}
|
|
2683
|
+
|
|
2684
|
+
response = requests.get(url, params=params, timeout=10)
|
|
2685
|
+
if response.ok:
|
|
2686
|
+
# 解析HTML获取基本信息
|
|
2687
|
+
import re
|
|
2688
|
+
html = response.text
|
|
2689
|
+
|
|
2690
|
+
info = {}
|
|
2691
|
+
|
|
2692
|
+
# 提取标题
|
|
2693
|
+
title_match = re.search(r'<title>(.*?)</title>', html)
|
|
2694
|
+
if title_match:
|
|
2695
|
+
info['title'] = title_match.group(1)
|
|
2696
|
+
|
|
2697
|
+
# 提取样本数
|
|
2698
|
+
samples_match = re.search(r'Samples?:\s*(\d+)', html)
|
|
2699
|
+
if samples_match:
|
|
2700
|
+
info['samples'] = int(samples_match.group(1))
|
|
2701
|
+
|
|
2702
|
+
# 提取平台
|
|
2703
|
+
platform_match = re.search(r'Platform.*?GPL\d+', html)
|
|
2704
|
+
if platform_match:
|
|
2705
|
+
info['platform'] = platform_match.group(0)
|
|
2706
|
+
|
|
2707
|
+
return info
|
|
2708
|
+
except Exception as e:
|
|
2709
|
+
logger.debug(f"Failed to get GEO info: {e}")
|
|
2710
|
+
|
|
2711
|
+
return {}
|
|
2712
|
+
|
|
2713
|
+
def _get_sra_info(self, dataset_id: str) -> Dict[str, Any]:
|
|
2714
|
+
"""获取SRA数据集信息"""
|
|
2715
|
+
if not self.sra_client:
|
|
2716
|
+
return {}
|
|
2717
|
+
|
|
2718
|
+
try:
|
|
2719
|
+
df = self.sra_client.search_sra(run_accession =dataset_id, detailed=False)
|
|
2720
|
+
if not df.empty:
|
|
2721
|
+
return df.iloc[0].to_dict()
|
|
2722
|
+
except Exception as e:
|
|
2723
|
+
logger.debug(f"Failed to get SRA info: {e}")
|
|
2724
|
+
|
|
2725
|
+
return {}
|
|
2726
|
+
|
|
2727
|
+
def clear_cache(self,
|
|
2728
|
+
data_type: Optional[str] = None,
|
|
2729
|
+
older_than_days: Optional[int] = None,
|
|
2730
|
+
confirm: bool = False):
|
|
2731
|
+
"""清理缓存"""
|
|
2732
|
+
if not confirm:
|
|
2733
|
+
logger.warning("Cache clearance requires confirmation. Use confirm=True")
|
|
2734
|
+
return
|
|
2735
|
+
|
|
2736
|
+
self.cache.clear_cache(data_type, older_than_days)
|
|
2737
|
+
|
|
2738
|
+
def export_data(self,
|
|
2739
|
+
dataset_id: str,
|
|
2740
|
+
output_format: str = 'csv',
|
|
2741
|
+
output_dir: Optional[str] = None) -> str:
|
|
2742
|
+
"""导出数据"""
|
|
2743
|
+
# 获取数据
|
|
2744
|
+
data = self.fetch_data(dataset_id)
|
|
2745
|
+
|
|
2746
|
+
if isinstance(data, dict) and 'error' in data:
|
|
2747
|
+
raise ValueError(f"Cannot export: {data['error']}")
|
|
2748
|
+
|
|
2749
|
+
if output_dir is None:
|
|
2750
|
+
output_dir = self.dir_save / "exports"
|
|
2751
|
+
else:
|
|
2752
|
+
output_dir = Path(output_dir)
|
|
2753
|
+
|
|
2754
|
+
output_dir.mkdir(exist_ok=True)
|
|
2755
|
+
|
|
2756
|
+
if output_format == 'csv':
|
|
2757
|
+
if isinstance(data, pd.DataFrame):
|
|
2758
|
+
output_path = output_dir / f"{dataset_id}.csv"
|
|
2759
|
+
data.to_csv(output_path)
|
|
2760
|
+
return str(output_path)
|
|
2761
|
+
else:
|
|
2762
|
+
raise ValueError("Data is not a DataFrame, cannot export as CSV")
|
|
2763
|
+
|
|
2764
|
+
elif output_format == 'excel':
|
|
2765
|
+
if isinstance(data, pd.DataFrame):
|
|
2766
|
+
output_path = output_dir / f"{dataset_id}.xlsx"
|
|
2767
|
+
data.to_excel(output_path, engine='openpyxl')
|
|
2768
|
+
return str(output_path)
|
|
2769
|
+
else:
|
|
2770
|
+
raise ValueError("Data is not a DataFrame, cannot export as Excel")
|
|
2771
|
+
|
|
2772
|
+
elif output_format == 'json':
|
|
2773
|
+
output_path = output_dir / f"{dataset_id}.json"
|
|
2774
|
+
with open(output_path, 'w') as f:
|
|
2775
|
+
if isinstance(data, pd.DataFrame):
|
|
2776
|
+
json.dump(data.to_dict(orient='records'), f, indent=2)
|
|
2777
|
+
else:
|
|
2778
|
+
json.dump(data, f, indent=2)
|
|
2779
|
+
return str(output_path)
|
|
2780
|
+
|
|
2781
|
+
else:
|
|
2782
|
+
raise ValueError(f"Unsupported output format: {output_format}")
|
|
2783
|
+
|
|
2784
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
2785
|
+
"""获取统计信息"""
|
|
2786
|
+
cache_files = list(self.dir_save.rglob("*.pkl"))
|
|
2787
|
+
|
|
2788
|
+
stats = {
|
|
2789
|
+
'total_datasets': len(set(f.stem for f in cache_files)),
|
|
2790
|
+
'total_files': len(cache_files),
|
|
2791
|
+
'total_size_gb': sum(f.stat().st_size for f in cache_files) / (1024**3),
|
|
2792
|
+
'by_data_type': {},
|
|
2793
|
+
'by_format': {},
|
|
2794
|
+
'recent_downloads': []
|
|
2795
|
+
}
|
|
2796
|
+
|
|
2797
|
+
# 按数据类型统计
|
|
2798
|
+
for file_path in cache_files:
|
|
2799
|
+
try:
|
|
2800
|
+
rel_path = file_path.relative_to(self.dir_save)
|
|
2801
|
+
if len(rel_path.parts) >= 1:
|
|
2802
|
+
data_type = rel_path.parts[0]
|
|
2803
|
+
stats['by_data_type'][data_type] = stats['by_data_type'].get(data_type, 0) + 1
|
|
2804
|
+
except:
|
|
2805
|
+
pass
|
|
2806
|
+
|
|
2807
|
+
# 读取下载历史
|
|
2808
|
+
history_file = self.dir_save / "download_history.json"
|
|
2809
|
+
if history_file.exists():
|
|
2810
|
+
try:
|
|
2811
|
+
with open(history_file, 'r') as f:
|
|
2812
|
+
history = json.load(f)
|
|
2813
|
+
stats['recent_downloads'] = history[-10:] # 最近10次下载
|
|
2814
|
+
except:
|
|
2815
|
+
pass
|
|
2816
|
+
|
|
2817
|
+
return stats
|
|
2818
|
+
|
|
2819
|
+
|
|
2820
|
+
# 简化的使用函数(保持向后兼容)
|
|
2821
|
+
def fetch_data(dataset_ids: Union[str, List[str]],
|
|
2822
|
+
data_type: Optional[str] = None,
|
|
2823
|
+
data_format: Optional[str] = None,
|
|
2824
|
+
organism: Optional[str] = None,
|
|
2825
|
+
platform: Optional[str] = None,
|
|
2826
|
+
samples: Optional[List[str]] = None,
|
|
2827
|
+
force_download: bool = False,
|
|
2828
|
+
dir_save: str = "./bio_data_cache",
|
|
2829
|
+
auto_infer: bool = True,
|
|
2830
|
+
**kwargs) -> Dict[str, Any]:
|
|
2831
|
+
"""
|
|
2832
|
+
简化的数据获取函数(智能版)
|
|
2833
|
+
|
|
2834
|
+
Parameters:
|
|
2835
|
+
-----------
|
|
2836
|
+
dataset_ids : Union[str, List[str]]
|
|
2837
|
+
数据集ID
|
|
2838
|
+
data_type : Optional[str]
|
|
2839
|
+
数据类型,如未指定则自动推断
|
|
2840
|
+
data_format : Optional[str]
|
|
2841
|
+
数据格式,如未指定则自动推断
|
|
2842
|
+
organism : Optional[str]
|
|
2843
|
+
物种
|
|
2844
|
+
platform : Optional[str]
|
|
2845
|
+
平台
|
|
2846
|
+
samples : Optional[List[str]]
|
|
2847
|
+
样本列表
|
|
2848
|
+
force_download : bool
|
|
2849
|
+
强制重新下载
|
|
2850
|
+
dir_save : str
|
|
2851
|
+
缓存目录
|
|
2852
|
+
auto_infer : bool
|
|
2853
|
+
是否启用自动类型推断
|
|
2854
|
+
|
|
2855
|
+
Returns:
|
|
2856
|
+
--------
|
|
2857
|
+
Dict[str, Any]: 数据字典
|
|
2858
|
+
"""
|
|
2859
|
+
fetcher = BioDataFetcher(dir_save=dir_save, auto_infer=auto_infer)
|
|
2860
|
+
|
|
2861
|
+
return fetcher.fetch_data(
|
|
2862
|
+
dataset_ids=dataset_ids,
|
|
2863
|
+
data_type=data_type,
|
|
2864
|
+
data_format=data_format,
|
|
2865
|
+
organism=organism,
|
|
2866
|
+
platform=platform,
|
|
2867
|
+
samples=samples,
|
|
2868
|
+
force_download=force_download,
|
|
2869
|
+
**kwargs
|
|
2870
|
+
)
|
|
2871
|
+
|
|
2872
|
+
|
|
2873
|
+
# 快速使用函数
|
|
2874
|
+
def quick_fetch(dataset_id: str,
|
|
2875
|
+
dir_save: str = "./bio_data_cache",
|
|
2876
|
+
**kwargs) -> Any:
|
|
2877
|
+
"""
|
|
2878
|
+
快速获取数据(完全自动推断)
|
|
2879
|
+
|
|
2880
|
+
Parameters:
|
|
2881
|
+
-----------
|
|
2882
|
+
dataset_id : str
|
|
2883
|
+
数据集ID
|
|
2884
|
+
dir_save : str
|
|
2885
|
+
缓存目录
|
|
2886
|
+
**kwargs : 其他参数传递给fetch_data
|
|
2887
|
+
|
|
2888
|
+
Returns:
|
|
2889
|
+
--------
|
|
2890
|
+
Any: 获取的数据
|
|
2891
|
+
"""
|
|
2892
|
+
return fetch_data(
|
|
2893
|
+
dataset_ids=dataset_id,
|
|
2894
|
+
dir_save=dir_save,
|
|
2895
|
+
auto_infer=True,
|
|
2896
|
+
**kwargs
|
|
2897
|
+
).get(dataset_id)
|
|
2898
|
+
|
|
2899
|
+
|
|
2900
|
+
# 示例配置文件
|
|
2901
|
+
SAMPLE_CONFIG = """
|
|
2902
|
+
# BioDataFetcher 配置文件
|
|
2903
|
+
# 保存为 config.yaml 并使用 fetcher = BioDataFetcher(config_file='config.yaml')
|
|
2904
|
+
|
|
2905
|
+
# 下载设置
|
|
2906
|
+
max_retries: 3
|
|
2907
|
+
timeout: 30
|
|
2908
|
+
parallel_downloads: 4
|
|
2909
|
+
prefer_cached: true
|
|
2910
|
+
|
|
2911
|
+
# 数据处理
|
|
2912
|
+
auto_normalize: true
|
|
2913
|
+
gene_id_conversion: true
|
|
2914
|
+
quality_control: true
|
|
2915
|
+
|
|
2916
|
+
# API密钥(可选)
|
|
2917
|
+
ncbi_api_key: null
|
|
2918
|
+
ensembl_api_key: null
|
|
2919
|
+
|
|
2920
|
+
# 缓存设置
|
|
2921
|
+
max_cache_size_gb: 10
|
|
2922
|
+
|
|
2923
|
+
# 网络设置
|
|
2924
|
+
proxy: null
|
|
2925
|
+
user_agent: "BioDataFetcher/1.0"
|
|
2926
|
+
|
|
2927
|
+
# 日志设置
|
|
2928
|
+
log_level: "INFO"
|
|
2929
|
+
log_file: "bio_data_fetcher.log"
|
|
2930
|
+
"""
|
|
2931
|
+
|
|
2932
|
+
import subprocess
|
|
2933
|
+
import shutil
|
|
2934
|
+
from pathlib import Path
|
|
2935
|
+
|
|
2936
|
+
def check_fastq_dump_available():
|
|
2937
|
+
"""检查fastq-dump是否可用"""
|
|
2938
|
+
# 查找fastq-dump路径
|
|
2939
|
+
fastq_dump_path = shutil.which("fastq-dump")
|
|
2940
|
+
prefetch_path = shutil.which("prefetch")
|
|
2941
|
+
|
|
2942
|
+
print("检查SRA Toolkit工具...")
|
|
2943
|
+
|
|
2944
|
+
if fastq_dump_path:
|
|
2945
|
+
print(f"✅ fastq-dump 找到: {fastq_dump_path}")
|
|
2946
|
+
|
|
2947
|
+
# 检查版本
|
|
2948
|
+
try:
|
|
2949
|
+
result = subprocess.run(
|
|
2950
|
+
[fastq_dump_path, "--version"],
|
|
2951
|
+
capture_output=True,
|
|
2952
|
+
text=True,
|
|
2953
|
+
timeout=5
|
|
2954
|
+
)
|
|
2955
|
+
if result.returncode == 0:
|
|
2956
|
+
print(f" 版本: {result.stdout.strip()}")
|
|
2957
|
+
except:
|
|
2958
|
+
print(" 无法获取版本信息")
|
|
2959
|
+
else:
|
|
2960
|
+
print("❌ fastq-dump 未找到")
|
|
2961
|
+
print(" 请安装 SRA Toolkit: https://github.com/ncbi/sra-tools")
|
|
2962
|
+
|
|
2963
|
+
if prefetch_path:
|
|
2964
|
+
print(f"✅ prefetch 找到: {prefetch_path}")
|
|
2965
|
+
else:
|
|
2966
|
+
print("❌ prefetch 未找到")
|
|
2967
|
+
|
|
2968
|
+
return fastq_dump_path is not None and prefetch_path is not None
|
|
2969
|
+
|
|
2970
|
+
# 检查工具
|
|
2971
|
+
# check_fastq_dump_available()
|
|
2972
|
+
|
|
2973
|
+
def enhance_bio_data_fetcher_with_fastqdump():
|
|
2974
|
+
"""增强BioDataFetcher,添加fastq-dump支持"""
|
|
2975
|
+
|
|
2976
|
+
class EnhancedBioDataFetcher(BioDataFetcher):
|
|
2977
|
+
"""增强版的BioDataFetcher,支持fastq-dump"""
|
|
2978
|
+
|
|
2979
|
+
def __init__(self, *args, **kwargs):
|
|
2980
|
+
super().__init__(*args, **kwargs)
|
|
2981
|
+
self.fastq_downloader = FastqDumpDownloader(
|
|
2982
|
+
cache_dir=str(self.dir_save / "fastqdump")
|
|
2983
|
+
)
|
|
2984
|
+
|
|
2985
|
+
# 覆盖SRA处理器
|
|
2986
|
+
self.data_processors['sra'] = self._process_sra_enhanced
|
|
2987
|
+
if DataSource.SRA in self.data_processors:
|
|
2988
|
+
self.data_processors[DataSource.SRA] = self._process_sra_enhanced
|
|
2989
|
+
|
|
2990
|
+
def _process_sra_enhanced(self, config: DatasetConfig) -> Any:
|
|
2991
|
+
"""增强的SRA处理方法,优先使用fastq-dump"""
|
|
2992
|
+
dataset_id = config.dataset_id
|
|
2993
|
+
|
|
2994
|
+
if config.data_format == DataFormat.METADATA:
|
|
2995
|
+
# 仍然使用原来的方法获取元数据
|
|
2996
|
+
return self._process_sra(config)
|
|
2997
|
+
|
|
2998
|
+
elif config.data_format == DataFormat.FASTQ:
|
|
2999
|
+
print(f"使用fastq-dump下载FASTQ: {dataset_id}")
|
|
3000
|
+
|
|
3001
|
+
# 提取参数
|
|
3002
|
+
split_files = config.custom_params.get('split_files', True)
|
|
3003
|
+
gzip_output = config.custom_params.get('gzip_output', True)
|
|
3004
|
+
use_prefetch = config.custom_params.get('use_prefetch', True)
|
|
3005
|
+
max_retries = config.custom_params.get('max_retries', 3)
|
|
3006
|
+
|
|
3007
|
+
# 使用fastq-dump下载
|
|
3008
|
+
result = self.fastq_downloader.download_with_fastq_dump(
|
|
3009
|
+
accession=dataset_id,
|
|
3010
|
+
output_dir=self.dir_save / "fastq",
|
|
3011
|
+
split_files=split_files,
|
|
3012
|
+
gzip_output=gzip_output,
|
|
3013
|
+
max_retries=max_retries
|
|
3014
|
+
)
|
|
3015
|
+
|
|
3016
|
+
# 如果需要,也获取元数据
|
|
3017
|
+
if result.get('success', False) and config.custom_params.get('include_metadata', True):
|
|
3018
|
+
metadata = self._get_sra_metadata(dataset_id)
|
|
3019
|
+
result['metadata'] = metadata
|
|
3020
|
+
|
|
3021
|
+
return result
|
|
3022
|
+
|
|
3023
|
+
else:
|
|
3024
|
+
raise ValueError(f"Unsupported SRA format: {config.data_format}")
|
|
3025
|
+
|
|
3026
|
+
def download_sra_with_fastqdump(self,
|
|
3027
|
+
accession: str,
|
|
3028
|
+
split_files: bool = True,
|
|
3029
|
+
gzip_output: bool = True,
|
|
3030
|
+
**kwargs) -> Dict[str, Any]:
|
|
3031
|
+
"""
|
|
3032
|
+
专门使用fastq-dump下载SRA数据
|
|
3033
|
+
|
|
3034
|
+
Parameters:
|
|
3035
|
+
-----------
|
|
3036
|
+
accession : str
|
|
3037
|
+
SRA accession
|
|
3038
|
+
split_files : bool
|
|
3039
|
+
是否拆分paired-end文件
|
|
3040
|
+
gzip_output : bool
|
|
3041
|
+
是否gzip压缩
|
|
3042
|
+
**kwargs :
|
|
3043
|
+
其他参数传递给download_with_fastq_dump
|
|
3044
|
+
|
|
3045
|
+
Returns:
|
|
3046
|
+
--------
|
|
3047
|
+
Dict: 下载结果
|
|
3048
|
+
"""
|
|
3049
|
+
return self.fastq_downloader.download_with_fastq_dump(
|
|
3050
|
+
accession=accession,
|
|
3051
|
+
output_dir=self.dir_save / "fastq",
|
|
3052
|
+
split_files=split_files,
|
|
3053
|
+
gzip_output=gzip_output,
|
|
3054
|
+
**kwargs
|
|
3055
|
+
)
|
|
3056
|
+
|
|
3057
|
+
return EnhancedBioDataFetcher
|
|
3058
|
+
|
|
3059
|
+
# 使用示例
|
|
3060
|
+
def example_enhanced_fetcher():
|
|
3061
|
+
"""使用增强版的BioDataFetcher"""
|
|
3062
|
+
print("使用增强版BioDataFetcher(支持fastq-dump)")
|
|
3063
|
+
print("=" * 60)
|
|
3064
|
+
|
|
3065
|
+
# 创建增强版fetcher
|
|
3066
|
+
EnhancedFetcher = enhance_bio_data_fetcher_with_fastqdump()
|
|
3067
|
+
fetcher = EnhancedFetcher(dir_save="./enhanced_cache")
|
|
3068
|
+
|
|
3069
|
+
# 方法1:使用统一接口(会自动选择fastq-dump)
|
|
3070
|
+
print("\n方法1:使用统一接口")
|
|
3071
|
+
result1 = fetcher.fetch_data(
|
|
3072
|
+
dataset_ids="SRR390728", # 测试用小文件
|
|
3073
|
+
data_type='sra',
|
|
3074
|
+
data_format='fastq',
|
|
3075
|
+
split_files=True,
|
|
3076
|
+
gzip_output=True,
|
|
3077
|
+
force_download=True
|
|
3078
|
+
)
|
|
3079
|
+
|
|
3080
|
+
print(f"结果1: {result1.get('SRR390728', {}).get('success', False)}")
|
|
3081
|
+
|
|
3082
|
+
# 方法2:直接使用fastq-dump方法
|
|
3083
|
+
print("\n方法2:直接使用fastq-dump方法")
|
|
3084
|
+
result2 = fetcher.download_sra_with_fastqdump(
|
|
3085
|
+
accession="SRR390728",
|
|
3086
|
+
split_files=True,
|
|
3087
|
+
gzip_output=True
|
|
3088
|
+
)
|
|
3089
|
+
|
|
3090
|
+
print(f"结果2: 成功={result2.get('success', False)}, 文件数={result2.get('file_count', 0)}")
|
|
3091
|
+
|
|
3092
|
+
# 方法3:批量下载
|
|
3093
|
+
print("\n方法3:批量下载测试")
|
|
3094
|
+
batch_result = fetcher.batch_fetch([
|
|
3095
|
+
{
|
|
3096
|
+
'dataset_id': 'SRR390728',
|
|
3097
|
+
'type': 'sra',
|
|
3098
|
+
'format': 'fastq',
|
|
3099
|
+
'split_files': True,
|
|
3100
|
+
'gzip_output': True
|
|
3101
|
+
},
|
|
3102
|
+
{
|
|
3103
|
+
'dataset_id': 'SRR3473776', # 另一个小文件
|
|
3104
|
+
'type': 'sra',
|
|
3105
|
+
'format': 'fastq',
|
|
3106
|
+
'split_files': False # 单端数据
|
|
3107
|
+
}
|
|
3108
|
+
])
|
|
3109
|
+
|
|
3110
|
+
for acc, res in batch_result.items():
|
|
3111
|
+
print(f" {acc}: 成功={res.get('success', False)}, 文件={len(res.get('files', []))}")
|
|
3112
|
+
|
|
3113
|
+
return fetcher, result1, result2, batch_result
|
|
3114
|
+
|
|
3115
|
+
# 运行示例
|
|
3116
|
+
# fetcher, r1, r2, batch = example_enhanced_fetcher()
|
|
3117
|
+
|
|
3118
|
+
def setup_sra_toolkit():
|
|
3119
|
+
"""帮助用户安装和配置SRA Toolkit"""
|
|
3120
|
+
import platform
|
|
3121
|
+
import sys
|
|
3122
|
+
|
|
3123
|
+
print("SRA Toolkit 安装助手")
|
|
3124
|
+
print("=" * 60)
|
|
3125
|
+
|
|
3126
|
+
system = platform.system()
|
|
3127
|
+
print(f"操作系统: {system}")
|
|
3128
|
+
print(f"Python版本: {sys.version}")
|
|
3129
|
+
|
|
3130
|
+
# 检查是否已安装
|
|
3131
|
+
fastq_dump_path = shutil.which("fastq-dump")
|
|
3132
|
+
prefetch_path = shutil.which("prefetch")
|
|
3133
|
+
|
|
3134
|
+
if fastq_dump_path and prefetch_path:
|
|
3135
|
+
print("✅ SRA Toolkit 已安装")
|
|
3136
|
+
print(f" fastq-dump: {fastq_dump_path}")
|
|
3137
|
+
print(f" prefetch: {prefetch_path}")
|
|
3138
|
+
return True
|
|
3139
|
+
|
|
3140
|
+
print("❌ SRA Toolkit 未安装或不在PATH中")
|
|
3141
|
+
print("\n安装指南:")
|
|
3142
|
+
|
|
3143
|
+
if system == "Darwin": # macOS
|
|
3144
|
+
print("""
|
|
3145
|
+
方法1: 使用Homebrew (推荐)
|
|
3146
|
+
brew install sratoolkit
|
|
3147
|
+
|
|
3148
|
+
方法2: 手动下载
|
|
3149
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3150
|
+
2. 下载macOS版本
|
|
3151
|
+
3. 解压并添加到PATH:
|
|
3152
|
+
echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.zshrc
|
|
3153
|
+
source ~/.zshrc
|
|
3154
|
+
""")
|
|
3155
|
+
|
|
3156
|
+
elif system == "Linux":
|
|
3157
|
+
print("""
|
|
3158
|
+
方法1: 使用包管理器
|
|
3159
|
+
# Ubuntu/Debian
|
|
3160
|
+
sudo apt-get install sra-toolkit
|
|
3161
|
+
|
|
3162
|
+
# CentOS/RHEL/Fedora
|
|
3163
|
+
sudo yum install sra-toolkit
|
|
3164
|
+
|
|
3165
|
+
方法2: 手动下载
|
|
3166
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3167
|
+
2. 下载Linux版本
|
|
3168
|
+
3. 解压并添加到PATH:
|
|
3169
|
+
echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.bashrc
|
|
3170
|
+
source ~/.bashrc
|
|
3171
|
+
""")
|
|
3172
|
+
|
|
3173
|
+
elif system == "Windows":
|
|
3174
|
+
print("""
|
|
3175
|
+
方法1: 使用Chocolatey
|
|
3176
|
+
choco install sratoolkit
|
|
3177
|
+
|
|
3178
|
+
方法2: 手动下载
|
|
3179
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3180
|
+
2. 下载Windows版本
|
|
3181
|
+
3. 解压并将bin目录添加到系统PATH
|
|
3182
|
+
""")
|
|
3183
|
+
|
|
3184
|
+
else:
|
|
3185
|
+
print(f" 不支持的操作系统: {system}")
|
|
3186
|
+
|
|
3187
|
+
print("\n配置建议:")
|
|
3188
|
+
print(" 1. 运行 'vdb-config -i' 进行交互式配置")
|
|
3189
|
+
print(" 2. 设置缓存目录: vdb-config --set /repository/user/main/public/root=./ncbi_cache")
|
|
3190
|
+
print(" 3. 测试: prefetch SRR390728 && fastq-dump SRR390728")
|
|
3191
|
+
|
|
3192
|
+
return False
|
|
3193
|
+
|
|
3194
|
+
def configure_sra_toolkit():
|
|
3195
|
+
"""配置SRA Toolkit(如果已安装)"""
|
|
3196
|
+
import subprocess
|
|
3197
|
+
|
|
3198
|
+
print("配置SRA Toolkit")
|
|
3199
|
+
print("=" * 50)
|
|
3200
|
+
|
|
3201
|
+
# 检查是否安装
|
|
3202
|
+
if not shutil.which("vdb-config"):
|
|
3203
|
+
print("❌ vdb-config 未找到,请先安装SRA Toolkit")
|
|
3204
|
+
return False
|
|
3205
|
+
|
|
3206
|
+
try:
|
|
3207
|
+
# 设置缓存目录
|
|
3208
|
+
cache_dir = Path.home() / ".ncbi" / "cache"
|
|
3209
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
3210
|
+
|
|
3211
|
+
print(f"设置缓存目录: {cache_dir}")
|
|
3212
|
+
|
|
3213
|
+
# 运行vdb-config进行配置
|
|
3214
|
+
print("\n建议运行以下命令进行配置:")
|
|
3215
|
+
print(f" vdb-config -i")
|
|
3216
|
+
print("\n或者使用命令行配置:")
|
|
3217
|
+
print(f" vdb-config --set /repository/user/main/public/root={cache_dir}")
|
|
3218
|
+
print(" vdb-config --set /repository/user/main/public/apps/http/read-only=true")
|
|
3219
|
+
|
|
3220
|
+
# 尝试设置
|
|
3221
|
+
try:
|
|
3222
|
+
subprocess.run(
|
|
3223
|
+
["vdb-config", "--set", f"/repository/user/main/public/root={cache_dir}"],
|
|
3224
|
+
check=True,
|
|
3225
|
+
capture_output=True,
|
|
3226
|
+
text=True
|
|
3227
|
+
)
|
|
3228
|
+
print("✅ 缓存目录设置成功")
|
|
3229
|
+
except:
|
|
3230
|
+
print("⚠️ 无法自动设置,请手动运行vdb-config")
|
|
3231
|
+
|
|
3232
|
+
return True
|
|
3233
|
+
|
|
3234
|
+
except Exception as e:
|
|
3235
|
+
print(f"❌ 配置失败: {e}")
|
|
3236
|
+
return False
|
|
3237
|
+
|
|
3238
|
+
# 运行安装助手
|
|
3239
|
+
# setup_sra_toolkit()
|
|
3240
|
+
# configure_sra_toolkit()
|
|
3241
|
+
|
|
3242
|
+
def install_fastq_dump_helper():
|
|
3243
|
+
"""提供fastq-dump安装帮助"""
|
|
3244
|
+
import platform
|
|
3245
|
+
import sys
|
|
3246
|
+
import subprocess
|
|
3247
|
+
import shutil
|
|
3248
|
+
|
|
3249
|
+
print("🔧 fastq-dump 安装助手")
|
|
3250
|
+
print("=" * 60)
|
|
3251
|
+
|
|
3252
|
+
# 获取系统信息
|
|
3253
|
+
system = platform.system()
|
|
3254
|
+
machine = platform.machine()
|
|
3255
|
+
python_version = sys.version_info
|
|
3256
|
+
|
|
3257
|
+
print(f"操作系统: {system} ({machine})")
|
|
3258
|
+
print(f"Python版本: {sys.version[:20]}")
|
|
3259
|
+
|
|
3260
|
+
# 检查当前状态
|
|
3261
|
+
tools = ['fastq-dump', 'prefetch', 'fasterq-dump']
|
|
3262
|
+
available = {}
|
|
3263
|
+
|
|
3264
|
+
for tool in tools:
|
|
3265
|
+
path = shutil.which(tool)
|
|
3266
|
+
available[tool] = path
|
|
3267
|
+
status = "✅ 已安装" if path else "❌ 未安装"
|
|
3268
|
+
print(f"{tool}: {status}")
|
|
3269
|
+
if path:
|
|
3270
|
+
print(f" 路径: {path}")
|
|
3271
|
+
|
|
3272
|
+
# 尝试获取版本
|
|
3273
|
+
try:
|
|
3274
|
+
result = subprocess.run(
|
|
3275
|
+
[tool, "--version"],
|
|
3276
|
+
capture_output=True,
|
|
3277
|
+
text=True,
|
|
3278
|
+
timeout=5
|
|
3279
|
+
)
|
|
3280
|
+
if result.returncode == 0:
|
|
3281
|
+
version_line = result.stdout.split('\n')[0] if result.stdout else "未知"
|
|
3282
|
+
print(f" 版本: {version_line}")
|
|
3283
|
+
except:
|
|
3284
|
+
pass
|
|
3285
|
+
|
|
3286
|
+
print("\n" + "=" * 60)
|
|
3287
|
+
print("安装指南:")
|
|
3288
|
+
|
|
3289
|
+
if system == "Darwin": # macOS
|
|
3290
|
+
print("""
|
|
3291
|
+
方法1: 使用Homebrew (推荐)
|
|
3292
|
+
---------------------------------
|
|
3293
|
+
1. 安装Homebrew (如果尚未安装):
|
|
3294
|
+
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
|
3295
|
+
|
|
3296
|
+
2. 安装SRA Toolkit:
|
|
3297
|
+
brew install sratoolkit
|
|
3298
|
+
|
|
3299
|
+
3. 验证安装:
|
|
3300
|
+
fastq-dump --version
|
|
3301
|
+
prefetch --version
|
|
3302
|
+
|
|
3303
|
+
方法2: 使用Conda
|
|
3304
|
+
---------------------------------
|
|
3305
|
+
1. 安装Miniconda或Anaconda
|
|
3306
|
+
2. 创建环境并安装:
|
|
3307
|
+
conda create -n sra-tools -c bioconda sra-tools
|
|
3308
|
+
conda activate sra-tools
|
|
3309
|
+
3. 验证: fastq-dump --version
|
|
3310
|
+
|
|
3311
|
+
方法3: 手动下载
|
|
3312
|
+
---------------------------------
|
|
3313
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3314
|
+
2. 下载macOS版本 (.dmg或.tar.gz)
|
|
3315
|
+
3. 解压并添加到PATH:
|
|
3316
|
+
echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.zshrc
|
|
3317
|
+
source ~/.zshrc
|
|
3318
|
+
""")
|
|
3319
|
+
|
|
3320
|
+
elif system == "Linux":
|
|
3321
|
+
print("""
|
|
3322
|
+
方法1: 使用包管理器 (Ubuntu/Debian)
|
|
3323
|
+
---------------------------------
|
|
3324
|
+
1. 更新包列表:
|
|
3325
|
+
sudo apt-get update
|
|
3326
|
+
|
|
3327
|
+
2. 安装SRA Toolkit:
|
|
3328
|
+
sudo apt-get install sra-toolkit
|
|
3329
|
+
|
|
3330
|
+
3. 验证安装:
|
|
3331
|
+
fastq-dump --version
|
|
3332
|
+
|
|
3333
|
+
方法2: 使用包管理器 (CentOS/RHEL/Fedora)
|
|
3334
|
+
---------------------------------
|
|
3335
|
+
1. 安装EPEL仓库 (CentOS/RHEL):
|
|
3336
|
+
sudo yum install epel-release
|
|
3337
|
+
|
|
3338
|
+
2. 安装SRA Toolkit:
|
|
3339
|
+
sudo yum install sra-toolkit
|
|
3340
|
+
或
|
|
3341
|
+
sudo dnf install sra-toolkit (Fedora)
|
|
3342
|
+
|
|
3343
|
+
3. 验证安装
|
|
3344
|
+
|
|
3345
|
+
方法3: 使用Conda
|
|
3346
|
+
---------------------------------
|
|
3347
|
+
1. 安装Miniconda:
|
|
3348
|
+
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
|
3349
|
+
bash Miniconda3-latest-Linux-x86_64.sh
|
|
3350
|
+
|
|
3351
|
+
2. 安装SRA Toolkit:
|
|
3352
|
+
conda install -c bioconda sra-tools
|
|
3353
|
+
|
|
3354
|
+
方法4: 手动下载
|
|
3355
|
+
---------------------------------
|
|
3356
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3357
|
+
2. 下载Linux版本 (.tar.gz)
|
|
3358
|
+
3. 解压并添加到PATH:
|
|
3359
|
+
tar -xzvf sratoolkit.*.tar.gz
|
|
3360
|
+
echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.bashrc
|
|
3361
|
+
source ~/.bashrc
|
|
3362
|
+
""")
|
|
3363
|
+
|
|
3364
|
+
elif system == "Windows":
|
|
3365
|
+
print("""
|
|
3366
|
+
方法1: 使用Chocolatey (推荐)
|
|
3367
|
+
---------------------------------
|
|
3368
|
+
1. 安装Chocolatey (如果尚未安装):
|
|
3369
|
+
以管理员身份打开PowerShell,运行:
|
|
3370
|
+
Set-ExecutionPolicy Bypass -Scope Process -Force
|
|
3371
|
+
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
|
|
3372
|
+
iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
|
|
3373
|
+
|
|
3374
|
+
2. 安装SRA Toolkit:
|
|
3375
|
+
choco install sratoolkit
|
|
3376
|
+
|
|
3377
|
+
3. 验证安装:
|
|
3378
|
+
fastq-dump --version
|
|
3379
|
+
|
|
3380
|
+
方法2: 使用Conda
|
|
3381
|
+
---------------------------------
|
|
3382
|
+
1. 安装Miniconda: https://docs.conda.io/en/latest/miniconda.html
|
|
3383
|
+
2. 安装SRA Toolkit:
|
|
3384
|
+
conda install -c bioconda sra-tools
|
|
3385
|
+
|
|
3386
|
+
方法3: 手动下载
|
|
3387
|
+
---------------------------------
|
|
3388
|
+
1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
|
|
3389
|
+
2. 下载Windows版本 (.exe安装程序)
|
|
3390
|
+
3. 运行安装程序,确保勾选"Add to PATH"
|
|
3391
|
+
""")
|
|
3392
|
+
|
|
3393
|
+
else:
|
|
3394
|
+
print(f"⚠️ 不支持的操作系统: {system}")
|
|
3395
|
+
print("请手动访问: https://github.com/ncbi/sra-tools/wiki/Downloads")
|
|
3396
|
+
|
|
3397
|
+
print("\n" + "=" * 60)
|
|
3398
|
+
print("配置建议:")
|
|
3399
|
+
|
|
3400
|
+
if any(available.values()):
|
|
3401
|
+
print("运行以下命令进行配置:")
|
|
3402
|
+
print(" vdb-config -i (交互式配置)")
|
|
3403
|
+
print("\n或使用命令行配置:")
|
|
3404
|
+
print(" vdb-config --set /repository/user/main/public/root=./ncbi_cache")
|
|
3405
|
+
print(" vdb-config --set /repository/user/main/public/apps/http/read-only=true")
|
|
3406
|
+
else:
|
|
3407
|
+
print("请先安装SRA Toolkit,然后运行上述配置命令")
|
|
3408
|
+
|
|
3409
|
+
return available
|
|
3410
|
+
|
|
3411
|
+
# 运行安装助手
|
|
3412
|
+
# install_fastq_dump_helper()
|
|
3413
|
+
|
|
3414
|
+
|
|
3415
|
+
|
|
3416
|
+
|
|
3417
|
+
if __name__ == "__main__":
|
|
3418
|
+
# 演示如何使用
|
|
3419
|
+
print("BioDataFetcher 演示")
|
|
3420
|
+
print("=" * 50)
|
|
3421
|
+
|
|
3422
|
+
# 创建fetcher实例
|
|
3423
|
+
fetcher = BioDataFetcher(dir_save="./test_cache")
|
|
3424
|
+
|
|
3425
|
+
# 示例1: 自动推断并获取GEO数据
|
|
3426
|
+
print("\n1. 获取GEO数据 (自动推断):")
|
|
3427
|
+
geo_data = fetcher.fetch_data("GSE12345")
|
|
3428
|
+
print(f" 获取到数据: {type(geo_data)}")
|
|
3429
|
+
|
|
3430
|
+
# 示例2: 获取SRA元数据
|
|
3431
|
+
print("\n2. 获取SRA元数据:")
|
|
3432
|
+
sra_meta = fetcher.fetch_data("SRR1635435", data_format="metadata")
|
|
3433
|
+
print(f" 获取到元数据: {type(sra_meta)}")
|
|
3434
|
+
|
|
3435
|
+
# 示例3: 搜索数据集
|
|
3436
|
+
print("\n3. 搜索癌症相关数据集:")
|
|
3437
|
+
search_results = fetcher.list_datasets(search_query="cancer RNA-seq", limit=5)
|
|
3438
|
+
if not search_results.empty:
|
|
3439
|
+
print(f" 找到 {len(search_results)} 个数据集:")
|
|
3440
|
+
for _, row in search_results.iterrows():
|
|
3441
|
+
print(f" {row['accession']}: {row['title'][:50]}...")
|
|
3442
|
+
|
|
3443
|
+
# 示例4: 查看缓存
|
|
3444
|
+
print("\n4. 查看缓存数据:")
|
|
3445
|
+
cached = fetcher.cache_list()
|
|
3446
|
+
if not cached.empty:
|
|
3447
|
+
print(f" 有 {len(cached)} 个缓存数据集")
|
|
3448
|
+
|
|
3449
|
+
# 示例5: 获取统计信息
|
|
3450
|
+
print("\n5. 统计信息:")
|
|
3451
|
+
stats = fetcher.get_statistics()
|
|
3452
|
+
print(f" 总数据集数: {stats['total_datasets']}")
|
|
3453
|
+
print(f" 缓存大小: {stats['total_size_gb']:.2f} GB")
|