py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of py2ls might be problematic. Click here for more details.

Files changed (72) hide show
  1. py2ls/.DS_Store +0 -0
  2. py2ls/.git/.DS_Store +0 -0
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  5. py2ls/.git/objects/.DS_Store +0 -0
  6. py2ls/.git/refs/.DS_Store +0 -0
  7. py2ls/ImageLoader.py +621 -0
  8. py2ls/__init__.py +7 -5
  9. py2ls/apptainer2ls.py +3940 -0
  10. py2ls/batman.py +164 -42
  11. py2ls/bio.py +2595 -0
  12. py2ls/cell_image_clf.py +1632 -0
  13. py2ls/container2ls.py +4635 -0
  14. py2ls/corr.py +475 -0
  15. py2ls/data/.DS_Store +0 -0
  16. py2ls/data/email/email_html_template.html +88 -0
  17. py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
  18. py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
  19. py2ls/data/mygenes_fields_241022.txt +355 -0
  20. py2ls/data/re_common_pattern.json +173 -0
  21. py2ls/data/sns_info.json +74 -0
  22. py2ls/data/styles/.DS_Store +0 -0
  23. py2ls/data/styles/example/.DS_Store +0 -0
  24. py2ls/data/styles/stylelib/.DS_Store +0 -0
  25. py2ls/data/styles/stylelib/grid.mplstyle +15 -0
  26. py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
  27. py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
  28. py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
  29. py2ls/data/styles/stylelib/light.mplstyl +6 -0
  30. py2ls/data/styles/stylelib/muted.mplstyle +6 -0
  31. py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
  32. py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
  33. py2ls/data/styles/stylelib/nature.mplstyle +31 -0
  34. py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
  35. py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
  36. py2ls/data/styles/stylelib/paper.mplstyle +290 -0
  37. py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
  38. py2ls/data/styles/stylelib/retro.mplstyle +4 -0
  39. py2ls/data/styles/stylelib/sans.mplstyle +10 -0
  40. py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
  41. py2ls/data/styles/stylelib/science.mplstyle +48 -0
  42. py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
  43. py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
  44. py2ls/data/tiles.csv +146 -0
  45. py2ls/data/usages_pd.json +1417 -0
  46. py2ls/data/usages_sns.json +31 -0
  47. py2ls/docker2ls.py +5446 -0
  48. py2ls/ec2ls.py +61 -0
  49. py2ls/fetch_update.py +145 -0
  50. py2ls/ich2ls.py +1955 -296
  51. py2ls/im2.py +8242 -0
  52. py2ls/image_ml2ls.py +2100 -0
  53. py2ls/ips.py +33909 -3418
  54. py2ls/ml2ls.py +7700 -0
  55. py2ls/mol.py +289 -0
  56. py2ls/mount2ls.py +1307 -0
  57. py2ls/netfinder.py +873 -351
  58. py2ls/nl2ls.py +283 -0
  59. py2ls/ocr.py +1581 -458
  60. py2ls/plot.py +10394 -314
  61. py2ls/rna2ls.py +311 -0
  62. py2ls/ssh2ls.md +456 -0
  63. py2ls/ssh2ls.py +5933 -0
  64. py2ls/ssh2ls_v01.py +2204 -0
  65. py2ls/stats.py +66 -172
  66. py2ls/temp20251124.py +509 -0
  67. py2ls/translator.py +2 -0
  68. py2ls/utils/decorators.py +3564 -0
  69. py2ls/utils_bio.py +3453 -0
  70. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
  71. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
  72. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/utils_bio.py ADDED
@@ -0,0 +1,3453 @@
1
+ """
2
+ 内容包含:
3
+ 1.基础使用
4
+ 2.GEO数据获取
5
+ 3.SRA数据获取
6
+ 4.数据搜索功能
7
+ 5.批量处理
8
+ 6.缓存管理
9
+ 7.高级功能
10
+ 8.故障排除
11
+
12
+ 多数据源支持:GEO, TCGA, SRA, ArrayExpress, ENCODE, 单细胞数据等
13
+ 多种数据格式:表达矩阵、临床数据、突变数据、FASTQ文件等
14
+ 智能缓存:自动缓存下载的数据,避免重复下载
15
+ 并行下载:支持多线程并行下载大型文件
16
+ 数据搜索:内置数据集搜索功能
17
+ 批量处理:支持批量下载多个数据集
18
+ 配置管理:支持YAML/JSON配置文件
19
+ 历史记录:记录所有下载操作
20
+ 向后兼容:保持与现有GEO函数的兼容性
21
+ 错误处理:完善的错误处理和日志记录
22
+
23
+
24
+ # 1. 简单使用(自动优先fastq-dump)
25
+ fetcher = BioDataFetcher(dir_save="./my_cache", prefer_fastq_dump=True)
26
+ result = fetcher.fetch_data("SRR1635435", data_type='sra', data_format='fastq')
27
+
28
+ # 2. 使用配置文件
29
+ fetcher = BioDataFetcher(dir_save="./my_cache", config_file="./config.yaml")
30
+
31
+ # 3. 强制指定方法
32
+ result = fetcher.fetch_data(
33
+ dataset_ids="SRR1635435",
34
+ data_type='sra',
35
+ data_format='fastq',
36
+ download_method='fastq_dump' # 或 'ftp'
37
+ )
38
+
39
+ # 4. 传递fastq-dump参数
40
+ result = fetcher.fetch_data(
41
+ dataset_ids="SRR1635435",
42
+ data_type='sra',
43
+ data_format='fastq',
44
+ split_files=True,
45
+ gzip_output=True,
46
+ threads=4
47
+ )
48
+
49
+ """
50
+
51
+ import os
52
+ import re
53
+ import pandas as pd
54
+ import numpy as np
55
+ from typing import Union, Dict, List, Optional, Any, Tuple, Callable
56
+ import logging
57
+ from pathlib import Path
58
+ import warnings
59
+ from datetime import datetime
60
+ import json
61
+ import yaml
62
+ from dataclasses import dataclass, field
63
+ from enum import Enum
64
+ import hashlib
65
+ import pickle
66
+ from tqdm import tqdm
67
+ import time
68
+ import requests
69
+ from concurrent.futures import ThreadPoolExecutor, as_completed
70
+
71
+
72
+ logger = logging.getLogger(__name__)
73
+
74
+ # 导入现有的GEO函数
75
+ try:
76
+ from . import bio as geo_utils
77
+ GEO_UTILS_AVAILABLE = True
78
+ except ImportError:
79
+ GEO_UTILS_AVAILABLE = False
80
+ warnings.warn("GEO utils not available. Make sure bio.py is in the same directory")
81
+
82
+ # 可能需要的额外库(可选择安装)
83
+ try:
84
+ import GEOparse
85
+ GEOPARSE_AVAILABLE = True
86
+ except ImportError:
87
+ GEOPARSE_AVAILABLE = False
88
+ warnings.warn("GEOparse not available. Install with: pip install GEOparse")
89
+
90
+ try:
91
+ from pysradb import SRAweb
92
+ SRADB_AVAILABLE = True
93
+ except ImportError:
94
+ SRADB_AVAILABLE = False
95
+ warnings.warn("pysradb not available. Install with: pip install pysradb")
96
+
97
+ try:
98
+ import gseapy as gp
99
+ GSEAPY_AVAILABLE = True
100
+ except ImportError:
101
+ GSEAPY_AVAILABLE = False
102
+
103
+ try:
104
+ import mygene
105
+ MYGENE_AVAILABLE = True
106
+ except ImportError:
107
+ MYGENE_AVAILABLE = False
108
+
109
+ try:
110
+ import requests
111
+ REQUESTS_AVAILABLE = True
112
+ except ImportError:
113
+ REQUESTS_AVAILABLE = False
114
+ warnings.warn("requests not available. Install with: pip install requests")
115
+
116
+ # 配置日志
117
+ logging.basicConfig(
118
+ level=logging.INFO,
119
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
120
+ )
121
+ logger = logging.getLogger(__name__)
122
+
123
+ # 数据类型枚举
124
+ class DataSource(Enum):
125
+ GEO = "geo" # Gene Expression Omnibus
126
+ SRA = "sra" # Sequence Read Archive
127
+ TCGA = "tcga" # The Cancer Genome Atlas
128
+ ENCODE = "encode" # ENCODE Project
129
+ ARRAY_EXPRESS = "arrayexpress" # ArrayExpress
130
+ DDBJ = "ddbj" # DNA Data Bank of Japan
131
+ EGA = "ega" # European Genome-phenome Archive
132
+ SINGLE_CELL = "single_cell" # 单细胞数据
133
+ PROTEIN_ATLAS = "protein_atlas" # Human Protein Atlas
134
+ STRINGDB = "stringdb" # STRING数据库
135
+ KEGG = "kegg" # KEGG通路
136
+ REACTOME = "reactome" # Reactome通路
137
+ CUSTOM = "custom" # 自定义数据源
138
+
139
+ @classmethod
140
+ def from_accession(cls, accession: str) -> 'DataSource':
141
+ """根据accession自动推断数据源"""
142
+ accession = accession.upper()
143
+
144
+ # GEO数据集
145
+ if re.match(r'^GSE\d+$', accession) or re.match(r'^GDS\d+$', accession):
146
+ return cls.GEO
147
+
148
+ # SRA数据集
149
+ elif re.match(r'^(SRR|ERR|DRR)\d+$', accession):
150
+ return cls.SRA
151
+
152
+ # TCGA项目
153
+ elif re.match(r'^TCGA-[A-Z0-9]+$', accession) or accession.startswith('TCGA_'):
154
+ return cls.TCGA
155
+
156
+ # ENCODE数据集
157
+ elif re.match(r'^ENC[SR]\d+$', accession):
158
+ return cls.ENCODE
159
+
160
+ # ArrayExpress
161
+ elif re.match(r'^E-[A-Z]{4}-\d+$', accession):
162
+ return cls.ARRAY_EXPRESS
163
+
164
+ # DDBJ
165
+ elif re.match(r'^(DRA|DRS|DRX|DRZ)\d+$', accession):
166
+ return cls.DDBJ
167
+
168
+ # 单细胞数据集(常见格式)
169
+ elif re.match(r'^SC\d+$', accession) or 'SC' in accession:
170
+ return cls.SINGLE_CELL
171
+
172
+ # 默认返回GEO
173
+ else:
174
+ return cls.GEO
175
+
176
+ class DataFormat(Enum):
177
+ EXPRESSION = "expression" # 表达矩阵
178
+ COUNTS = "counts" # 原始计数
179
+ FASTQ = "fastq" # FASTQ文件
180
+ BAM = "bam" # BAM文件
181
+ METADATA = "metadata" # 元数据
182
+ CLINICAL = "clinical" # 临床数据
183
+ MUTATIONS = "mutations" # 突变数据
184
+ PROBE = "probe" # 探针信息
185
+ ANNOTATION = "annotation" # 注释信息
186
+ NETWORK = "network" # 网络数据
187
+ PATHWAY = "pathway" # 通路数据
188
+
189
+ @classmethod
190
+ def infer_format(cls, data_type: DataSource, **kwargs) -> 'DataFormat':
191
+ """根据数据源和其他参数推断数据格式"""
192
+ platform = kwargs.get('platform', '').lower()
193
+ data_format = kwargs.get('data_format', '').lower()
194
+
195
+ # 如果有明确指定的格式,使用它
196
+ if data_format:
197
+ for fmt in cls:
198
+ if fmt.value == data_format:
199
+ return fmt
200
+
201
+ # 根据数据源推断
202
+ if data_type == DataSource.GEO:
203
+ return cls.EXPRESSION
204
+ elif data_type == DataSource.SRA:
205
+ return cls.FASTQ if kwargs.get('download_fastq', False) else cls.METADATA
206
+ elif data_type == DataSource.TCGA:
207
+ if platform == 'clinical':
208
+ return cls.CLINICAL
209
+ elif platform == 'mutations':
210
+ return cls.MUTATIONS
211
+ else:
212
+ return cls.EXPRESSION
213
+ elif data_type == DataSource.ENCODE:
214
+ return cls.BAM if 'chip' in platform else cls.EXPRESSION
215
+ else:
216
+ return cls.METADATA
217
+ class FastqDumpDownloader:
218
+ """
219
+ 使用fastq-dump下载SRA数据的下载器
220
+ 更可靠,支持更多功能
221
+ """
222
+
223
+ def __init__(self, cache_dir: str = "./sra_fastqdump", use_prefetch: bool = True):
224
+ """
225
+ Parameters:
226
+ -----------
227
+ cache_dir : str
228
+ 缓存目录
229
+ use_prefetch : bool
230
+ 是否使用prefetch先下载.sra文件(推荐)
231
+ """
232
+ self.cache_dir = Path(cache_dir)
233
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
234
+ self.use_prefetch = use_prefetch
235
+
236
+ # 查找工具
237
+ self.fastq_dump_path = shutil.which("fastq-dump") or shutil.which("fastq-dump.exe")
238
+ self.prefetch_path = shutil.which("prefetch") or shutil.which("prefetch.exe")
239
+ self.fasterq_dump_path = shutil.which("fasterq-dump") or shutil.which("fasterq-dump.exe")
240
+
241
+ print(f"工具状态:")
242
+ print(f" fastq-dump: {'✅ 可用' if self.fastq_dump_path else '❌ 未找到'}")
243
+ print(f" prefetch: {'✅ 可用' if self.prefetch_path else '❌ 未找到'}")
244
+ print(f" fasterq-dump: {'✅ 可用' if self.fasterq_dump_path else '❌ 未找到'}")
245
+
246
+ def download_with_fastq_dump(self,
247
+ accession: str,
248
+ output_dir: Optional[Path] = None,
249
+ split_files: bool = True,
250
+ gzip_output: bool = True,
251
+ max_retries: int = 3) -> Dict[str, Any]:
252
+ """
253
+ 使用fastq-dump下载数据
254
+
255
+ Parameters:
256
+ -----------
257
+ accession : str
258
+ SRA accession (SRR, ERR, DRR)
259
+ output_dir : Path
260
+ 输出目录
261
+ split_files : bool
262
+ 是否拆分文件(paired-end数据拆分为 _1.fastq 和 _2.fastq)
263
+ gzip_output : bool
264
+ 是否gzip压缩输出
265
+ max_retries : int
266
+ 最大重试次数
267
+
268
+ Returns:
269
+ --------
270
+ Dict: 下载结果
271
+ """
272
+ import time
273
+
274
+ if output_dir is None:
275
+ output_dir = self.cache_dir / accession
276
+ else:
277
+ output_dir = Path(output_dir) / accession
278
+
279
+ output_dir.mkdir(parents=True, exist_ok=True)
280
+
281
+ if not self.fastq_dump_path:
282
+ return {
283
+ 'accession': accession,
284
+ 'success': False,
285
+ 'error': 'fastq-dump not found. Please install SRA Toolkit.',
286
+ 'step': 'tool_check'
287
+ }
288
+
289
+ print(f"使用fastq-dump下载: {accession}")
290
+ print(f"输出目录: {output_dir}")
291
+ print(f"拆分文件: {split_files}")
292
+ print(f"gzip压缩: {gzip_output}")
293
+ print("-" * 50)
294
+
295
+ results = {}
296
+
297
+ # 方法1:使用prefetch + fastq-dump(推荐)
298
+ if self.use_prefetch and self.prefetch_path:
299
+ print("方法1: prefetch + fastq-dump")
300
+ result = self._download_with_prefetch(
301
+ accession=accession,
302
+ output_dir=output_dir,
303
+ split_files=split_files,
304
+ gzip_output=gzip_output,
305
+ max_retries=max_retries
306
+ )
307
+ results['prefetch_method'] = result
308
+
309
+ if result.get('success', False):
310
+ print("✅ prefetch方法成功")
311
+ return self._format_result(accession, output_dir, result)
312
+
313
+ # 方法2:直接使用fastq-dump
314
+ print("\n方法2: 直接使用fastq-dump")
315
+ result = self._download_direct(
316
+ accession=accession,
317
+ output_dir=output_dir,
318
+ split_files=split_files,
319
+ gzip_output=gzip_output,
320
+ max_retries=max_retries
321
+ )
322
+ results['direct_method'] = result
323
+
324
+ if result.get('success', False):
325
+ print("✅ 直接方法成功")
326
+ return self._format_result(accession, output_dir, result)
327
+
328
+ # 方法3:使用fasterq-dump(如果可用)
329
+ if self.fasterq_dump_path:
330
+ print("\n方法3: 使用fasterq-dump(更快)")
331
+ result = self._download_with_fasterq_dump(
332
+ accession=accession,
333
+ output_dir=output_dir,
334
+ split_files=split_files,
335
+ gzip_output=gzip_output,
336
+ max_retries=max_retries
337
+ )
338
+ results['fasterq_method'] = result
339
+
340
+ if result.get('success', False):
341
+ print("✅ fasterq-dump方法成功")
342
+ return self._format_result(accession, output_dir, result)
343
+
344
+ # 所有方法都失败
345
+ print("❌ 所有方法都失败")
346
+ return {
347
+ 'accession': accession,
348
+ 'success': False,
349
+ 'error': 'All download methods failed',
350
+ 'results': results,
351
+ 'output_dir': str(output_dir)
352
+ }
353
+
354
+ def _download_with_prefetch(self, accession, output_dir, split_files, gzip_output, max_retries):
355
+ """使用prefetch下载.sra文件,然后用fastq-dump转换"""
356
+ import time
357
+
358
+ sra_dir = output_dir / ".sra_cache"
359
+ sra_dir.mkdir(exist_ok=True)
360
+
361
+ # 步骤1: 使用prefetch下载.sra文件
362
+ print(" 步骤1: 使用prefetch下载.sra文件...")
363
+
364
+ prefetch_cmd = [
365
+ self.prefetch_path,
366
+ accession,
367
+ "-O", str(sra_dir),
368
+ "--progress" # 显示进度
369
+ ]
370
+
371
+ try:
372
+ print(f" 运行: {' '.join(prefetch_cmd)}")
373
+ result = subprocess.run(
374
+ prefetch_cmd,
375
+ capture_output=True,
376
+ text=True,
377
+ timeout=600, # 10分钟超时
378
+ check=True
379
+ )
380
+
381
+ print(f" prefetch完成: {result.stdout[-200:] if result.stdout else '无输出'}")
382
+
383
+ # 查找下载的.sra文件
384
+ sra_files = list(sra_dir.glob(f"**/{accession}.sra"))
385
+ if not sra_files:
386
+ sra_files = list(sra_dir.glob(f"**/*.sra"))
387
+
388
+ if not sra_files:
389
+ return {'success': False, 'error': 'No .sra file found after prefetch'}
390
+
391
+ sra_file = sra_files[0]
392
+ print(f" 找到.sra文件: {sra_file} ({sra_file.stat().st_size/1024/1024:.1f} MB)")
393
+
394
+ # 步骤2: 使用fastq-dump转换
395
+ return self._run_fastq_dump(
396
+ input_file=str(sra_file),
397
+ output_dir=output_dir,
398
+ split_files=split_files,
399
+ gzip_output=gzip_output
400
+ )
401
+
402
+ except subprocess.TimeoutExpired:
403
+ return {'success': False, 'error': 'prefetch timed out after 10 minutes'}
404
+ except subprocess.CalledProcessError as e:
405
+ return {'success': False, 'error': f'prefetch failed: {e.stderr[:200]}'}
406
+ except Exception as e:
407
+ return {'success': False, 'error': f'prefetch error: {type(e).__name__}: {e}'}
408
+
409
+ def _download_direct(self, accession, output_dir, split_files, gzip_output, max_retries):
410
+ """直接使用fastq-dump下载(不先下载.sra文件)"""
411
+ print(" 直接下载并转换...")
412
+
413
+ # 构建fastq-dump命令
414
+ cmd = self._build_fastq_dump_command(
415
+ accession=accession,
416
+ output_dir=output_dir,
417
+ split_files=split_files,
418
+ gzip_output=gzip_output
419
+ )
420
+
421
+ try:
422
+ print(f" 运行: {' '.join(cmd)}")
423
+ result = subprocess.run(
424
+ cmd,
425
+ capture_output=True,
426
+ text=True,
427
+ timeout=900, # 15分钟超时(可能较长)
428
+ check=True
429
+ )
430
+
431
+ print(f" fastq-dump输出: {result.stdout[-500:] if result.stdout else '无输出'}")
432
+
433
+ # 检查生成的文件
434
+ return self._check_output_files(output_dir, accession, split_files, gzip_output)
435
+
436
+ except subprocess.TimeoutExpired:
437
+ return {'success': False, 'error': 'fastq-dump timed out after 15 minutes'}
438
+ except subprocess.CalledProcessError as e:
439
+ error_msg = e.stderr[:500] if e.stderr else str(e)
440
+ return {'success': False, 'error': f'fastq-dump failed: {error_msg}'}
441
+ except Exception as e:
442
+ return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {e}'}
443
+
444
+ def _download_with_fasterq_dump(self, accession, output_dir, split_files, gzip_output, max_retries):
445
+ """使用fasterq-dump(更快版本)"""
446
+ print(" 使用fasterq-dump...")
447
+
448
+ # 构建fasterq-dump命令
449
+ cmd = [
450
+ self.fasterq_dump_path,
451
+ accession,
452
+ "-O", str(output_dir),
453
+ "-e", "4", # 使用4个线程
454
+ "-p" # 显示进度
455
+ ]
456
+
457
+ if split_files:
458
+ cmd.append("--split-files")
459
+
460
+ try:
461
+ print(f" 运行: {' '.join(cmd)}")
462
+ result = subprocess.run(
463
+ cmd,
464
+ capture_output=True,
465
+ text=True,
466
+ timeout=600, # 10分钟超时
467
+ check=True
468
+ )
469
+
470
+ print(f" fasterq-dump输出: {result.stdout[-500:] if result.stdout else '无输出'}")
471
+
472
+ # 如果需要gzip,使用并行gzip
473
+ if gzip_output:
474
+ self._gzip_files(output_dir)
475
+
476
+ return self._check_output_files(output_dir, accession, split_files, gzip_output)
477
+
478
+ except subprocess.TimeoutExpired:
479
+ return {'success': False, 'error': 'fasterq-dump timed out'}
480
+ except subprocess.CalledProcessError as e:
481
+ error_msg = e.stderr[:500] if e.stderr else str(e)
482
+ return {'success': False, 'error': f'fasterq-dump failed: {error_msg}'}
483
+ except Exception as e:
484
+ return {'success': False, 'error': f'fasterq-dump error: {type(e).__name__}: {e}'}
485
+
486
+ def _build_fastq_dump_command(self, accession, output_dir, split_files, gzip_output):
487
+ """构建fastq-dump命令"""
488
+ cmd = [
489
+ self.fastq_dump_path,
490
+ accession,
491
+ "--outdir", str(output_dir),
492
+ "--skip-technical", # 跳过技术读取
493
+ "--readids", # 在读取ID中包含原始名称
494
+ "--dumpbase", # 以碱基形式格式化序列
495
+ "--clip", # 移除适配器和质量修剪
496
+ ]
497
+
498
+ if split_files:
499
+ cmd.append("--split-files")
500
+
501
+ if gzip_output:
502
+ cmd.append("--gzip")
503
+
504
+ # 添加其他有用选项
505
+ cmd.extend([
506
+ "--read-filter", "pass", # 只保留通过的读取
507
+ "--origfmt" # 保持原始格式
508
+ ])
509
+
510
+ return cmd
511
+
512
+ def _run_fastq_dump(self, input_file, output_dir, split_files, gzip_output):
513
+ """运行fastq-dump转换.sra文件"""
514
+ cmd = [
515
+ self.fastq_dump_path,
516
+ input_file,
517
+ "--outdir", str(output_dir),
518
+ "--skip-technical",
519
+ "--readids",
520
+ "--dumpbase",
521
+ "--clip",
522
+ ]
523
+
524
+ if split_files:
525
+ cmd.append("--split-files")
526
+
527
+ if gzip_output:
528
+ cmd.append("--gzip")
529
+
530
+ try:
531
+ print(f" 运行fastq-dump: {' '.join(cmd)}")
532
+ result = subprocess.run(
533
+ cmd,
534
+ capture_output=True,
535
+ text=True,
536
+ timeout=300, # 5分钟超时(.sra文件已本地存在)
537
+ check=True
538
+ )
539
+
540
+ print(f" fastq-dump完成")
541
+ return self._check_output_files(output_dir, Path(input_file).stem, split_files, gzip_output)
542
+
543
+ except subprocess.CalledProcessError as e:
544
+ return {'success': False, 'error': f'fastq-dump conversion failed: {e.stderr[:200]}'}
545
+ except Exception as e:
546
+ return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {e}'}
547
+
548
+ def _gzip_files(self, output_dir):
549
+ """并行gzip文件(如果fastq-dump没有自动gzip)"""
550
+ import gzip
551
+ import shutil
552
+ from concurrent.futures import ThreadPoolExecutor
553
+
554
+ fastq_files = list(output_dir.glob("*.fastq"))
555
+
556
+ if not fastq_files:
557
+ return
558
+
559
+ print(f" 压缩 {len(fastq_files)} 个fastq文件...")
560
+
561
+ def compress_file(fastq_path):
562
+ gzip_path = fastq_path.with_suffix('.fastq.gz')
563
+
564
+ try:
565
+ with open(fastq_path, 'rb') as f_in:
566
+ with gzip.open(gzip_path, 'wb') as f_out:
567
+ shutil.copyfileobj(f_in, f_out)
568
+
569
+ # 删除原始文件
570
+ fastq_path.unlink()
571
+ return True
572
+ except Exception as e:
573
+ print(f" 压缩失败 {fastq_path.name}: {e}")
574
+ return False
575
+
576
+ # 并行压缩
577
+ with ThreadPoolExecutor(max_workers=4) as executor:
578
+ results = list(executor.map(compress_file, fastq_files))
579
+
580
+ success_count = sum(results)
581
+ print(f" 压缩完成: {success_count}/{len(fastq_files)} 成功")
582
+
583
+ def _check_output_files(self, output_dir, accession, split_files, gzip_output):
584
+ """检查输出文件"""
585
+ # 查找生成的文件
586
+ patterns = []
587
+ if gzip_output:
588
+ patterns.extend([f"{accession}*.fastq.gz", f"{accession}*.fq.gz"])
589
+ else:
590
+ patterns.extend([f"{accession}*.fastq", f"{accession}*.fq"])
591
+
592
+ files = []
593
+ for pattern in patterns:
594
+ files.extend(output_dir.glob(pattern))
595
+
596
+ files = [str(f) for f in files if f.exists() and f.stat().st_size > 0]
597
+
598
+ if files:
599
+ total_size = sum(Path(f).stat().st_size for f in files)
600
+ return {
601
+ 'success': True,
602
+ 'files': files,
603
+ 'file_count': len(files),
604
+ 'total_size_bytes': total_size,
605
+ 'total_size_mb': total_size / (1024 * 1024)
606
+ }
607
+ else:
608
+ return {'success': False, 'error': 'No output files found'}
609
+
610
+ def _format_result(self, accession, output_dir, result):
611
+ """格式化结果"""
612
+ return {
613
+ 'accession': accession,
614
+ 'success': True,
615
+ 'files': result.get('files', []),
616
+ 'file_count': result.get('file_count', 0),
617
+ 'total_size_mb': result.get('total_size_mb', 0),
618
+ 'output_dir': str(output_dir),
619
+ 'method': 'fastq-dump'
620
+ }
621
+
622
+ # 测试使用
623
+ def test_fastq_dump_downloader():
624
+ """测试fastq-dump下载器"""
625
+ print("测试fastq-dump下载器")
626
+ print("=" * 60)
627
+
628
+ downloader = FastqDumpDownloader(cache_dir="./fastqdump_test")
629
+
630
+ # 测试小文件
631
+ result = downloader.download_with_fastq_dump(
632
+ accession="SRR390728", # 小文件,约1MB
633
+ output_dir="./test_output",
634
+ split_files=True,
635
+ gzip_output=True,
636
+ max_retries=2
637
+ )
638
+
639
+ print(f"\n结果:")
640
+ print(f" 成功: {result['success']}")
641
+ print(f" 文件数: {result.get('file_count', 0)}")
642
+ print(f" 总大小: {result.get('total_size_mb', 0):.2f} MB")
643
+
644
+ if result['success'] and result.get('files'):
645
+ print(f" 文件列表:")
646
+ for filepath in result['files']:
647
+ size_mb = Path(filepath).stat().st_size / (1024 * 1024)
648
+ print(f" - {Path(filepath).name} ({size_mb:.2f} MB)")
649
+
650
+ return result
651
+
652
+ # test_fastq_dump_downloader()
653
+
654
+ class SRADownloader:
655
+ """
656
+ 独立的SRA数据下载器,不依赖pysradb
657
+ 直接使用ENA和NCBI API
658
+ """
659
+
660
+ def __init__(self, cache_dir: str = "./sra_data", max_workers: int = 4):
661
+ self.cache_dir = Path(cache_dir)
662
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
663
+ self.max_workers = max_workers
664
+ self.session = requests.Session()
665
+ self.session.headers.update({
666
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
667
+ })
668
+
669
+ def get_metadata(self, accession: str) -> Dict[str, Any]:
670
+ """
671
+ 从ENA获取SRA元数据
672
+ 参数可以是:SRR/ERR/DRR运行号,SRS样本号,SRX实验号
673
+ """
674
+ # 尝试不同API端点
675
+ endpoints = [
676
+ self._get_ena_metadata,
677
+ self._get_ncbi_metadata,
678
+ ]
679
+
680
+ for endpoint in endpoints:
681
+ try:
682
+ metadata = endpoint(accession)
683
+ if metadata:
684
+ return metadata
685
+ except Exception as e:
686
+ logger.debug(f"{endpoint.__name__} failed: {e}")
687
+
688
+ return {'error': f'无法获取 {accession} 的元数据'}
689
+
690
+ def _get_ena_metadata(self, accession: str) -> Dict[str, Any]:
691
+ """使用ENA API获取元数据"""
692
+ base_url = "https://www.ebi.ac.uk/ena/portal/api/search"
693
+
694
+ # 根据accession类型确定结果类型
695
+ if accession.startswith(('SRR', 'ERR', 'DRR')):
696
+ result_type = 'read_run'
697
+ elif accession.startswith(('SRS', 'ERS', 'DRS')):
698
+ result_type = 'sample'
699
+ elif accession.startswith(('SRX', 'ERX', 'DRX')):
700
+ result_type = 'experiment'
701
+ else:
702
+ result_type = 'read_run' # 默认
703
+
704
+ fields = [
705
+ 'accession', 'secondary_sample_accession', 'run_accession',
706
+ 'experiment_accession', 'study_accession', 'submission_accession',
707
+ 'instrument_platform', 'instrument_model', 'library_layout',
708
+ 'library_selection', 'library_source', 'library_strategy',
709
+ 'read_count', 'base_count', 'sample_alias', 'sample_title',
710
+ 'experiment_title', 'study_title', 'fastq_ftp', 'submitted_ftp',
711
+ 'sra_ftp', 'first_public', 'last_updated'
712
+ ]
713
+
714
+ params = {
715
+ 'result': result_type,
716
+ 'query': f'accession="{accession}" OR run_accession="{accession}"',
717
+ 'fields': ','.join(fields),
718
+ 'format': 'json',
719
+ 'limit': 1
720
+ }
721
+
722
+ try:
723
+ response = self.session.get(base_url, params=params, timeout=30)
724
+ response.raise_for_status()
725
+
726
+ data = response.json()
727
+ if data and isinstance(data, list) and len(data) > 0:
728
+ return data[0]
729
+ except Exception as e:
730
+ logger.error(f"ENA metadata API error: {e}")
731
+
732
+ return {}
733
+
734
+ def _get_ncbi_metadata(self, accession: str) -> Dict[str, Any]:
735
+ """使用NCBI API获取元数据(备用)"""
736
+ # Entrez API
737
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
738
+
739
+ # 搜索
740
+ search_params = {
741
+ 'db': 'sra',
742
+ 'term': f'{accession}[Accession]',
743
+ 'retmax': 1,
744
+ 'retmode': 'json'
745
+ }
746
+
747
+ try:
748
+ search_response = self.session.get(base_url + "esearch.fcgi", params=search_params)
749
+ search_data = search_response.json()
750
+
751
+ ids = search_data.get('esearchresult', {}).get('idlist', [])
752
+ if not ids:
753
+ return {}
754
+
755
+ # 获取摘要
756
+ summary_params = {
757
+ 'db': 'sra',
758
+ 'id': ids[0],
759
+ 'retmode': 'json'
760
+ }
761
+
762
+ summary_response = self.session.get(base_url + "esummary.fcgi", params=summary_params)
763
+ summary_data = summary_response.json()
764
+
765
+ result = summary_data.get('result', {}).get(ids[0], {})
766
+
767
+ # 转换为标准格式
768
+ metadata = {
769
+ 'accession': accession,
770
+ 'title': result.get('title', ''),
771
+ 'organism': result.get('organism', ''),
772
+ 'platform': result.get('platform', ''),
773
+ 'library_strategy': result.get('librarystrategy', ''),
774
+ 'library_source': result.get('librarysource', ''),
775
+ 'library_selection': result.get('libraryselection', ''),
776
+ 'instrument': result.get('instrument', ''),
777
+ }
778
+
779
+ return metadata
780
+
781
+ except Exception as e:
782
+ logger.error(f"NCBI metadata API error: {e}")
783
+ return {}
784
+
785
+ def get_fastq_links(self, accession: str) -> List[str]:
786
+ """获取FASTQ下载链接"""
787
+ metadata = self.get_metadata(accession)
788
+
789
+ links = []
790
+
791
+ # 从元数据中提取FASTQ链接
792
+ for field in ['fastq_ftp', 'submitted_ftp', 'sra_ftp']:
793
+ if field in metadata and metadata[field]:
794
+ ftp_links = str(metadata[field]).split(';')
795
+ for link in ftp_links:
796
+ link = link.strip()
797
+ if link:
798
+ if not link.startswith(('http://', 'https://', 'ftp://')):
799
+ link = f"ftp://{link}"
800
+ links.append(link)
801
+
802
+ # 如果没有找到链接,生成默认链接
803
+ if not links:
804
+ links = self._generate_default_links(accession)
805
+
806
+ return list(set(links)) # 去重
807
+
808
+ def _generate_default_links(self, accession: str) -> List[str]:
809
+ """生成默认的ENA FTP链接"""
810
+ links = []
811
+
812
+ # ENA标准FTP路径模式
813
+ # ftp://ftp.sra.ebi.ac.uk/vol1/fastq/XXXnnn/XXXnnnXXX/
814
+
815
+ if accession.startswith(('SRR', 'ERR', 'DRR')):
816
+ # 提取前6位
817
+ prefix = accession[:6]
818
+ # 尝试不同路径模式
819
+ patterns = [
820
+ f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}.fastq.gz",
821
+ f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}_1.fastq.gz",
822
+ f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/{accession}/{accession}_2.fastq.gz",
823
+ f"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{prefix}/00{accession[-1]}/{accession}/{accession}.fastq.gz",
824
+ ]
825
+ links.extend(patterns)
826
+
827
+ return links
828
+
829
+ def download_fastq(self,
830
+ accession: str,
831
+ output_dir: Optional[Path] = None,
832
+ max_files: int = 10) -> Dict[str, Any]:
833
+ """下载FASTQ文件"""
834
+ if output_dir is None:
835
+ output_dir = self.cache_dir / accession
836
+ else:
837
+ output_dir = Path(output_dir) / accession
838
+
839
+ output_dir.mkdir(parents=True, exist_ok=True)
840
+
841
+ # 获取下载链接
842
+ links = self.get_fastq_links(accession)
843
+
844
+ if not links:
845
+ return {
846
+ 'accession': accession,
847
+ 'success': False,
848
+ 'error': 'No download links found',
849
+ 'files': []
850
+ }
851
+
852
+ logger.info(f"Found {len(links)} download links for {accession}")
853
+
854
+ # 限制下载文件数量
855
+ if len(links) > max_files:
856
+ logger.info(f"Limiting to {max_files} files")
857
+ links = links[:max_files]
858
+
859
+ # 并行下载
860
+ downloaded_files = []
861
+
862
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
863
+ # 提交下载任务
864
+ future_to_url = {
865
+ executor.submit(self._download_file, url, output_dir): url
866
+ for url in links
867
+ }
868
+
869
+ # 使用进度条
870
+ for future in tqdm(as_completed(future_to_url),
871
+ total=len(links),
872
+ desc=f"Downloading {accession}"):
873
+ url = future_to_url[future]
874
+ try:
875
+ result = future.result(timeout=300)
876
+ if result['success']:
877
+ downloaded_files.append(result['filepath'])
878
+ else:
879
+ logger.error(f"Failed to download {url}: {result.get('error')}")
880
+ except Exception as e:
881
+ logger.error(f"Download task failed for {url}: {e}")
882
+
883
+ return {
884
+ 'accession': accession,
885
+ 'success': len(downloaded_files) > 0,
886
+ 'files': downloaded_files,
887
+ 'output_dir': str(output_dir),
888
+ 'metadata': self.get_metadata(accession)
889
+ }
890
+
891
+ def _download_file(self, url: str, output_dir: Path) -> Dict[str, Any]:
892
+ """下载单个文件"""
893
+ filename = self._extract_filename(url)
894
+ filepath = output_dir / filename
895
+
896
+ # 检查文件是否已存在
897
+ if filepath.exists():
898
+ file_size = filepath.stat().st_size
899
+ if file_size > 1024: # 大于1KB认为文件完整
900
+ logger.debug(f"File already exists: {filepath}")
901
+ return {
902
+ 'success': True,
903
+ 'filepath': str(filepath),
904
+ 'size': file_size,
905
+ 'cached': True
906
+ }
907
+
908
+ try:
909
+ # 根据URL协议选择下载方法
910
+ if url.startswith('ftp://'):
911
+ result = self._download_ftp(url, filepath)
912
+ elif url.startswith('http'):
913
+ result = self._download_http(url, filepath)
914
+ else:
915
+ result = {'success': False, 'error': f'Unsupported protocol: {url}'}
916
+
917
+ if result['success']:
918
+ logger.info(f"Downloaded: {filename}")
919
+
920
+ return result
921
+
922
+ except Exception as e:
923
+ logger.error(f"Download failed for {url}: {e}")
924
+ return {'success': False, 'error': str(e)}
925
+
926
+ def _extract_filename(self, url: str) -> str:
927
+ """从URL提取文件名"""
928
+ # 移除查询参数
929
+ if '?' in url:
930
+ url = url.split('?')[0]
931
+
932
+ # 获取最后一部分作为文件名
933
+ filename = url.split('/')[-1]
934
+
935
+ # 如果文件名为空,使用默认名
936
+ if not filename or filename.endswith('/'):
937
+ return "unknown_file.fastq.gz"
938
+
939
+ return filename
940
+
941
+ def _download_http(self, url: str, filepath: Path) -> Dict[str, Any]:
942
+ """下载HTTP/HTTPS文件"""
943
+ try:
944
+ response = self.session.get(url, stream=True, timeout=60)
945
+ response.raise_for_status()
946
+
947
+ total_size = int(response.headers.get('content-length', 0))
948
+
949
+ with open(filepath, 'wb') as f:
950
+ downloaded = 0
951
+ for chunk in response.iter_content(chunk_size=8192):
952
+ if chunk:
953
+ f.write(chunk)
954
+ downloaded += len(chunk)
955
+
956
+ actual_size = filepath.stat().st_size
957
+
958
+ return {
959
+ 'success': True,
960
+ 'filepath': str(filepath),
961
+ 'size': actual_size,
962
+ 'expected_size': total_size
963
+ }
964
+
965
+ except Exception as e:
966
+ return {'success': False, 'error': f'HTTP download failed: {e}'}
967
+
968
+ def _download_ftp(self, url: str, filepath: Path) -> Dict[str, Any]:
969
+ """下载FTP文件"""
970
+ import ftplib
971
+ from urllib.parse import urlparse
972
+
973
+ try:
974
+ # 解析FTP URL
975
+ parsed = urlparse(url)
976
+ hostname = parsed.hostname
977
+ path = parsed.path
978
+
979
+ if not hostname:
980
+ return {'success': False, 'error': 'Invalid FTP URL'}
981
+
982
+ # 连接FTP服务器
983
+ ftp = ftplib.FTP(hostname, timeout=30)
984
+ ftp.login() # 匿名登录
985
+
986
+ # 提取目录和文件名
987
+ if '/' in path:
988
+ dir_path = '/'.join(path.split('/')[:-1]) or '/'
989
+ filename = path.split('/')[-1]
990
+ else:
991
+ dir_path = '/'
992
+ filename = path
993
+
994
+ # 切换到目录
995
+ if dir_path != '/':
996
+ try:
997
+ ftp.cwd(dir_path)
998
+ except:
999
+ # 如果目录不存在,尝试创建路径
1000
+ pass
1001
+
1002
+ # 获取文件大小
1003
+ try:
1004
+ ftp.sendcmd("TYPE I") # 二进制模式
1005
+ file_size = ftp.size(filename)
1006
+ except:
1007
+ file_size = 0
1008
+
1009
+ # 下载文件
1010
+ with open(filepath, 'wb') as f:
1011
+ ftp.retrbinary(f"RETR {filename}", f.write)
1012
+
1013
+ ftp.quit()
1014
+
1015
+ actual_size = filepath.stat().st_size
1016
+
1017
+ return {
1018
+ 'success': True,
1019
+ 'filepath': str(filepath),
1020
+ 'size': actual_size,
1021
+ 'expected_size': file_size
1022
+ }
1023
+
1024
+ except Exception as e:
1025
+ return {'success': False, 'error': f'FTP download failed: {e}'}
1026
+
1027
+ def batch_download(self,
1028
+ accessions: List[str],
1029
+ output_dir: Optional[Path] = None) -> Dict[str, Any]:
1030
+ """批量下载多个accession"""
1031
+ results = {}
1032
+
1033
+ for accession in tqdm(accessions, desc="Processing accessions"):
1034
+ try:
1035
+ result = self.download_fastq(accession, output_dir)
1036
+ results[accession] = result
1037
+ except Exception as e:
1038
+ results[accession] = {
1039
+ 'accession': accession,
1040
+ 'success': False,
1041
+ 'error': str(e)
1042
+ }
1043
+
1044
+ # 统计
1045
+ total = len(accessions)
1046
+ successful = sum(1 for r in results.values() if r.get('success', False))
1047
+
1048
+ return {
1049
+ 'total': total,
1050
+ 'successful': successful,
1051
+ 'failed': total - successful,
1052
+ 'results': results
1053
+ }
1054
+ @dataclass
1055
+ class DatasetConfig:
1056
+ """数据集配置"""
1057
+ dataset_id: str
1058
+ data_type: DataSource
1059
+ data_format: DataFormat
1060
+ organism: Optional[str] = None
1061
+ platform: Optional[str] = None
1062
+ samples: Optional[List[str]] = None
1063
+ force_download: bool = False
1064
+ custom_params: Dict[str, Any] = field(default_factory=dict)
1065
+
1066
+ @classmethod
1067
+ def from_accession(cls, accession: str, **kwargs) -> 'DatasetConfig':
1068
+ """从accession创建配置"""
1069
+ data_type = DataSource.from_accession(accession)
1070
+ data_format = DataFormat.infer_format(data_type, **kwargs)
1071
+
1072
+ return cls(
1073
+ dataset_id=accession,
1074
+ data_type=data_type,
1075
+ data_format=data_format,
1076
+ organism=kwargs.get('organism'),
1077
+ platform=kwargs.get('platform'),
1078
+ samples=kwargs.get('samples'),
1079
+ force_download=kwargs.get('force_download', False),
1080
+ custom_params={k: v for k, v in kwargs.items()
1081
+ if k not in ['dataset_id', 'organism', 'platform',
1082
+ 'samples', 'force_download']}
1083
+ )
1084
+
1085
+ class CacheManager:
1086
+ """缓存管理器"""
1087
+
1088
+ def __init__(self, cache_dir: Path):
1089
+ self.cache_dir = cache_dir
1090
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
1091
+ self.metadata_file = cache_dir / "cache_metadata.json"
1092
+ self._load_metadata()
1093
+
1094
+ def _load_metadata(self):
1095
+ """加载缓存元数据"""
1096
+ if self.metadata_file.exists():
1097
+ with open(self.metadata_file, 'r') as f:
1098
+ self.metadata = json.load(f)
1099
+ else:
1100
+ self.metadata = {}
1101
+
1102
+ def _save_metadata(self):
1103
+ """保存缓存元数据"""
1104
+ with open(self.metadata_file, 'w') as f:
1105
+ json.dump(self.metadata, f, indent=2)
1106
+
1107
+ def get_cache_key(self, config: DatasetConfig) -> str:
1108
+ """生成缓存键"""
1109
+ key_parts = [
1110
+ config.dataset_id,
1111
+ config.data_type.value,
1112
+ config.data_format.value,
1113
+ config.organism or 'any',
1114
+ config.platform or 'any',
1115
+ str(sorted(config.samples)) if config.samples else 'all'
1116
+ ]
1117
+ key_string = '|'.join(key_parts)
1118
+ return hashlib.md5(key_string.encode()).hexdigest()
1119
+
1120
+ def get_cache_path(self, config: DatasetConfig) -> Path:
1121
+ """获取缓存路径"""
1122
+ cache_key = self.get_cache_key(config)
1123
+ cache_dir = self.cache_dir / config.data_type.value
1124
+ cache_dir.mkdir(exist_ok=True)
1125
+ return cache_dir / f"{cache_key}.pkl"
1126
+
1127
+ def exists(self, config: DatasetConfig) -> bool:
1128
+ """检查缓存是否存在"""
1129
+ cache_path = self.get_cache_path(config)
1130
+ return cache_path.exists()
1131
+
1132
+ def load(self, config: DatasetConfig) -> Optional[Any]:
1133
+ """从缓存加载数据"""
1134
+ cache_path = self.get_cache_path(config)
1135
+
1136
+ if cache_path.exists():
1137
+ try:
1138
+ with open(cache_path, 'rb') as f:
1139
+ data = pickle.load(f)
1140
+
1141
+ # 更新访问时间
1142
+ cache_key = self.get_cache_key(config)
1143
+ self.metadata[cache_key] = {
1144
+ 'last_accessed': datetime.now().isoformat(),
1145
+ 'dataset_id': config.dataset_id,
1146
+ 'data_type': config.data_type.value,
1147
+ 'data_format': config.data_format.value
1148
+ }
1149
+ self._save_metadata()
1150
+
1151
+ logger.info(f"Loaded from cache: {cache_path}")
1152
+ return data
1153
+ except Exception as e:
1154
+ logger.warning(f"Failed to load cache: {e}")
1155
+
1156
+ return None
1157
+
1158
+ def save(self, config: DatasetConfig, data: Any):
1159
+ """保存数据到缓存"""
1160
+ cache_path = self.get_cache_path(config)
1161
+
1162
+ try:
1163
+ with open(cache_path, 'wb') as f:
1164
+ pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
1165
+
1166
+ # 更新元数据
1167
+ cache_key = self.get_cache_key(config)
1168
+ self.metadata[cache_key] = {
1169
+ 'created': datetime.now().isoformat(),
1170
+ 'last_accessed': datetime.now().isoformat(),
1171
+ 'dataset_id': config.dataset_id,
1172
+ 'data_type': config.data_type.value,
1173
+ 'data_format': config.data_format.value,
1174
+ 'size': cache_path.stat().st_size if cache_path.exists() else 0
1175
+ }
1176
+ self._save_metadata()
1177
+
1178
+ logger.info(f"Saved to cache: {cache_path}")
1179
+ except Exception as e:
1180
+ logger.error(f"Failed to save cache: {e}")
1181
+
1182
+ def clear_cache(self, data_type: Optional[str] = None, older_than_days: Optional[int] = None):
1183
+ """清理缓存"""
1184
+ cache_files = list(self.cache_dir.rglob("*.pkl"))
1185
+
1186
+ for cache_file in cache_files:
1187
+ try:
1188
+ if data_type and data_type not in str(cache_file):
1189
+ continue
1190
+
1191
+ if older_than_days:
1192
+ file_age = datetime.now().timestamp() - cache_file.stat().st_mtime
1193
+ if file_age < older_than_days * 86400:
1194
+ continue
1195
+
1196
+ cache_file.unlink()
1197
+ logger.info(f"Removed cache: {cache_file}")
1198
+ except Exception as e:
1199
+ logger.error(f"Failed to remove cache {cache_file}: {e}")
1200
+
1201
+ self._load_metadata() # 重新加载元数据
1202
+
1203
+ class BioDataFetcher:
1204
+ """
1205
+ 生物信息学数据获取器终极版
1206
+ 支持多数据源、自动类型推断、智能缓存和并行下载
1207
+ """
1208
+
1209
+ def __init__(self, dir_save: str = "./bio_data_cache",
1210
+ config_file: Optional[str] = None,
1211
+ auto_infer: bool = True,
1212
+ prefer_fastq_dump: bool = True):
1213
+ """
1214
+ 初始化数据获取器
1215
+
1216
+ Parameters:
1217
+ -----------
1218
+ dir_save : str
1219
+ 数据缓存目录
1220
+ config_file : str
1221
+ 配置文件路径(YAML或JSON格式)
1222
+ auto_infer : bool
1223
+ 是否启用自动类型推断
1224
+ prefer_fastq_dump : bool
1225
+ 是否优先使用fastq-dump下载SRA数据
1226
+ """
1227
+ self.dir_save = Path(dir_save)
1228
+ self.auto_infer = auto_infer
1229
+ self.prefer_fastq_dump = prefer_fastq_dump
1230
+ # 初始化缓存管理器
1231
+ self.cache = CacheManager(self.dir_save)
1232
+
1233
+ # 加载配置
1234
+ self.config = self._load_config(config_file)
1235
+
1236
+ # 数据源API客户端
1237
+ self.sra_client = None
1238
+ self.mygene_client = None
1239
+ self._init_clients()
1240
+ # 检查fastq-dump是否可用
1241
+ self.fastq_dump_available = self._check_fastq_dump_available()
1242
+
1243
+ # 数据源处理器映射 - 使用字符串键确保一致性
1244
+ self.data_processors = {
1245
+ 'geo': self._process_geo,
1246
+ 'sra': self._process_sra,
1247
+ 'tcga': self._process_tcga,
1248
+ 'encode': self._process_encode,
1249
+ 'arrayexpress': self._process_array_express,
1250
+ 'single_cell': self._process_single_cell,
1251
+ 'custom': self._process_custom,
1252
+ # 同时支持枚举键的别名
1253
+ DataSource.GEO: self._process_geo,
1254
+ DataSource.SRA: self._process_sra,
1255
+ DataSource.TCGA: self._process_tcga,
1256
+ DataSource.ENCODE: self._process_encode,
1257
+ DataSource.ARRAY_EXPRESS: self._process_array_express,
1258
+ DataSource.SINGLE_CELL: self._process_single_cell,
1259
+ DataSource.CUSTOM: self._process_custom,
1260
+ }
1261
+
1262
+ # 注册数据库API信息
1263
+ self.database_apis = {
1264
+ 'ncbi': {
1265
+ 'base_url': 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/',
1266
+ 'formats': ['xml', 'json'],
1267
+ 'rate_limit': 3
1268
+ },
1269
+ 'ena': {
1270
+ 'base_url': 'https://www.ebi.ac.uk/ena/portal/api/',
1271
+ 'formats': ['json'],
1272
+ 'rate_limit': 10
1273
+ },
1274
+ 'gdc': {
1275
+ 'base_url': 'https://api.gdc.cancer.gov/',
1276
+ 'formats': ['json'],
1277
+ 'rate_limit': 5
1278
+ },
1279
+ 'encode': {
1280
+ 'base_url': 'https://www.encodeproject.org/',
1281
+ 'formats': ['json'],
1282
+ 'rate_limit': 5
1283
+ }
1284
+ }
1285
+
1286
+ logger.info(f"BioDataFetcher initialized with cache dir: {self.dir_save}")
1287
+ if self.fastq_dump_available and self.prefer_fastq_dump:
1288
+ logger.info("fastq-dump available, will use it for SRA downloads")
1289
+ def _check_fastq_dump_available(self) -> bool:
1290
+ """检查fastq-dump是否可用"""
1291
+ import shutil
1292
+
1293
+ # 检查主要工具
1294
+ tools_to_check = ['fastq-dump', 'prefetch']
1295
+ available_tools = []
1296
+
1297
+ for tool in tools_to_check:
1298
+ path = shutil.which(tool)
1299
+ if path:
1300
+ available_tools.append((tool, path))
1301
+ logger.debug(f"{tool} found: {path}")
1302
+ else:
1303
+ logger.debug(f"{tool} not found in PATH")
1304
+
1305
+ if len(available_tools) >= 1: # 至少需要fastq-dump
1306
+ logger.info(f"fastq-dump tools available: {[t[0] for t in available_tools]}")
1307
+ return True
1308
+ else:
1309
+ install_fastq_dump_helper()
1310
+ logger.warning("fastq-dump not available. SRA downloads may use FTP fallback.")
1311
+ return False
1312
+
1313
+ def _load_config(self, config_file: Optional[str]) -> Dict:
1314
+ """加载配置文件"""
1315
+ default_config = {
1316
+ 'max_retries': 3,
1317
+ 'timeout': 30,
1318
+ 'batch_size': 10,
1319
+ 'prefer_cached': True,
1320
+ 'download_fastq': False,
1321
+ 'parallel_downloads': 4,
1322
+ 'ncbi_api_key': None,
1323
+ 'ensembl_api_key': None,
1324
+ 'max_cache_size_gb': 10,
1325
+ 'auto_normalize': True,
1326
+ 'gene_id_conversion': True,
1327
+ 'quality_control': True,
1328
+ 'prefer_fastq_dump': True, # 是否优先使用fastq-dump
1329
+ 'fastq_dump_split_files': True, # 是否拆分文件
1330
+ 'fastq_dump_gzip_output': True, # 是否gzip压缩
1331
+ 'fastq_dump_use_prefetch': True, # 是否使用prefetch
1332
+ 'fastq_dump_threads': 4, # 线程数
1333
+ 'fastq_dump_max_retries': 2, # 最大重试次数
1334
+ }
1335
+
1336
+ if config_file and Path(config_file).exists():
1337
+ try:
1338
+ with open(config_file, 'r') as f:
1339
+ if config_file.endswith('.yaml') or config_file.endswith('.yml'):
1340
+ user_config = yaml.safe_load(f)
1341
+ elif config_file.endswith('.json'):
1342
+ user_config = json.load(f)
1343
+ else:
1344
+ logger.warning(f"Unsupported config file format: {config_file}")
1345
+ return default_config
1346
+
1347
+ # 合并配置
1348
+ default_config.update(user_config)
1349
+ except Exception as e:
1350
+ logger.error(f"Error loading config file: {e}")
1351
+
1352
+ return default_config
1353
+
1354
+ def _init_clients(self):
1355
+ """初始化API客户端"""
1356
+ if SRADB_AVAILABLE:
1357
+ try:
1358
+ self.sra_client = SRAweb()
1359
+ logger.info("SRAweb client initialized")
1360
+ except Exception as e:
1361
+ logger.warning(f"Failed to initialize SRAweb client: {e}")
1362
+
1363
+ if MYGENE_AVAILABLE:
1364
+ try:
1365
+ self.mygene_client = mygene.MyGeneInfo()
1366
+ logger.info("MyGene client initialized")
1367
+ except Exception as e:
1368
+ logger.warning(f"Failed to initialize MyGene client: {e}")
1369
+
1370
+ def fetch_data(self,
1371
+ dataset_ids: Union[str, List[str]],
1372
+ data_type: Optional[str] = None,
1373
+ data_format: Optional[str] = None,
1374
+ organism: Optional[str] = None,
1375
+ platform: Optional[str] = None,
1376
+ samples: Optional[List[str]] = None,
1377
+ force_download: bool = False,
1378
+ **kwargs) -> Dict[str, Any]:
1379
+ """
1380
+ 通用数据获取函数(智能版)
1381
+
1382
+ Parameters:
1383
+ -----------
1384
+ dataset_ids : Union[str, List[str]]
1385
+ 数据集ID或ID列表
1386
+ data_type : Optional[str]
1387
+ 数据类型,如未指定则自动推断
1388
+ data_format : Optional[str]
1389
+ 数据格式,如未指定则自动推断
1390
+ organism : Optional[str]
1391
+ 物种
1392
+ platform : Optional[str]
1393
+ 平台类型
1394
+ samples : Optional[List[str]]
1395
+ 指定样本ID列表
1396
+ force_download : bool
1397
+ 强制重新下载,忽略缓存
1398
+
1399
+ Returns:
1400
+ --------
1401
+ Dict[str, Any]: 包含数据和元数据的字典
1402
+ """
1403
+ if isinstance(dataset_ids, str):
1404
+ dataset_ids = [dataset_ids]
1405
+
1406
+ results = {}
1407
+
1408
+ for dataset_id in dataset_ids:
1409
+ try:
1410
+ # 自动推断数据类型
1411
+ inferred_type = data_type or self._infer_data_type(dataset_id)
1412
+
1413
+ # 创建数据集配置
1414
+ config = DatasetConfig(
1415
+ dataset_id=dataset_id,
1416
+ data_type=DataSource(inferred_type),
1417
+ data_format=DataFormat(data_format or 'expression'),
1418
+ organism=organism,
1419
+ platform=platform,
1420
+ samples=samples,
1421
+ force_download=force_download,
1422
+ custom_params=kwargs
1423
+ )
1424
+
1425
+ # 获取数据
1426
+ result = self._fetch_with_config(config)
1427
+ results[dataset_id] = result
1428
+
1429
+ except Exception as e:
1430
+ logger.error(f"Failed to fetch data for {dataset_id}: {e}")
1431
+ results[dataset_id] = {
1432
+ 'error': str(e),
1433
+ 'traceback': self._format_exception(e)
1434
+ }
1435
+
1436
+ # 记录下载历史
1437
+ self._record_download_history(dataset_ids)
1438
+
1439
+ return results
1440
+
1441
+ def _infer_data_type(self, dataset_id: str) -> str:
1442
+ """根据数据集ID推断数据类型"""
1443
+ if self.auto_infer:
1444
+ return DataSource.from_accession(dataset_id).value
1445
+
1446
+ # 使用启发式规则
1447
+ dataset_id = dataset_id.upper()
1448
+
1449
+ # GEO系列
1450
+ if dataset_id.startswith('GSE') or dataset_id.startswith('GDS'):
1451
+ return 'geo'
1452
+
1453
+ # SRA运行
1454
+ elif dataset_id.startswith(('SRR', 'ERR', 'DRR')):
1455
+ return 'sra'
1456
+
1457
+ # TCGA项目
1458
+ elif dataset_id.startswith('TCGA'):
1459
+ return 'tcga'
1460
+
1461
+ # ENCODE实验
1462
+ elif dataset_id.startswith('ENC'):
1463
+ return 'encode'
1464
+
1465
+ # ArrayExpress
1466
+ elif re.match(r'^E-[A-Z]{4}-\d+$', dataset_id):
1467
+ return 'arrayexpress'
1468
+
1469
+ # 默认使用GEO
1470
+ else:
1471
+ return 'geo'
1472
+
1473
+ def _fetch_with_config(self, config: DatasetConfig) -> Any:
1474
+ """使用配置获取数据"""
1475
+ dataset_id = config.dataset_id
1476
+
1477
+ # 检查缓存
1478
+ if not config.force_download and self.config['prefer_cached']:
1479
+ cached_data = self.cache.load(config)
1480
+ if cached_data is not None:
1481
+ logger.info(f"Using cached data for {dataset_id}")
1482
+ return cached_data
1483
+
1484
+ logger.info(f"Fetching data for {dataset_id} [{config.data_type.value}/{config.data_format.value}]")
1485
+
1486
+ # 根据数据类型选择处理器
1487
+ # 将枚举类型转换为字符串键进行查找
1488
+ data_type_key = config.data_type.value if isinstance(config.data_type, DataSource) else config.data_type
1489
+
1490
+ processor = self.data_processors.get(data_type_key)
1491
+ if not processor:
1492
+ # 如果直接查找失败,尝试使用枚举值查找
1493
+ if isinstance(config.data_type, DataSource):
1494
+ processor = self.data_processors.get(config.data_type)
1495
+
1496
+ if not processor:
1497
+ # 最后尝试将字符串转换为枚举
1498
+ try:
1499
+ enum_type = DataSource(data_type_key)
1500
+ processor = self.data_processors.get(enum_type)
1501
+ except:
1502
+ pass
1503
+
1504
+ if not processor:
1505
+ raise ValueError(f"No processor for data type: {config.data_type} (key: {data_type_key})")
1506
+ processor = self.data_processors.get(config.data_type)
1507
+ if not processor:
1508
+ raise ValueError(f"No processor for data type: {config.data_type}")
1509
+
1510
+ # 获取数据
1511
+ data = processor(config)
1512
+
1513
+ # 后处理
1514
+ data = self._post_process(data, config)
1515
+
1516
+ # 保存到缓存
1517
+ if not config.force_download:
1518
+ self.cache.save(config, data)
1519
+
1520
+ return data
1521
+
1522
+ def _process_geo(self, config: DatasetConfig) -> Any:
1523
+ """处理GEO数据"""
1524
+ if not GEO_UTILS_AVAILABLE:
1525
+ raise ImportError("GEO utilities not available")
1526
+
1527
+ # 使用现有的GEO函数
1528
+ geo_data = geo_utils.load_geo(
1529
+ datasets=config.dataset_id,
1530
+ dir_save=str(self.dir_save / "geo"),
1531
+ verbose=config.custom_params.get('verbose', False)
1532
+ )
1533
+
1534
+ # 根据格式提取数据
1535
+ if config.data_format == DataFormat.EXPRESSION:
1536
+ data = geo_utils.get_data(
1537
+ geo=geo_data,
1538
+ dataset=config.dataset_id,
1539
+ verbose=config.custom_params.get('verbose', False)
1540
+ )
1541
+ elif config.data_format == DataFormat.METADATA:
1542
+ data = geo_utils.get_meta(
1543
+ geo=geo_data,
1544
+ dataset=config.dataset_id,
1545
+ verbose=config.custom_params.get('verbose', False)
1546
+ )
1547
+ elif config.data_format == DataFormat.PROBE:
1548
+ data = geo_utils.get_probe(
1549
+ geo=geo_data,
1550
+ dataset=config.dataset_id,
1551
+ platform_id=config.platform,
1552
+ verbose=config.custom_params.get('verbose', False)
1553
+ )
1554
+ else:
1555
+ raise ValueError(f"Unsupported GEO format: {config.data_format}")
1556
+
1557
+ # 过滤样本
1558
+ if config.samples:
1559
+ data = self._filter_samples(data, config.samples)
1560
+
1561
+ return data
1562
+ def _process_sra(self, config: DatasetConfig) -> Any:
1563
+ """
1564
+ 智能SRA处理器
1565
+ 优先使用fastq-dump,失败时回退到FTP下载
1566
+ """
1567
+ dataset_id = config.dataset_id
1568
+
1569
+ if config.data_format == DataFormat.METADATA:
1570
+ # 元数据仍然使用原来的方法
1571
+ return self._process_sra_original(config)
1572
+
1573
+ elif config.data_format == DataFormat.FASTQ:
1574
+ # FASTQ下载:优先使用fastq-dump
1575
+ logger.info(f"Processing SRA FASTQ: {dataset_id}")
1576
+
1577
+ # 检查是否强制使用某种方法
1578
+ force_method = config.custom_params.get('download_method')
1579
+
1580
+ if force_method == 'fastq_dump' or (self.prefer_fastq_dump and self.fastq_dump_available and force_method != 'ftp'):
1581
+ # 尝试使用fastq-dump
1582
+ logger.info(f"Attempting fastq-dump for {dataset_id}")
1583
+ result = self._download_with_fastq_dump(config)
1584
+
1585
+ if result.get('success', False):
1586
+ logger.info(f"fastq-dump successful for {dataset_id}")
1587
+ return result
1588
+ else:
1589
+ logger.warning(f"fastq-dump failed for {dataset_id}: {result.get('error', 'unknown')}")
1590
+
1591
+ # 如果用户没有明确要求fastq-dump,回退到FTP
1592
+ if force_method != 'fastq_dump':
1593
+ logger.info(f"Falling back to FTP for {dataset_id}")
1594
+ return self._download_with_ftp(config)
1595
+ else:
1596
+ return result # 用户明确要求fastq-dump,即使失败也返回
1597
+
1598
+ else:
1599
+ # 使用FTP下载
1600
+ logger.info(f"Using FTP for {dataset_id}")
1601
+ return self._download_with_ftp(config)
1602
+
1603
+ else:
1604
+ raise ValueError(f"Unsupported SRA format: {config.data_format}")
1605
+ def _download_with_ftp(self, config: DatasetConfig) -> Dict[str, Any]:
1606
+ """使用FTP下载(回退方法)"""
1607
+ dataset_id = config.dataset_id
1608
+
1609
+ logger.info(f"Using FTP fallback for {dataset_id}")
1610
+
1611
+ # 使用原来的SRADownloader
1612
+ downloader = SRADownloader(
1613
+ cache_dir=str(self.dir_save / "fastq"),
1614
+ max_workers=config.custom_params.get('parallel_downloads', 4)
1615
+ )
1616
+
1617
+ result = downloader.download_fastq(
1618
+ dataset_id,
1619
+ output_dir=self.dir_save / "fastq",
1620
+ max_files=config.custom_params.get('max_files', 10)
1621
+ )
1622
+
1623
+ # 添加方法标记
1624
+ if isinstance(result, dict):
1625
+ result['download_method'] = 'ftp'
1626
+
1627
+ return result
1628
+ def _process_sra_original(self, config: DatasetConfig) -> Any:
1629
+ """处理SRA数据 - 使用独立的下载器"""
1630
+ dataset_id = config.dataset_id
1631
+
1632
+ if config.data_format == DataFormat.METADATA:
1633
+ # 使用独立的下载器获取元数据
1634
+ downloader = SRADownloader(cache_dir=str(self.dir_save / "sra"))
1635
+ metadata = downloader.get_metadata(dataset_id)
1636
+
1637
+ # 转换为DataFrame
1638
+ if isinstance(metadata, dict) and metadata:
1639
+ return pd.DataFrame([metadata])
1640
+ else:
1641
+ return pd.DataFrame()
1642
+
1643
+ elif config.data_format == DataFormat.FASTQ:
1644
+ # 使用独立的下载器下载FASTQ
1645
+ downloader = SRADownloader(
1646
+ cache_dir=str(self.dir_save / "fastq"),
1647
+ max_workers=config.custom_params.get('parallel_downloads', 4)
1648
+ )
1649
+
1650
+ result = downloader.download_fastq(
1651
+ dataset_id,
1652
+ output_dir=self.dir_save / "fastq",
1653
+ max_files=config.custom_params.get('max_files', 10)
1654
+ )
1655
+
1656
+ return result
1657
+
1658
+ else:
1659
+ raise ValueError(f"Unsupported SRA format: {config.data_format}")
1660
+
1661
+ def _download_sra_fastq(self, config: DatasetConfig) -> Dict:
1662
+ """下载SRA FASTQ文件"""
1663
+ import requests
1664
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1665
+
1666
+ dataset_id = config.dataset_id
1667
+ output_dir = self.dir_save / "fastq" / dataset_id
1668
+ output_dir.mkdir(parents=True, exist_ok=True)
1669
+
1670
+ # 获取下载链接
1671
+ download_links = self._get_sra_download_links(dataset_id)
1672
+
1673
+ if not download_links:
1674
+ raise ValueError(f"No download links found for {dataset_id}")
1675
+
1676
+ logger.info(f"Found {len(download_links)} download links for {dataset_id}")
1677
+
1678
+ # 并行下载
1679
+ downloaded_files = []
1680
+ max_workers = config.custom_params.get('parallel_downloads',
1681
+ self.config['parallel_downloads'])
1682
+
1683
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1684
+ future_to_url = {
1685
+ executor.submit(self._download_file, url, output_dir, config): url
1686
+ for url in download_links[:10] # 限制最多下载10个文件
1687
+ }
1688
+
1689
+ for future in tqdm(as_completed(future_to_url),
1690
+ total=len(future_to_url),
1691
+ desc=f"Downloading {dataset_id}"):
1692
+ url = future_to_url[future]
1693
+ try:
1694
+ filepath = future.result(timeout=300)
1695
+ if filepath:
1696
+ downloaded_files.append(str(filepath))
1697
+ except Exception as e:
1698
+ logger.error(f"Failed to download {url}: {e}")
1699
+
1700
+ return {
1701
+ 'metadata': self._get_sra_metadata(dataset_id),
1702
+ 'fastq_files': downloaded_files,
1703
+ 'output_dir': str(output_dir)
1704
+ }
1705
+
1706
+ def _get_sra_download_links(self, accession: str) -> List[str]:
1707
+ """获取SRA下载链接"""
1708
+ try:
1709
+ # 尝试ENA API
1710
+ ena_links = self._get_ena_download_links(accession)
1711
+ if ena_links:
1712
+ return ena_links
1713
+
1714
+ # 尝试NCBI
1715
+ ncbi_links = self._get_ncbi_download_links(accession)
1716
+ if ncbi_links:
1717
+ return ncbi_links
1718
+
1719
+ # 生成默认链接
1720
+ return self._generate_default_links(accession)
1721
+
1722
+ except Exception as e:
1723
+ logger.error(f"Failed to get download links for {accession}: {e}")
1724
+ return []
1725
+
1726
+ def _get_ena_download_links(self, accession: str) -> List[str]:
1727
+ """从ENA获取下载链接"""
1728
+ if not REQUESTS_AVAILABLE:
1729
+ return []
1730
+
1731
+ try:
1732
+ url = "https://www.ebi.ac.uk/ena/portal/api/filereport"
1733
+ params = {
1734
+ 'accession': accession,
1735
+ 'result': 'read_run',
1736
+ 'fields': 'fastq_ftp',
1737
+ 'format': 'json'
1738
+ }
1739
+
1740
+ response = requests.get(url, params=params, timeout=30)
1741
+ response.raise_for_status()
1742
+
1743
+ data = response.json()
1744
+ if data and isinstance(data, list):
1745
+ links = []
1746
+ for item in data:
1747
+ if 'fastq_ftp' in item and item['fastq_ftp']:
1748
+ ftp_links = str(item['fastq_ftp']).split(';')
1749
+ for link in ftp_links:
1750
+ link = link.strip()
1751
+ if link:
1752
+ links.append(f"ftp://{link}")
1753
+ return links
1754
+ except Exception as e:
1755
+ logger.debug(f"ENA API failed: {e}")
1756
+
1757
+ return []
1758
+
1759
+ def _get_sra_metadata(self, accession: str) -> pd.DataFrame:
1760
+ """获取SRA元数据"""
1761
+ if self.sra_client:
1762
+ return self.sra_client.search_sra(run_accession =accession, detailed=True)
1763
+ return pd.DataFrame()
1764
+
1765
+ def _download_with_fastq_dump(self, config: DatasetConfig) -> Dict[str, Any]:
1766
+ """使用fastq-dump下载SRA数据"""
1767
+ import subprocess
1768
+ import shutil
1769
+ import time
1770
+
1771
+ dataset_id = config.dataset_id
1772
+
1773
+ # 提取参数
1774
+ output_dir = self.dir_save / "fastq" / dataset_id
1775
+ output_dir.mkdir(parents=True, exist_ok=True)
1776
+
1777
+ split_files = config.custom_params.get('split_files', True)
1778
+ gzip_output = config.custom_params.get('gzip_output', True)
1779
+ use_prefetch = config.custom_params.get('use_prefetch', True)
1780
+ max_retries = config.custom_params.get('max_retries', 2)
1781
+ threads = config.custom_params.get('threads', 4)
1782
+
1783
+ # 查找工具
1784
+ fastq_dump_path = shutil.which("fastq-dump")
1785
+ prefetch_path = shutil.which("prefetch")
1786
+ fasterq_dump_path = shutil.which("fasterq-dump")
1787
+ if not fastq_dump_path and not fasterq_dump_path:
1788
+ return {
1789
+ 'success': False,
1790
+ 'error': 'Neither fastq-dump nor fasterq-dump found in PATH',
1791
+ 'accession': dataset_id,
1792
+ }
1793
+
1794
+ logger.info(f"Downloading {dataset_id} with fastq-dump")
1795
+ logger.info(f" Output dir: {output_dir}")
1796
+ logger.info(f" Split files: {split_files}")
1797
+ logger.info(f" Gzip output: {gzip_output}")
1798
+
1799
+ results = {}
1800
+
1801
+ # 方法1: 使用prefetch + fastq-dump(如果可用)
1802
+ if use_prefetch and prefetch_path:
1803
+ logger.info("Method 1: Using prefetch + fastq-dump")
1804
+ result = self._run_prefetch_fastq_dump(
1805
+ accession=dataset_id,
1806
+ fastq_dump_path=fastq_dump_path,
1807
+ prefetch_path=prefetch_path,
1808
+ output_dir=output_dir,
1809
+ split_files=split_files,
1810
+ gzip_output=gzip_output,
1811
+ threads=threads,
1812
+ max_retries=max_retries
1813
+ )
1814
+ results['prefetch_method'] = result
1815
+
1816
+ if result.get('success', False):
1817
+ return self._format_fastq_dump_result(dataset_id, output_dir, result, 'prefetch+fastq-dump')
1818
+
1819
+ # 方法2: 直接使用fastq-dump
1820
+ logger.info("Method 2: Using fastq-dump directly")
1821
+ result = self._run_fastq_dump_direct(
1822
+ accession=dataset_id,
1823
+ fastq_dump_path=fastq_dump_path,
1824
+ output_dir=output_dir,
1825
+ split_files=split_files,
1826
+ gzip_output=gzip_output,
1827
+ threads=threads,
1828
+ max_retries=max_retries
1829
+ )
1830
+ results['direct_method'] = result
1831
+
1832
+ if result.get('success', False):
1833
+ return self._format_fastq_dump_result(dataset_id, output_dir, result, 'fastq-dump')
1834
+
1835
+ # 方法3: 使用fasterq-dump(如果可用)
1836
+ if fasterq_dump_path:
1837
+ logger.info("Method 3: Using fasterq-dump")
1838
+ result = self._run_fasterq_dump(
1839
+ accession=dataset_id,
1840
+ fasterq_dump_path=fasterq_dump_path,
1841
+ output_dir=output_dir,
1842
+ split_files=split_files,
1843
+ gzip_output=gzip_output,
1844
+ threads=threads,
1845
+ max_retries=max_retries
1846
+ )
1847
+ results['fasterq_method'] = result
1848
+
1849
+ if result.get('success', False):
1850
+ return self._format_fastq_dump_result(dataset_id, output_dir, result, 'fasterq-dump')
1851
+
1852
+ # 所有方法都失败
1853
+ logger.error(f"All fastq-dump methods failed for {dataset_id}")
1854
+ return {
1855
+ 'success': False,
1856
+ 'error': 'All fastq-dump methods failed',
1857
+ 'accession': dataset_id,
1858
+ 'results': results,
1859
+ 'method': 'fastq-dump'
1860
+ }
1861
+
1862
+ def _run_prefetch_fastq_dump(self, accession, fastq_dump_path, prefetch_path,
1863
+ output_dir, split_files, gzip_output, threads, max_retries):
1864
+ """使用prefetch下载.sra文件,然后用fastq-dump转换"""
1865
+
1866
+ sra_dir = output_dir / "sra"
1867
+ sra_dir.mkdir(exist_ok=True)
1868
+ # 步骤1: 使用prefetch
1869
+ prefetch_cmd = [
1870
+ prefetch_path,
1871
+ accession,
1872
+ "-O", str(sra_dir),
1873
+ "--progress"
1874
+ ]
1875
+
1876
+ try:
1877
+ logger.info(f"Running prefetch: {' '.join(prefetch_cmd)}")
1878
+
1879
+ # 运行prefetch
1880
+ result = subprocess.run(
1881
+ prefetch_cmd,
1882
+ capture_output=True,
1883
+ text=True,
1884
+ timeout=300, # 10分钟超时
1885
+ check=False,# 不立即抛出异常
1886
+ )
1887
+
1888
+ # 详细记录输出
1889
+ logger.debug(f"prefetch return code: {result.returncode}")
1890
+ if result.stdout:
1891
+ logger.debug(f"prefetch stdout (last 500 chars): {result.stdout[-500:]}")
1892
+ if result.stderr:
1893
+ logger.error(f"prefetch stderr: {result.stderr}")
1894
+
1895
+ if result.returncode != 0:
1896
+ error_msg = f"prefetch failed with code {result.returncode}"
1897
+ if result.stderr:
1898
+ error_msg += f": {result.stderr[:200]}"
1899
+ return {'success': False, 'error': error_msg}
1900
+
1901
+ # 查找.sra文件
1902
+ sra_files = list(sra_dir.glob(f"**/{accession}.sra"))
1903
+ if not sra_files:
1904
+ sra_files = list(sra_dir.glob(f"**/*.sra"))
1905
+
1906
+ if not sra_files:
1907
+ # 列出目录内容帮助调试
1908
+ all_files = list(sra_dir.rglob("*"))
1909
+ file_list = [f"{f.name} ({f.stat().st_size} bytes)" for f in all_files if f.is_file()]
1910
+ logger.warning(f"No .sra files found. Directory contents: {file_list}")
1911
+ return {'success': False, 'error': f'No .sra file found. Files: {file_list}'}
1912
+
1913
+
1914
+ sra_file = sra_files[0]
1915
+ logger.info(f"Found .sra file: {sra_file.name} ({sra_file.stat().st_size/1024/1024:.1f} MB)")
1916
+
1917
+ # 步骤2: 使用fastq-dump转换
1918
+ return self._run_fastq_dump_on_file(
1919
+ sra_file=sra_file,
1920
+ fastq_dump_path=fastq_dump_path,
1921
+ output_dir=output_dir,
1922
+ split_files=split_files,
1923
+ gzip_output=gzip_output,
1924
+ threads=threads
1925
+ )
1926
+
1927
+ except subprocess.TimeoutExpired:
1928
+ return {'success': False, 'error': 'prefetch timed out'}
1929
+ except Exception as e:
1930
+ import traceback
1931
+ error_details = traceback.format_exc()[:500]
1932
+ return {'success': False, 'error': f'prefetch error: {type(e).__name__}: {str(e)[:200]}\n{error_details}'}
1933
+
1934
+
1935
+ def _run_fastq_dump_direct(self, accession, fastq_dump_path, output_dir,
1936
+ split_files, gzip_output, threads, max_retries):
1937
+ """直接使用fastq-dump下载"""
1938
+ # 构建命令
1939
+ cmd = [
1940
+ fastq_dump_path,
1941
+ accession,
1942
+ "--outdir", str(output_dir),
1943
+ "--skip-technical",
1944
+ "--readids",
1945
+ "--dumpbase",
1946
+ "--clip",
1947
+ "--read-filter", "pass",
1948
+ "--origfmt"
1949
+ ]
1950
+
1951
+ if split_files:
1952
+ cmd.append("--split-files")
1953
+
1954
+ if gzip_output:
1955
+ cmd.append("--gzip")
1956
+
1957
+ # 添加线程支持(如果版本支持)
1958
+ if threads > 1:
1959
+ cmd.extend(["--threads", str(threads)])
1960
+
1961
+ try:
1962
+ logger.info(f"Running fastq-dump: {' '.join(cmd[:10])}...") # 只显示前10个参数
1963
+
1964
+ result = subprocess.run(
1965
+ cmd,
1966
+ capture_output=True,
1967
+ text=True,
1968
+ timeout=300, # 15分钟超时
1969
+ check=False
1970
+ )
1971
+
1972
+ logger.debug(f"fastq-dump stdout: {result.stdout[-500:] if result.stdout else ''}")
1973
+ logger.debug(f"fastq-dump stderr: {result.stderr[-500:] if result.stderr else ''}")
1974
+
1975
+ # 检查输出文件
1976
+ return self._check_fastq_output(output_dir, accession, split_files, gzip_output)
1977
+
1978
+ except subprocess.TimeoutExpired:
1979
+ return {'success': False, 'error': 'fastq-dump timed out'}
1980
+ except subprocess.CalledProcessError as e:
1981
+ error_msg = e.stderr[:500] if e.stderr else str(e)
1982
+ return {'success': False, 'error': f'fastq-dump failed: {error_msg}'}
1983
+ except Exception as e:
1984
+ return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {str(e)[:200]}'}
1985
+
1986
+ def _run_fasterq_dump(self, accession, fasterq_dump_path, output_dir,
1987
+ split_files, gzip_output, threads, max_retries):
1988
+ """使用fasterq-dump"""
1989
+ cmd = [
1990
+ fasterq_dump_path,
1991
+ accession,
1992
+ "-O", str(output_dir),
1993
+ "-e", str(threads),
1994
+ "-p", # 显示进度
1995
+ "-t", str(output_dir / "temp") # 临时目录
1996
+ ]
1997
+
1998
+ if split_files:
1999
+ cmd.append("--split-files")
2000
+
2001
+ try:
2002
+ logger.info(f"Running fasterq-dump: {' '.join(cmd)}")
2003
+
2004
+ result = subprocess.run(
2005
+ cmd,
2006
+ capture_output=True,
2007
+ text=True,
2008
+ timeout=600, # 10分钟超时
2009
+ check=True
2010
+ )
2011
+
2012
+ logger.debug(f"fasterq-dump stdout: {result.stdout[-500:] if result.stdout else ''}")
2013
+ logger.debug(f"fasterq-dump stderr: {result.stderr[-500:] if result.stderr else ''}")
2014
+
2015
+ # 如果需要gzip,压缩文件
2016
+ if gzip_output:
2017
+ self._compress_fastq_files(output_dir)
2018
+
2019
+ return self._check_fastq_output(output_dir, accession, split_files, gzip_output)
2020
+
2021
+ except subprocess.TimeoutExpired:
2022
+ return {'success': False, 'error': 'fasterq-dump timed out'}
2023
+ except subprocess.CalledProcessError as e:
2024
+ error_msg = e.stderr[:500] if e.stderr else str(e)
2025
+ return {'success': False, 'error': f'fasterq-dump failed: {error_msg}'}
2026
+ except Exception as e:
2027
+ return {'success': False, 'error': f'fasterq-dump error: {type(e).__name__}: {str(e)[:200]}'}
2028
+
2029
+ def _run_fastq_dump_on_file(self, sra_file, fastq_dump_path, output_dir,
2030
+ split_files, gzip_output, threads):
2031
+ """对已有的.sra文件运行fastq-dump"""
2032
+ cmd = [
2033
+ fastq_dump_path,
2034
+ str(sra_file),
2035
+ "--outdir", str(output_dir),
2036
+ "--skip-technical",
2037
+ "--readids",
2038
+ "--dumpbase",
2039
+ "--clip",
2040
+ "--read-filter", "pass",
2041
+ "--origfmt"
2042
+ ]
2043
+
2044
+ if split_files:
2045
+ cmd.append("--split-files")
2046
+
2047
+ if gzip_output:
2048
+ cmd.append("--gzip")
2049
+
2050
+ if threads > 1:
2051
+ cmd.extend(["--threads", str(threads)])
2052
+
2053
+ try:
2054
+ logger.info(f"Running fastq-dump on .sra file: {' '.join(cmd[:8])}...")
2055
+
2056
+ result = subprocess.run(
2057
+ cmd,
2058
+ capture_output=True,
2059
+ text=True,
2060
+ timeout=300, # 5分钟超时(文件已本地存在)
2061
+ check=True
2062
+ )
2063
+
2064
+ return self._check_fastq_output(output_dir, sra_file.stem, split_files, gzip_output)
2065
+
2066
+ except subprocess.CalledProcessError as e:
2067
+ return {'success': False, 'error': f'fastq-dump conversion failed: {e.stderr[:200] if e.stderr else str(e)}'}
2068
+ except Exception as e:
2069
+ return {'success': False, 'error': f'fastq-dump error: {type(e).__name__}: {str(e)[:200]}'}
2070
+
2071
+ def _check_fastq_output(self, output_dir, accession, split_files, gzip_output):
2072
+ """检查fastq输出文件"""
2073
+ import glob
2074
+
2075
+ # 查找文件模式
2076
+ if gzip_output:
2077
+ patterns = [f"{accession}*.fastq.gz", f"{accession}*.fq.gz"]
2078
+ else:
2079
+ patterns = [f"{accession}*.fastq", f"{accession}*.fq"]
2080
+
2081
+ files = []
2082
+ for pattern in patterns:
2083
+ files.extend(output_dir.glob(pattern))
2084
+
2085
+ # 过滤空文件
2086
+ files = [str(f) for f in files if f.exists() and f.stat().st_size > 0]
2087
+
2088
+ if files:
2089
+ total_size = sum(Path(f).stat().st_size for f in files)
2090
+ return {
2091
+ 'success': True,
2092
+ 'files': files,
2093
+ 'file_count': len(files),
2094
+ 'total_size_bytes': total_size,
2095
+ 'total_size_mb': total_size / (1024 * 1024)
2096
+ }
2097
+ else:
2098
+ # 尝试其他命名模式
2099
+ all_fastq_files = list(output_dir.glob("*.fastq*"))
2100
+ if all_fastq_files:
2101
+ files = [str(f) for f in all_fastq_files if f.stat().st_size > 0]
2102
+ if files:
2103
+ total_size = sum(Path(f).stat().st_size for f in files)
2104
+ return {
2105
+ 'success': True,
2106
+ 'files': files,
2107
+ 'file_count': len(files),
2108
+ 'total_size_bytes': total_size,
2109
+ 'total_size_mb': total_size / (1024 * 1024),
2110
+ 'note': 'Files found with different naming pattern'
2111
+ }
2112
+
2113
+ return {'success': False, 'error': 'No output files found'}
2114
+
2115
+ def _compress_fastq_files(self, output_dir):
2116
+ """压缩fastq文件"""
2117
+ import gzip
2118
+ import shutil
2119
+ from concurrent.futures import ThreadPoolExecutor
2120
+
2121
+ fastq_files = list(output_dir.glob("*.fastq"))
2122
+
2123
+ if not fastq_files:
2124
+ return
2125
+
2126
+ logger.info(f"Compressing {len(fastq_files)} fastq files...")
2127
+
2128
+ def compress_file(fastq_path):
2129
+ gzip_path = fastq_path.with_suffix('.fastq.gz')
2130
+
2131
+ try:
2132
+ with open(fastq_path, 'rb') as f_in:
2133
+ with gzip.open(gzip_path, 'wb') as f_out:
2134
+ shutil.copyfileobj(f_in, f_out)
2135
+
2136
+ # 删除原始文件
2137
+ fastq_path.unlink()
2138
+ return True
2139
+ except Exception as e:
2140
+ logger.warning(f"Failed to compress {fastq_path.name}: {e}")
2141
+ return False
2142
+
2143
+ # 并行压缩
2144
+ with ThreadPoolExecutor(max_workers=4) as executor:
2145
+ results = list(executor.map(compress_file, fastq_files))
2146
+
2147
+ success_count = sum(results)
2148
+ logger.info(f"Compression complete: {success_count}/{len(fastq_files)} successful")
2149
+
2150
+ def _format_fastq_dump_result(self, accession, output_dir, result, method):
2151
+ """格式化fastq-dump结果"""
2152
+ formatted = {
2153
+ 'accession': accession,
2154
+ 'success': result['success'],
2155
+ 'files': result.get('files', []),
2156
+ 'file_count': result.get('file_count', 0),
2157
+ 'total_size_mb': result.get('total_size_mb', 0),
2158
+ 'output_dir': str(output_dir),
2159
+ 'method': method,
2160
+ 'download_method': 'fastq-dump'
2161
+ }
2162
+
2163
+ if 'note' in result:
2164
+ formatted['note'] = result['note']
2165
+
2166
+ return formatted
2167
+
2168
+
2169
+ def _download_file(self, url: str, output_dir: Path, config: DatasetConfig) -> Optional[Path]:
2170
+ """下载单个文件"""
2171
+ import requests
2172
+
2173
+ filename = url.split('/')[-1].split('?')[0]
2174
+ filepath = output_dir / filename
2175
+
2176
+ # 检查文件是否已存在
2177
+ if filepath.exists() and not config.force_download:
2178
+ file_size = filepath.stat().st_size
2179
+ if file_size > 1000: # 文件大小合理
2180
+ logger.debug(f"File already exists: {filepath}")
2181
+ return filepath
2182
+
2183
+ try:
2184
+ if url.startswith('ftp://'):
2185
+ return self._download_ftp_file(url, filepath)
2186
+ else:
2187
+ return self._download_http_file(url, filepath)
2188
+ except Exception as e:
2189
+ logger.error(f"Failed to download {url}: {e}")
2190
+ return None
2191
+
2192
+ def _download_http_file(self, url: str, filepath: Path) -> Path:
2193
+ """下载HTTP文件"""
2194
+ import requests
2195
+
2196
+ headers = {
2197
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
2198
+ }
2199
+
2200
+ response = requests.get(url, stream=True, headers=headers, timeout=60)
2201
+ response.raise_for_status()
2202
+
2203
+ total_size = int(response.headers.get('content-length', 0))
2204
+
2205
+ with open(filepath, 'wb') as f:
2206
+ downloaded = 0
2207
+ for chunk in response.iter_content(chunk_size=8192):
2208
+ if chunk:
2209
+ f.write(chunk)
2210
+ downloaded += len(chunk)
2211
+
2212
+ logger.info(f"Downloaded: {filepath.name} ({downloaded/1024/1024:.1f} MB)")
2213
+ return filepath
2214
+
2215
+ def _process_tcga(self, config: DatasetConfig) -> Any:
2216
+ """处理TCGA数据"""
2217
+ # 实现TCGA数据下载逻辑
2218
+ raise NotImplementedError("TCGA data fetching not yet implemented")
2219
+
2220
+ def _process_encode(self, config: DatasetConfig) -> Any:
2221
+ """处理ENCODE数据"""
2222
+ # 实现ENCODE数据下载逻辑
2223
+ raise NotImplementedError("ENCODE data fetching not yet implemented")
2224
+
2225
+ def _process_array_express(self, config: DatasetConfig) -> Any:
2226
+ """处理ArrayExpress数据"""
2227
+ # 实现ArrayExpress数据下载逻辑
2228
+ raise NotImplementedError("ArrayExpress data fetching not yet implemented")
2229
+
2230
+ def _process_single_cell(self, config: DatasetConfig) -> Any:
2231
+ """处理单细胞数据"""
2232
+ # 实现单细胞数据下载逻辑
2233
+ raise NotImplementedError("Single-cell data fetching not yet implemented")
2234
+
2235
+ def _process_custom(self, config: DatasetConfig) -> Any:
2236
+ """处理自定义数据"""
2237
+ # 用户自定义数据处理
2238
+ custom_func = config.custom_params.get('custom_function')
2239
+ if custom_func and callable(custom_func):
2240
+ return custom_func(config.dataset_id, **config.custom_params)
2241
+
2242
+ raise ValueError("No custom function provided for custom data source")
2243
+
2244
+ def _post_process(self, data: Any, config: DatasetConfig) -> Any:
2245
+ """数据后处理"""
2246
+ if isinstance(data, pd.DataFrame):
2247
+ # 自动归一化
2248
+ if self.config['auto_normalize'] and config.data_format == DataFormat.EXPRESSION:
2249
+ data = self._auto_normalize(data)
2250
+
2251
+ # 基因ID转换
2252
+ if self.config['gene_id_conversion']:
2253
+ data = self._convert_gene_ids(data)
2254
+
2255
+ # 质量控制
2256
+ if self.config['quality_control']:
2257
+ data = self._quality_control(data)
2258
+
2259
+ return data
2260
+
2261
+ def _auto_normalize(self, df: pd.DataFrame) -> pd.DataFrame:
2262
+ """自动归一化表达数据"""
2263
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
2264
+
2265
+ if len(numeric_cols) == 0:
2266
+ return df
2267
+
2268
+ # 检测数据类型
2269
+ if self._is_raw_counts(df[numeric_cols]):
2270
+ logger.info("Detected raw counts, normalizing with TMM")
2271
+ try:
2272
+ return self._normalize_counts(df, numeric_cols)
2273
+ except Exception as e:
2274
+ logger.warning(f"TMM normalization failed: {e}")
2275
+
2276
+ return df
2277
+
2278
+ def _is_raw_counts(self, df_numeric: pd.DataFrame) -> bool:
2279
+ """检测是否为原始计数数据"""
2280
+ # 检查是否为整数
2281
+ if not df_numeric.applymap(lambda x: isinstance(x, (int, np.integer))).all().all():
2282
+ return False
2283
+
2284
+ # 检查数值范围
2285
+ max_val = df_numeric.max().max()
2286
+ min_val = df_numeric.min().min()
2287
+
2288
+ # 原始计数通常是正整数,且最大值较大
2289
+ return min_val >= 0 and max_val > 1000
2290
+
2291
+ def _normalize_counts(self, df: pd.DataFrame, numeric_cols: pd.Index) -> pd.DataFrame:
2292
+ """使用TMM方法归一化计数数据"""
2293
+ from statsmodels import robust
2294
+ import numpy as np
2295
+
2296
+ df_numeric = df[numeric_cols]
2297
+
2298
+ # 简单的TMM-like归一化
2299
+ # 计算几何均值作为参考样本
2300
+ log_counts = np.log1p(df_numeric.values)
2301
+ ref_sample = np.exp(np.mean(log_counts, axis=1))
2302
+
2303
+ # 计算缩放因子
2304
+ scaling_factors = []
2305
+ for col in df_numeric.columns:
2306
+ sample_counts = df_numeric[col].values
2307
+ log_ratio = np.log1p(sample_counts) - np.log1p(ref_sample)
2308
+ m_value = log_ratio - np.median(log_ratio)
2309
+ a_value = 0.5 * (np.log1p(sample_counts) + np.log1p(ref_sample))
2310
+
2311
+ # 修剪极端值
2312
+ trim_frac = 0.3
2313
+ n = len(m_value)
2314
+ trim_n = int(n * trim_frac)
2315
+ indices = np.argsort(a_value)
2316
+ keep_indices = indices[trim_n:n-trim_n]
2317
+
2318
+ # 计算缩放因子
2319
+ scaling_factor = np.exp(np.mean(m_value[keep_indices]))
2320
+ scaling_factors.append(scaling_factor)
2321
+
2322
+ # 应用缩放因子
2323
+ scaling_factors = np.array(scaling_factors)
2324
+ df_normalized = df.copy()
2325
+ for i, col in enumerate(numeric_cols):
2326
+ df_normalized[col] = df_numeric[col] / scaling_factors[i]
2327
+
2328
+ return df_normalized
2329
+
2330
+ def _convert_gene_ids(self, df: pd.DataFrame) -> pd.DataFrame:
2331
+ """转换基因ID"""
2332
+ if not MYGENE_AVAILABLE or self.mygene_client is None:
2333
+ return df
2334
+
2335
+ # 检测可能的基因ID列
2336
+ gene_id_cols = [col for col in df.columns
2337
+ if col.lower() in ['gene_id', 'gene_symbol', 'entrez', 'ensembl']]
2338
+
2339
+ if not gene_id_cols:
2340
+ return df
2341
+
2342
+ # 使用mygene.info进行ID转换
2343
+ try:
2344
+ gene_ids = df[gene_id_cols[0]].dropna().tolist()
2345
+ results = self.mygene_client.querymany(gene_ids, scopes='symbol', fields='symbol,name')
2346
+
2347
+ # 创建映射
2348
+ id_map = {}
2349
+ for result in results:
2350
+ if 'query' in result and 'symbol' in result:
2351
+ id_map[result['query']] = result['symbol']
2352
+
2353
+ # 应用映射
2354
+ df = df.copy()
2355
+ df[gene_id_cols[0]] = df[gene_id_cols[0]].map(id_map).fillna(df[gene_id_cols[0]])
2356
+
2357
+ except Exception as e:
2358
+ logger.warning(f"Gene ID conversion failed: {e}")
2359
+
2360
+ return df
2361
+
2362
+ def _quality_control(self, df: pd.DataFrame) -> pd.DataFrame:
2363
+ """质量控制"""
2364
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
2365
+
2366
+ if len(numeric_cols) == 0:
2367
+ return df
2368
+
2369
+ # 移除全为零的行
2370
+ df_numeric = df[numeric_cols]
2371
+ non_zero_rows = (df_numeric != 0).any(axis=1)
2372
+
2373
+ if not non_zero_rows.all():
2374
+ logger.info(f"Removing {sum(~non_zero_rows)} rows with all zeros")
2375
+ df = df[non_zero_rows].copy()
2376
+
2377
+ # 移除低表达基因
2378
+ mean_expression = df_numeric.mean(axis=1)
2379
+ if len(mean_expression) > 1000: # 只在数据量较大时过滤
2380
+ threshold = mean_expression.quantile(0.1)
2381
+ keep_rows = mean_expression >= threshold
2382
+
2383
+ if not keep_rows.all():
2384
+ logger.info(f"Removing {sum(~keep_rows)} low-expression rows")
2385
+ df = df[keep_rows].copy()
2386
+
2387
+ return df
2388
+
2389
+ def _filter_samples(self, data: Any, samples: List[str]) -> Any:
2390
+ """过滤样本"""
2391
+ if isinstance(data, pd.DataFrame):
2392
+ # 尝试按列名过滤
2393
+ if any(sample in data.columns for sample in samples):
2394
+ return data[samples]
2395
+ # 尝试按索引过滤
2396
+ elif any(sample in data.index for sample in samples):
2397
+ return data.loc[data.index.intersection(samples)]
2398
+
2399
+ return data
2400
+
2401
+ def _record_download_history(self, dataset_ids: List[str]):
2402
+ """记录下载历史"""
2403
+ history_file = self.dir_save / "download_history.json"
2404
+
2405
+ history = []
2406
+ if history_file.exists():
2407
+ try:
2408
+ with open(history_file, 'r') as f:
2409
+ history = json.load(f)
2410
+ if isinstance(history, dict):
2411
+ history = [history]
2412
+ elif not isinstance(history, list):
2413
+ history = []
2414
+ except:
2415
+ history = []
2416
+
2417
+ for dataset_id in dataset_ids:
2418
+ history.append({
2419
+ 'dataset_id': dataset_id,
2420
+ 'timestamp': datetime.now().isoformat(),
2421
+ 'cache_dir': str(self.dir_save)
2422
+ })
2423
+
2424
+ # 只保留最近100条记录
2425
+ history = history[-100:]
2426
+
2427
+ with open(history_file, 'w') as f:
2428
+ json.dump(history, f, indent=2)
2429
+
2430
+ def _format_exception(self, e: Exception) -> str:
2431
+ """格式化异常信息"""
2432
+ import traceback
2433
+ return traceback.format_exc()
2434
+
2435
+ # 公共API方法
2436
+ def list_datasets(self,
2437
+ data_type: Optional[str] = None,
2438
+ search_query: Optional[str] = None,
2439
+ organism: Optional[str] = None,
2440
+ limit: int = 50) -> pd.DataFrame:
2441
+ """列出或搜索数据集"""
2442
+ if search_query:
2443
+ return self._search_datasets(search_query, data_type, organism, limit)
2444
+
2445
+ # 列出缓存的数据集
2446
+ return self.cache_list(data_type)
2447
+
2448
+ def _search_datasets(self,
2449
+ query: str,
2450
+ data_type: Optional[str],
2451
+ organism: Optional[str],
2452
+ limit: int) -> pd.DataFrame:
2453
+ """搜索数据集"""
2454
+ import requests
2455
+
2456
+ # 根据数据类型选择API
2457
+ if data_type == 'geo' or data_type is None:
2458
+ return self._search_geo(query, organism, limit)
2459
+ elif data_type == 'sra':
2460
+ return self._search_sra(query, limit)
2461
+ else:
2462
+ logger.warning(f"Search not supported for data type: {data_type}")
2463
+ return pd.DataFrame()
2464
+
2465
+ def _search_geo(self, query: str, organism: Optional[str], limit: int) -> pd.DataFrame:
2466
+ """搜索GEO数据集"""
2467
+ if not REQUESTS_AVAILABLE:
2468
+ return pd.DataFrame()
2469
+
2470
+ try:
2471
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
2472
+
2473
+ search_term = query
2474
+ if organism:
2475
+ search_term += f" AND {organism}[Organism]"
2476
+
2477
+ # 搜索
2478
+ search_params = {
2479
+ 'db': 'gds',
2480
+ 'term': search_term,
2481
+ 'retmax': limit,
2482
+ 'retmode': 'json'
2483
+ }
2484
+
2485
+ response = requests.get(base_url + "esearch.fcgi", params=search_params)
2486
+ response.raise_for_status()
2487
+
2488
+ result = response.json()
2489
+ ids = result.get('esearchresult', {}).get('idlist', [])
2490
+
2491
+ if not ids:
2492
+ return pd.DataFrame()
2493
+
2494
+ # 获取详细信息
2495
+ summary_params = {
2496
+ 'db': 'gds',
2497
+ 'id': ','.join(ids),
2498
+ 'retmode': 'json'
2499
+ }
2500
+
2501
+ summary_response = requests.get(base_url + "esummary.fcgi", params=summary_params)
2502
+ summary_result = summary_response.json()
2503
+
2504
+ datasets = []
2505
+ for uid in ids:
2506
+ info = summary_result.get('result', {}).get(uid, {})
2507
+ datasets.append({
2508
+ 'accession': info.get('accession', ''),
2509
+ 'title': info.get('title', ''),
2510
+ 'summary': info.get('summary', '')[:200] + '...' if info.get('summary') else '',
2511
+ 'organism': info.get('organism', ''),
2512
+ 'platform': info.get('platform', ''),
2513
+ 'samples': info.get('samples', 0),
2514
+ 'type': info.get('entrytype', ''),
2515
+ 'gdstype': info.get('gdstype', ''),
2516
+ 'pubmed': info.get('pubmed', ''),
2517
+ })
2518
+
2519
+ return pd.DataFrame(datasets)
2520
+
2521
+ except Exception as e:
2522
+ logger.error(f"Failed to search GEO datasets: {e}")
2523
+ return pd.DataFrame()
2524
+
2525
+ def _search_sra(self, query: str, limit: int) -> pd.DataFrame:
2526
+ """搜索SRA数据集"""
2527
+ if not self.sra_client:
2528
+ return pd.DataFrame()
2529
+
2530
+ try:
2531
+ df = self.sra_client.search_sra(query, size=limit)
2532
+ return df
2533
+ except Exception as e:
2534
+ logger.error(f"Failed to search SRA datasets: {e}")
2535
+ return pd.DataFrame()
2536
+
2537
+ def cache_list(self, data_type: Optional[str] = None) -> pd.DataFrame:
2538
+ """列出缓存的数据集"""
2539
+ cache_files = list(self.dir_save.rglob("*.pkl"))
2540
+
2541
+ datasets = []
2542
+ for file_path in cache_files:
2543
+ try:
2544
+ rel_path = file_path.relative_to(self.dir_save)
2545
+ parts = rel_path.parts
2546
+
2547
+ if len(parts) >= 2:
2548
+ ds_type = parts[0]
2549
+
2550
+ if data_type and ds_type != data_type:
2551
+ continue
2552
+
2553
+ # 尝试从缓存元数据获取信息
2554
+ cache_key = file_path.stem
2555
+ metadata = self.cache.metadata.get(cache_key, {})
2556
+
2557
+ datasets.append({
2558
+ 'dataset_id': metadata.get('dataset_id', 'Unknown'),
2559
+ 'data_type': ds_type,
2560
+ 'data_format': metadata.get('data_format', 'Unknown'),
2561
+ 'file_path': str(file_path),
2562
+ 'size_mb': file_path.stat().st_size / (1024 * 1024),
2563
+ 'created': metadata.get('created', 'Unknown'),
2564
+ 'last_accessed': metadata.get('last_accessed', 'Unknown'),
2565
+ })
2566
+ except:
2567
+ continue
2568
+
2569
+ if datasets:
2570
+ df = pd.DataFrame(datasets)
2571
+ return df.sort_values('last_accessed', ascending=False)
2572
+
2573
+ return pd.DataFrame()
2574
+
2575
+ def batch_fetch(self,
2576
+ configs: List[Dict[str, Any]],
2577
+ max_workers: int = 4,
2578
+ progress_bar: bool = True) -> Dict[str, Any]:
2579
+ """批量获取数据"""
2580
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2581
+
2582
+ results = {}
2583
+
2584
+ def fetch_task(config_dict: Dict) -> Tuple[str, Any]:
2585
+ """单个获取任务"""
2586
+ try:
2587
+ dataset_id = config_dict.get('dataset_id', config_dict.get('id'))
2588
+ if not dataset_id:
2589
+ return 'unknown', {'error': 'No dataset_id provided'}
2590
+
2591
+ # 创建配置
2592
+ config = DatasetConfig.from_accession(dataset_id, **config_dict)
2593
+
2594
+ # 获取数据
2595
+ data = self._fetch_with_config(config)
2596
+ return dataset_id, data
2597
+
2598
+ except Exception as e:
2599
+ dataset_id = config_dict.get('dataset_id', config_dict.get('id', 'unknown'))
2600
+ return dataset_id, {'error': str(e)}
2601
+
2602
+ # 使用进度条
2603
+ if progress_bar:
2604
+ configs_iter = tqdm(configs, desc="Batch fetching")
2605
+ else:
2606
+ configs_iter = configs
2607
+
2608
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
2609
+ future_to_config = {
2610
+ executor.submit(fetch_task, config): config
2611
+ for config in configs_iter
2612
+ }
2613
+
2614
+ for future in as_completed(future_to_config):
2615
+ dataset_id, result = future.result()
2616
+ results[dataset_id] = result
2617
+
2618
+ return results
2619
+
2620
+ def get_dataset_info(self, dataset_id: str) -> Dict[str, Any]:
2621
+ """获取数据集信息"""
2622
+ # 自动推断数据类型
2623
+ data_type = self._infer_data_type(dataset_id)
2624
+
2625
+ info = {
2626
+ 'dataset_id': dataset_id,
2627
+ 'inferred_type': data_type,
2628
+ 'data_source': DataSource.from_accession(dataset_id).value,
2629
+ 'cache_status': self._check_cache_status(dataset_id),
2630
+ 'available_formats': self._get_available_formats(dataset_id, data_type),
2631
+ }
2632
+
2633
+ # 尝试获取元数据
2634
+ try:
2635
+ if data_type == 'geo':
2636
+ info['metadata'] = self._get_geo_info(dataset_id)
2637
+ elif data_type == 'sra':
2638
+ info['metadata'] = self._get_sra_info(dataset_id)
2639
+ except Exception as e:
2640
+ info['metadata_error'] = str(e)
2641
+
2642
+ return info
2643
+
2644
+ def _check_cache_status(self, dataset_id: str) -> Dict[str, Any]:
2645
+ """检查缓存状态"""
2646
+ cache_files = list(self.dir_save.rglob(f"*{dataset_id}*.pkl"))
2647
+
2648
+ status = {
2649
+ 'cached': len(cache_files) > 0,
2650
+ 'files': [],
2651
+ 'total_size_mb': 0
2652
+ }
2653
+
2654
+ for file_path in cache_files:
2655
+ status['files'].append({
2656
+ 'path': str(file_path),
2657
+ 'size_mb': file_path.stat().st_size / (1024 * 1024),
2658
+ 'modified': datetime.fromtimestamp(file_path.stat().st_mtime)
2659
+ })
2660
+ status['total_size_mb'] += file_path.stat().st_size / (1024 * 1024)
2661
+
2662
+ return status
2663
+
2664
+ def _get_available_formats(self, dataset_id: str, data_type: str) -> List[str]:
2665
+ """获取可用数据格式"""
2666
+ if data_type == 'geo':
2667
+ return ['expression', 'metadata', 'probe']
2668
+ elif data_type == 'sra':
2669
+ return ['metadata', 'fastq']
2670
+ elif data_type == 'tcga':
2671
+ return ['expression', 'clinical', 'mutations']
2672
+ else:
2673
+ return ['metadata']
2674
+
2675
+ def _get_geo_info(self, dataset_id: str) -> Dict[str, Any]:
2676
+ """获取GEO数据集信息"""
2677
+ if not REQUESTS_AVAILABLE:
2678
+ return {}
2679
+
2680
+ try:
2681
+ url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
2682
+ params = {'acc': dataset_id, 'targ': 'self', 'form': 'xml', 'view': 'quick'}
2683
+
2684
+ response = requests.get(url, params=params, timeout=10)
2685
+ if response.ok:
2686
+ # 解析HTML获取基本信息
2687
+ import re
2688
+ html = response.text
2689
+
2690
+ info = {}
2691
+
2692
+ # 提取标题
2693
+ title_match = re.search(r'<title>(.*?)</title>', html)
2694
+ if title_match:
2695
+ info['title'] = title_match.group(1)
2696
+
2697
+ # 提取样本数
2698
+ samples_match = re.search(r'Samples?:\s*(\d+)', html)
2699
+ if samples_match:
2700
+ info['samples'] = int(samples_match.group(1))
2701
+
2702
+ # 提取平台
2703
+ platform_match = re.search(r'Platform.*?GPL\d+', html)
2704
+ if platform_match:
2705
+ info['platform'] = platform_match.group(0)
2706
+
2707
+ return info
2708
+ except Exception as e:
2709
+ logger.debug(f"Failed to get GEO info: {e}")
2710
+
2711
+ return {}
2712
+
2713
+ def _get_sra_info(self, dataset_id: str) -> Dict[str, Any]:
2714
+ """获取SRA数据集信息"""
2715
+ if not self.sra_client:
2716
+ return {}
2717
+
2718
+ try:
2719
+ df = self.sra_client.search_sra(run_accession =dataset_id, detailed=False)
2720
+ if not df.empty:
2721
+ return df.iloc[0].to_dict()
2722
+ except Exception as e:
2723
+ logger.debug(f"Failed to get SRA info: {e}")
2724
+
2725
+ return {}
2726
+
2727
+ def clear_cache(self,
2728
+ data_type: Optional[str] = None,
2729
+ older_than_days: Optional[int] = None,
2730
+ confirm: bool = False):
2731
+ """清理缓存"""
2732
+ if not confirm:
2733
+ logger.warning("Cache clearance requires confirmation. Use confirm=True")
2734
+ return
2735
+
2736
+ self.cache.clear_cache(data_type, older_than_days)
2737
+
2738
+ def export_data(self,
2739
+ dataset_id: str,
2740
+ output_format: str = 'csv',
2741
+ output_dir: Optional[str] = None) -> str:
2742
+ """导出数据"""
2743
+ # 获取数据
2744
+ data = self.fetch_data(dataset_id)
2745
+
2746
+ if isinstance(data, dict) and 'error' in data:
2747
+ raise ValueError(f"Cannot export: {data['error']}")
2748
+
2749
+ if output_dir is None:
2750
+ output_dir = self.dir_save / "exports"
2751
+ else:
2752
+ output_dir = Path(output_dir)
2753
+
2754
+ output_dir.mkdir(exist_ok=True)
2755
+
2756
+ if output_format == 'csv':
2757
+ if isinstance(data, pd.DataFrame):
2758
+ output_path = output_dir / f"{dataset_id}.csv"
2759
+ data.to_csv(output_path)
2760
+ return str(output_path)
2761
+ else:
2762
+ raise ValueError("Data is not a DataFrame, cannot export as CSV")
2763
+
2764
+ elif output_format == 'excel':
2765
+ if isinstance(data, pd.DataFrame):
2766
+ output_path = output_dir / f"{dataset_id}.xlsx"
2767
+ data.to_excel(output_path, engine='openpyxl')
2768
+ return str(output_path)
2769
+ else:
2770
+ raise ValueError("Data is not a DataFrame, cannot export as Excel")
2771
+
2772
+ elif output_format == 'json':
2773
+ output_path = output_dir / f"{dataset_id}.json"
2774
+ with open(output_path, 'w') as f:
2775
+ if isinstance(data, pd.DataFrame):
2776
+ json.dump(data.to_dict(orient='records'), f, indent=2)
2777
+ else:
2778
+ json.dump(data, f, indent=2)
2779
+ return str(output_path)
2780
+
2781
+ else:
2782
+ raise ValueError(f"Unsupported output format: {output_format}")
2783
+
2784
+ def get_statistics(self) -> Dict[str, Any]:
2785
+ """获取统计信息"""
2786
+ cache_files = list(self.dir_save.rglob("*.pkl"))
2787
+
2788
+ stats = {
2789
+ 'total_datasets': len(set(f.stem for f in cache_files)),
2790
+ 'total_files': len(cache_files),
2791
+ 'total_size_gb': sum(f.stat().st_size for f in cache_files) / (1024**3),
2792
+ 'by_data_type': {},
2793
+ 'by_format': {},
2794
+ 'recent_downloads': []
2795
+ }
2796
+
2797
+ # 按数据类型统计
2798
+ for file_path in cache_files:
2799
+ try:
2800
+ rel_path = file_path.relative_to(self.dir_save)
2801
+ if len(rel_path.parts) >= 1:
2802
+ data_type = rel_path.parts[0]
2803
+ stats['by_data_type'][data_type] = stats['by_data_type'].get(data_type, 0) + 1
2804
+ except:
2805
+ pass
2806
+
2807
+ # 读取下载历史
2808
+ history_file = self.dir_save / "download_history.json"
2809
+ if history_file.exists():
2810
+ try:
2811
+ with open(history_file, 'r') as f:
2812
+ history = json.load(f)
2813
+ stats['recent_downloads'] = history[-10:] # 最近10次下载
2814
+ except:
2815
+ pass
2816
+
2817
+ return stats
2818
+
2819
+
2820
+ # 简化的使用函数(保持向后兼容)
2821
+ def fetch_data(dataset_ids: Union[str, List[str]],
2822
+ data_type: Optional[str] = None,
2823
+ data_format: Optional[str] = None,
2824
+ organism: Optional[str] = None,
2825
+ platform: Optional[str] = None,
2826
+ samples: Optional[List[str]] = None,
2827
+ force_download: bool = False,
2828
+ dir_save: str = "./bio_data_cache",
2829
+ auto_infer: bool = True,
2830
+ **kwargs) -> Dict[str, Any]:
2831
+ """
2832
+ 简化的数据获取函数(智能版)
2833
+
2834
+ Parameters:
2835
+ -----------
2836
+ dataset_ids : Union[str, List[str]]
2837
+ 数据集ID
2838
+ data_type : Optional[str]
2839
+ 数据类型,如未指定则自动推断
2840
+ data_format : Optional[str]
2841
+ 数据格式,如未指定则自动推断
2842
+ organism : Optional[str]
2843
+ 物种
2844
+ platform : Optional[str]
2845
+ 平台
2846
+ samples : Optional[List[str]]
2847
+ 样本列表
2848
+ force_download : bool
2849
+ 强制重新下载
2850
+ dir_save : str
2851
+ 缓存目录
2852
+ auto_infer : bool
2853
+ 是否启用自动类型推断
2854
+
2855
+ Returns:
2856
+ --------
2857
+ Dict[str, Any]: 数据字典
2858
+ """
2859
+ fetcher = BioDataFetcher(dir_save=dir_save, auto_infer=auto_infer)
2860
+
2861
+ return fetcher.fetch_data(
2862
+ dataset_ids=dataset_ids,
2863
+ data_type=data_type,
2864
+ data_format=data_format,
2865
+ organism=organism,
2866
+ platform=platform,
2867
+ samples=samples,
2868
+ force_download=force_download,
2869
+ **kwargs
2870
+ )
2871
+
2872
+
2873
+ # 快速使用函数
2874
+ def quick_fetch(dataset_id: str,
2875
+ dir_save: str = "./bio_data_cache",
2876
+ **kwargs) -> Any:
2877
+ """
2878
+ 快速获取数据(完全自动推断)
2879
+
2880
+ Parameters:
2881
+ -----------
2882
+ dataset_id : str
2883
+ 数据集ID
2884
+ dir_save : str
2885
+ 缓存目录
2886
+ **kwargs : 其他参数传递给fetch_data
2887
+
2888
+ Returns:
2889
+ --------
2890
+ Any: 获取的数据
2891
+ """
2892
+ return fetch_data(
2893
+ dataset_ids=dataset_id,
2894
+ dir_save=dir_save,
2895
+ auto_infer=True,
2896
+ **kwargs
2897
+ ).get(dataset_id)
2898
+
2899
+
2900
+ # 示例配置文件
2901
+ SAMPLE_CONFIG = """
2902
+ # BioDataFetcher 配置文件
2903
+ # 保存为 config.yaml 并使用 fetcher = BioDataFetcher(config_file='config.yaml')
2904
+
2905
+ # 下载设置
2906
+ max_retries: 3
2907
+ timeout: 30
2908
+ parallel_downloads: 4
2909
+ prefer_cached: true
2910
+
2911
+ # 数据处理
2912
+ auto_normalize: true
2913
+ gene_id_conversion: true
2914
+ quality_control: true
2915
+
2916
+ # API密钥(可选)
2917
+ ncbi_api_key: null
2918
+ ensembl_api_key: null
2919
+
2920
+ # 缓存设置
2921
+ max_cache_size_gb: 10
2922
+
2923
+ # 网络设置
2924
+ proxy: null
2925
+ user_agent: "BioDataFetcher/1.0"
2926
+
2927
+ # 日志设置
2928
+ log_level: "INFO"
2929
+ log_file: "bio_data_fetcher.log"
2930
+ """
2931
+
2932
+ import subprocess
2933
+ import shutil
2934
+ from pathlib import Path
2935
+
2936
+ def check_fastq_dump_available():
2937
+ """检查fastq-dump是否可用"""
2938
+ # 查找fastq-dump路径
2939
+ fastq_dump_path = shutil.which("fastq-dump")
2940
+ prefetch_path = shutil.which("prefetch")
2941
+
2942
+ print("检查SRA Toolkit工具...")
2943
+
2944
+ if fastq_dump_path:
2945
+ print(f"✅ fastq-dump 找到: {fastq_dump_path}")
2946
+
2947
+ # 检查版本
2948
+ try:
2949
+ result = subprocess.run(
2950
+ [fastq_dump_path, "--version"],
2951
+ capture_output=True,
2952
+ text=True,
2953
+ timeout=5
2954
+ )
2955
+ if result.returncode == 0:
2956
+ print(f" 版本: {result.stdout.strip()}")
2957
+ except:
2958
+ print(" 无法获取版本信息")
2959
+ else:
2960
+ print("❌ fastq-dump 未找到")
2961
+ print(" 请安装 SRA Toolkit: https://github.com/ncbi/sra-tools")
2962
+
2963
+ if prefetch_path:
2964
+ print(f"✅ prefetch 找到: {prefetch_path}")
2965
+ else:
2966
+ print("❌ prefetch 未找到")
2967
+
2968
+ return fastq_dump_path is not None and prefetch_path is not None
2969
+
2970
+ # 检查工具
2971
+ # check_fastq_dump_available()
2972
+
2973
+ def enhance_bio_data_fetcher_with_fastqdump():
2974
+ """增强BioDataFetcher,添加fastq-dump支持"""
2975
+
2976
+ class EnhancedBioDataFetcher(BioDataFetcher):
2977
+ """增强版的BioDataFetcher,支持fastq-dump"""
2978
+
2979
+ def __init__(self, *args, **kwargs):
2980
+ super().__init__(*args, **kwargs)
2981
+ self.fastq_downloader = FastqDumpDownloader(
2982
+ cache_dir=str(self.dir_save / "fastqdump")
2983
+ )
2984
+
2985
+ # 覆盖SRA处理器
2986
+ self.data_processors['sra'] = self._process_sra_enhanced
2987
+ if DataSource.SRA in self.data_processors:
2988
+ self.data_processors[DataSource.SRA] = self._process_sra_enhanced
2989
+
2990
+ def _process_sra_enhanced(self, config: DatasetConfig) -> Any:
2991
+ """增强的SRA处理方法,优先使用fastq-dump"""
2992
+ dataset_id = config.dataset_id
2993
+
2994
+ if config.data_format == DataFormat.METADATA:
2995
+ # 仍然使用原来的方法获取元数据
2996
+ return self._process_sra(config)
2997
+
2998
+ elif config.data_format == DataFormat.FASTQ:
2999
+ print(f"使用fastq-dump下载FASTQ: {dataset_id}")
3000
+
3001
+ # 提取参数
3002
+ split_files = config.custom_params.get('split_files', True)
3003
+ gzip_output = config.custom_params.get('gzip_output', True)
3004
+ use_prefetch = config.custom_params.get('use_prefetch', True)
3005
+ max_retries = config.custom_params.get('max_retries', 3)
3006
+
3007
+ # 使用fastq-dump下载
3008
+ result = self.fastq_downloader.download_with_fastq_dump(
3009
+ accession=dataset_id,
3010
+ output_dir=self.dir_save / "fastq",
3011
+ split_files=split_files,
3012
+ gzip_output=gzip_output,
3013
+ max_retries=max_retries
3014
+ )
3015
+
3016
+ # 如果需要,也获取元数据
3017
+ if result.get('success', False) and config.custom_params.get('include_metadata', True):
3018
+ metadata = self._get_sra_metadata(dataset_id)
3019
+ result['metadata'] = metadata
3020
+
3021
+ return result
3022
+
3023
+ else:
3024
+ raise ValueError(f"Unsupported SRA format: {config.data_format}")
3025
+
3026
+ def download_sra_with_fastqdump(self,
3027
+ accession: str,
3028
+ split_files: bool = True,
3029
+ gzip_output: bool = True,
3030
+ **kwargs) -> Dict[str, Any]:
3031
+ """
3032
+ 专门使用fastq-dump下载SRA数据
3033
+
3034
+ Parameters:
3035
+ -----------
3036
+ accession : str
3037
+ SRA accession
3038
+ split_files : bool
3039
+ 是否拆分paired-end文件
3040
+ gzip_output : bool
3041
+ 是否gzip压缩
3042
+ **kwargs :
3043
+ 其他参数传递给download_with_fastq_dump
3044
+
3045
+ Returns:
3046
+ --------
3047
+ Dict: 下载结果
3048
+ """
3049
+ return self.fastq_downloader.download_with_fastq_dump(
3050
+ accession=accession,
3051
+ output_dir=self.dir_save / "fastq",
3052
+ split_files=split_files,
3053
+ gzip_output=gzip_output,
3054
+ **kwargs
3055
+ )
3056
+
3057
+ return EnhancedBioDataFetcher
3058
+
3059
+ # 使用示例
3060
+ def example_enhanced_fetcher():
3061
+ """使用增强版的BioDataFetcher"""
3062
+ print("使用增强版BioDataFetcher(支持fastq-dump)")
3063
+ print("=" * 60)
3064
+
3065
+ # 创建增强版fetcher
3066
+ EnhancedFetcher = enhance_bio_data_fetcher_with_fastqdump()
3067
+ fetcher = EnhancedFetcher(dir_save="./enhanced_cache")
3068
+
3069
+ # 方法1:使用统一接口(会自动选择fastq-dump)
3070
+ print("\n方法1:使用统一接口")
3071
+ result1 = fetcher.fetch_data(
3072
+ dataset_ids="SRR390728", # 测试用小文件
3073
+ data_type='sra',
3074
+ data_format='fastq',
3075
+ split_files=True,
3076
+ gzip_output=True,
3077
+ force_download=True
3078
+ )
3079
+
3080
+ print(f"结果1: {result1.get('SRR390728', {}).get('success', False)}")
3081
+
3082
+ # 方法2:直接使用fastq-dump方法
3083
+ print("\n方法2:直接使用fastq-dump方法")
3084
+ result2 = fetcher.download_sra_with_fastqdump(
3085
+ accession="SRR390728",
3086
+ split_files=True,
3087
+ gzip_output=True
3088
+ )
3089
+
3090
+ print(f"结果2: 成功={result2.get('success', False)}, 文件数={result2.get('file_count', 0)}")
3091
+
3092
+ # 方法3:批量下载
3093
+ print("\n方法3:批量下载测试")
3094
+ batch_result = fetcher.batch_fetch([
3095
+ {
3096
+ 'dataset_id': 'SRR390728',
3097
+ 'type': 'sra',
3098
+ 'format': 'fastq',
3099
+ 'split_files': True,
3100
+ 'gzip_output': True
3101
+ },
3102
+ {
3103
+ 'dataset_id': 'SRR3473776', # 另一个小文件
3104
+ 'type': 'sra',
3105
+ 'format': 'fastq',
3106
+ 'split_files': False # 单端数据
3107
+ }
3108
+ ])
3109
+
3110
+ for acc, res in batch_result.items():
3111
+ print(f" {acc}: 成功={res.get('success', False)}, 文件={len(res.get('files', []))}")
3112
+
3113
+ return fetcher, result1, result2, batch_result
3114
+
3115
+ # 运行示例
3116
+ # fetcher, r1, r2, batch = example_enhanced_fetcher()
3117
+
3118
+ def setup_sra_toolkit():
3119
+ """帮助用户安装和配置SRA Toolkit"""
3120
+ import platform
3121
+ import sys
3122
+
3123
+ print("SRA Toolkit 安装助手")
3124
+ print("=" * 60)
3125
+
3126
+ system = platform.system()
3127
+ print(f"操作系统: {system}")
3128
+ print(f"Python版本: {sys.version}")
3129
+
3130
+ # 检查是否已安装
3131
+ fastq_dump_path = shutil.which("fastq-dump")
3132
+ prefetch_path = shutil.which("prefetch")
3133
+
3134
+ if fastq_dump_path and prefetch_path:
3135
+ print("✅ SRA Toolkit 已安装")
3136
+ print(f" fastq-dump: {fastq_dump_path}")
3137
+ print(f" prefetch: {prefetch_path}")
3138
+ return True
3139
+
3140
+ print("❌ SRA Toolkit 未安装或不在PATH中")
3141
+ print("\n安装指南:")
3142
+
3143
+ if system == "Darwin": # macOS
3144
+ print("""
3145
+ 方法1: 使用Homebrew (推荐)
3146
+ brew install sratoolkit
3147
+
3148
+ 方法2: 手动下载
3149
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3150
+ 2. 下载macOS版本
3151
+ 3. 解压并添加到PATH:
3152
+ echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.zshrc
3153
+ source ~/.zshrc
3154
+ """)
3155
+
3156
+ elif system == "Linux":
3157
+ print("""
3158
+ 方法1: 使用包管理器
3159
+ # Ubuntu/Debian
3160
+ sudo apt-get install sra-toolkit
3161
+
3162
+ # CentOS/RHEL/Fedora
3163
+ sudo yum install sra-toolkit
3164
+
3165
+ 方法2: 手动下载
3166
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3167
+ 2. 下载Linux版本
3168
+ 3. 解压并添加到PATH:
3169
+ echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.bashrc
3170
+ source ~/.bashrc
3171
+ """)
3172
+
3173
+ elif system == "Windows":
3174
+ print("""
3175
+ 方法1: 使用Chocolatey
3176
+ choco install sratoolkit
3177
+
3178
+ 方法2: 手动下载
3179
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3180
+ 2. 下载Windows版本
3181
+ 3. 解压并将bin目录添加到系统PATH
3182
+ """)
3183
+
3184
+ else:
3185
+ print(f" 不支持的操作系统: {system}")
3186
+
3187
+ print("\n配置建议:")
3188
+ print(" 1. 运行 'vdb-config -i' 进行交互式配置")
3189
+ print(" 2. 设置缓存目录: vdb-config --set /repository/user/main/public/root=./ncbi_cache")
3190
+ print(" 3. 测试: prefetch SRR390728 && fastq-dump SRR390728")
3191
+
3192
+ return False
3193
+
3194
+ def configure_sra_toolkit():
3195
+ """配置SRA Toolkit(如果已安装)"""
3196
+ import subprocess
3197
+
3198
+ print("配置SRA Toolkit")
3199
+ print("=" * 50)
3200
+
3201
+ # 检查是否安装
3202
+ if not shutil.which("vdb-config"):
3203
+ print("❌ vdb-config 未找到,请先安装SRA Toolkit")
3204
+ return False
3205
+
3206
+ try:
3207
+ # 设置缓存目录
3208
+ cache_dir = Path.home() / ".ncbi" / "cache"
3209
+ cache_dir.mkdir(parents=True, exist_ok=True)
3210
+
3211
+ print(f"设置缓存目录: {cache_dir}")
3212
+
3213
+ # 运行vdb-config进行配置
3214
+ print("\n建议运行以下命令进行配置:")
3215
+ print(f" vdb-config -i")
3216
+ print("\n或者使用命令行配置:")
3217
+ print(f" vdb-config --set /repository/user/main/public/root={cache_dir}")
3218
+ print(" vdb-config --set /repository/user/main/public/apps/http/read-only=true")
3219
+
3220
+ # 尝试设置
3221
+ try:
3222
+ subprocess.run(
3223
+ ["vdb-config", "--set", f"/repository/user/main/public/root={cache_dir}"],
3224
+ check=True,
3225
+ capture_output=True,
3226
+ text=True
3227
+ )
3228
+ print("✅ 缓存目录设置成功")
3229
+ except:
3230
+ print("⚠️ 无法自动设置,请手动运行vdb-config")
3231
+
3232
+ return True
3233
+
3234
+ except Exception as e:
3235
+ print(f"❌ 配置失败: {e}")
3236
+ return False
3237
+
3238
+ # 运行安装助手
3239
+ # setup_sra_toolkit()
3240
+ # configure_sra_toolkit()
3241
+
3242
+ def install_fastq_dump_helper():
3243
+ """提供fastq-dump安装帮助"""
3244
+ import platform
3245
+ import sys
3246
+ import subprocess
3247
+ import shutil
3248
+
3249
+ print("🔧 fastq-dump 安装助手")
3250
+ print("=" * 60)
3251
+
3252
+ # 获取系统信息
3253
+ system = platform.system()
3254
+ machine = platform.machine()
3255
+ python_version = sys.version_info
3256
+
3257
+ print(f"操作系统: {system} ({machine})")
3258
+ print(f"Python版本: {sys.version[:20]}")
3259
+
3260
+ # 检查当前状态
3261
+ tools = ['fastq-dump', 'prefetch', 'fasterq-dump']
3262
+ available = {}
3263
+
3264
+ for tool in tools:
3265
+ path = shutil.which(tool)
3266
+ available[tool] = path
3267
+ status = "✅ 已安装" if path else "❌ 未安装"
3268
+ print(f"{tool}: {status}")
3269
+ if path:
3270
+ print(f" 路径: {path}")
3271
+
3272
+ # 尝试获取版本
3273
+ try:
3274
+ result = subprocess.run(
3275
+ [tool, "--version"],
3276
+ capture_output=True,
3277
+ text=True,
3278
+ timeout=5
3279
+ )
3280
+ if result.returncode == 0:
3281
+ version_line = result.stdout.split('\n')[0] if result.stdout else "未知"
3282
+ print(f" 版本: {version_line}")
3283
+ except:
3284
+ pass
3285
+
3286
+ print("\n" + "=" * 60)
3287
+ print("安装指南:")
3288
+
3289
+ if system == "Darwin": # macOS
3290
+ print("""
3291
+ 方法1: 使用Homebrew (推荐)
3292
+ ---------------------------------
3293
+ 1. 安装Homebrew (如果尚未安装):
3294
+ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
3295
+
3296
+ 2. 安装SRA Toolkit:
3297
+ brew install sratoolkit
3298
+
3299
+ 3. 验证安装:
3300
+ fastq-dump --version
3301
+ prefetch --version
3302
+
3303
+ 方法2: 使用Conda
3304
+ ---------------------------------
3305
+ 1. 安装Miniconda或Anaconda
3306
+ 2. 创建环境并安装:
3307
+ conda create -n sra-tools -c bioconda sra-tools
3308
+ conda activate sra-tools
3309
+ 3. 验证: fastq-dump --version
3310
+
3311
+ 方法3: 手动下载
3312
+ ---------------------------------
3313
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3314
+ 2. 下载macOS版本 (.dmg或.tar.gz)
3315
+ 3. 解压并添加到PATH:
3316
+ echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.zshrc
3317
+ source ~/.zshrc
3318
+ """)
3319
+
3320
+ elif system == "Linux":
3321
+ print("""
3322
+ 方法1: 使用包管理器 (Ubuntu/Debian)
3323
+ ---------------------------------
3324
+ 1. 更新包列表:
3325
+ sudo apt-get update
3326
+
3327
+ 2. 安装SRA Toolkit:
3328
+ sudo apt-get install sra-toolkit
3329
+
3330
+ 3. 验证安装:
3331
+ fastq-dump --version
3332
+
3333
+ 方法2: 使用包管理器 (CentOS/RHEL/Fedora)
3334
+ ---------------------------------
3335
+ 1. 安装EPEL仓库 (CentOS/RHEL):
3336
+ sudo yum install epel-release
3337
+
3338
+ 2. 安装SRA Toolkit:
3339
+ sudo yum install sra-toolkit
3340
+
3341
+ sudo dnf install sra-toolkit (Fedora)
3342
+
3343
+ 3. 验证安装
3344
+
3345
+ 方法3: 使用Conda
3346
+ ---------------------------------
3347
+ 1. 安装Miniconda:
3348
+ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
3349
+ bash Miniconda3-latest-Linux-x86_64.sh
3350
+
3351
+ 2. 安装SRA Toolkit:
3352
+ conda install -c bioconda sra-tools
3353
+
3354
+ 方法4: 手动下载
3355
+ ---------------------------------
3356
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3357
+ 2. 下载Linux版本 (.tar.gz)
3358
+ 3. 解压并添加到PATH:
3359
+ tar -xzvf sratoolkit.*.tar.gz
3360
+ echo 'export PATH=$PATH:/path/to/sratoolkit/bin' >> ~/.bashrc
3361
+ source ~/.bashrc
3362
+ """)
3363
+
3364
+ elif system == "Windows":
3365
+ print("""
3366
+ 方法1: 使用Chocolatey (推荐)
3367
+ ---------------------------------
3368
+ 1. 安装Chocolatey (如果尚未安装):
3369
+ 以管理员身份打开PowerShell,运行:
3370
+ Set-ExecutionPolicy Bypass -Scope Process -Force
3371
+ [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
3372
+ iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
3373
+
3374
+ 2. 安装SRA Toolkit:
3375
+ choco install sratoolkit
3376
+
3377
+ 3. 验证安装:
3378
+ fastq-dump --version
3379
+
3380
+ 方法2: 使用Conda
3381
+ ---------------------------------
3382
+ 1. 安装Miniconda: https://docs.conda.io/en/latest/miniconda.html
3383
+ 2. 安装SRA Toolkit:
3384
+ conda install -c bioconda sra-tools
3385
+
3386
+ 方法3: 手动下载
3387
+ ---------------------------------
3388
+ 1. 访问: https://github.com/ncbi/sra-tools/wiki/Downloads
3389
+ 2. 下载Windows版本 (.exe安装程序)
3390
+ 3. 运行安装程序,确保勾选"Add to PATH"
3391
+ """)
3392
+
3393
+ else:
3394
+ print(f"⚠️ 不支持的操作系统: {system}")
3395
+ print("请手动访问: https://github.com/ncbi/sra-tools/wiki/Downloads")
3396
+
3397
+ print("\n" + "=" * 60)
3398
+ print("配置建议:")
3399
+
3400
+ if any(available.values()):
3401
+ print("运行以下命令进行配置:")
3402
+ print(" vdb-config -i (交互式配置)")
3403
+ print("\n或使用命令行配置:")
3404
+ print(" vdb-config --set /repository/user/main/public/root=./ncbi_cache")
3405
+ print(" vdb-config --set /repository/user/main/public/apps/http/read-only=true")
3406
+ else:
3407
+ print("请先安装SRA Toolkit,然后运行上述配置命令")
3408
+
3409
+ return available
3410
+
3411
+ # 运行安装助手
3412
+ # install_fastq_dump_helper()
3413
+
3414
+
3415
+
3416
+
3417
+ if __name__ == "__main__":
3418
+ # 演示如何使用
3419
+ print("BioDataFetcher 演示")
3420
+ print("=" * 50)
3421
+
3422
+ # 创建fetcher实例
3423
+ fetcher = BioDataFetcher(dir_save="./test_cache")
3424
+
3425
+ # 示例1: 自动推断并获取GEO数据
3426
+ print("\n1. 获取GEO数据 (自动推断):")
3427
+ geo_data = fetcher.fetch_data("GSE12345")
3428
+ print(f" 获取到数据: {type(geo_data)}")
3429
+
3430
+ # 示例2: 获取SRA元数据
3431
+ print("\n2. 获取SRA元数据:")
3432
+ sra_meta = fetcher.fetch_data("SRR1635435", data_format="metadata")
3433
+ print(f" 获取到元数据: {type(sra_meta)}")
3434
+
3435
+ # 示例3: 搜索数据集
3436
+ print("\n3. 搜索癌症相关数据集:")
3437
+ search_results = fetcher.list_datasets(search_query="cancer RNA-seq", limit=5)
3438
+ if not search_results.empty:
3439
+ print(f" 找到 {len(search_results)} 个数据集:")
3440
+ for _, row in search_results.iterrows():
3441
+ print(f" {row['accession']}: {row['title'][:50]}...")
3442
+
3443
+ # 示例4: 查看缓存
3444
+ print("\n4. 查看缓存数据:")
3445
+ cached = fetcher.cache_list()
3446
+ if not cached.empty:
3447
+ print(f" 有 {len(cached)} 个缓存数据集")
3448
+
3449
+ # 示例5: 获取统计信息
3450
+ print("\n5. 统计信息:")
3451
+ stats = fetcher.get_statistics()
3452
+ print(f" 总数据集数: {stats['total_datasets']}")
3453
+ print(f" 缓存大小: {stats['total_size_gb']:.2f} GB")