ncbi-blast-dbs 0.0.2 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/README.md +12 -8
- data/bin/ncbi-blast-dbs +17 -2
- data/lib/http-ncbi-blast-dbs.rake +84 -0
- data/lib/ncbi-blast-dbs.rake +46 -4
- data/ncbi-blast-dbs.gemspec +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bcce1b77f891ca2abb26a791de778fb9305d0e8a998f971eff97393e11f6ea87
|
4
|
+
data.tar.gz: 1dc5dc4d9f3d2bf0f6dea3b75bc04f1cefbece636598c941dcf08f9c2affba2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb6f3996b8e344a7e98af27c6187bc71a57cdf3996ed74d84243355c6c12aa9e122081b31b2b006d995fea04011c698cde49ef692c8b8eb6b45a790b2490d17f
|
7
|
+
data.tar.gz: bc9f835b65e8bb3fd7923d697a9b98096f2d5b5b8e9f7e5c0482f0996aece512397fecc70db78426e93bb91b99a199f7b9cd20bb48c0f46b915e5e580b7f7ef5
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9
|
data/README.md
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
## Fast download BLAST databases from NCBI
|
2
2
|
|
3
3
|
Database files (volumes) are downloaded in parallel: number of threads to use
|
4
|
-
is determined automatically. MD5 checksum is verified and the database
|
5
|
-
extracted upon download.
|
6
|
-
order.
|
7
|
-
|
4
|
+
is determined automatically. MD5 checksum is verified and the database volume
|
5
|
+
extracted upon download. Database volumes are not downloaded in a particular
|
6
|
+
order. The volumes are updated if a newer version is available on the server,
|
7
|
+
or re-downloaded if corrupt. Aborted downloads are safely resumed.
|
8
8
|
|
9
|
-
|
10
|
-
which is a pure Perl script,
|
11
|
-
to `wget` and `md5sum` and is thus
|
9
|
+
`ncbi-blast-dbs` is faster than NCBI's `update_blastdb.pl`. But unlike
|
10
|
+
`update_blastdb.pl`, which is a pure Perl script, `ncbi-blast-dbs` delegates
|
11
|
+
download and checksum verification to `wget` and `md5sum` / `md5` and is thus
|
12
|
+
not as universal.
|
12
13
|
|
13
14
|
### Installation
|
14
15
|
|
@@ -22,7 +23,10 @@ to `wget` and `md5sum` and is thus not as universal.
|
|
22
23
|
|
23
24
|
#### Download all volumes of a BLAST database
|
24
25
|
|
25
|
-
ncbi-blast-dbs nr
|
26
|
+
ncbi-blast-dbs nt nr
|
27
|
+
|
28
|
+
Databases are downloaded one after the other. Volumes of each database are
|
29
|
+
downloaded in parallel. Downloads are placed in the current directory.
|
26
30
|
|
27
31
|
NCBI expects users to submit their email address when downloading data from
|
28
32
|
their FTP server. To comply with that, download as:
|
data/bin/ncbi-blast-dbs
CHANGED
@@ -1,5 +1,20 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'rake'
|
4
|
-
|
5
|
-
|
4
|
+
|
5
|
+
trap :INT do
|
6
|
+
puts "Quitting ..."
|
7
|
+
exit!
|
8
|
+
end
|
9
|
+
|
10
|
+
if ARGV.include? "http";
|
11
|
+
import "#{File.dirname(__FILE__)}/../lib/http-ncbi-blast-dbs.rake"
|
12
|
+
Rake.application.init 'http-ncbi-blast-dbs'
|
13
|
+
Rake.application.load_imports
|
14
|
+
Rake.application.top_level
|
15
|
+
else;
|
16
|
+
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake"
|
17
|
+
Rake.application.init 'ncbi-blast-dbs'
|
18
|
+
Rake.application.load_imports
|
19
|
+
Rake.application.top_level
|
20
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
puts "using http-ncbi-dbs-dgs.rake"
|
4
|
+
# Downloads tarball at the given URL if a local copy does not exist, or if the
|
5
|
+
# local copy is older than at the given URL, or if the local copy is corrupt.
|
6
|
+
def download(url, last_to_do)
|
7
|
+
file = File.basename(url)
|
8
|
+
|
9
|
+
# # Resume an interrupted download or fetch the file for the first time. If
|
10
|
+
# # the file on the server is newer, then it is downloaded from start.
|
11
|
+
|
12
|
+
sh "wget -Nc --no-verbose #{url}"
|
13
|
+
# If the local copy is already fully retrieved, then the previous command
|
14
|
+
# ignores the timestamp. So we check with the server again if the file on
|
15
|
+
# the server is newer and if so download the new copy.
|
16
|
+
sh "wget -N --no-verbose #{url}"
|
17
|
+
sh "wget -Nc --no-verbose #{url}.md5"
|
18
|
+
sh "wget -N --no-verbose #{url}.md5"
|
19
|
+
# Immediately download md5 and verify the tarball. Re-download tarball if
|
20
|
+
# corrupt; extract otherwise.
|
21
|
+
sh "md5sum -c #{file}.md5" do |matched, _|
|
22
|
+
if !matched
|
23
|
+
sh "rm #{file} #{file}.md5"; download(url)
|
24
|
+
# too many tar instances unzipping the same file clutter the system
|
25
|
+
elsif file == last_to_do;
|
26
|
+
sh "tar xfov #{file}"
|
27
|
+
else
|
28
|
+
# at least nr and nt tarballs have identical files .?al; unsure of others
|
29
|
+
sh "tar xfov #{file} --exclude='*.?al' --exclude='taxdb*'"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def databases
|
36
|
+
method = 'https://'
|
37
|
+
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db'
|
38
|
+
uri = URI.parse(method + host + "/" + dir + "/")
|
39
|
+
|
40
|
+
response = Net::HTTP.get_response(uri)
|
41
|
+
body = response.body.split
|
42
|
+
|
43
|
+
array_of_files = []
|
44
|
+
body.each do |line|
|
45
|
+
# regex takes the raw http response, matches lines such as:
|
46
|
+
# href="tsa_nt.06.tar.gz.md5">tsa_nt.06.tar.gz</a>
|
47
|
+
# Returns:
|
48
|
+
# tsa_nt.06.tar.gz
|
49
|
+
filenames_and_newlines = line[/(^href=".*">)(.*tar.gz|.*md5)(<\/a>)$/, 2]
|
50
|
+
array_of_files.append(filenames_and_newlines) unless filenames_and_newlines.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
# append the full path to file for downstream wget
|
54
|
+
array_of_files.map! { |string| "".concat("/blast/db/", string ) }
|
55
|
+
array_of_files.
|
56
|
+
map { |file| File.join(host, file) }.
|
57
|
+
select { |file| file.match(/\.tar\.gz$/) }.
|
58
|
+
group_by { |file| File.basename(file).split('.')[0] }
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# Create user-facing task for each database to drive the download of its
|
63
|
+
# volumes in parallel.
|
64
|
+
databases.each do |name, files|
|
65
|
+
last = { name => files.last }
|
66
|
+
multitask(name => files.map { |file| task(file) { download(file, last.values.uniq) } })
|
67
|
+
end
|
68
|
+
|
69
|
+
# List name of all databases that can be downloaded if executed without
|
70
|
+
# any arguments.
|
71
|
+
task :default do
|
72
|
+
databases
|
73
|
+
puts databases.keys.push('taxdump').join(', ')
|
74
|
+
end
|
75
|
+
|
76
|
+
task :taxdump do
|
77
|
+
download('https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', "nil")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Ruby being over my head, this is my quick-and-dirty way to trick it ignoring
|
81
|
+
# "http" as a task rather than a specification. Happy for an expert to fix it up!
|
82
|
+
task :http do
|
83
|
+
puts "using http method"
|
84
|
+
end
|
data/lib/ncbi-blast-dbs.rake
CHANGED
@@ -1,12 +1,45 @@
|
|
1
1
|
require 'net/ftp'
|
2
2
|
|
3
|
+
# Downloads tarball at the given URL if a local copy does not exist, or if the
|
4
|
+
# local copy is older than at the given URL, or if the local copy is corrupt.
|
3
5
|
def download(url)
|
4
6
|
file = File.basename(url)
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
# Resume an interrupted download or fetch the file for the first time. If
|
8
|
+
# the file on the server is newer, then it is downloaded from start.
|
9
|
+
sh "wget -Nc --no-verbose #{url}"
|
10
|
+
# If the local copy is already fully retrieved, then the previous command
|
11
|
+
# ignores the timestamp. So we check with the server again if the file on
|
12
|
+
# the server is newer and if so download the new copy.
|
13
|
+
sh "wget -N --no-verbose #{url}"
|
14
|
+
|
15
|
+
# Download Md5
|
16
|
+
sh "wget --no-verbose #{url}.md5"
|
17
|
+
|
18
|
+
# Verify the tarball using md5sum or md5
|
19
|
+
if system("which md5sum > /dev/null")
|
20
|
+
matched = system("md5sum -c #{file}.md5")
|
21
|
+
elsif system("which md5 > /dev/null")
|
22
|
+
md5_out = %x[md5 -q #{file}].chomp
|
23
|
+
md5_actual = File.read("#{file}.md5").split[0]
|
24
|
+
matched = md5_out == md5_actual
|
25
|
+
else
|
26
|
+
puts "Cannot find md5sum or md5. Please install md5sum or md5 and try again"
|
27
|
+
exit 1
|
28
|
+
end
|
29
|
+
|
30
|
+
# Re-download tarball if corrupt; extract otherwise.
|
31
|
+
if !matched
|
32
|
+
sh "rm #{file} #{file}.md5"; download(url)
|
33
|
+
else
|
34
|
+
sh "tar xf #{file}"
|
35
|
+
end
|
8
36
|
end
|
9
37
|
|
38
|
+
# Connects to NCBI's FTP server, gets the URL of all database volumes and
|
39
|
+
# returns them grouped by database name:
|
40
|
+
#
|
41
|
+
# {'nr' => ['ftp://...', ...], 'nt' => [...], ...}
|
42
|
+
#
|
10
43
|
def databases
|
11
44
|
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db'
|
12
45
|
usr, pswd = 'anonymous', ENV['email']
|
@@ -20,10 +53,19 @@ def databases
|
|
20
53
|
end
|
21
54
|
end
|
22
55
|
|
56
|
+
# Create user-facing task for each database to drive the download of its
|
57
|
+
# volumes in parallel.
|
23
58
|
databases.each do |name, files|
|
24
59
|
multitask(name => files.map { |file| task(file) { download(file) } })
|
25
60
|
end
|
26
61
|
|
62
|
+
# Taxonomy database is different from sequence databases.
|
63
|
+
task :taxdump do
|
64
|
+
download 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
|
65
|
+
end
|
66
|
+
|
67
|
+
# List name of all databases that can be downloaded if executed without
|
68
|
+
# any arguments.
|
27
69
|
task :default do
|
28
|
-
puts databases.keys.join(', ')
|
70
|
+
puts databases.keys.push('taxdump').join(', ')
|
29
71
|
end
|
data/ncbi-blast-dbs.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.authors = ['Anurag Priyam']
|
3
3
|
s.email = ['anurag08priyam@gmail.com']
|
4
4
|
s.name = 'ncbi-blast-dbs'
|
5
|
-
s.version = '0.0.
|
5
|
+
s.version = '0.0.7'
|
6
6
|
s.summary = 'Fast download BLAST databases from NCBI.'
|
7
7
|
s.description = <<DESC
|
8
8
|
Downloads BLAST databases from NCBI. Database files (volumes) are downloaded in
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ncbi-blast-dbs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Anurag Priyam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -41,10 +41,12 @@ executables:
|
|
41
41
|
extensions: []
|
42
42
|
extra_rdoc_files: []
|
43
43
|
files:
|
44
|
+
- ".ruby-version"
|
44
45
|
- Gemfile
|
45
46
|
- LICENSE.txt
|
46
47
|
- README.md
|
47
48
|
- bin/ncbi-blast-dbs
|
49
|
+
- lib/http-ncbi-blast-dbs.rake
|
48
50
|
- lib/ncbi-blast-dbs.rake
|
49
51
|
- ncbi-blast-dbs.gemspec
|
50
52
|
homepage: http://github.com/yeban/ncbi-blast-dbs
|
@@ -66,10 +68,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
68
|
- !ruby/object:Gem::Version
|
67
69
|
version: '0'
|
68
70
|
requirements: []
|
69
|
-
|
70
|
-
rubygems_version: 2.2.5
|
71
|
+
rubygems_version: 3.0.3
|
71
72
|
signing_key:
|
72
73
|
specification_version: 4
|
73
74
|
summary: Fast download BLAST databases from NCBI.
|
74
75
|
test_files: []
|
75
|
-
has_rdoc:
|