ncbi-blast-dbs 0.0.2 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/README.md +12 -8
- data/bin/ncbi-blast-dbs +17 -2
- data/lib/http-ncbi-blast-dbs.rake +84 -0
- data/lib/ncbi-blast-dbs.rake +46 -4
- data/ncbi-blast-dbs.gemspec +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bcce1b77f891ca2abb26a791de778fb9305d0e8a998f971eff97393e11f6ea87
|
4
|
+
data.tar.gz: 1dc5dc4d9f3d2bf0f6dea3b75bc04f1cefbece636598c941dcf08f9c2affba2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb6f3996b8e344a7e98af27c6187bc71a57cdf3996ed74d84243355c6c12aa9e122081b31b2b006d995fea04011c698cde49ef692c8b8eb6b45a790b2490d17f
|
7
|
+
data.tar.gz: bc9f835b65e8bb3fd7923d697a9b98096f2d5b5b8e9f7e5c0482f0996aece512397fecc70db78426e93bb91b99a199f7b9cd20bb48c0f46b915e5e580b7f7ef5
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9
|
data/README.md
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
## Fast download BLAST databases from NCBI
|
2
2
|
|
3
3
|
Database files (volumes) are downloaded in parallel: number of threads to use
|
4
|
-
is determined automatically. MD5 checksum is verified and the database
|
5
|
-
extracted upon download.
|
6
|
-
order.
|
7
|
-
|
4
|
+
is determined automatically. MD5 checksum is verified and the database volume
|
5
|
+
extracted upon download. Database volumes are not downloaded in a particular
|
6
|
+
order. The volumes are updated if a newer version is available on the server,
|
7
|
+
or re-downloaded if corrupt. Aborted downloads are safely resumed.
|
8
8
|
|
9
|
-
|
10
|
-
which is a pure Perl script,
|
11
|
-
to `wget` and `md5sum` and is thus
|
9
|
+
`ncbi-blast-dbs` is faster than NCBI's `update_blastdb.pl`. But unlike
|
10
|
+
`update_blastdb.pl`, which is a pure Perl script, `ncbi-blast-dbs` delegates
|
11
|
+
download and checksum verification to `wget` and `md5sum` / `md5` and is thus
|
12
|
+
not as universal.
|
12
13
|
|
13
14
|
### Installation
|
14
15
|
|
@@ -22,7 +23,10 @@ to `wget` and `md5sum` and is thus not as universal.
|
|
22
23
|
|
23
24
|
#### Download all volumes of a BLAST database
|
24
25
|
|
25
|
-
ncbi-blast-dbs nr
|
26
|
+
ncbi-blast-dbs nt nr
|
27
|
+
|
28
|
+
Databases are downloaded one after the other. Volumes of each database are
|
29
|
+
downloaded in parallel. Downloads are placed in the current directory.
|
26
30
|
|
27
31
|
NCBI expects users to submit their email address when downloading data from
|
28
32
|
their FTP server. To comply with that, download as:
|
data/bin/ncbi-blast-dbs
CHANGED
@@ -1,5 +1,20 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'rake'
|
4
|
-
|
5
|
-
|
4
|
+
|
5
|
+
trap :INT do
|
6
|
+
puts "Quitting ..."
|
7
|
+
exit!
|
8
|
+
end
|
9
|
+
|
10
|
+
if ARGV.include? "http";
|
11
|
+
import "#{File.dirname(__FILE__)}/../lib/http-ncbi-blast-dbs.rake"
|
12
|
+
Rake.application.init 'http-ncbi-blast-dbs'
|
13
|
+
Rake.application.load_imports
|
14
|
+
Rake.application.top_level
|
15
|
+
else;
|
16
|
+
import "#{File.dirname(__FILE__)}/../lib/ncbi-blast-dbs.rake"
|
17
|
+
Rake.application.init 'ncbi-blast-dbs'
|
18
|
+
Rake.application.load_imports
|
19
|
+
Rake.application.top_level
|
20
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
puts "using http-ncbi-dbs-dgs.rake"
|
4
|
+
# Downloads tarball at the given URL if a local copy does not exist, or if the
|
5
|
+
# local copy is older than at the given URL, or if the local copy is corrupt.
|
6
|
+
def download(url, last_to_do)
|
7
|
+
file = File.basename(url)
|
8
|
+
|
9
|
+
# # Resume an interrupted download or fetch the file for the first time. If
|
10
|
+
# # the file on the server is newer, then it is downloaded from start.
|
11
|
+
|
12
|
+
sh "wget -Nc --no-verbose #{url}"
|
13
|
+
# If the local copy is already fully retrieved, then the previous command
|
14
|
+
# ignores the timestamp. So we check with the server again if the file on
|
15
|
+
# the server is newer and if so download the new copy.
|
16
|
+
sh "wget -N --no-verbose #{url}"
|
17
|
+
sh "wget -Nc --no-verbose #{url}.md5"
|
18
|
+
sh "wget -N --no-verbose #{url}.md5"
|
19
|
+
# Immediately download md5 and verify the tarball. Re-download tarball if
|
20
|
+
# corrupt; extract otherwise.
|
21
|
+
sh "md5sum -c #{file}.md5" do |matched, _|
|
22
|
+
if !matched
|
23
|
+
sh "rm #{file} #{file}.md5"; download(url)
|
24
|
+
# too many tar instances unzipping the same file clutter the system
|
25
|
+
elsif file == last_to_do;
|
26
|
+
sh "tar xfov #{file}"
|
27
|
+
else
|
28
|
+
# at least nr and nt tarballs have identical files .?al; unsure of others
|
29
|
+
sh "tar xfov #{file} --exclude='*.?al' --exclude='taxdb*'"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def databases
|
36
|
+
method = 'https://'
|
37
|
+
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db'
|
38
|
+
uri = URI.parse(method + host + "/" + dir + "/")
|
39
|
+
|
40
|
+
response = Net::HTTP.get_response(uri)
|
41
|
+
body = response.body.split
|
42
|
+
|
43
|
+
array_of_files = []
|
44
|
+
body.each do |line|
|
45
|
+
# regex takes the raw http response, matches lines such as:
|
46
|
+
# href="tsa_nt.06.tar.gz.md5">tsa_nt.06.tar.gz</a>
|
47
|
+
# Returns:
|
48
|
+
# tsa_nt.06.tar.gz
|
49
|
+
filenames_and_newlines = line[/(^href=".*">)(.*tar.gz|.*md5)(<\/a>)$/, 2]
|
50
|
+
array_of_files.append(filenames_and_newlines) unless filenames_and_newlines.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
# append the full path to file for downstream wget
|
54
|
+
array_of_files.map! { |string| "".concat("/blast/db/", string ) }
|
55
|
+
array_of_files.
|
56
|
+
map { |file| File.join(host, file) }.
|
57
|
+
select { |file| file.match(/\.tar\.gz$/) }.
|
58
|
+
group_by { |file| File.basename(file).split('.')[0] }
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# Create user-facing task for each database to drive the download of its
|
63
|
+
# volumes in parallel.
|
64
|
+
databases.each do |name, files|
|
65
|
+
last = { name => files.last }
|
66
|
+
multitask(name => files.map { |file| task(file) { download(file, last.values.uniq) } })
|
67
|
+
end
|
68
|
+
|
69
|
+
# List name of all databases that can be downloaded if executed without
|
70
|
+
# any arguments.
|
71
|
+
task :default do
|
72
|
+
databases
|
73
|
+
puts databases.keys.push('taxdump').join(', ')
|
74
|
+
end
|
75
|
+
|
76
|
+
task :taxdump do
|
77
|
+
download('https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', "nil")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Ruby being over my head, this is my quick-and-dirty way to trick it ignoring
|
81
|
+
# "http" as a task rather than a specification. Happy for an expert to fix it up!
|
82
|
+
task :http do
|
83
|
+
puts "using http method"
|
84
|
+
end
|
data/lib/ncbi-blast-dbs.rake
CHANGED
@@ -1,12 +1,45 @@
|
|
1
1
|
require 'net/ftp'
|
2
2
|
|
3
|
+
# Downloads tarball at the given URL if a local copy does not exist, or if the
|
4
|
+
# local copy is older than at the given URL, or if the local copy is corrupt.
|
3
5
|
def download(url)
|
4
6
|
file = File.basename(url)
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
# Resume an interrupted download or fetch the file for the first time. If
|
8
|
+
# the file on the server is newer, then it is downloaded from start.
|
9
|
+
sh "wget -Nc --no-verbose #{url}"
|
10
|
+
# If the local copy is already fully retrieved, then the previous command
|
11
|
+
# ignores the timestamp. So we check with the server again if the file on
|
12
|
+
# the server is newer and if so download the new copy.
|
13
|
+
sh "wget -N --no-verbose #{url}"
|
14
|
+
|
15
|
+
# Download Md5
|
16
|
+
sh "wget --no-verbose #{url}.md5"
|
17
|
+
|
18
|
+
# Verify the tarball using md5sum or md5
|
19
|
+
if system("which md5sum > /dev/null")
|
20
|
+
matched = system("md5sum -c #{file}.md5")
|
21
|
+
elsif system("which md5 > /dev/null")
|
22
|
+
md5_out = %x[md5 -q #{file}].chomp
|
23
|
+
md5_actual = File.read("#{file}.md5").split[0]
|
24
|
+
matched = md5_out == md5_actual
|
25
|
+
else
|
26
|
+
puts "Cannot find md5sum or md5. Please install md5sum or md5 and try again"
|
27
|
+
exit 1
|
28
|
+
end
|
29
|
+
|
30
|
+
# Re-download tarball if corrupt; extract otherwise.
|
31
|
+
if !matched
|
32
|
+
sh "rm #{file} #{file}.md5"; download(url)
|
33
|
+
else
|
34
|
+
sh "tar xf #{file}"
|
35
|
+
end
|
8
36
|
end
|
9
37
|
|
38
|
+
# Connects to NCBI's FTP server, gets the URL of all database volumes and
|
39
|
+
# returns them grouped by database name:
|
40
|
+
#
|
41
|
+
# {'nr' => ['ftp://...', ...], 'nt' => [...], ...}
|
42
|
+
#
|
10
43
|
def databases
|
11
44
|
host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db'
|
12
45
|
usr, pswd = 'anonymous', ENV['email']
|
@@ -20,10 +53,19 @@ def databases
|
|
20
53
|
end
|
21
54
|
end
|
22
55
|
|
56
|
+
# Create user-facing task for each database to drive the download of its
|
57
|
+
# volumes in parallel.
|
23
58
|
databases.each do |name, files|
|
24
59
|
multitask(name => files.map { |file| task(file) { download(file) } })
|
25
60
|
end
|
26
61
|
|
62
|
+
# Taxonomy database is different from sequence databases.
|
63
|
+
task :taxdump do
|
64
|
+
download 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
|
65
|
+
end
|
66
|
+
|
67
|
+
# List name of all databases that can be downloaded if executed without
|
68
|
+
# any arguments.
|
27
69
|
task :default do
|
28
|
-
puts databases.keys.join(', ')
|
70
|
+
puts databases.keys.push('taxdump').join(', ')
|
29
71
|
end
|
data/ncbi-blast-dbs.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.authors = ['Anurag Priyam']
|
3
3
|
s.email = ['anurag08priyam@gmail.com']
|
4
4
|
s.name = 'ncbi-blast-dbs'
|
5
|
-
s.version = '0.0.
|
5
|
+
s.version = '0.0.7'
|
6
6
|
s.summary = 'Fast download BLAST databases from NCBI.'
|
7
7
|
s.description = <<DESC
|
8
8
|
Downloads BLAST databases from NCBI. Database files (volumes) are downloaded in
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ncbi-blast-dbs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Anurag Priyam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -41,10 +41,12 @@ executables:
|
|
41
41
|
extensions: []
|
42
42
|
extra_rdoc_files: []
|
43
43
|
files:
|
44
|
+
- ".ruby-version"
|
44
45
|
- Gemfile
|
45
46
|
- LICENSE.txt
|
46
47
|
- README.md
|
47
48
|
- bin/ncbi-blast-dbs
|
49
|
+
- lib/http-ncbi-blast-dbs.rake
|
48
50
|
- lib/ncbi-blast-dbs.rake
|
49
51
|
- ncbi-blast-dbs.gemspec
|
50
52
|
homepage: http://github.com/yeban/ncbi-blast-dbs
|
@@ -66,10 +68,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
66
68
|
- !ruby/object:Gem::Version
|
67
69
|
version: '0'
|
68
70
|
requirements: []
|
69
|
-
|
70
|
-
rubygems_version: 2.2.5
|
71
|
+
rubygems_version: 3.0.3
|
71
72
|
signing_key:
|
72
73
|
specification_version: 4
|
73
74
|
summary: Fast download BLAST databases from NCBI.
|
74
75
|
test_files: []
|
75
|
-
has_rdoc:
|