dratools 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/bin/dratools +8 -0
- data/docs/design.md +80 -0
- data/docs/development.md +39 -0
- data/docs/environment.md +71 -0
- data/docs/usage.md +289 -0
- data/lib/dratools/accession_input_collector.rb +53 -0
- data/lib/dratools/accession_resolver.rb +104 -0
- data/lib/dratools/accession_resource_type_classifier.rb +34 -0
- data/lib/dratools/byte_formatter.rb +25 -0
- data/lib/dratools/checksum_verifier.rb +34 -0
- data/lib/dratools/command_line_interface.rb +138 -0
- data/lib/dratools/commands/base_command.rb +189 -0
- data/lib/dratools/commands/get_command.rb +87 -0
- data/lib/dratools/commands/meta_command.rb +123 -0
- data/lib/dratools/commands/probe_command.rb +55 -0
- data/lib/dratools/commands/runs_command.rb +70 -0
- data/lib/dratools/commands/size_command.rb +163 -0
- data/lib/dratools/commands/tree_command.rb +45 -0
- data/lib/dratools/commands/url_command.rb +118 -0
- data/lib/dratools/config.rb +114 -0
- data/lib/dratools/ddbj_record_fields.rb +56 -0
- data/lib/dratools/ddbj_resource_client.rb +78 -0
- data/lib/dratools/download_candidate.rb +45 -0
- data/lib/dratools/download_candidate_builder.rb +90 -0
- data/lib/dratools/download_service.rb +221 -0
- data/lib/dratools/errors.rb +39 -0
- data/lib/dratools/external_command_runner.rb +115 -0
- data/lib/dratools/run_record_collector.rb +198 -0
- data/lib/dratools/traversal_node.rb +68 -0
- data/lib/dratools/tree_renderer.rb +83 -0
- data/lib/dratools/version.rb +6 -0
- data/lib/dratools.rb +19 -0
- metadata +76 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Dratools
|
|
4
|
+
# DDBJ Search resource JSON で使う resource type とキー名をまとめる。
|
|
5
|
+
module DdbjRecordFields
|
|
6
|
+
SRA_RUN_RESOURCE_TYPE = 'sra-run'
|
|
7
|
+
SRA_EXPERIMENT_RESOURCE_TYPE = 'sra-experiment'
|
|
8
|
+
SRA_SAMPLE_RESOURCE_TYPE = 'sra-sample'
|
|
9
|
+
SRA_STUDY_RESOURCE_TYPE = 'sra-study'
|
|
10
|
+
SRA_SUBMISSION_RESOURCE_TYPE = 'sra-submission'
|
|
11
|
+
BIOPROJECT_RESOURCE_TYPE = 'bioproject'
|
|
12
|
+
BIOSAMPLE_RESOURCE_TYPE = 'biosample'
|
|
13
|
+
|
|
14
|
+
FILE_TYPE_SRA = 'sra'
|
|
15
|
+
FILE_TYPE_FASTQ = 'fastq'
|
|
16
|
+
FILE_TYPE_ALL = 'all'
|
|
17
|
+
|
|
18
|
+
DB_XREFS_KEY = 'dbXrefs'
|
|
19
|
+
CHILD_BIOPROJECTS_KEY = 'childBioProjects'
|
|
20
|
+
TYPE_KEY = 'type'
|
|
21
|
+
URL_KEY = 'url'
|
|
22
|
+
FTP_URL_KEY = 'ftpUrl'
|
|
23
|
+
ID_KEY = 'id'
|
|
24
|
+
IDENTIFIER_KEY = 'identifier'
|
|
25
|
+
ACCESSION_KEY = 'accession'
|
|
26
|
+
PRIMARY_ID_KEY = 'primaryId'
|
|
27
|
+
DOWNLOAD_URL_KEY = 'downloadUrl'
|
|
28
|
+
DISTRIBUTION_KEY = 'distribution'
|
|
29
|
+
CONTENT_URL_KEY = 'contentUrl'
|
|
30
|
+
CONTENT_SIZE_KEY = 'contentSize'
|
|
31
|
+
SIZE_KEY = 'size'
|
|
32
|
+
FILE_SIZE_KEY = 'fileSize'
|
|
33
|
+
MD5_KEY = 'md5'
|
|
34
|
+
MD5_SUM_KEY = 'md5sum'
|
|
35
|
+
ENCODING_FORMAT_KEY = 'encodingFormat'
|
|
36
|
+
|
|
37
|
+
INFO_FIELD_KEYS = [
|
|
38
|
+
IDENTIFIER_KEY,
|
|
39
|
+
TYPE_KEY,
|
|
40
|
+
'title',
|
|
41
|
+
'description',
|
|
42
|
+
'organism',
|
|
43
|
+
'platform',
|
|
44
|
+
'instrumentModel',
|
|
45
|
+
'libraryStrategy',
|
|
46
|
+
'librarySource',
|
|
47
|
+
'librarySelection',
|
|
48
|
+
'libraryLayout',
|
|
49
|
+
'libraryName',
|
|
50
|
+
'dateCreated',
|
|
51
|
+
'dateModified',
|
|
52
|
+
'datePublished',
|
|
53
|
+
'status'
|
|
54
|
+
].freeze
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'net/http'
|
|
5
|
+
require 'timeout'
|
|
6
|
+
require 'uri'
|
|
7
|
+
|
|
8
|
+
require_relative 'errors'
|
|
9
|
+
require_relative 'version'
|
|
10
|
+
|
|
11
|
+
module Dratools
|
|
12
|
+
# DDBJ resource API を呼び出す薄い HTTP クライアント。
|
|
13
|
+
class DdbjResourceClient
|
|
14
|
+
DDBJ_RESOURCE_BASE_URL = 'https://ddbj.nig.ac.jp/resource'
|
|
15
|
+
RESOURCE_RECORD_EXTENSION = '.json'
|
|
16
|
+
HTTPS_SCHEME = 'https'
|
|
17
|
+
HTTP_LOCATION_HEADER = 'location'
|
|
18
|
+
USER_AGENT_HEADER = 'User-Agent'
|
|
19
|
+
DEFAULT_REDIRECT_LIMIT = 5
|
|
20
|
+
DEFAULT_OPEN_TIMEOUT_SECONDS = 10
|
|
21
|
+
DEFAULT_READ_TIMEOUT_SECONDS = 30
|
|
22
|
+
|
|
23
|
+
def initialize(base_url: DDBJ_RESOURCE_BASE_URL, open_timeout: DEFAULT_OPEN_TIMEOUT_SECONDS,
|
|
24
|
+
read_timeout: DEFAULT_READ_TIMEOUT_SECONDS)
|
|
25
|
+
@base_url = base_url.delete_suffix('/')
|
|
26
|
+
@open_timeout = open_timeout
|
|
27
|
+
@read_timeout = read_timeout
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def fetch_resource_record(type, accession)
|
|
31
|
+
fetch_json("#{@base_url}/#{type}/#{accession}#{RESOURCE_RECORD_EXTENSION}")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def fetch_json(request_url, redirects_remaining = DEFAULT_REDIRECT_LIMIT)
|
|
37
|
+
request_uri = URI(request_url)
|
|
38
|
+
response = get_http_response(request_uri)
|
|
39
|
+
|
|
40
|
+
case response
|
|
41
|
+
when Net::HTTPSuccess
|
|
42
|
+
JSON.parse(response.body)
|
|
43
|
+
when Net::HTTPRedirection
|
|
44
|
+
raise NetworkError, "too many redirects: #{request_url}" if redirects_remaining <= 0
|
|
45
|
+
|
|
46
|
+
location = response[HTTP_LOCATION_HEADER]
|
|
47
|
+
raise NetworkError, "redirect without location: #{request_url}" if location.to_s.empty?
|
|
48
|
+
|
|
49
|
+
fetch_json(URI.join(request_uri, location).to_s, redirects_remaining - 1)
|
|
50
|
+
when Net::HTTPNotFound
|
|
51
|
+
raise NotFoundError, "not found: #{request_url}"
|
|
52
|
+
else
|
|
53
|
+
raise NetworkError, "HTTP #{response.code}: #{request_url}"
|
|
54
|
+
end
|
|
55
|
+
rescue JSON::ParserError => error
|
|
56
|
+
raise NetworkError, "invalid JSON from #{request_url}: #{error.message}", cause: error
|
|
57
|
+
rescue Timeout::Error, IOError, SocketError, SystemCallError => error
|
|
58
|
+
message = "failed to fetch #{request_url}: #{error.class}: #{error.message}"
|
|
59
|
+
raise NetworkError, message, cause: error
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def get_http_response(request_uri)
|
|
63
|
+
Net::HTTP.start(
|
|
64
|
+
request_uri.host,
|
|
65
|
+
request_uri.port,
|
|
66
|
+
use_ssl: request_uri.scheme == HTTPS_SCHEME,
|
|
67
|
+
open_timeout: @open_timeout,
|
|
68
|
+
read_timeout: @read_timeout
|
|
69
|
+
) do |http|
|
|
70
|
+
http.get(request_uri.request_uri, USER_AGENT_HEADER => user_agent)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def user_agent
|
|
75
|
+
"#{NAME}/#{VERSION}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
require_relative 'errors'
|
|
6
|
+
|
|
7
|
+
module Dratools
|
|
8
|
+
# 1 件のダウンロード候補を表す値オブジェクト。
|
|
9
|
+
class DownloadCandidate
|
|
10
|
+
FTP_PROTOCOL = 'ftp'
|
|
11
|
+
HTTPS_PROTOCOL = 'https'
|
|
12
|
+
HTTP_PROTOCOL = 'http'
|
|
13
|
+
HTTP_BASED_PROTOCOLS = [HTTPS_PROTOCOL, HTTP_PROTOCOL].freeze
|
|
14
|
+
|
|
15
|
+
attr_reader :run_accession, :type, :url, :ftp_url, :size, :md5
|
|
16
|
+
|
|
17
|
+
def initialize(type:, run_accession: nil, url: nil, ftp_url: nil, size: nil, md5: nil)
|
|
18
|
+
@run_accession = run_accession
|
|
19
|
+
@type = type
|
|
20
|
+
@url = url
|
|
21
|
+
@ftp_url = ftp_url
|
|
22
|
+
@size = size
|
|
23
|
+
@md5 = md5
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def url_for_protocol(protocol)
|
|
27
|
+
case protocol.to_s
|
|
28
|
+
when FTP_PROTOCOL
|
|
29
|
+
ftp_url || url
|
|
30
|
+
when *HTTP_BASED_PROTOCOLS
|
|
31
|
+
url || ftp_url
|
|
32
|
+
else
|
|
33
|
+
raise InvalidProtocolError, "unknown protocol: #{protocol}"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def filename_for_protocol(protocol = HTTPS_PROTOCOL)
|
|
38
|
+
File.basename(URI(url_for_protocol(protocol)).path)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def directory_url_for_protocol?(protocol = HTTPS_PROTOCOL)
|
|
42
|
+
URI(url_for_protocol(protocol)).path.end_with?('/')
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'ddbj_record_fields'
|
|
4
|
+
require_relative 'download_candidate'
|
|
5
|
+
|
|
6
|
+
module Dratools
|
|
7
|
+
# DDBJ run レコードの downloadUrl/distribution から DownloadCandidate を作る。
|
|
8
|
+
class DownloadCandidateBuilder
|
|
9
|
+
def build_from_run_record(run_record)
|
|
10
|
+
run_accession = run_accession_from(run_record)
|
|
11
|
+
downloads = download_items_from(run_record).filter_map do |download_item|
|
|
12
|
+
build_from_download_item(run_accession, download_item)
|
|
13
|
+
end
|
|
14
|
+
downloads.uniq { |download| download_key(download) }
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def run_accession_from(run_record)
|
|
20
|
+
run_record[DdbjRecordFields::ACCESSION_KEY] ||
|
|
21
|
+
run_record[DdbjRecordFields::IDENTIFIER_KEY] ||
|
|
22
|
+
run_record[DdbjRecordFields::ID_KEY] ||
|
|
23
|
+
run_record[DdbjRecordFields::PRIMARY_ID_KEY]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def download_items_from(ddbj_record)
|
|
27
|
+
ddbj_record.fetch(DdbjRecordFields::DOWNLOAD_URL_KEY, []) +
|
|
28
|
+
ddbj_record.fetch(DdbjRecordFields::DISTRIBUTION_KEY, [])
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def build_from_download_item(run_accession, download_item)
|
|
32
|
+
return unless download_item.is_a?(Hash)
|
|
33
|
+
|
|
34
|
+
if download_item[DdbjRecordFields::CONTENT_URL_KEY]
|
|
35
|
+
build_from_distribution_item(run_accession, download_item)
|
|
36
|
+
else
|
|
37
|
+
build_from_download_url_item(run_accession, download_item)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def build_from_distribution_item(run_accession, download_item)
|
|
42
|
+
file_type = file_type_from_distribution(download_item)
|
|
43
|
+
return unless file_type
|
|
44
|
+
|
|
45
|
+
DownloadCandidate.new(
|
|
46
|
+
run_accession: run_accession,
|
|
47
|
+
type: file_type,
|
|
48
|
+
url: download_item[DdbjRecordFields::CONTENT_URL_KEY],
|
|
49
|
+
ftp_url: nil,
|
|
50
|
+
size: download_item[DdbjRecordFields::CONTENT_SIZE_KEY],
|
|
51
|
+
md5: download_item[DdbjRecordFields::MD5_KEY] || download_item[DdbjRecordFields::MD5_SUM_KEY]
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def build_from_download_url_item(run_accession, download_item)
|
|
56
|
+
file_type = file_type_from_download_url(download_item)
|
|
57
|
+
return unless file_type
|
|
58
|
+
|
|
59
|
+
DownloadCandidate.new(
|
|
60
|
+
run_accession: run_accession,
|
|
61
|
+
type: file_type,
|
|
62
|
+
url: download_item[DdbjRecordFields::URL_KEY],
|
|
63
|
+
ftp_url: download_item[DdbjRecordFields::FTP_URL_KEY],
|
|
64
|
+
size: download_item[DdbjRecordFields::SIZE_KEY] || download_item[DdbjRecordFields::FILE_SIZE_KEY],
|
|
65
|
+
md5: download_item[DdbjRecordFields::MD5_KEY] || download_item[DdbjRecordFields::MD5_SUM_KEY]
|
|
66
|
+
)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def file_type_from_distribution(download_item)
|
|
70
|
+
file_type_from(download_item[DdbjRecordFields::ENCODING_FORMAT_KEY])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def file_type_from_download_url(download_item)
|
|
74
|
+
file_type_from(download_item[DdbjRecordFields::TYPE_KEY])
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def file_type_from(value)
|
|
78
|
+
case value.to_s.downcase
|
|
79
|
+
when DdbjRecordFields::FILE_TYPE_SRA
|
|
80
|
+
DdbjRecordFields::FILE_TYPE_SRA
|
|
81
|
+
when DdbjRecordFields::FILE_TYPE_FASTQ
|
|
82
|
+
DdbjRecordFields::FILE_TYPE_FASTQ
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def download_key(download)
|
|
87
|
+
[download.type, download.url, download.ftp_url]
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi/escape'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require 'net/http'
|
|
6
|
+
require 'timeout'
|
|
7
|
+
require 'uri'
|
|
8
|
+
|
|
9
|
+
require_relative 'download_candidate'
|
|
10
|
+
require_relative 'external_command_runner'
|
|
11
|
+
require_relative 'checksum_verifier'
|
|
12
|
+
require_relative 'errors'
|
|
13
|
+
require_relative 'version'
|
|
14
|
+
|
|
15
|
+
module Dratools
|
|
16
|
+
# 解決済みダウンロードの取得、probe、md5 検証、既存ファイル判定をまとめて扱う。
|
|
17
|
+
class DownloadService
|
|
18
|
+
DownloadResult = Struct.new(:path, :skipped, keyword_init: true) do
|
|
19
|
+
def skipped?
|
|
20
|
+
skipped
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def to_s
|
|
24
|
+
path
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
DEFAULT_OUTPUT_DIRECTORY = '.'
|
|
29
|
+
DEFAULT_PROTOCOL = DownloadCandidate::HTTPS_PROTOCOL
|
|
30
|
+
DEFAULT_PROBE_TIMEOUT_SECONDS = 5
|
|
31
|
+
DEFAULT_SIZE_TIMEOUT_SECONDS = 10
|
|
32
|
+
DEFAULT_REDIRECT_LIMIT = 5
|
|
33
|
+
HTTP_LOCATION_HEADER = 'location'
|
|
34
|
+
HTTPS_SCHEME = 'https'
|
|
35
|
+
USER_AGENT_HEADER = 'User-Agent'
|
|
36
|
+
FASTQ_HREF_PATTERN = /href=(?<quote>["'])(?<href>[^"']*\.fastq[^"']*)\k<quote>/i
|
|
37
|
+
|
|
38
|
+
def initialize(runner: ExternalCommandRunner.new, checksum_verifier: ChecksumVerifier.new)
|
|
39
|
+
@runner = runner
|
|
40
|
+
@checksum_verifier = checksum_verifier
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def probe_download(download, protocol: DEFAULT_PROTOCOL, timeout: DEFAULT_PROBE_TIMEOUT_SECONDS)
|
|
44
|
+
@runner.probe_url(download.url_for_protocol(protocol), timeout: timeout)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def content_lengths(download, protocol: DEFAULT_PROTOCOL, timeout: DEFAULT_SIZE_TIMEOUT_SECONDS)
|
|
48
|
+
download_url = download.url_for_protocol(protocol)
|
|
49
|
+
return [nil] unless http_url?(download_url)
|
|
50
|
+
|
|
51
|
+
if download.directory_url_for_protocol?(protocol)
|
|
52
|
+
file_urls = directory_file_urls(download_url, timeout: timeout)
|
|
53
|
+
return [nil] if file_urls.empty?
|
|
54
|
+
|
|
55
|
+
return file_urls.map { |file_url| safe_head_content_length(file_url, timeout: timeout) }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
[safe_head_content_length(download_url, timeout: timeout)]
|
|
59
|
+
rescue Error
|
|
60
|
+
[nil]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def save_download(
|
|
64
|
+
download,
|
|
65
|
+
outdir: DEFAULT_OUTPUT_DIRECTORY,
|
|
66
|
+
protocol: DEFAULT_PROTOCOL,
|
|
67
|
+
verify: true,
|
|
68
|
+
force: false,
|
|
69
|
+
skip_existing: false
|
|
70
|
+
)
|
|
71
|
+
FileUtils.mkdir_p(outdir)
|
|
72
|
+
download_url = download.url_for_protocol(protocol)
|
|
73
|
+
if download.directory_url_for_protocol?(protocol)
|
|
74
|
+
raise InvalidRecordError, "download URL points to a directory: #{download_url}"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
output_path = File.join(outdir, download.filename_for_protocol(protocol))
|
|
78
|
+
FileUtils.rm_f(output_path) if force && File.file?(output_path)
|
|
79
|
+
if should_skip_existing?(
|
|
80
|
+
output_path,
|
|
81
|
+
download,
|
|
82
|
+
download_url: download_url,
|
|
83
|
+
skip_existing: skip_existing
|
|
84
|
+
)
|
|
85
|
+
return DownloadResult.new(path: output_path, skipped: true)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
@runner.download_url(download_url, output_path)
|
|
89
|
+
if verify && checksum_available?(download)
|
|
90
|
+
@checksum_verifier.verify_md5!(output_path, download.md5)
|
|
91
|
+
end
|
|
92
|
+
DownloadResult.new(path: output_path, skipped: false)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
private
|
|
96
|
+
|
|
97
|
+
def should_skip_existing?(output_path, download, download_url:, skip_existing:)
|
|
98
|
+
return false unless File.file?(output_path)
|
|
99
|
+
return true if skip_existing
|
|
100
|
+
if checksum_available?(download)
|
|
101
|
+
return @checksum_verifier.md5_matches?(output_path, download.md5)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
existing_file_complete?(output_path, download_url)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def existing_file_complete?(output_path, download_url)
|
|
108
|
+
remote_size = safe_head_content_length(download_url, timeout: DEFAULT_SIZE_TIMEOUT_SECONDS)
|
|
109
|
+
return false unless remote_size
|
|
110
|
+
|
|
111
|
+
local_size = File.size(output_path)
|
|
112
|
+
return true if local_size == remote_size
|
|
113
|
+
return false if local_size < remote_size
|
|
114
|
+
|
|
115
|
+
raise InvalidRecordError,
|
|
116
|
+
"existing file is larger than remote file: #{output_path} " \
|
|
117
|
+
"(local=#{local_size}, remote=#{remote_size}); use --force to re-download"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def checksum_available?(download)
|
|
121
|
+
!download.md5.to_s.strip.empty?
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def http_url?(url)
|
|
125
|
+
DownloadCandidate::HTTP_BASED_PROTOCOLS.include?(URI(url).scheme)
|
|
126
|
+
rescue TypeError, URI::InvalidURIError
|
|
127
|
+
false
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def directory_file_urls(directory_url, timeout:, redirects_remaining: DEFAULT_REDIRECT_LIMIT)
|
|
131
|
+
request_uri = URI(directory_url)
|
|
132
|
+
response = get_http_response(request_uri, timeout: timeout)
|
|
133
|
+
|
|
134
|
+
case response
|
|
135
|
+
when Net::HTTPSuccess
|
|
136
|
+
response.body.scan(FASTQ_HREF_PATTERN).map do |match|
|
|
137
|
+
href = CGI.unescapeHTML(match.last)
|
|
138
|
+
URI.join(request_uri, href).to_s
|
|
139
|
+
end.uniq
|
|
140
|
+
when Net::HTTPRedirection
|
|
141
|
+
raise NetworkError, "too many redirects: #{directory_url}" if redirects_remaining <= 0
|
|
142
|
+
|
|
143
|
+
location = response[HTTP_LOCATION_HEADER]
|
|
144
|
+
raise NetworkError, "redirect without location: #{directory_url}" if location.to_s.empty?
|
|
145
|
+
|
|
146
|
+
directory_file_urls(
|
|
147
|
+
URI.join(request_uri, location).to_s,
|
|
148
|
+
timeout: timeout,
|
|
149
|
+
redirects_remaining: redirects_remaining - 1
|
|
150
|
+
)
|
|
151
|
+
else
|
|
152
|
+
raise NetworkError, "HTTP #{response.code}: #{directory_url}"
|
|
153
|
+
end
|
|
154
|
+
rescue Timeout::Error, IOError, SocketError, SystemCallError, URI::InvalidURIError => error
|
|
155
|
+
raise NetworkError, fetch_failure_message(directory_url, error), cause: error
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def head_content_length(request_url, timeout:, redirects_remaining: DEFAULT_REDIRECT_LIMIT)
|
|
159
|
+
request_uri = URI(request_url)
|
|
160
|
+
response = head_http_response(request_uri, timeout: timeout)
|
|
161
|
+
|
|
162
|
+
case response
|
|
163
|
+
when Net::HTTPSuccess
|
|
164
|
+
response.content_length
|
|
165
|
+
when Net::HTTPRedirection
|
|
166
|
+
raise NetworkError, "too many redirects: #{request_url}" if redirects_remaining <= 0
|
|
167
|
+
|
|
168
|
+
location = response[HTTP_LOCATION_HEADER]
|
|
169
|
+
raise NetworkError, "redirect without location: #{request_url}" if location.to_s.empty?
|
|
170
|
+
|
|
171
|
+
head_content_length(
|
|
172
|
+
URI.join(request_uri, location).to_s,
|
|
173
|
+
timeout: timeout,
|
|
174
|
+
redirects_remaining: redirects_remaining - 1
|
|
175
|
+
)
|
|
176
|
+
else
|
|
177
|
+
raise NetworkError, "HTTP #{response.code}: #{request_url}"
|
|
178
|
+
end
|
|
179
|
+
rescue Timeout::Error, IOError, SocketError, SystemCallError, URI::InvalidURIError => error
|
|
180
|
+
raise NetworkError, fetch_failure_message(request_url, error), cause: error
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def safe_head_content_length(request_url, timeout:)
|
|
184
|
+
head_content_length(request_url, timeout: timeout)
|
|
185
|
+
rescue Error
|
|
186
|
+
nil
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def get_http_response(request_uri, timeout:)
|
|
190
|
+
Net::HTTP.start(
|
|
191
|
+
request_uri.host,
|
|
192
|
+
request_uri.port,
|
|
193
|
+
use_ssl: request_uri.scheme == HTTPS_SCHEME,
|
|
194
|
+
open_timeout: timeout,
|
|
195
|
+
read_timeout: timeout
|
|
196
|
+
) do |http|
|
|
197
|
+
http.get(request_uri.request_uri, USER_AGENT_HEADER => user_agent)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def head_http_response(request_uri, timeout:)
|
|
202
|
+
Net::HTTP.start(
|
|
203
|
+
request_uri.host,
|
|
204
|
+
request_uri.port,
|
|
205
|
+
use_ssl: request_uri.scheme == HTTPS_SCHEME,
|
|
206
|
+
open_timeout: timeout,
|
|
207
|
+
read_timeout: timeout
|
|
208
|
+
) do |http|
|
|
209
|
+
http.head(request_uri.request_uri, USER_AGENT_HEADER => user_agent)
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def user_agent
|
|
214
|
+
"#{NAME}/#{VERSION}"
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def fetch_failure_message(url, error)
|
|
218
|
+
"failed to fetch #{url}: #{error.class}: #{error.message}"
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Dratools
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class InputError < Error
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class MissingAccessionError < InputError
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
class InputFileError < InputError
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class InvalidOptionError < Error
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
class InvalidProtocolError < Error
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class NotFoundError < Error
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
class InvalidRecordError < Error
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class NetworkError < Error
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class CommandError < Error
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
class ChecksumError < Error
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class UnsupportedAccessionError < Error
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'shellwords'
|
|
5
|
+
require 'English'
|
|
6
|
+
|
|
7
|
+
require_relative 'config'
|
|
8
|
+
require_relative 'errors'
|
|
9
|
+
|
|
10
|
+
module Dratools
|
|
11
|
+
# curl か wget を使って URL の確認とダウンロードを行うラッパー。
|
|
12
|
+
#
|
|
13
|
+
# probe は短時間・無出力で済ませ、download は外部コマンドの進捗を端末へ流す。
|
|
14
|
+
# 巨大ファイルを扱うため、download には総時間制限ではなく失速検知を使う。
|
|
15
|
+
class ExternalCommandRunner
|
|
16
|
+
CURL_COMMAND = 'curl'
|
|
17
|
+
WGET_COMMAND = 'wget'
|
|
18
|
+
SUPPORTED_COMMANDS = [CURL_COMMAND, WGET_COMMAND].freeze
|
|
19
|
+
COMMAND_NOT_FOUND_MESSAGE = 'curl または wget が見つかりません'
|
|
20
|
+
DEFAULT_PROBE_TIMEOUT_SECONDS = 5
|
|
21
|
+
PROBE_BYTE_RANGE = '0-0'
|
|
22
|
+
SINGLE_ATTEMPT_COUNT = 1
|
|
23
|
+
|
|
24
|
+
CURL_PROBE_OPTIONS = ['--location', '--fail', '--silent', '--show-error', '--range'].freeze
|
|
25
|
+
CURL_TIMEOUT_OPTION = '--max-time'
|
|
26
|
+
CURL_CONNECT_TIMEOUT_OPTION = '--connect-timeout'
|
|
27
|
+
CURL_SPEED_LIMIT_OPTION = '--speed-limit'
|
|
28
|
+
CURL_SPEED_TIME_OPTION = '--speed-time'
|
|
29
|
+
CURL_RETRY_OPTION = '--retry'
|
|
30
|
+
CURL_OUTPUT_OPTION = '--output'
|
|
31
|
+
CURL_DOWNLOAD_OPTIONS = ['--location', '--fail', '--continue-at', '-'].freeze
|
|
32
|
+
|
|
33
|
+
WGET_PROBE_OPTIONS = ['--spider'].freeze
|
|
34
|
+
WGET_TIMEOUT_OPTION = '--timeout'
|
|
35
|
+
WGET_CONNECT_TIMEOUT_OPTION = '--connect-timeout'
|
|
36
|
+
WGET_READ_TIMEOUT_OPTION = '--read-timeout'
|
|
37
|
+
WGET_TRIES_OPTION = '--tries'
|
|
38
|
+
WGET_WAITRETRY_OPTION = '--waitretry'
|
|
39
|
+
WGET_CONTINUE_OPTION = '--continue'
|
|
40
|
+
WGET_OUTPUT_OPTION = '--output-document'
|
|
41
|
+
|
|
42
|
+
def initialize(preferred: nil)
|
|
43
|
+
@preferred = preferred
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def available_command
|
|
47
|
+
candidates = [@preferred, *SUPPORTED_COMMANDS].compact
|
|
48
|
+
candidates.find { |command_name| executable_command?(command_name) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def probe_url(url, timeout: DEFAULT_PROBE_TIMEOUT_SECONDS)
|
|
52
|
+
tool = available_command || raise(CommandError, COMMAND_NOT_FOUND_MESSAGE)
|
|
53
|
+
# 巨大ファイルを落とさないよう、短時間・最小範囲の確認に留める。
|
|
54
|
+
command =
|
|
55
|
+
if File.basename(tool) == CURL_COMMAND
|
|
56
|
+
[tool, *CURL_PROBE_OPTIONS, PROBE_BYTE_RANGE, CURL_TIMEOUT_OPTION, timeout.to_s,
|
|
57
|
+
CURL_OUTPUT_OPTION, null_device, url]
|
|
58
|
+
else
|
|
59
|
+
[tool, *WGET_PROBE_OPTIONS, "#{WGET_TIMEOUT_OPTION}=#{timeout}",
|
|
60
|
+
"#{WGET_TRIES_OPTION}=#{SINGLE_ATTEMPT_COUNT}", url]
|
|
61
|
+
end
|
|
62
|
+
run_quietly(command)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def download_url(url, output_path)
|
|
66
|
+
tool = available_command || raise(CommandError, COMMAND_NOT_FOUND_MESSAGE)
|
|
67
|
+
command =
|
|
68
|
+
if File.basename(tool) == CURL_COMMAND
|
|
69
|
+
[tool, *CURL_DOWNLOAD_OPTIONS,
|
|
70
|
+
CURL_CONNECT_TIMEOUT_OPTION, Config.download_connect_timeout_seconds.to_s,
|
|
71
|
+
CURL_SPEED_LIMIT_OPTION, Config.download_stall_speed_bytes_per_second.to_s,
|
|
72
|
+
CURL_SPEED_TIME_OPTION, Config.download_stall_timeout_seconds.to_s,
|
|
73
|
+
CURL_RETRY_OPTION, Config.download_retry_count.to_s,
|
|
74
|
+
CURL_OUTPUT_OPTION, output_path, url]
|
|
75
|
+
else
|
|
76
|
+
[tool, WGET_CONTINUE_OPTION,
|
|
77
|
+
"#{WGET_CONNECT_TIMEOUT_OPTION}=#{Config.download_connect_timeout_seconds}",
|
|
78
|
+
"#{WGET_READ_TIMEOUT_OPTION}=#{Config.download_stall_timeout_seconds}",
|
|
79
|
+
"#{WGET_TRIES_OPTION}=#{Config.download_retry_count}",
|
|
80
|
+
"#{WGET_WAITRETRY_OPTION}=#{Config.download_retry_wait_seconds}",
|
|
81
|
+
WGET_OUTPUT_OPTION, output_path, url]
|
|
82
|
+
end
|
|
83
|
+
run_streaming(command)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def run_quietly(command)
|
|
89
|
+
out, err, status = Open3.capture3(*command)
|
|
90
|
+
return true if status.success?
|
|
91
|
+
|
|
92
|
+
raise CommandError, "#{command.shelljoin}\n#{out}#{err}"
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def run_streaming(command)
|
|
96
|
+
# 配列形式で渡すことでシェルを介さず、curl/wget の stderr 進捗はそのまま見せる。
|
|
97
|
+
return true if system(*command)
|
|
98
|
+
|
|
99
|
+
status = $CHILD_STATUS
|
|
100
|
+
detail = status ? "exit status: #{status.exitstatus}" : 'command failed'
|
|
101
|
+
raise CommandError, "#{command.shelljoin}\n#{detail}"
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def executable_command?(command_name)
|
|
105
|
+
ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |directory|
|
|
106
|
+
command_path = File.join(directory, command_name)
|
|
107
|
+
File.file?(command_path) && File.executable?(command_path)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def null_device
|
|
112
|
+
File::NULL
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|