dratools 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +120 -0
  4. data/bin/dratools +8 -0
  5. data/docs/design.md +80 -0
  6. data/docs/development.md +39 -0
  7. data/docs/environment.md +71 -0
  8. data/docs/usage.md +289 -0
  9. data/lib/dratools/accession_input_collector.rb +53 -0
  10. data/lib/dratools/accession_resolver.rb +104 -0
  11. data/lib/dratools/accession_resource_type_classifier.rb +34 -0
  12. data/lib/dratools/byte_formatter.rb +25 -0
  13. data/lib/dratools/checksum_verifier.rb +34 -0
  14. data/lib/dratools/command_line_interface.rb +138 -0
  15. data/lib/dratools/commands/base_command.rb +189 -0
  16. data/lib/dratools/commands/get_command.rb +87 -0
  17. data/lib/dratools/commands/meta_command.rb +123 -0
  18. data/lib/dratools/commands/probe_command.rb +55 -0
  19. data/lib/dratools/commands/runs_command.rb +70 -0
  20. data/lib/dratools/commands/size_command.rb +163 -0
  21. data/lib/dratools/commands/tree_command.rb +45 -0
  22. data/lib/dratools/commands/url_command.rb +118 -0
  23. data/lib/dratools/config.rb +114 -0
  24. data/lib/dratools/ddbj_record_fields.rb +56 -0
  25. data/lib/dratools/ddbj_resource_client.rb +78 -0
  26. data/lib/dratools/download_candidate.rb +45 -0
  27. data/lib/dratools/download_candidate_builder.rb +90 -0
  28. data/lib/dratools/download_service.rb +221 -0
  29. data/lib/dratools/errors.rb +39 -0
  30. data/lib/dratools/external_command_runner.rb +115 -0
  31. data/lib/dratools/run_record_collector.rb +198 -0
  32. data/lib/dratools/traversal_node.rb +68 -0
  33. data/lib/dratools/tree_renderer.rb +83 -0
  34. data/lib/dratools/version.rb +6 -0
  35. data/lib/dratools.rb +19 -0
  36. metadata +76 -0
@@ -0,0 +1,198 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ require_relative 'config'
6
+ require_relative 'ddbj_record_fields'
7
+ require_relative 'errors'
8
+ require_relative 'traversal_node'
9
+
10
+ module Dratools
11
+ # BioProject などの上位レコードから DDBJ sra-run レコードを集める。
12
+ class RunRecordCollector
13
+ XREF_URL_PATTERN = %r{/(?:resource|search/entry)/([^/]+)/([^/?#.]+)}
14
+ TRAVERSABLE_XREF_TYPES = [
15
+ DdbjRecordFields::SRA_RUN_RESOURCE_TYPE,
16
+ DdbjRecordFields::SRA_EXPERIMENT_RESOURCE_TYPE,
17
+ DdbjRecordFields::SRA_SAMPLE_RESOURCE_TYPE,
18
+ DdbjRecordFields::SRA_STUDY_RESOURCE_TYPE,
19
+ DdbjRecordFields::SRA_SUBMISSION_RESOURCE_TYPE,
20
+ DdbjRecordFields::BIOPROJECT_RESOURCE_TYPE,
21
+ DdbjRecordFields::BIOSAMPLE_RESOURCE_TYPE
22
+ ].freeze
23
+ def initialize(client:)
24
+ @client = client
25
+ end
26
+
27
+ def collect_run_records(ddbj_record, seen_keys = Set.new)
28
+ explore(ddbj_record, seen_keys: seen_keys).run_records
29
+ end
30
+
31
+ def explore(ddbj_record, seen_keys: Set.new, relation: TraversalNode::ROOT_RELATION,
32
+ tolerant: false, direct_run_fetch_limit: nil)
33
+ node = node_from_record(ddbj_record, relation: relation)
34
+ return node if run_record?(ddbj_record)
35
+
36
+ xrefs = ddbj_record.fetch(DdbjRecordFields::DB_XREFS_KEY, [])
37
+ run_xrefs = xrefs.select { |xref| sra_run_xref?(xref) }
38
+ if (lightweight_children = lightweight_direct_run_nodes(run_xrefs, direct_run_fetch_limit))
39
+ node.children.concat(lightweight_children)
40
+ return node
41
+ end
42
+
43
+ direct_children = explore_edges(run_xrefs, TraversalNode::DB_XREF_RELATION, seen_keys,
44
+ tolerant: tolerant,
45
+ direct_run_fetch_limit: direct_run_fetch_limit)
46
+ if direct_children.any? { |child| child.run? || child.run_records.any? }
47
+ node.children.concat(direct_children)
48
+ return node
49
+ end
50
+
51
+ recursive_xrefs = xrefs.select { |xref| traversable_xref?(xref) }
52
+ validate_recursive_non_run_xref_count!(ddbj_record, recursive_xrefs)
53
+ db_xref_edges = explore_edges(
54
+ recursive_xrefs,
55
+ TraversalNode::DB_XREF_RELATION,
56
+ seen_keys,
57
+ tolerant: tolerant,
58
+ direct_run_fetch_limit: direct_run_fetch_limit
59
+ )
60
+ child_edges = explore_edges(
61
+ child_bioprojects(ddbj_record),
62
+ TraversalNode::CHILD_BIOPROJECT_RELATION,
63
+ seen_keys,
64
+ tolerant: tolerant,
65
+ direct_run_fetch_limit: direct_run_fetch_limit
66
+ )
67
+ node.children.concat(db_xref_edges + child_edges)
68
+ node
69
+ end
70
+
71
+ private
72
+
73
+ def lightweight_direct_run_nodes(run_xrefs, direct_run_fetch_limit)
74
+ return nil unless direct_run_fetch_limit && run_xrefs.length > direct_run_fetch_limit
75
+
76
+ run_xrefs.map { |xref| node_from_xref(xref, relation: TraversalNode::DB_XREF_RELATION) }
77
+ end
78
+
79
+ def validate_recursive_non_run_xref_count!(ddbj_record, xrefs)
80
+ max_xrefs = Config.max_recursive_non_run_xrefs
81
+ return unless max_xrefs
82
+
83
+ non_run_xrefs = xrefs.reject { |xref| sra_run_xref?(xref) }
84
+ return if non_run_xrefs.length <= max_xrefs
85
+
86
+ accession = record_accession(ddbj_record) || 'record'
87
+ raise InvalidRecordError,
88
+ "#{accession} has #{non_run_xrefs.length} linked non-run records; " \
89
+ 'refine to an experiment/sample accession before run expansion'
90
+ end
91
+
92
+ def child_bioprojects(ddbj_record)
93
+ ddbj_record.fetch(DdbjRecordFields::CHILD_BIOPROJECTS_KEY, [])
94
+ end
95
+
96
+ def explore_edges(xrefs, relation, seen_keys, tolerant:, direct_run_fetch_limit:)
97
+ xrefs.each_with_object([]) do |xref, children|
98
+ next unless traversable_xref?(xref)
99
+
100
+ reference_key = xref_key(xref)
101
+ next if reference_key.empty? || seen_keys.include?(reference_key)
102
+
103
+ seen_keys.add(reference_key)
104
+ children << explore_xref(
105
+ xref,
106
+ relation,
107
+ seen_keys,
108
+ tolerant: tolerant,
109
+ direct_run_fetch_limit: direct_run_fetch_limit
110
+ )
111
+ end
112
+ end
113
+
114
+ def explore_xref(xref, relation, seen_keys, tolerant:, direct_run_fetch_limit:)
115
+ linked_record = fetch_xref_record(xref)
116
+ explore(
117
+ linked_record,
118
+ seen_keys: seen_keys,
119
+ relation: relation,
120
+ tolerant: tolerant,
121
+ direct_run_fetch_limit: direct_run_fetch_limit
122
+ )
123
+ rescue Error => error
124
+ raise unless tolerant
125
+
126
+ node_from_xref(xref, relation: relation, error: error.message)
127
+ end
128
+
129
+ def xref_key(xref)
130
+ reference_url = xref[DdbjRecordFields::URL_KEY].to_s
131
+ return reference_url unless reference_url.empty?
132
+
133
+ (xref[DdbjRecordFields::ID_KEY] || xref[DdbjRecordFields::IDENTIFIER_KEY]).to_s
134
+ end
135
+
136
+ def fetch_xref_record(xref)
137
+ if (resource_match = xref[DdbjRecordFields::URL_KEY].to_s.match(XREF_URL_PATTERN))
138
+ @client.fetch_resource_record(resource_match[1], resource_match[2])
139
+ elsif xref[DdbjRecordFields::ID_KEY] || xref[DdbjRecordFields::IDENTIFIER_KEY]
140
+ fetch_xref_by_identifier(xref)
141
+ else
142
+ raise InvalidRecordError, 'sra-run xref has no URL or id'
143
+ end
144
+ end
145
+
146
+ def fetch_xref_by_identifier(xref)
147
+ @client.fetch_resource_record(
148
+ xref[DdbjRecordFields::TYPE_KEY],
149
+ xref[DdbjRecordFields::ID_KEY] || xref[DdbjRecordFields::IDENTIFIER_KEY]
150
+ )
151
+ end
152
+
153
+ def sra_run_xref?(xref)
154
+ xref[DdbjRecordFields::TYPE_KEY] == DdbjRecordFields::SRA_RUN_RESOURCE_TYPE
155
+ end
156
+
157
+ def traversable_xref?(xref)
158
+ TRAVERSABLE_XREF_TYPES.include?(xref[DdbjRecordFields::TYPE_KEY])
159
+ end
160
+
161
+ def run_record?(ddbj_record)
162
+ record_type = ddbj_record[DdbjRecordFields::TYPE_KEY]
163
+ return record_type == DdbjRecordFields::SRA_RUN_RESOURCE_TYPE if record_type
164
+
165
+ ddbj_record[DdbjRecordFields::DOWNLOAD_URL_KEY].is_a?(Array)
166
+ end
167
+
168
+ def node_from_record(ddbj_record, relation:)
169
+ TraversalNode.new(
170
+ relation: relation,
171
+ type: ddbj_record[DdbjRecordFields::TYPE_KEY] || inferred_record_type(ddbj_record),
172
+ accession: record_accession(ddbj_record),
173
+ object_type: ddbj_record['objectType'],
174
+ record: run_record?(ddbj_record) ? ddbj_record : nil
175
+ )
176
+ end
177
+
178
+ def node_from_xref(xref, relation:, error: nil)
179
+ TraversalNode.new(
180
+ relation: relation,
181
+ type: xref[DdbjRecordFields::TYPE_KEY],
182
+ accession: xref[DdbjRecordFields::ID_KEY] || xref[DdbjRecordFields::IDENTIFIER_KEY],
183
+ error: error
184
+ )
185
+ end
186
+
187
+ def record_accession(ddbj_record)
188
+ ddbj_record[DdbjRecordFields::ACCESSION_KEY] ||
189
+ ddbj_record[DdbjRecordFields::IDENTIFIER_KEY] ||
190
+ ddbj_record[DdbjRecordFields::ID_KEY] ||
191
+ ddbj_record[DdbjRecordFields::PRIMARY_ID_KEY]
192
+ end
193
+
194
+ def inferred_record_type(ddbj_record)
195
+ DdbjRecordFields::SRA_RUN_RESOURCE_TYPE if run_record?(ddbj_record)
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dratools
4
+ # DDBJ record traversal and download leaves for tree rendering.
5
+ class TraversalNode
6
+ ROOT_RELATION = :root
7
+ DB_XREF_RELATION = :db_xref
8
+ CHILD_BIOPROJECT_RELATION = :child_bioproject
9
+ DOWNLOAD_RELATION = :download
10
+
11
+ attr_reader :relation, :type, :accession, :object_type, :record, :url, :error, :children,
12
+ :download
13
+
14
+ def initialize( # rubocop:disable Metrics/ParameterLists
15
+ relation: ROOT_RELATION,
16
+ type: nil,
17
+ accession: nil,
18
+ object_type: nil,
19
+ record: nil,
20
+ url: nil,
21
+ error: nil,
22
+ children: [],
23
+ download: nil
24
+ )
25
+ @relation = relation
26
+ @type = type
27
+ @accession = accession
28
+ @object_type = object_type
29
+ @record = record
30
+ @url = url
31
+ @error = error
32
+ @children = children
33
+ @download = download
34
+ end
35
+
36
+ def run?
37
+ type == DdbjRecordFields::SRA_RUN_RESOURCE_TYPE
38
+ end
39
+
40
+ def download?
41
+ relation == DOWNLOAD_RELATION
42
+ end
43
+
44
+ def errored?
45
+ !error.to_s.empty?
46
+ end
47
+
48
+ def run_records
49
+ records = run? && record ? [record] : []
50
+ records + children.flat_map(&:run_records)
51
+ end
52
+
53
+ def run_accessions
54
+ accessions = run? && accession ? [accession] : []
55
+ accessions + children.reject(&:download?).flat_map(&:run_accessions)
56
+ end
57
+
58
+ def downloads
59
+ own_downloads = download ? [download] : []
60
+ own_downloads + children.flat_map(&:downloads)
61
+ end
62
+
63
+ def errors
64
+ own_errors = errored? ? [error] : []
65
+ own_errors + children.flat_map(&:errors)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dratools
4
+ # Renders a TraversalNode tree as terminal-friendly text.
5
+ class TreeRenderer
6
+ DEFAULT_SUMMARY_THRESHOLD = 5
7
+
8
+ def initialize(file_type: DdbjRecordFields::FILE_TYPE_SRA,
9
+ summary_threshold: DEFAULT_SUMMARY_THRESHOLD)
10
+ @file_type = file_type
11
+ @summary_threshold = summary_threshold
12
+ end
13
+
14
+ def render(root)
15
+ lines = [label_for(root)]
16
+ render_children(root.children, prefix: '', lines: lines)
17
+ lines.join("\n")
18
+ end
19
+
20
+ private
21
+
22
+ def render_children(children, prefix:, lines:)
23
+ display_children = summarized_children(children)
24
+ display_children.each_with_index do |child, index|
25
+ last = index == display_children.length - 1
26
+ connector = last ? '└─ ' : '├─ '
27
+ lines << "#{prefix}#{connector}#{label_for(child)}"
28
+ next if child.children.empty?
29
+
30
+ child_prefix = "#{prefix}#{last ? ' ' : '│ '}"
31
+ render_children(child.children, prefix: child_prefix, lines: lines)
32
+ end
33
+ end
34
+
35
+ def summarized_children(children)
36
+ return children unless summarizable_run_group?(children)
37
+
38
+ [
39
+ TraversalNode.new(
40
+ type: DdbjRecordFields::SRA_RUN_RESOURCE_TYPE,
41
+ accession: "#{children.length} records",
42
+ children: [
43
+ TraversalNode.new(
44
+ type: summary_leaf_label(children)
45
+ )
46
+ ]
47
+ )
48
+ ]
49
+ end
50
+
51
+ def summary_leaf_label(children)
52
+ return "#{@file_type} downloads not expanded" if children.any? { |child| child.record.nil? }
53
+
54
+ "no #{@file_type} downloads"
55
+ end
56
+
57
+ def summarizable_run_group?(children)
58
+ return false if children.length <= @summary_threshold
59
+ return false unless children.all?(&:run?)
60
+
61
+ children.all? { |child| child.downloads.empty? && child.errors.empty? }
62
+ end
63
+
64
+ def label_for(node)
65
+ return [node.type, node.url].compact.join(' ') if node.download?
66
+
67
+ label_parts = []
68
+ label_parts << relation_label(node)
69
+ label_parts << node.type unless node.relation == TraversalNode::CHILD_BIOPROJECT_RELATION
70
+ label_parts << node.accession
71
+ label_parts << node.object_type
72
+ label_parts << "error: #{node.error}" if node.errored?
73
+ label_parts.compact.join(' ')
74
+ end
75
+
76
+ def relation_label(node)
77
+ case node.relation
78
+ when TraversalNode::CHILD_BIOPROJECT_RELATION
79
+ 'childBioProject'
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dratools
4
+ NAME = 'dratools'
5
+ VERSION = '0.0.1'
6
+ end
data/lib/dratools.rb ADDED
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'dratools/version'
4
+ require_relative 'dratools/errors'
5
+ require_relative 'dratools/config'
6
+ require_relative 'dratools/ddbj_record_fields'
7
+ require_relative 'dratools/byte_formatter'
8
+ require_relative 'dratools/download_candidate'
9
+ require_relative 'dratools/traversal_node'
10
+ require_relative 'dratools/tree_renderer'
11
+ require_relative 'dratools/download_candidate_builder'
12
+ require_relative 'dratools/checksum_verifier'
13
+ require_relative 'dratools/ddbj_resource_client'
14
+ require_relative 'dratools/accession_resource_type_classifier'
15
+ require_relative 'dratools/run_record_collector'
16
+ require_relative 'dratools/accession_resolver'
17
+ require_relative 'dratools/download_service'
18
+ require_relative 'dratools/accession_input_collector'
19
+ require_relative 'dratools/command_line_interface'
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dratools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - kojix2
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: DDBJ Search を使って DRA/SRA ファイル URL を解決し、DDBJ から取得する小さな CLI とライブラリです。
13
+ email:
14
+ - 2xijok@gmail.com
15
+ executables:
16
+ - dratools
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE.txt
21
+ - README.md
22
+ - bin/dratools
23
+ - docs/design.md
24
+ - docs/development.md
25
+ - docs/environment.md
26
+ - docs/usage.md
27
+ - lib/dratools.rb
28
+ - lib/dratools/accession_input_collector.rb
29
+ - lib/dratools/accession_resolver.rb
30
+ - lib/dratools/accession_resource_type_classifier.rb
31
+ - lib/dratools/byte_formatter.rb
32
+ - lib/dratools/checksum_verifier.rb
33
+ - lib/dratools/command_line_interface.rb
34
+ - lib/dratools/commands/base_command.rb
35
+ - lib/dratools/commands/get_command.rb
36
+ - lib/dratools/commands/meta_command.rb
37
+ - lib/dratools/commands/probe_command.rb
38
+ - lib/dratools/commands/runs_command.rb
39
+ - lib/dratools/commands/size_command.rb
40
+ - lib/dratools/commands/tree_command.rb
41
+ - lib/dratools/commands/url_command.rb
42
+ - lib/dratools/config.rb
43
+ - lib/dratools/ddbj_record_fields.rb
44
+ - lib/dratools/ddbj_resource_client.rb
45
+ - lib/dratools/download_candidate.rb
46
+ - lib/dratools/download_candidate_builder.rb
47
+ - lib/dratools/download_service.rb
48
+ - lib/dratools/errors.rb
49
+ - lib/dratools/external_command_runner.rb
50
+ - lib/dratools/run_record_collector.rb
51
+ - lib/dratools/traversal_node.rb
52
+ - lib/dratools/tree_renderer.rb
53
+ - lib/dratools/version.rb
54
+ homepage: https://github.com/kojix2/dratools
55
+ licenses:
56
+ - MIT
57
+ metadata:
58
+ rubygems_mfa_required: 'true'
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '3.0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 4.0.10
74
+ specification_version: 4
75
+ summary: DDBJ DRA toolkit
76
+ test_files: []