miga-base 1.3.9.1 → 1.3.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/download/ncbi.rb +19 -2
- data/lib/miga/cli/action/get.rb +1 -1
- data/lib/miga/json.rb +14 -5
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +32 -17
- data/lib/miga/result/stats.rb +28 -41
- data/lib/miga/version.rb +1 -1
- data/test/net_test.rb +20 -0
- data/test/remote_dataset_test.rb +1 -1
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32295b80e344eec3e534bfef0de472a19c14674c93a50ac6c066a3690be7499c
|
4
|
+
data.tar.gz: fa44c75572f39ae7dc60dabcdc2fcf11d2c17b5a4120dea0ea6fa66a4dc915ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3fd8c46e2daf0b0e6ee82435bc91b5d62784a36f2f2e0cff1b254d335dc6cecb9b5aacef2c982c743d026d87163049611a2bc73f99245f5253af8331be284b6
|
7
|
+
data.tar.gz: c3b3c69514dbb2cc035b78380f49789483ff96d510f7b044602e487512911bd8f6243451c38d473e27581a986302ae6f10e2ba4f464886c22273e6ef4ed066aa
|
@@ -26,6 +26,9 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
26
26
|
cli[:scaffold] = true
|
27
27
|
cli[:contig] = true
|
28
28
|
end
|
29
|
+
opt.on('--ncbi-list-json STRING', '::HIDE::') do |v|
|
30
|
+
cli[:ncbi_list_json] = v
|
31
|
+
end
|
29
32
|
end
|
30
33
|
|
31
34
|
def cli_name_modifiers(opt)
|
@@ -35,7 +38,9 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
35
38
|
'Only affects --complete and --chromosome'
|
36
39
|
) { |v| cli[:add_version] = v }
|
37
40
|
# For backwards compatibility
|
38
|
-
|
41
|
+
opt.on('--legacy-name', '::HIDE::') do
|
42
|
+
warn 'Deprecated flag --legacy-name ignored'
|
43
|
+
end
|
39
44
|
end
|
40
45
|
|
41
46
|
def sanitize_cli
|
@@ -49,6 +54,11 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
49
54
|
end
|
50
55
|
|
51
56
|
def remote_list
|
57
|
+
if cli[:ncbi_list_json] && File.size?(cli[:ncbi_list_json])
|
58
|
+
cli.say "Reusing remote list: #{cli[:ncbi_list_json]}"
|
59
|
+
return MiGA::Json.parse(cli[:ncbi_list_json])
|
60
|
+
end
|
61
|
+
|
52
62
|
list = {}
|
53
63
|
query = remote_list_query
|
54
64
|
loop do
|
@@ -66,6 +76,12 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
66
76
|
break unless page[:next_page_token]
|
67
77
|
query[:page_token] = page[:next_page_token]
|
68
78
|
end
|
79
|
+
|
80
|
+
if cli[:ncbi_list_json]
|
81
|
+
cli.say "Saving remote list: #{cli[:ncbi_list_json]}"
|
82
|
+
MiGA::Json.generate_plain(list, cli[:ncbi_list_json])
|
83
|
+
end
|
84
|
+
|
69
85
|
list
|
70
86
|
end
|
71
87
|
|
@@ -80,7 +96,8 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
80
96
|
ds[n] = {
|
81
97
|
ids: [asm], db: :assembly, universe: :ncbi,
|
82
98
|
md: {
|
83
|
-
type: :genome, ncbi_asm: asm,
|
99
|
+
type: :genome, ncbi_asm: asm,
|
100
|
+
strain: r.dig(:organism, :infraspecific_names, :strain)
|
84
101
|
}
|
85
102
|
}
|
86
103
|
date = r.dig(:assembly_info, :release_date)
|
data/lib/miga/cli/action/get.rb
CHANGED
@@ -14,7 +14,7 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
|
|
14
14
|
opt.on(
|
15
15
|
'-I', '--ids ID1,ID2,...', Array,
|
16
16
|
'(Mandatory unless -F) IDs in the remote database separated by commas'
|
17
|
-
) { |v| cli[:ids] = v }
|
17
|
+
) { |v| cli[:ids] = v.map(&:strip) }
|
18
18
|
opt.on(
|
19
19
|
'-U', '--universe STRING',
|
20
20
|
"Universe of the remote database. By default: #{cli[:universe]}",
|
data/lib/miga/json.rb
CHANGED
@@ -65,17 +65,26 @@ class MiGA::Json < MiGA::MiGA
|
|
65
65
|
# Generates and returns prettyfied JSON to represent +obj+.
|
66
66
|
# If +path+ is passed, it saves the JSON in that file.
|
67
67
|
def generate(obj, path = nil)
|
68
|
-
|
69
|
-
File.open(path, 'w') { |fh| fh.print y } unless path.nil?
|
70
|
-
y
|
68
|
+
generate_generic(:pretty_generate, obj, path)
|
71
69
|
end
|
72
70
|
|
73
71
|
##
|
74
72
|
# Generates and returns plain JSON to represent +obj+.
|
75
73
|
# If +path+ is passed, it saves the JSON in that file.
|
76
74
|
def generate_plain(obj, path = nil)
|
77
|
-
|
78
|
-
|
75
|
+
generate_generic(:generate, obj, path)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def generate_generic(method, obj, path)
|
81
|
+
y = JSON.send(method, obj)
|
82
|
+
return y unless path
|
83
|
+
|
84
|
+
io = StringIO.new(y)
|
85
|
+
File.open(path, 'w') do |fh|
|
86
|
+
fh.print(io.read(1024)) until io.eof?
|
87
|
+
end
|
79
88
|
y
|
80
89
|
end
|
81
90
|
end
|
@@ -64,8 +64,8 @@ module MiGA::RemoteDataset::Base
|
|
64
64
|
},
|
65
65
|
gtdb: {
|
66
66
|
dbs: {
|
67
|
-
# This is a dummy entry plugged directly to +
|
68
|
-
assembly: { stage: :assembly, format: :
|
67
|
+
# This is a dummy entry plugged directly to +ncbi_asm_get+
|
68
|
+
assembly: { stage: :assembly, format: :fasta, getter: :ncbi_asm },
|
69
69
|
# The 'taxon' namespace actually returns a list of genomes (+format+)
|
70
70
|
taxon: {
|
71
71
|
stage: :metadata, format: :genomes, map_to: [:assembly],
|
@@ -84,8 +84,8 @@ module MiGA::RemoteDataset::Base
|
|
84
84
|
},
|
85
85
|
seqcode: {
|
86
86
|
dbs: {
|
87
|
-
# These are dummy entries plugged directly to +ncbi_*
|
88
|
-
assembly: { stage: :assembly, format: :
|
87
|
+
# These are dummy entries plugged directly to +ncbi_*_get+
|
88
|
+
assembly: { stage: :assembly, format: :fasta, getter: :ncbi_asm },
|
89
89
|
nuccore: { stage: :assembly, format: :fasta, getter: :ncbi_gb },
|
90
90
|
# This is the list of type genomes
|
91
91
|
:'type-genomes' => { stage: :metadata, format: :json }
|
@@ -100,7 +100,7 @@ module MiGA::RemoteDataset::Base
|
|
100
100
|
ncbi: {
|
101
101
|
dbs: {
|
102
102
|
nuccore: { stage: :assembly, format: :fasta, getter: :ncbi_gb },
|
103
|
-
assembly: { stage: :assembly, format: :
|
103
|
+
assembly: { stage: :assembly, format: :fasta, getter: :ncbi_asm },
|
104
104
|
taxonomy: { stage: :metadata, format: :xml }
|
105
105
|
},
|
106
106
|
uri: lambda do |opts|
|
@@ -19,6 +19,10 @@ class MiGA::RemoteDataset
|
|
19
19
|
getter = database_hash[:getter] || :download
|
20
20
|
action = database_hash[:method] || universe_hash[:method]
|
21
21
|
|
22
|
+
# Clean IDs
|
23
|
+
ids =
|
24
|
+
|
25
|
+
# Return options
|
22
26
|
{
|
23
27
|
universe: universe, db: db, ids: ids.is_a?(Array) ? ids : [ids],
|
24
28
|
format: format, file: file, obj: obj,
|
@@ -50,22 +54,37 @@ class MiGA::RemoteDataset
|
|
50
54
|
# Supported +opts+ (Hash) include:
|
51
55
|
# +obj+ (mandatory): MiGA::RemoteDataset
|
52
56
|
# +ids+ (mandatory): String or Array of String
|
53
|
-
# +file
|
57
|
+
# +file+ (mandatory): String, assembly saved here
|
54
58
|
# +extra+: Hash, passed to download
|
55
|
-
# +format+: String,
|
59
|
+
# +format+: String, ignored
|
56
60
|
def ncbi_asm_get(opts)
|
57
|
-
|
58
|
-
|
59
|
-
raise MiGA::RemoteDataMissingError.new(
|
60
|
-
"Missing ftppath_genbank in NCBI Assembly JSON"
|
61
|
-
)
|
62
|
-
end
|
61
|
+
require 'tempfile'
|
62
|
+
require 'zip'
|
63
63
|
|
64
|
-
|
65
|
-
|
66
|
-
:
|
67
|
-
opts[:format], opts[:file], opts[:extra], opts[:obj]
|
64
|
+
zipped = download(
|
65
|
+
:ncbi_datasets_download, :genome, opts[:ids],
|
66
|
+
:zip, nil, opts[:extra], opts[:obj]
|
68
67
|
)
|
68
|
+
zip_tmp = Tempfile.new('asm.zip')
|
69
|
+
zip_tmp.puts zipped
|
70
|
+
zip_tmp.close
|
71
|
+
|
72
|
+
o = ''
|
73
|
+
ofh = opts[:file] ? File.open(opts[:file], 'w') : nil
|
74
|
+
Zip::File.open(zip_tmp.path) do |zfh|
|
75
|
+
zfh.each do |entry|
|
76
|
+
if entry.file? && entry.name =~ /_genomic\.fna$/
|
77
|
+
DEBUG "Extracting: #{entry.name}"
|
78
|
+
entry.get_input_stream do |ifh|
|
79
|
+
cont = ifh.read
|
80
|
+
ofh&.puts cont
|
81
|
+
o += cont
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
ofh&.close
|
87
|
+
o
|
69
88
|
end
|
70
89
|
|
71
90
|
##
|
@@ -77,11 +96,7 @@ class MiGA::RemoteDataset
|
|
77
96
|
return o unless o.strip.empty?
|
78
97
|
|
79
98
|
MiGA::MiGA.DEBUG 'Empty sequence, attempting download from NCBI assembly'
|
80
|
-
opts[:format] = :
|
81
|
-
if opts[:file]
|
82
|
-
File.unlink(opts[:file]) if File.exist? opts[:file]
|
83
|
-
opts[:file] = "#{opts[:file]}.gz"
|
84
|
-
end
|
99
|
+
opts[:format] = :fasta
|
85
100
|
ncbi_asm_get(opts)
|
86
101
|
end
|
87
102
|
|
data/lib/miga/result/stats.rb
CHANGED
@@ -29,33 +29,15 @@ module MiGA::Result::Stats
|
|
29
29
|
seq_opts = { gc: true, x: true, skew: true }
|
30
30
|
if self[:files][:pair1].nil?
|
31
31
|
s = MiGA::MiGA.seqs_length(file_path(:single), :fastq, seq_opts)
|
32
|
-
stats =
|
33
|
-
reads: s[:n],
|
34
|
-
length_average: [s[:avg], 'bp'],
|
35
|
-
length_standard_deviation: [s[:sd], 'bp'],
|
36
|
-
g_c_content: [s[:gc], '%'],
|
37
|
-
x_content: [s[:x], '%'],
|
38
|
-
g_c_skew: [s[:gc_skew], '%'],
|
39
|
-
a_t_skew: [s[:at_skew], '%']
|
40
|
-
}
|
32
|
+
stats = seqs_length_as_stats_hash(s)
|
41
33
|
else
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
forward_x_content: [s1[:x], '%'],
|
50
|
-
forward_g_c_skew: [s1[:gc_skew], '%'],
|
51
|
-
forward_a_t_skew: [s1[:at_skew], '%'],
|
52
|
-
reverse_length_average: [s2[:avg], 'bp'],
|
53
|
-
reverse_length_standard_deviation: [s2[:sd], 'bp'],
|
54
|
-
reverse_g_c_content: [s2[:gc], '%'],
|
55
|
-
reverse_x_content: [s2[:x], '%'],
|
56
|
-
reverse_g_c_skew: [s2[:gc_skew], '%'],
|
57
|
-
reverse_a_t_skew: [s2[:at_skew], '%']
|
58
|
-
}
|
34
|
+
stats = { read_pairs: nil }
|
35
|
+
{ pair1: :forward, pair2: :reverse }.each do |pair, direction|
|
36
|
+
s = MiGA::MiGA.seqs_length(file_path(pair), :fastq, seq_opts)
|
37
|
+
seqs_length_as_stats_hash(s).each do |k, v|
|
38
|
+
stats[k == :reads ? :read_pairs : :"#{direction}_#{k}"] ||= v
|
39
|
+
end
|
40
|
+
end
|
59
41
|
end
|
60
42
|
stats
|
61
43
|
end
|
@@ -63,15 +45,7 @@ module MiGA::Result::Stats
|
|
63
45
|
def compute_stats_trimmed_fasta
|
64
46
|
f = self[:files][:coupled].nil? ? file_path(:single) : file_path(:coupled)
|
65
47
|
s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true, skew: true)
|
66
|
-
|
67
|
-
reads: s[:n],
|
68
|
-
length_average: [s[:avg], 'bp'],
|
69
|
-
length_standard_deviation: [s[:sd], 'bp'],
|
70
|
-
g_c_content: [s[:gc], '%'],
|
71
|
-
x_content: [s[:x], '%'],
|
72
|
-
g_c_skew: [s[:gc_skew], '%'],
|
73
|
-
a_t_skew: [s[:at_skew], '%']
|
74
|
-
}
|
48
|
+
seqs_length_as_stats_hash(s)
|
75
49
|
end
|
76
50
|
|
77
51
|
def compute_stats_assembly
|
@@ -79,16 +53,17 @@ module MiGA::Result::Stats
|
|
79
53
|
file_path(:largecontigs), :fasta,
|
80
54
|
n50: true, gc: true, x: true, skew: true
|
81
55
|
)
|
56
|
+
h = seqs_length_as_stats_hash(s)
|
82
57
|
{
|
83
58
|
contigs: s[:n],
|
84
59
|
n50: [s[:n50], 'bp'],
|
85
60
|
total_length: [s[:tot], 'bp'],
|
86
|
-
longest_sequence: [s[:max], 'bp']
|
87
|
-
|
88
|
-
x_content
|
89
|
-
|
90
|
-
|
91
|
-
|
61
|
+
longest_sequence: [s[:max], 'bp']
|
62
|
+
}.tap do |stats|
|
63
|
+
%i[g_c_content x_content g_c_skew a_t_skew].each do |i|
|
64
|
+
stats[i] = h[i]
|
65
|
+
end
|
66
|
+
end
|
92
67
|
end
|
93
68
|
|
94
69
|
def compute_stats_cds
|
@@ -253,4 +228,16 @@ module MiGA::Result::Stats
|
|
253
228
|
add_file(:raw_report, "#{source.name}.ess/log")
|
254
229
|
add_file(:report, "#{source.name}.ess/log.domain")
|
255
230
|
end
|
231
|
+
|
232
|
+
def seqs_length_as_stats_hash(s)
|
233
|
+
{
|
234
|
+
reads: s[:n],
|
235
|
+
length_average: [s[:avg], 'bp'],
|
236
|
+
length_standard_deviation: [s[:sd], 'bp'],
|
237
|
+
g_c_content: [s[:gc], '%'],
|
238
|
+
x_content: [s[:x], '%'],
|
239
|
+
g_c_skew: [s[:gc_skew], '%'],
|
240
|
+
a_t_skew: [s[:at_skew], '%']
|
241
|
+
}
|
242
|
+
end
|
256
243
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3, 9,
|
15
|
+
VERSION = [1.3, 9, 3].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
data/test/net_test.rb
CHANGED
@@ -47,4 +47,24 @@ class FormatTest < Test::Unit::TestCase
|
|
47
47
|
### m.download_file_ftp(:miga_db, '../api_test.txt', f)
|
48
48
|
### assert_equal('miga', File.read(f).chomp)
|
49
49
|
end
|
50
|
+
|
51
|
+
def test_encoding
|
52
|
+
# Test original encoding
|
53
|
+
t1 = '()!@*#àøo'
|
54
|
+
t2 = "#{t1}"
|
55
|
+
assert_equal(t1, t2)
|
56
|
+
assert_equal(t1, MiGA::MiGA.normalize_encoding(t2))
|
57
|
+
|
58
|
+
# Test with a different encoding
|
59
|
+
t2 = t2.encode('windows-1252')
|
60
|
+
assert_equal('Windows-1252', t2.encoding.to_s)
|
61
|
+
assert_not_equal(t1, t2)
|
62
|
+
assert_equal(t1, MiGA::MiGA.normalize_encoding(t2))
|
63
|
+
|
64
|
+
# Test with a different encoding wrongly declared
|
65
|
+
t2.force_encoding('utf-8')
|
66
|
+
assert_equal('UTF-8', t2.encoding.to_s)
|
67
|
+
assert_not_equal(t1, t2)
|
68
|
+
assert_equal(t1, MiGA::MiGA.normalize_encoding(t2))
|
69
|
+
end
|
50
70
|
end
|
data/test/remote_dataset_test.rb
CHANGED
@@ -142,7 +142,7 @@ class RemoteDatasetTest < Test::Unit::TestCase
|
|
142
142
|
|
143
143
|
def test_missing_data
|
144
144
|
declare_remote_access
|
145
|
-
rd = MiGA::RemoteDataset.new('
|
145
|
+
rd = MiGA::RemoteDataset.new('XYZ_GCA_000484975.1', :assembly, :ncbi)
|
146
146
|
assert_raise(MiGA::RemoteDataMissingError) { rd.save_to(project, 'bad') }
|
147
147
|
end
|
148
148
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.9.
|
4
|
+
version: 1.3.9.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubyzip
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rake
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|