miga-base 1.2.17.1 → 1.2.17.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/remote_dataset.rb +9 -4
- data/lib/miga/version.rb +2 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +39 -11
- data/utils/enveomics/Manifest/Tasks/remote.json +2 -1
- data/utils/enveomics/Scripts/BedGraph.tad.rb +98 -53
- data/utils/enveomics/Scripts/SRA.download.bash +14 -2
- data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +5 -5
- data/utils/enveomics/enveomics.R/R/autoprune.R +99 -87
- data/utils/enveomics/enveomics.R/R/barplot.R +116 -97
- data/utils/enveomics/enveomics.R/R/cliopts.R +65 -59
- data/utils/enveomics/enveomics.R/R/df2dist.R +96 -58
- data/utils/enveomics/enveomics.R/R/growthcurve.R +166 -148
- data/utils/enveomics/enveomics.R/R/recplot.R +201 -136
- data/utils/enveomics/enveomics.R/R/recplot2.R +371 -304
- data/utils/enveomics/enveomics.R/R/tribs.R +318 -263
- data/utils/enveomics/enveomics.R/R/utils.R +30 -20
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +4 -3
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +3 -3
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +7 -4
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +7 -4
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +4 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +25 -17
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +10 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +20 -1
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +2 -3
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +5 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +50 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +5 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +9 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +3 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +4 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +5 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +11 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +5 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +3 -3
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +3 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +3 -0
- metadata +3 -37
- data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
- data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
- data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
- data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
- data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
- data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
- data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
- data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
- data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
- data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
- data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
- data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
- data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
- data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
- data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
- data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
- data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
- data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
- data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
- data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
- data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
- data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
- data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
- data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
- data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
- data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57016515d9789927d6eda96e04be04dd9d47e6f330ea8f2f489971d7f0ad3845
|
4
|
+
data.tar.gz: 3e42b22e637fc5ad47405eaac88fcc7ba00d94abfe86035c8a8a613b54e06bbe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2b8cf60bfc487960e66d4a35a61fcb691d166baf31656e115c28dd3854276e062f6c9325d2d4ff924a563327a01c05f4ecfd570004deb54db05b065b955281a
|
7
|
+
data.tar.gz: 6124879ca61700d6cf58aaa27c2db405874b1fdcaeed829b835662eeb92f6a59383f1fb2ed5bb073d5b97d1bef2ed15f0580821a267cd6f5f0c65ef2dbb2f1bc
|
@@ -61,7 +61,7 @@ class MiGA::RemoteDataset
|
|
61
61
|
# Supported +opts+ (Hash) are the same as #download_rest and #ncbi_asm_rest.
|
62
62
|
def ncbi_gb_rest(opts)
|
63
63
|
# Simply use defaults, but ensure that the URL can be properly formed
|
64
|
-
o = download_rest(opts.merge(universe: :ncbi, db: :nuccore
|
64
|
+
o = download_rest(opts.merge(universe: :ncbi, db: :nuccore))
|
65
65
|
return o unless o.strip.empty?
|
66
66
|
|
67
67
|
MiGA::MiGA.DEBUG 'Empty sequence, attempting download from NCBI assembly'
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -134,7 +134,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
134
134
|
# Get taxonomy
|
135
135
|
@metadata[:tax] = get_gtdb_taxonomy
|
136
136
|
when :seqcode
|
137
|
-
#
|
137
|
+
# Taxonomy already defined
|
138
|
+
# Copy IDs over to allow additional metadata linked
|
139
|
+
@metadata[:ncbi_asm] = @metadata[:seqcode_asm]
|
140
|
+
@metadata[:ncbi_nuccore] = @metadata[:seqcode_nuccore]
|
138
141
|
end
|
139
142
|
|
140
143
|
if metadata[:get_ncbi_taxonomy]
|
@@ -276,6 +279,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
276
279
|
def get_type_status_ncbi_asm(metadata)
|
277
280
|
return metadata if ncbi_asm_json_doc.nil?
|
278
281
|
|
282
|
+
metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
|
283
|
+
metadata[:suspect] = nil if metadata[:suspect].empty?
|
284
|
+
return metadata if metadata[:is_type] # If predefined, as in SeqCode
|
285
|
+
|
279
286
|
from_type = ncbi_asm_json_doc['from_type']
|
280
287
|
from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
|
281
288
|
case from_type
|
@@ -292,8 +299,6 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
292
299
|
metadata[:is_type] = true
|
293
300
|
metadata[:type_rel] = from_type
|
294
301
|
end
|
295
|
-
metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
|
296
|
-
metadata[:suspect] = nil if metadata[:suspect].empty?
|
297
302
|
MiGA.DEBUG "Got type: #{from_type}"
|
298
303
|
metadata
|
299
304
|
end
|
@@ -306,7 +311,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
306
311
|
File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s }
|
307
312
|
if udb[:format] == :fasta_gz
|
308
313
|
download "#{l_ctg}.gz"
|
309
|
-
system "gzip -
|
314
|
+
system "gzip -fd '#{l_ctg}.gz'"
|
310
315
|
else
|
311
316
|
download l_ctg
|
312
317
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.2, 17,
|
15
|
+
VERSION = [1.2, 17, 3].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023, 2,
|
23
|
+
VERSION_DATE = Date.new(2023, 2, 22)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
@@ -2,33 +2,61 @@
|
|
2
2
|
"tasks": [
|
3
3
|
{
|
4
4
|
"task": "BedGraph.tad.rb",
|
5
|
-
"description": [
|
6
|
-
"
|
7
|
-
|
5
|
+
"description": [
|
6
|
+
"Estimates the truncated average sequencing depth (TAD)",
|
7
|
+
"from a BedGraph file."
|
8
|
+
],
|
9
|
+
"warn": [
|
10
|
+
"This script doesn't consider zero-coverage positions if",
|
8
11
|
"missing from the file. If you produce your BedGraph file with",
|
9
12
|
"bedtools genomecov and want to consider zero-coverage position, be",
|
10
|
-
"sure to use -bga (not -bg)."
|
11
|
-
|
12
|
-
|
13
|
+
"sure to use -bga (not -bg)."
|
14
|
+
],
|
15
|
+
"see_also": [
|
16
|
+
"BedGraph.window.rb", "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"
|
17
|
+
],
|
13
18
|
"help_arg": "--help",
|
14
19
|
"options": [
|
15
20
|
{
|
16
21
|
"opt": "--input",
|
17
22
|
"arg": "in_file",
|
18
23
|
"mandatory": true,
|
19
|
-
"description":
|
24
|
+
"description": [
|
25
|
+
"Input BedGraph file.",
|
26
|
+
"Supports compression with .gz extension, use - for STDIN."
|
27
|
+
]
|
28
|
+
},
|
29
|
+
{
|
30
|
+
"opt": "--output",
|
31
|
+
"arg": "out_file",
|
32
|
+
"default": "-",
|
33
|
+
"description": [
|
34
|
+
"Output tab-delimited values (by default, STDOUT).",
|
35
|
+
"Supports compression with .gz extension, use - for STDOUT."
|
36
|
+
]
|
20
37
|
},
|
21
38
|
{
|
22
39
|
"opt": "--range",
|
23
40
|
"arg": "float",
|
24
41
|
"default": 0.5,
|
25
|
-
"description": [
|
26
|
-
"
|
42
|
+
"description": [
|
43
|
+
"Central range to consider, between 0 and 1.",
|
44
|
+
"By default: inter-quartile range (0.5)."
|
45
|
+
]
|
46
|
+
},
|
47
|
+
{
|
48
|
+
"opt": "--name",
|
49
|
+
"arg": "string",
|
50
|
+
"description": [
|
51
|
+
"Name (ID) of the sequence, added as first column."
|
52
|
+
]
|
27
53
|
},
|
28
54
|
{
|
29
55
|
"opt": "--per-seq",
|
30
|
-
"description": [
|
31
|
-
"
|
56
|
+
"description": [
|
57
|
+
"Calculate averages per reference sequence, not total.",
|
58
|
+
"Assumes a sorted BedGraph file."
|
59
|
+
]
|
32
60
|
},
|
33
61
|
{
|
34
62
|
"opt": "--length",
|
@@ -184,7 +184,8 @@
|
|
184
184
|
{
|
185
185
|
"task": "SRA.download.bash",
|
186
186
|
"description": ["Downloads the set of runs from a project, sample, or",
|
187
|
-
"experiment in SRA."
|
187
|
+
"experiment in SRA. If the expected file already exists, skips the",
|
188
|
+
"file if the MD5 hash matches."],
|
188
189
|
"help_arg": "",
|
189
190
|
"requires": [
|
190
191
|
{
|
@@ -1,93 +1,138 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
$VERSION = 1.01
|
4
|
+
$:.push File.expand_path('../lib', __FILE__)
|
5
|
+
require 'enveomics_rb/enveomics'
|
4
6
|
|
5
|
-
o = {range: 0.5, perseq: false, length: false}
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
o = { range: 0.5, perseq: false, length: false, o: '-' }
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.version = $VERSION
|
10
|
+
banner = <<~BANNER
|
11
|
+
Estimates the truncated average sequencing depth (TAD) from a BedGraph file
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
13
|
+
IMPORTANT: This script doesn't consider zero-coverage positions if missing
|
14
|
+
from the file. If you produce your BedGraph file with bedtools genomecov and
|
15
|
+
want to consider zero-coverage position, be sure to use -bga (not -bg).
|
16
|
+
BANNER
|
17
|
+
Enveomics.opt_banner(opts, banner, "#{File.basename($0)} [options]")
|
14
18
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
'Input BedGraph file
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
'
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
19
|
+
opts.separator 'Mandatory'
|
20
|
+
opts.on(
|
21
|
+
'-i', '--input PATH',
|
22
|
+
'Input BedGraph file',
|
23
|
+
'Supports compression with .gz extension, use - for STDIN'
|
24
|
+
) { |v| o[:i] = v }
|
25
|
+
|
26
|
+
opts.separator ''
|
27
|
+
opts.separator 'Other Options'
|
28
|
+
opts.on(
|
29
|
+
'-o', '--out PATH',
|
30
|
+
'Output tab-delimited values (by default, STDOUT)',
|
31
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
32
|
+
) { |v| o[:o] = v }
|
33
|
+
opts.on(
|
34
|
+
'-r', '--range FLOAT', Float,
|
35
|
+
'Central range to consider, between 0 and 1',
|
36
|
+
"By default: #{o[:range]} (inter-quartile range)"
|
37
|
+
) { |v| o[:range] = v }
|
38
|
+
opts.on(
|
39
|
+
'-n', '--name STRING',
|
40
|
+
'Name (ID) of the sequence (added as first column)'
|
41
|
+
) { |v| o[:name] = v }
|
42
|
+
opts.on(
|
43
|
+
'-s', '--per-seq',
|
44
|
+
'Calculate averages per reference sequence, not total',
|
45
|
+
'Assumes a sorted BedGraph file'
|
46
|
+
) { |v| o[:perseq] = v }
|
47
|
+
opts.on(
|
48
|
+
'-l', '--length',
|
49
|
+
'Add sequence length to the output'
|
50
|
+
) { |v| o[:length] = v }
|
51
|
+
opts.on(
|
52
|
+
'-b', '--breadth',
|
53
|
+
'Add sequencing breadth to the output'
|
54
|
+
) { |v| o[:breadth] = v }
|
55
|
+
opts.on('-h', '--help', 'Display this screen') do
|
56
|
+
puts opts
|
31
57
|
exit
|
32
58
|
end
|
33
|
-
|
59
|
+
opts.separator ''
|
34
60
|
end.parse!
|
35
|
-
|
61
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
|
36
62
|
|
63
|
+
##
|
64
|
+
# Pad an array to include all index values up to +r+ entries:
|
65
|
+
# - d: Array of [ depth => counts ]
|
66
|
+
# - idx: Array of [ depth, depth, ... ]
|
67
|
+
# - r: Expected number of entries in the array
|
37
68
|
def pad(d, idx, r)
|
38
69
|
idx.each do |i|
|
39
70
|
next if d[i].nil?
|
71
|
+
|
40
72
|
d[i] -= r
|
41
73
|
break unless d[i] < 0
|
74
|
+
|
42
75
|
r = -d[i]
|
43
76
|
d[i] = nil
|
44
77
|
end
|
45
78
|
d
|
46
79
|
end
|
47
80
|
|
81
|
+
##
|
82
|
+
# Report the results for:
|
83
|
+
# - sq: Contig ID
|
84
|
+
# - d: Array of [ depth => counts ]
|
85
|
+
# - ln: Length of the sequence
|
86
|
+
# - o: CLI Options
|
48
87
|
def report(sq, d, ln, o)
|
49
88
|
# Estimate padding ranges
|
50
|
-
pad = (1.0-o[:range])/2.0
|
51
|
-
r = (pad*ln).round
|
89
|
+
pad = (1.0 - o[:range]) / 2.0
|
90
|
+
r = (pad * ln).round
|
91
|
+
zeroes = d[0].to_i
|
52
92
|
|
53
|
-
# Pad
|
54
|
-
d = pad(d, d.each_index.to_a, r+0)
|
55
|
-
d = pad(d, d.each_index.to_a.reverse, r+0)
|
93
|
+
# Pad (truncation)
|
94
|
+
d = pad(d, d.each_index.to_a, r + 0)
|
95
|
+
d = pad(d, d.each_index.to_a.reverse, r + 0)
|
56
96
|
|
57
97
|
# Average
|
58
98
|
y = [0.0]
|
59
99
|
unless d.compact.empty?
|
60
|
-
s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0
|
61
|
-
y[0] = s.to_f/d.compact.inject(:+)
|
100
|
+
s = d.each_with_index.to_a.map { |v, i| v.nil? ? 0 : i * v }.inject(0, :+)
|
101
|
+
y[0] = s.to_f / d.compact.inject(:+)
|
62
102
|
end
|
63
103
|
|
64
104
|
# Report
|
65
105
|
y.unshift(sq) if o[:perseq]
|
106
|
+
y.unshift(o[:name]) if o[:name]
|
66
107
|
y << ln if o[:length]
|
67
|
-
|
108
|
+
y << (ln - zeroes).to_f / ln if o[:breadth]
|
109
|
+
y.join("\t")
|
68
110
|
end
|
69
111
|
|
70
112
|
# Read BedGraph
|
71
|
-
d = []
|
113
|
+
d = [] # [ depth => count ]
|
72
114
|
ln = 0
|
73
115
|
pre_sq = nil
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
r.map! { |j| j.to_i }
|
85
|
-
l = r[1]-r[0]
|
86
|
-
d[ r[2] ] ||= 0
|
87
|
-
d[ r[2] ] += l
|
88
|
-
ln += l
|
89
|
-
pre_sq = sq
|
116
|
+
ifh = reader(o[:i])
|
117
|
+
ofh = writer(o[:o])
|
118
|
+
ifh.each_line do |i|
|
119
|
+
next if i =~ /^#/
|
120
|
+
r = i.chomp.split("\t")
|
121
|
+
sq = r.shift # Contig ID
|
122
|
+
if o[:perseq] && !pre_sq.nil? && pre_sq != sq
|
123
|
+
ofh.puts(report(pre_sq, d, ln, o))
|
124
|
+
d = []
|
125
|
+
ln = 0
|
90
126
|
end
|
127
|
+
r.map!(&:to_i) # From, To, Depth
|
128
|
+
l = r[1] - r[0] # Window length: To - From
|
129
|
+
d[ r[2] ] ||= 0
|
130
|
+
d[ r[2] ] += l # Add these "l" positions with depth "Depth"
|
131
|
+
ln += l
|
132
|
+
pre_sq = sq
|
91
133
|
end
|
92
|
-
report(pre_sq, d, ln, o)
|
134
|
+
ofh.puts(report(pre_sq, d, ln, o))
|
135
|
+
|
136
|
+
ifh.close
|
137
|
+
ofh.close
|
93
138
|
|
@@ -4,13 +4,14 @@ DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
|
|
4
4
|
DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
|
5
5
|
SRX=$1
|
6
6
|
DIR=${2:-$SRX}
|
7
|
-
VERSION=
|
7
|
+
VERSION=2.0
|
8
8
|
|
9
9
|
if [[ "$SRX" == "" ]] ; then
|
10
10
|
echo "
|
11
11
|
[Enveomics Collection: $(basename "$0" .bash) $VERSION]
|
12
12
|
|
13
13
|
Downloads the set of runs from a project, sample, or experiment in SRA.
|
14
|
+
If the expected file already exists, skips the file if the MD5 hash matches.
|
14
15
|
|
15
16
|
Usage:
|
16
17
|
$(basename "$0") <SRA-ID>[ <dir>]
|
@@ -42,9 +43,20 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
|
42
43
|
echo "o $srr" >&2
|
43
44
|
for uri in $(echo "$ftp" | tr ";" " ") ; do
|
44
45
|
file="$dir/$(basename $uri)"
|
46
|
+
|
47
|
+
# Check if it exists and it's complete
|
48
|
+
if [[ -s "$file" ]] ; then
|
49
|
+
md5obs=$(md5value "$file" 2> /dev/null)
|
50
|
+
if [[ "$md5;" == "$md5obs;"* ]] ; then
|
51
|
+
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
52
|
+
continue
|
53
|
+
fi
|
54
|
+
fi
|
55
|
+
|
56
|
+
# Otherwise, download and check MD5
|
45
57
|
curl "$uri" -o "$file"
|
46
58
|
md5obs=$(md5value "$file" 2> /dev/null)
|
47
|
-
if [[ "$md5" == "$md5obs"* ]] ; then
|
59
|
+
if [[ "$md5;" == "$md5obs;"* ]] ; then
|
48
60
|
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
49
61
|
else
|
50
62
|
echo "Corrupt file: $file" >&2
|
Binary file
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Package: enveomics.R
|
2
|
-
Version: 1.
|
3
|
-
Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
|
4
|
-
email="lmrodriguezr@gmail.com"))
|
2
|
+
Version: 1.9.0
|
3
|
+
Authors@R: c(person("Luis M.", "Rodriguez-R", role = c("aut", "cre"),
|
4
|
+
email = "lmrodriguezr@gmail.com"))
|
5
5
|
Title: Various Utilities for Microbial Genomics and Metagenomics
|
6
6
|
Description: A collection of functions for microbial ecology and other
|
7
7
|
applications of genomics and metagenomics. Companion package for the
|
@@ -9,7 +9,7 @@ Description: A collection of functions for microbial ecology and other
|
|
9
9
|
<DOI:10.7287/peerj.preprints.1900v1>).
|
10
10
|
Author: Luis M. Rodriguez-R [aut, cre]
|
11
11
|
Maintainer: Luis M. Rodriguez-R <lmrodriguezr@gmail.com>
|
12
|
-
URL: http://enve-omics.ce.gatech.edu/enveomics
|
12
|
+
URL: http://enve-omics.ce.gatech.edu/enveomics/
|
13
13
|
Depends:
|
14
14
|
R (>= 2.9),
|
15
15
|
stats,
|
@@ -28,4 +28,4 @@ Suggests:
|
|
28
28
|
License: Artistic-2.0
|
29
29
|
LazyData: yes
|
30
30
|
Encoding: UTF-8
|
31
|
-
RoxygenNote: 7.
|
31
|
+
RoxygenNote: 7.1.2
|
@@ -22,134 +22,146 @@
|
|
22
22
|
#'
|
23
23
|
#' @export
|
24
24
|
|
25
|
-
enve.prune.dist <- function
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
){
|
34
|
-
if(!requireNamespace("ape", quietly=TRUE))
|
25
|
+
enve.prune.dist <- function(
|
26
|
+
t,
|
27
|
+
dist.quantile = 0.25,
|
28
|
+
min_dist,
|
29
|
+
quiet = FALSE,
|
30
|
+
max_iters = 100,
|
31
|
+
min_nodes_random = 4e4,
|
32
|
+
random_nodes_frx = 1
|
33
|
+
) {
|
34
|
+
if (!requireNamespace("ape", quietly = TRUE))
|
35
35
|
stop('Unavailable ape library.');
|
36
|
-
if(is.character(t)) t <- ape::read.tree(t)
|
37
|
-
if(missing(min_dist)){
|
38
|
-
if(dist.quantile>0){
|
39
|
-
min_dist <- as.numeric(quantile(t$edge.length, dist.quantile))
|
40
|
-
}else{
|
41
|
-
min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]))
|
36
|
+
if (is.character(t)) t <- ape::read.tree(t)
|
37
|
+
if (missing(min_dist)) {
|
38
|
+
if (dist.quantile > 0) {
|
39
|
+
min_dist <- as.numeric(quantile(t$edge.length, dist.quantile))
|
40
|
+
} else {
|
41
|
+
min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]))
|
42
42
|
}
|
43
43
|
}
|
44
|
-
if(!quiet)
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
44
|
+
if (!quiet)
|
45
|
+
cat("\nObjective minimum distance: ", min_dist, ", initial tips: ",
|
46
|
+
length(t$tip.label), "\n", sep = "")
|
47
|
+
|
48
|
+
round <- 1
|
49
|
+
while (round <= max_iters) {
|
50
|
+
if (length(t$tip.label) > min_nodes_random) {
|
51
|
+
if (!quiet)
|
52
|
+
cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
|
53
|
+
", reducing tip-pairs.\n", sep = "")
|
54
|
+
rnd.nodes <- sample(t$tip.label, length(t$tip.label) * random_nodes_frx)
|
55
|
+
t <- enve.__prune.reduce(t, rnd.nodes, min_dist, quiet)
|
56
|
+
} else {
|
57
|
+
if (!quiet) cat(" Gathering distances...\r")
|
58
|
+
d <- cophenetic(t)
|
59
|
+
diag(d) <- NA
|
60
|
+
if(!quiet)
|
61
|
+
cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
|
62
|
+
", Median distance: ", median(d, na.rm = TRUE),
|
63
|
+
", Minimum distance: ", min(d, na.rm = TRUE), "\n", sep = "")
|
60
64
|
# Run iteration
|
61
|
-
if(min(d, na.rm=TRUE) < min_dist){
|
62
|
-
t <- enve.__prune.iter(t, d, min_dist, quiet)
|
63
|
-
}else{
|
64
|
-
break
|
65
|
+
if (min(d, na.rm = TRUE) < min_dist) {
|
66
|
+
t <- enve.__prune.iter(t, d, min_dist, quiet)
|
67
|
+
} else {
|
68
|
+
break
|
65
69
|
}
|
66
70
|
}
|
67
|
-
round <- round + 1
|
71
|
+
round <- round + 1
|
68
72
|
}
|
69
|
-
return(t)
|
73
|
+
return(t)
|
70
74
|
}
|
71
75
|
|
72
76
|
#' Enveomics: Prune Reduce (Internal Function)
|
73
77
|
#'
|
74
78
|
#' Internal function for \code{\link{enve.prune.dist}}.
|
75
79
|
#'
|
76
|
-
#' @param t A \strong{phylo} object
|
77
|
-
#' @param nodes Vector of nodes
|
78
|
-
#' @param min_dist Minimum distance
|
79
|
-
#' @param quiet If running quietly
|
80
|
+
#' @param t A \strong{phylo} object.
|
81
|
+
#' @param nodes Vector of nodes.
|
82
|
+
#' @param min_dist Minimum distance.
|
83
|
+
#' @param quiet If running quietly.
|
84
|
+
#'
|
85
|
+
#' @return A \strong{phylo} object.
|
80
86
|
#'
|
81
87
|
#' @author Luis M. Rodriguez-R [aut, cre]
|
82
88
|
#'
|
83
89
|
#' @export
|
84
90
|
|
85
|
-
enve.__prune.reduce <- function
|
86
|
-
(
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
+
enve.__prune.reduce <- function(t, nodes, min_dist, quiet) {
|
92
|
+
if (!quiet) pb <- txtProgressBar(1, length(nodes), style = 3)
|
93
|
+
for (i in 1:length(nodes)) {
|
94
|
+
node.name <- nodes[i]
|
95
|
+
if (!quiet) setTxtProgressBar(pb, i)
|
96
|
+
|
91
97
|
# Get node ID
|
92
|
-
node <- which(t$tip.label==node.name)
|
93
|
-
if(length(node)==0) next
|
98
|
+
node <- which(t$tip.label == node.name)
|
99
|
+
if (length(node) == 0) next
|
100
|
+
|
94
101
|
# Get parent and distance to parent
|
95
|
-
parent.node <- t$edge[
|
102
|
+
parent.node <- t$edge[t$edge[, 2] == node, 1]
|
103
|
+
|
96
104
|
# Get edges to parent
|
97
|
-
parent.edges <- which(t$edge[,1]==parent.node)
|
98
|
-
stopit <- FALSE
|
99
|
-
for(j in parent.edges){
|
100
|
-
for(k in parent.edges){
|
101
|
-
if(j != k & t$edge[j,2]<length(t$tip.label) &
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
+
parent.edges <- which(t$edge[, 1] == parent.node)
|
106
|
+
stopit <- FALSE
|
107
|
+
for (j in parent.edges) {
|
108
|
+
for (k in parent.edges) {
|
109
|
+
if (j != k & t$edge[j,2]<length(t$tip.label) &
|
110
|
+
t$edge[k,2]<length(t$tip.label) &
|
111
|
+
sum(t$edge.length[c(j,k)]) < min_dist) {
|
112
|
+
t <- ape::drop.tip(t, t$edge[k,2])
|
113
|
+
stopit <- TRUE
|
114
|
+
break
|
105
115
|
}
|
106
116
|
}
|
107
|
-
if(stopit) break
|
117
|
+
if (stopit) break
|
108
118
|
}
|
109
119
|
}
|
110
|
-
if(!quiet) cat(
|
111
|
-
return(t)
|
120
|
+
if (!quiet) cat("\n")
|
121
|
+
return(t)
|
112
122
|
}
|
113
123
|
|
114
124
|
#' Enveomics: Prune Iter (Internal Function)
|
115
125
|
#'
|
116
126
|
#' Internal function for \code{\link{enve.prune.dist}}.
|
117
127
|
#'
|
118
|
-
#' @param t A \strong{phylo} object
|
119
|
-
#' @param dist Cophenetic distance matrix
|
120
|
-
#' @param min_dist Minimum distance
|
121
|
-
#' @param quiet If running quietly
|
128
|
+
#' @param t A \strong{phylo} object.
|
129
|
+
#' @param dist Cophenetic distance matrix.
|
130
|
+
#' @param min_dist Minimum distance.
|
131
|
+
#' @param quiet If running quietly.
|
132
|
+
#'
|
133
|
+
#' @return Returns a \strong{phylo} object.
|
122
134
|
#'
|
123
135
|
#' @author Luis M. Rodriguez-R [aut, cre]
|
124
136
|
#'
|
125
137
|
#' @export
|
126
138
|
|
127
|
-
enve.__prune.iter <- function
|
128
|
-
(t
|
129
|
-
|
130
|
-
min_dist,
|
131
|
-
quiet){
|
132
|
-
ori_len <- length(t$tip.label);
|
139
|
+
enve.__prune.iter <- function(t, dist, min_dist, quiet) {
|
140
|
+
ori_len <- length(t$tip.label)
|
141
|
+
|
133
142
|
# Prune
|
134
|
-
if(!quiet) pb <- txtProgressBar(1, ncol(dist)-1, style=3)
|
135
|
-
ignore <- c()
|
136
|
-
for(i in 1:(ncol(dist)-1)){
|
137
|
-
if(i %in% ignore) next
|
138
|
-
for(j in (i+1):nrow(dist)){
|
139
|
-
if(dist[j, i]<min_dist){
|
140
|
-
t <- ape::drop.tip(t, rownames(dist)[j])
|
141
|
-
ignore <- c(ignore, j)
|
142
|
-
break
|
143
|
+
if (!quiet) pb <- txtProgressBar(1, ncol(dist) - 1, style = 3)
|
144
|
+
ignore <- c()
|
145
|
+
for (i in 1:(ncol(dist) - 1)) {
|
146
|
+
if (i %in% ignore) next
|
147
|
+
for (j in (i + 1):nrow(dist)) {
|
148
|
+
if (dist[j, i] < min_dist) {
|
149
|
+
t <- ape::drop.tip(t, rownames(dist)[j])
|
150
|
+
ignore <- c(ignore, j)
|
151
|
+
break
|
143
152
|
}
|
144
153
|
}
|
145
|
-
if(!quiet) setTxtProgressBar(pb, i)
|
154
|
+
if (!quiet) setTxtProgressBar(pb, i)
|
146
155
|
}
|
147
|
-
if(!quiet) cat(
|
156
|
+
if(!quiet) cat("\n")
|
157
|
+
|
148
158
|
# Check if it droped tips
|
149
|
-
cur_len <- length(t$tip.label)
|
150
|
-
if(cur_len == ori_len){
|
151
|
-
stop(
|
159
|
+
cur_len <- length(t$tip.label)
|
160
|
+
if (cur_len == ori_len){
|
161
|
+
stop(
|
162
|
+
"Internal error: small edge found in tree, with no equivalent in distance matrix.\n"
|
163
|
+
)
|
152
164
|
}
|
153
|
-
return(t)
|
165
|
+
return(t)
|
154
166
|
}
|
155
167
|
|