miga-base 1.2.17.1 → 1.2.17.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/remote_dataset/download.rb +1 -1
  3. data/lib/miga/remote_dataset.rb +9 -4
  4. data/lib/miga/version.rb +2 -2
  5. data/utils/enveomics/Manifest/Tasks/mapping.json +39 -11
  6. data/utils/enveomics/Manifest/Tasks/remote.json +2 -1
  7. data/utils/enveomics/Scripts/BedGraph.tad.rb +98 -53
  8. data/utils/enveomics/Scripts/SRA.download.bash +14 -2
  9. data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
  10. data/utils/enveomics/enveomics.R/DESCRIPTION +5 -5
  11. data/utils/enveomics/enveomics.R/R/autoprune.R +99 -87
  12. data/utils/enveomics/enveomics.R/R/barplot.R +116 -97
  13. data/utils/enveomics/enveomics.R/R/cliopts.R +65 -59
  14. data/utils/enveomics/enveomics.R/R/df2dist.R +96 -58
  15. data/utils/enveomics/enveomics.R/R/growthcurve.R +166 -148
  16. data/utils/enveomics/enveomics.R/R/recplot.R +201 -136
  17. data/utils/enveomics/enveomics.R/R/recplot2.R +371 -304
  18. data/utils/enveomics/enveomics.R/R/tribs.R +318 -263
  19. data/utils/enveomics/enveomics.R/R/utils.R +30 -20
  20. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +4 -3
  21. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +2 -2
  22. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +3 -3
  23. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +7 -4
  24. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +7 -4
  25. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +4 -0
  26. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +25 -17
  27. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +10 -0
  28. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +8 -2
  29. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +14 -0
  30. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +20 -1
  31. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +2 -3
  32. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +5 -2
  33. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +50 -42
  34. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +5 -2
  35. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +3 -0
  36. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +3 -0
  37. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +3 -0
  38. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +3 -0
  39. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +9 -4
  40. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +3 -0
  41. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +3 -3
  42. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -2
  43. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +4 -0
  44. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +5 -0
  45. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +11 -7
  46. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +5 -1
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +3 -0
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +2 -2
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +3 -3
  50. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +2 -2
  51. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +3 -0
  52. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +3 -0
  53. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +6 -3
  54. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +2 -2
  55. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +3 -0
  56. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +3 -0
  57. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +3 -0
  58. metadata +3 -37
  59. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
  60. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
  61. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
  62. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
  63. data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
  64. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
  65. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
  66. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
  67. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
  68. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
  69. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
  70. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
  71. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
  72. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
  73. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
  74. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
  75. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
  76. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
  77. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
  78. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
  79. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
  80. data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
  81. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
  82. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
  83. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
  84. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
  85. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
  86. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
  87. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
  88. data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
  89. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
  90. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
  91. data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
  92. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
  93. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 608607327562dd08edc9f866aeeb566407eb85f0adcf8538c37789962c387f72
4
- data.tar.gz: b21cb37dcae1eab3551d2058f21543221a9b6b9a5b6c834074a4d7c6c60a7102
3
+ metadata.gz: 57016515d9789927d6eda96e04be04dd9d47e6f330ea8f2f489971d7f0ad3845
4
+ data.tar.gz: 3e42b22e637fc5ad47405eaac88fcc7ba00d94abfe86035c8a8a613b54e06bbe
5
5
  SHA512:
6
- metadata.gz: 6708285348840ed44251d64d003c477a0887db497ccfaf97ce17f398b650156420897fc07036cbc395b4107c71bdb6638ad756442e364e530363a36aca8ec9a3
7
- data.tar.gz: fce107bebd89fd53f07d0ca5b814564b76c00b6faacbb0c89b91c0a114b2c5d5911b446fa27f1db5cecab3d74dee2ab3c2d6b10538a3e33056ab2b2ba766c2e9
6
+ metadata.gz: c2b8cf60bfc487960e66d4a35a61fcb691d166baf31656e115c28dd3854276e062f6c9325d2d4ff924a563327a01c05f4ecfd570004deb54db05b065b955281a
7
+ data.tar.gz: 6124879ca61700d6cf58aaa27c2db405874b1fdcaeed829b835662eeb92f6a59383f1fb2ed5bb073d5b97d1bef2ed15f0580821a267cd6f5f0c65ef2dbb2f1bc
@@ -61,7 +61,7 @@ class MiGA::RemoteDataset
61
61
  # Supported +opts+ (Hash) are the same as #download_rest and #ncbi_asm_rest.
62
62
  def ncbi_gb_rest(opts)
63
63
  # Simply use defaults, but ensure that the URL can be properly formed
64
- o = download_rest(opts.merge(universe: :ncbi, db: :nuccore, format: :gb))
64
+ o = download_rest(opts.merge(universe: :ncbi, db: :nuccore))
65
65
  return o unless o.strip.empty?
66
66
 
67
67
  MiGA::MiGA.DEBUG 'Empty sequence, attempting download from NCBI assembly'
@@ -134,7 +134,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
134
134
  # Get taxonomy
135
135
  @metadata[:tax] = get_gtdb_taxonomy
136
136
  when :seqcode
137
- # Do nothing, taxonomy already defined
137
+ # Taxonomy already defined
138
+ # Copy IDs over to allow additional metadata linked
139
+ @metadata[:ncbi_asm] = @metadata[:seqcode_asm]
140
+ @metadata[:ncbi_nuccore] = @metadata[:seqcode_nuccore]
138
141
  end
139
142
 
140
143
  if metadata[:get_ncbi_taxonomy]
@@ -276,6 +279,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
276
279
  def get_type_status_ncbi_asm(metadata)
277
280
  return metadata if ncbi_asm_json_doc.nil?
278
281
 
282
+ metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
283
+ metadata[:suspect] = nil if metadata[:suspect].empty?
284
+ return metadata if metadata[:is_type] # If predefined, as in SeqCode
285
+
279
286
  from_type = ncbi_asm_json_doc['from_type']
280
287
  from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
281
288
  case from_type
@@ -292,8 +299,6 @@ class MiGA::RemoteDataset < MiGA::MiGA
292
299
  metadata[:is_type] = true
293
300
  metadata[:type_rel] = from_type
294
301
  end
295
- metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
296
- metadata[:suspect] = nil if metadata[:suspect].empty?
297
302
  MiGA.DEBUG "Got type: #{from_type}"
298
303
  metadata
299
304
  end
@@ -306,7 +311,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
306
311
  File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s }
307
312
  if udb[:format] == :fasta_gz
308
313
  download "#{l_ctg}.gz"
309
- system "gzip -d '#{l_ctg}.gz'"
314
+ system "gzip -fd '#{l_ctg}.gz'"
310
315
  else
311
316
  download l_ctg
312
317
  end
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 17, 1].freeze
15
+ VERSION = [1.2, 17, 3].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 2, 14)
23
+ VERSION_DATE = Date.new(2023, 2, 22)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -2,33 +2,61 @@
2
2
  "tasks": [
3
3
  {
4
4
  "task": "BedGraph.tad.rb",
5
- "description": ["Estimates the truncated average sequencing depth (TAD)",
6
- "from a BedGraph file."],
7
- "warn": ["This script doesn't consider zero-coverage positions if",
5
+ "description": [
6
+ "Estimates the truncated average sequencing depth (TAD)",
7
+ "from a BedGraph file."
8
+ ],
9
+ "warn": [
10
+ "This script doesn't consider zero-coverage positions if",
8
11
  "missing from the file. If you produce your BedGraph file with",
9
12
  "bedtools genomecov and want to consider zero-coverage position, be",
10
- "sure to use -bga (not -bg)."],
11
- "see_also": ["BedGraph.window.rb",
12
- "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
13
+ "sure to use -bga (not -bg)."
14
+ ],
15
+ "see_also": [
16
+ "BedGraph.window.rb", "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"
17
+ ],
13
18
  "help_arg": "--help",
14
19
  "options": [
15
20
  {
16
21
  "opt": "--input",
17
22
  "arg": "in_file",
18
23
  "mandatory": true,
19
- "description": "Input BedGraph file."
24
+ "description": [
25
+ "Input BedGraph file.",
26
+ "Supports compression with .gz extension, use - for STDIN."
27
+ ]
28
+ },
29
+ {
30
+ "opt": "--output",
31
+ "arg": "out_file",
32
+ "default": "-",
33
+ "description": [
34
+ "Output tab-delimited values (by default, STDOUT).",
35
+ "Supports compression with .gz extension, use - for STDOUT."
36
+ ]
20
37
  },
21
38
  {
22
39
  "opt": "--range",
23
40
  "arg": "float",
24
41
  "default": 0.5,
25
- "description": ["Central range to consider, between 0 and 1. By",
26
- "default: inter-quartile range (0.5)."]
42
+ "description": [
43
+ "Central range to consider, between 0 and 1.",
44
+ "By default: inter-quartile range (0.5)."
45
+ ]
46
+ },
47
+ {
48
+ "opt": "--name",
49
+ "arg": "string",
50
+ "description": [
51
+ "Name (ID) of the sequence, added as first column."
52
+ ]
27
53
  },
28
54
  {
29
55
  "opt": "--per-seq",
30
- "description": ["Calculate averages per reference sequence, not",
31
- "total. Assumes a sorted BedGraph file."]
56
+ "description": [
57
+ "Calculate averages per reference sequence, not total.",
58
+ "Assumes a sorted BedGraph file."
59
+ ]
32
60
  },
33
61
  {
34
62
  "opt": "--length",
@@ -184,7 +184,8 @@
184
184
  {
185
185
  "task": "SRA.download.bash",
186
186
  "description": ["Downloads the set of runs from a project, sample, or",
187
- "experiment in SRA."],
187
+ "experiment in SRA. If the expected file already exists, skips the",
188
+ "file if the MD5 hash matches."],
188
189
  "help_arg": "",
189
190
  "requires": [
190
191
  {
@@ -1,93 +1,138 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'optparse'
3
+ $VERSION = 1.01
4
+ $:.push File.expand_path('../lib', __FILE__)
5
+ require 'enveomics_rb/enveomics'
4
6
 
5
- o = {range: 0.5, perseq: false, length: false}
6
- ARGV << '-h' if ARGV.empty?
7
- OptionParser.new do |opt|
8
- opt.banner = "
9
- Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
7
+ o = { range: 0.5, perseq: false, length: false, o: '-' }
8
+ OptionParser.new do |opts|
9
+ opts.version = $VERSION
10
+ banner = <<~BANNER
11
+ Estimates the truncated average sequencing depth (TAD) from a BedGraph file
10
12
 
11
- IMPORTANT: This script doesn't consider zero-coverage positions if missing
12
- from the file. If you produce your BedGraph file with bedtools genomecov and
13
- want to consider zero-coverage position, be sure to use -bga (not -bg).
13
+ IMPORTANT: This script doesn't consider zero-coverage positions if missing
14
+ from the file. If you produce your BedGraph file with bedtools genomecov and
15
+ want to consider zero-coverage position, be sure to use -bga (not -bg).
16
+ BANNER
17
+ Enveomics.opt_banner(opts, banner, "#{File.basename($0)} [options]")
14
18
 
15
- Usage: #{$0} [options]"
16
- opt.separator ''
17
- opt.on('-i', '--input PATH',
18
- 'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
19
- opt.on('-r', '--range FLOAT',
20
- 'Central range to consider, between 0 and 1.',
21
- "By default: #{o[:range]} (inter-quartile range)."
22
- ){ |v| o[:range]=v.to_f }
23
- opt.on('-s', '--per-seq',
24
- 'Calculate averages per reference sequence, not total.',
25
- 'Assumes a sorted BedGraph file.'
26
- ){ |v| o[:perseq] = v }
27
- opt.on('-l', '--length',
28
- 'Add sequence length to the output.'){ |v| o[:length] = v }
29
- opt.on('-h', '--help', 'Display this screen.') do
30
- puts opt
19
+ opts.separator 'Mandatory'
20
+ opts.on(
21
+ '-i', '--input PATH',
22
+ 'Input BedGraph file',
23
+ 'Supports compression with .gz extension, use - for STDIN'
24
+ ) { |v| o[:i] = v }
25
+
26
+ opts.separator ''
27
+ opts.separator 'Other Options'
28
+ opts.on(
29
+ '-o', '--out PATH',
30
+ 'Output tab-delimited values (by default, STDOUT)',
31
+ 'Supports compression with .gz extension, use - for STDOUT'
32
+ ) { |v| o[:o] = v }
33
+ opts.on(
34
+ '-r', '--range FLOAT', Float,
35
+ 'Central range to consider, between 0 and 1',
36
+ "By default: #{o[:range]} (inter-quartile range)"
37
+ ) { |v| o[:range] = v }
38
+ opts.on(
39
+ '-n', '--name STRING',
40
+ 'Name (ID) of the sequence (added as first column)'
41
+ ) { |v| o[:name] = v }
42
+ opts.on(
43
+ '-s', '--per-seq',
44
+ 'Calculate averages per reference sequence, not total',
45
+ 'Assumes a sorted BedGraph file'
46
+ ) { |v| o[:perseq] = v }
47
+ opts.on(
48
+ '-l', '--length',
49
+ 'Add sequence length to the output'
50
+ ) { |v| o[:length] = v }
51
+ opts.on(
52
+ '-b', '--breadth',
53
+ 'Add sequencing breadth to the output'
54
+ ) { |v| o[:breadth] = v }
55
+ opts.on('-h', '--help', 'Display this screen') do
56
+ puts opts
31
57
  exit
32
58
  end
33
- opt.separator ''
59
+ opts.separator ''
34
60
  end.parse!
35
- abort '-i is mandatory.' if o[:i].nil?
61
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
36
62
 
63
+ ##
64
+ # Pad an array to include all index values up to +r+ entries:
65
+ # - d: Array of [ depth => counts ]
66
+ # - idx: Array of [ depth, depth, ... ]
67
+ # - r: Expected number of entries in the array
37
68
  def pad(d, idx, r)
38
69
  idx.each do |i|
39
70
  next if d[i].nil?
71
+
40
72
  d[i] -= r
41
73
  break unless d[i] < 0
74
+
42
75
  r = -d[i]
43
76
  d[i] = nil
44
77
  end
45
78
  d
46
79
  end
47
80
 
81
+ ##
82
+ # Report the results for:
83
+ # - sq: Contig ID
84
+ # - d: Array of [ depth => counts ]
85
+ # - ln: Length of the sequence
86
+ # - o: CLI Options
48
87
  def report(sq, d, ln, o)
49
88
  # Estimate padding ranges
50
- pad = (1.0-o[:range])/2.0
51
- r = (pad*ln).round
89
+ pad = (1.0 - o[:range]) / 2.0
90
+ r = (pad * ln).round
91
+ zeroes = d[0].to_i
52
92
 
53
- # Pad
54
- d = pad(d, d.each_index.to_a, r+0)
55
- d = pad(d, d.each_index.to_a.reverse, r+0)
93
+ # Pad (truncation)
94
+ d = pad(d, d.each_index.to_a, r + 0)
95
+ d = pad(d, d.each_index.to_a.reverse, r + 0)
56
96
 
57
97
  # Average
58
98
  y = [0.0]
59
99
  unless d.compact.empty?
60
- s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
61
- y[0] = s.to_f/d.compact.inject(:+)
100
+ s = d.each_with_index.to_a.map { |v, i| v.nil? ? 0 : i * v }.inject(0, :+)
101
+ y[0] = s.to_f / d.compact.inject(:+)
62
102
  end
63
103
 
64
104
  # Report
65
105
  y.unshift(sq) if o[:perseq]
106
+ y.unshift(o[:name]) if o[:name]
66
107
  y << ln if o[:length]
67
- puts y.join("\t")
108
+ y << (ln - zeroes).to_f / ln if o[:breadth]
109
+ y.join("\t")
68
110
  end
69
111
 
70
112
  # Read BedGraph
71
- d = []
113
+ d = [] # [ depth => count ]
72
114
  ln = 0
73
115
  pre_sq = nil
74
- File.open(o[:i], "r") do |ifh|
75
- ifh.each_line do |i|
76
- next if i =~ /^#/
77
- r = i.chomp.split("\t")
78
- sq = r.shift
79
- if o[:perseq] and !pre_sq.nil? and pre_sq!=sq
80
- report(pre_sq, d, ln, o)
81
- d = []
82
- ln = 0
83
- end
84
- r.map! { |j| j.to_i }
85
- l = r[1]-r[0]
86
- d[ r[2] ] ||= 0
87
- d[ r[2] ] += l
88
- ln += l
89
- pre_sq = sq
116
+ ifh = reader(o[:i])
117
+ ofh = writer(o[:o])
118
+ ifh.each_line do |i|
119
+ next if i =~ /^#/
120
+ r = i.chomp.split("\t")
121
+ sq = r.shift # Contig ID
122
+ if o[:perseq] && !pre_sq.nil? && pre_sq != sq
123
+ ofh.puts(report(pre_sq, d, ln, o))
124
+ d = []
125
+ ln = 0
90
126
  end
127
+ r.map!(&:to_i) # From, To, Depth
128
+ l = r[1] - r[0] # Window length: To - From
129
+ d[ r[2] ] ||= 0
130
+ d[ r[2] ] += l # Add these "l" positions with depth "Depth"
131
+ ln += l
132
+ pre_sq = sq
91
133
  end
92
- report(pre_sq, d, ln, o)
134
+ ofh.puts(report(pre_sq, d, ln, o))
135
+
136
+ ifh.close
137
+ ofh.close
93
138
 
@@ -4,13 +4,14 @@ DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
4
4
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
5
5
  SRX=$1
6
6
  DIR=${2:-$SRX}
7
- VERSION=1.0
7
+ VERSION=2.0
8
8
 
9
9
  if [[ "$SRX" == "" ]] ; then
10
10
  echo "
11
11
  [Enveomics Collection: $(basename "$0" .bash) $VERSION]
12
12
 
13
13
  Downloads the set of runs from a project, sample, or experiment in SRA.
14
+ If the expected file already exists, skips the file if the MD5 hash matches.
14
15
 
15
16
  Usage:
16
17
  $(basename "$0") <SRA-ID>[ <dir>]
@@ -42,9 +43,20 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
42
43
  echo "o $srr" >&2
43
44
  for uri in $(echo "$ftp" | tr ";" " ") ; do
44
45
  file="$dir/$(basename $uri)"
46
+
47
+ # Check if it exists and it's complete
48
+ if [[ -s "$file" ]] ; then
49
+ md5obs=$(md5value "$file" 2> /dev/null)
50
+ if [[ "$md5;" == "$md5obs;"* ]] ; then
51
+ md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
52
+ continue
53
+ fi
54
+ fi
55
+
56
+ # Otherwise, download and check MD5
45
57
  curl "$uri" -o "$file"
46
58
  md5obs=$(md5value "$file" 2> /dev/null)
47
- if [[ "$md5" == "$md5obs"* ]] ; then
59
+ if [[ "$md5;" == "$md5obs;"* ]] ; then
48
60
  md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
49
61
  else
50
62
  echo "Corrupt file: $file" >&2
Binary file
@@ -1,7 +1,7 @@
1
1
  Package: enveomics.R
2
- Version: 1.8.0
3
- Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
- email="lmrodriguezr@gmail.com"))
2
+ Version: 1.9.0
3
+ Authors@R: c(person("Luis M.", "Rodriguez-R", role = c("aut", "cre"),
4
+ email = "lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
6
6
  Description: A collection of functions for microbial ecology and other
7
7
  applications of genomics and metagenomics. Companion package for the
@@ -9,7 +9,7 @@ Description: A collection of functions for microbial ecology and other
9
9
  <DOI:10.7287/peerj.preprints.1900v1>).
10
10
  Author: Luis M. Rodriguez-R [aut, cre]
11
11
  Maintainer: Luis M. Rodriguez-R <lmrodriguezr@gmail.com>
12
- URL: http://enve-omics.ce.gatech.edu/enveomics
12
+ URL: http://enve-omics.ce.gatech.edu/enveomics/
13
13
  Depends:
14
14
  R (>= 2.9),
15
15
  stats,
@@ -28,4 +28,4 @@ Suggests:
28
28
  License: Artistic-2.0
29
29
  LazyData: yes
30
30
  Encoding: UTF-8
31
- RoxygenNote: 7.0.2
31
+ RoxygenNote: 7.1.2
@@ -22,134 +22,146 @@
22
22
  #'
23
23
  #' @export
24
24
 
25
- enve.prune.dist <- function
26
- (t,
27
- dist.quantile=0.25,
28
- min_dist,
29
- quiet=FALSE,
30
- max_iters=100,
31
- min_nodes_random=4e4,
32
- random_nodes_frx=1
33
- ){
34
- if(!requireNamespace("ape", quietly=TRUE))
25
+ enve.prune.dist <- function(
26
+ t,
27
+ dist.quantile = 0.25,
28
+ min_dist,
29
+ quiet = FALSE,
30
+ max_iters = 100,
31
+ min_nodes_random = 4e4,
32
+ random_nodes_frx = 1
33
+ ) {
34
+ if (!requireNamespace("ape", quietly = TRUE))
35
35
  stop('Unavailable ape library.');
36
- if(is.character(t)) t <- ape::read.tree(t)
37
- if(missing(min_dist)){
38
- if(dist.quantile>0){
39
- min_dist <- as.numeric(quantile(t$edge.length, dist.quantile));
40
- }else{
41
- min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]));
36
+ if (is.character(t)) t <- ape::read.tree(t)
37
+ if (missing(min_dist)) {
38
+ if (dist.quantile > 0) {
39
+ min_dist <- as.numeric(quantile(t$edge.length, dist.quantile))
40
+ } else {
41
+ min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]))
42
42
  }
43
43
  }
44
- if(!quiet) cat('\nObjective minimum distance: ',min_dist,', initial tips: ',length(t$tip.label),'\n', sep='');
45
- round=1;
46
- while(round <= max_iters){
47
- if(length(t$tip.label) > min_nodes_random){
48
- if(!quiet) cat(' | Iter: ',round-1,', Tips: ', length(t$tip.label),
49
- ', reducing tip-pairs.\n', sep='');
50
- rnd.nodes <- sample(t$tip.label, length(t$tip.label)*random_nodes_frx);
51
- t <- enve.__prune.reduce(t, rnd.nodes, min_dist, quiet);
52
- }else{
53
- if(!quiet) cat(' Gathering distances...\r');
54
- d <- cophenetic(t);
55
- diag(d) <- NA;
56
- if(!quiet) cat(' | Iter: ',round-1,', Tips: ', length(t$tip.label),
57
- ', Median distance: ', median(d, na.rm=TRUE),
58
- ', Minimum distance: ', min(d, na.rm=TRUE),
59
- '\n', sep='');
44
+ if (!quiet)
45
+ cat("\nObjective minimum distance: ", min_dist, ", initial tips: ",
46
+ length(t$tip.label), "\n", sep = "")
47
+
48
+ round <- 1
49
+ while (round <= max_iters) {
50
+ if (length(t$tip.label) > min_nodes_random) {
51
+ if (!quiet)
52
+ cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
53
+ ", reducing tip-pairs.\n", sep = "")
54
+ rnd.nodes <- sample(t$tip.label, length(t$tip.label) * random_nodes_frx)
55
+ t <- enve.__prune.reduce(t, rnd.nodes, min_dist, quiet)
56
+ } else {
57
+ if (!quiet) cat(" Gathering distances...\r")
58
+ d <- cophenetic(t)
59
+ diag(d) <- NA
60
+ if(!quiet)
61
+ cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
62
+ ", Median distance: ", median(d, na.rm = TRUE),
63
+ ", Minimum distance: ", min(d, na.rm = TRUE), "\n", sep = "")
60
64
  # Run iteration
61
- if(min(d, na.rm=TRUE) < min_dist){
62
- t <- enve.__prune.iter(t, d, min_dist, quiet);
63
- }else{
64
- break;
65
+ if (min(d, na.rm = TRUE) < min_dist) {
66
+ t <- enve.__prune.iter(t, d, min_dist, quiet)
67
+ } else {
68
+ break
65
69
  }
66
70
  }
67
- round <- round + 1;
71
+ round <- round + 1
68
72
  }
69
- return(t);
73
+ return(t)
70
74
  }
71
75
 
72
76
  #' Enveomics: Prune Reduce (Internal Function)
73
77
  #'
74
78
  #' Internal function for \code{\link{enve.prune.dist}}.
75
79
  #'
76
- #' @param t A \strong{phylo} object
77
- #' @param nodes Vector of nodes
78
- #' @param min_dist Minimum distance
79
- #' @param quiet If running quietly
80
+ #' @param t A \strong{phylo} object.
81
+ #' @param nodes Vector of nodes.
82
+ #' @param min_dist Minimum distance.
83
+ #' @param quiet If running quietly.
84
+ #'
85
+ #' @return A \strong{phylo} object.
80
86
  #'
81
87
  #' @author Luis M. Rodriguez-R [aut, cre]
82
88
  #'
83
89
  #' @export
84
90
 
85
- enve.__prune.reduce <- function
86
- (t, nodes, min_dist, quiet){
87
- if(!quiet) pb <- txtProgressBar(1, length(nodes), style=3);
88
- for(i in 1:length(nodes)){
89
- node.name <- nodes[i];
90
- if(!quiet) setTxtProgressBar(pb, i);
91
+ enve.__prune.reduce <- function(t, nodes, min_dist, quiet) {
92
+ if (!quiet) pb <- txtProgressBar(1, length(nodes), style = 3)
93
+ for (i in 1:length(nodes)) {
94
+ node.name <- nodes[i]
95
+ if (!quiet) setTxtProgressBar(pb, i)
96
+
91
97
  # Get node ID
92
- node <- which(t$tip.label==node.name);
93
- if(length(node)==0) next;
98
+ node <- which(t$tip.label == node.name)
99
+ if (length(node) == 0) next
100
+
94
101
  # Get parent and distance to parent
95
- parent.node <- t$edge[ t$edge[,2]==node, 1];
102
+ parent.node <- t$edge[t$edge[, 2] == node, 1]
103
+
96
104
  # Get edges to parent
97
- parent.edges <- which(t$edge[,1]==parent.node);
98
- stopit <- FALSE;
99
- for(j in parent.edges){
100
- for(k in parent.edges){
101
- if(j != k & t$edge[j,2]<length(t$tip.label) & t$edge[k,2]<length(t$tip.label) & sum(t$edge.length[c(j,k)]) < min_dist){
102
- t <- ape::drop.tip(t, t$edge[k,2]);
103
- stopit <- TRUE;
104
- break;
105
+ parent.edges <- which(t$edge[, 1] == parent.node)
106
+ stopit <- FALSE
107
+ for (j in parent.edges) {
108
+ for (k in parent.edges) {
109
+ if (j != k & t$edge[j,2]<length(t$tip.label) &
110
+ t$edge[k,2]<length(t$tip.label) &
111
+ sum(t$edge.length[c(j,k)]) < min_dist) {
112
+ t <- ape::drop.tip(t, t$edge[k,2])
113
+ stopit <- TRUE
114
+ break
105
115
  }
106
116
  }
107
- if(stopit) break;
117
+ if (stopit) break
108
118
  }
109
119
  }
110
- if(!quiet) cat('\n');
111
- return(t);
120
+ if (!quiet) cat("\n")
121
+ return(t)
112
122
  }
113
123
 
114
124
  #' Enveomics: Prune Iter (Internal Function)
115
125
  #'
116
126
  #' Internal function for \code{\link{enve.prune.dist}}.
117
127
  #'
118
- #' @param t A \strong{phylo} object
119
- #' @param dist Cophenetic distance matrix
120
- #' @param min_dist Minimum distance
121
- #' @param quiet If running quietly
128
+ #' @param t A \strong{phylo} object.
129
+ #' @param dist Cophenetic distance matrix.
130
+ #' @param min_dist Minimum distance.
131
+ #' @param quiet If running quietly.
132
+ #'
133
+ #' @return Returns a \strong{phylo} object.
122
134
  #'
123
135
  #' @author Luis M. Rodriguez-R [aut, cre]
124
136
  #'
125
137
  #' @export
126
138
 
127
- enve.__prune.iter <- function
128
- (t,
129
- dist,
130
- min_dist,
131
- quiet){
132
- ori_len <- length(t$tip.label);
139
+ enve.__prune.iter <- function(t, dist, min_dist, quiet) {
140
+ ori_len <- length(t$tip.label)
141
+
133
142
  # Prune
134
- if(!quiet) pb <- txtProgressBar(1, ncol(dist)-1, style=3);
135
- ignore <- c();
136
- for(i in 1:(ncol(dist)-1)){
137
- if(i %in% ignore) next;
138
- for(j in (i+1):nrow(dist)){
139
- if(dist[j, i]<min_dist){
140
- t <- ape::drop.tip(t, rownames(dist)[j]);
141
- ignore <- c(ignore, j);
142
- break;
143
+ if (!quiet) pb <- txtProgressBar(1, ncol(dist) - 1, style = 3)
144
+ ignore <- c()
145
+ for (i in 1:(ncol(dist) - 1)) {
146
+ if (i %in% ignore) next
147
+ for (j in (i + 1):nrow(dist)) {
148
+ if (dist[j, i] < min_dist) {
149
+ t <- ape::drop.tip(t, rownames(dist)[j])
150
+ ignore <- c(ignore, j)
151
+ break
143
152
  }
144
153
  }
145
- if(!quiet) setTxtProgressBar(pb, i);
154
+ if (!quiet) setTxtProgressBar(pb, i)
146
155
  }
147
- if(!quiet) cat('\n');
156
+ if(!quiet) cat("\n")
157
+
148
158
  # Check if it droped tips
149
- cur_len <- length(t$tip.label);
150
- if(cur_len == ori_len){
151
- stop("Internal error: small edge found in tree, with no equivalent in distance matrix.\n");
159
+ cur_len <- length(t$tip.label)
160
+ if (cur_len == ori_len){
161
+ stop(
162
+ "Internal error: small edge found in tree, with no equivalent in distance matrix.\n"
163
+ )
152
164
  }
153
- return(t);
165
+ return(t)
154
166
  }
155
167