miga-base 1.2.17.1 → 1.2.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/remote_dataset/download.rb +1 -1
  3. data/lib/miga/remote_dataset.rb +9 -4
  4. data/lib/miga/version.rb +2 -2
  5. data/utils/enveomics/Manifest/Tasks/mapping.json +39 -11
  6. data/utils/enveomics/Manifest/Tasks/remote.json +2 -1
  7. data/utils/enveomics/Scripts/BedGraph.tad.rb +98 -53
  8. data/utils/enveomics/Scripts/SRA.download.bash +14 -2
  9. data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
  10. data/utils/enveomics/enveomics.R/DESCRIPTION +5 -5
  11. data/utils/enveomics/enveomics.R/R/autoprune.R +99 -87
  12. data/utils/enveomics/enveomics.R/R/barplot.R +116 -97
  13. data/utils/enveomics/enveomics.R/R/cliopts.R +65 -59
  14. data/utils/enveomics/enveomics.R/R/df2dist.R +96 -58
  15. data/utils/enveomics/enveomics.R/R/growthcurve.R +166 -148
  16. data/utils/enveomics/enveomics.R/R/recplot.R +201 -136
  17. data/utils/enveomics/enveomics.R/R/recplot2.R +371 -304
  18. data/utils/enveomics/enveomics.R/R/tribs.R +318 -263
  19. data/utils/enveomics/enveomics.R/R/utils.R +30 -20
  20. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +4 -3
  21. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +2 -2
  22. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +3 -3
  23. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +7 -4
  24. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +7 -4
  25. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +4 -0
  26. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +25 -17
  27. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +10 -0
  28. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +8 -2
  29. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +14 -0
  30. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +20 -1
  31. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +2 -3
  32. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +5 -2
  33. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +50 -42
  34. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +5 -2
  35. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +3 -0
  36. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +3 -0
  37. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +3 -0
  38. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +3 -0
  39. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +9 -4
  40. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +3 -0
  41. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +3 -3
  42. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -2
  43. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +4 -0
  44. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +5 -0
  45. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +11 -7
  46. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +5 -1
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +3 -0
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +2 -2
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +3 -3
  50. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +2 -2
  51. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +3 -0
  52. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +3 -0
  53. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +6 -3
  54. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +2 -2
  55. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +3 -0
  56. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +3 -0
  57. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +3 -0
  58. metadata +3 -37
  59. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
  60. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
  61. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
  62. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
  63. data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
  64. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
  65. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
  66. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
  67. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
  68. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
  69. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
  70. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
  71. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
  72. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
  73. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
  74. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
  75. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
  76. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
  77. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
  78. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
  79. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
  80. data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
  81. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
  82. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
  83. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
  84. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
  85. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
  86. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
  87. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
  88. data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
  89. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
  90. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
  91. data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
  92. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
  93. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 608607327562dd08edc9f866aeeb566407eb85f0adcf8538c37789962c387f72
4
- data.tar.gz: b21cb37dcae1eab3551d2058f21543221a9b6b9a5b6c834074a4d7c6c60a7102
3
+ metadata.gz: 57016515d9789927d6eda96e04be04dd9d47e6f330ea8f2f489971d7f0ad3845
4
+ data.tar.gz: 3e42b22e637fc5ad47405eaac88fcc7ba00d94abfe86035c8a8a613b54e06bbe
5
5
  SHA512:
6
- metadata.gz: 6708285348840ed44251d64d003c477a0887db497ccfaf97ce17f398b650156420897fc07036cbc395b4107c71bdb6638ad756442e364e530363a36aca8ec9a3
7
- data.tar.gz: fce107bebd89fd53f07d0ca5b814564b76c00b6faacbb0c89b91c0a114b2c5d5911b446fa27f1db5cecab3d74dee2ab3c2d6b10538a3e33056ab2b2ba766c2e9
6
+ metadata.gz: c2b8cf60bfc487960e66d4a35a61fcb691d166baf31656e115c28dd3854276e062f6c9325d2d4ff924a563327a01c05f4ecfd570004deb54db05b065b955281a
7
+ data.tar.gz: 6124879ca61700d6cf58aaa27c2db405874b1fdcaeed829b835662eeb92f6a59383f1fb2ed5bb073d5b97d1bef2ed15f0580821a267cd6f5f0c65ef2dbb2f1bc
@@ -61,7 +61,7 @@ class MiGA::RemoteDataset
61
61
  # Supported +opts+ (Hash) are the same as #download_rest and #ncbi_asm_rest.
62
62
  def ncbi_gb_rest(opts)
63
63
  # Simply use defaults, but ensure that the URL can be properly formed
64
- o = download_rest(opts.merge(universe: :ncbi, db: :nuccore, format: :gb))
64
+ o = download_rest(opts.merge(universe: :ncbi, db: :nuccore))
65
65
  return o unless o.strip.empty?
66
66
 
67
67
  MiGA::MiGA.DEBUG 'Empty sequence, attempting download from NCBI assembly'
@@ -134,7 +134,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
134
134
  # Get taxonomy
135
135
  @metadata[:tax] = get_gtdb_taxonomy
136
136
  when :seqcode
137
- # Do nothing, taxonomy already defined
137
+ # Taxonomy already defined
138
+ # Copy IDs over to allow additional metadata linked
139
+ @metadata[:ncbi_asm] = @metadata[:seqcode_asm]
140
+ @metadata[:ncbi_nuccore] = @metadata[:seqcode_nuccore]
138
141
  end
139
142
 
140
143
  if metadata[:get_ncbi_taxonomy]
@@ -276,6 +279,10 @@ class MiGA::RemoteDataset < MiGA::MiGA
276
279
  def get_type_status_ncbi_asm(metadata)
277
280
  return metadata if ncbi_asm_json_doc.nil?
278
281
 
282
+ metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
283
+ metadata[:suspect] = nil if metadata[:suspect].empty?
284
+ return metadata if metadata[:is_type] # If predefined, as in SeqCode
285
+
279
286
  from_type = ncbi_asm_json_doc['from_type']
280
287
  from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
281
288
  case from_type
@@ -292,8 +299,6 @@ class MiGA::RemoteDataset < MiGA::MiGA
292
299
  metadata[:is_type] = true
293
300
  metadata[:type_rel] = from_type
294
301
  end
295
- metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || [])
296
- metadata[:suspect] = nil if metadata[:suspect].empty?
297
302
  MiGA.DEBUG "Got type: #{from_type}"
298
303
  metadata
299
304
  end
@@ -306,7 +311,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
306
311
  File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s }
307
312
  if udb[:format] == :fasta_gz
308
313
  download "#{l_ctg}.gz"
309
- system "gzip -d '#{l_ctg}.gz'"
314
+ system "gzip -fd '#{l_ctg}.gz'"
310
315
  else
311
316
  download l_ctg
312
317
  end
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 17, 1].freeze
15
+ VERSION = [1.2, 17, 3].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 2, 14)
23
+ VERSION_DATE = Date.new(2023, 2, 22)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -2,33 +2,61 @@
2
2
  "tasks": [
3
3
  {
4
4
  "task": "BedGraph.tad.rb",
5
- "description": ["Estimates the truncated average sequencing depth (TAD)",
6
- "from a BedGraph file."],
7
- "warn": ["This script doesn't consider zero-coverage positions if",
5
+ "description": [
6
+ "Estimates the truncated average sequencing depth (TAD)",
7
+ "from a BedGraph file."
8
+ ],
9
+ "warn": [
10
+ "This script doesn't consider zero-coverage positions if",
8
11
  "missing from the file. If you produce your BedGraph file with",
9
12
  "bedtools genomecov and want to consider zero-coverage position, be",
10
- "sure to use -bga (not -bg)."],
11
- "see_also": ["BedGraph.window.rb",
12
- "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"],
13
+ "sure to use -bga (not -bg)."
14
+ ],
15
+ "see_also": [
16
+ "BedGraph.window.rb", "BlastTab.seqdepth.pl", "BlastTab.seqdepth_ZIP.pl"
17
+ ],
13
18
  "help_arg": "--help",
14
19
  "options": [
15
20
  {
16
21
  "opt": "--input",
17
22
  "arg": "in_file",
18
23
  "mandatory": true,
19
- "description": "Input BedGraph file."
24
+ "description": [
25
+ "Input BedGraph file.",
26
+ "Supports compression with .gz extension, use - for STDIN."
27
+ ]
28
+ },
29
+ {
30
+ "opt": "--output",
31
+ "arg": "out_file",
32
+ "default": "-",
33
+ "description": [
34
+ "Output tab-delimited values (by default, STDOUT).",
35
+ "Supports compression with .gz extension, use - for STDOUT."
36
+ ]
20
37
  },
21
38
  {
22
39
  "opt": "--range",
23
40
  "arg": "float",
24
41
  "default": 0.5,
25
- "description": ["Central range to consider, between 0 and 1. By",
26
- "default: inter-quartile range (0.5)."]
42
+ "description": [
43
+ "Central range to consider, between 0 and 1.",
44
+ "By default: inter-quartile range (0.5)."
45
+ ]
46
+ },
47
+ {
48
+ "opt": "--name",
49
+ "arg": "string",
50
+ "description": [
51
+ "Name (ID) of the sequence, added as first column."
52
+ ]
27
53
  },
28
54
  {
29
55
  "opt": "--per-seq",
30
- "description": ["Calculate averages per reference sequence, not",
31
- "total. Assumes a sorted BedGraph file."]
56
+ "description": [
57
+ "Calculate averages per reference sequence, not total.",
58
+ "Assumes a sorted BedGraph file."
59
+ ]
32
60
  },
33
61
  {
34
62
  "opt": "--length",
@@ -184,7 +184,8 @@
184
184
  {
185
185
  "task": "SRA.download.bash",
186
186
  "description": ["Downloads the set of runs from a project, sample, or",
187
- "experiment in SRA."],
187
+ "experiment in SRA. If the expected file already exists, skips the",
188
+ "file if the MD5 hash matches."],
188
189
  "help_arg": "",
189
190
  "requires": [
190
191
  {
@@ -1,93 +1,138 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'optparse'
3
+ $VERSION = 1.01
4
+ $:.push File.expand_path('../lib', __FILE__)
5
+ require 'enveomics_rb/enveomics'
4
6
 
5
- o = {range: 0.5, perseq: false, length: false}
6
- ARGV << '-h' if ARGV.empty?
7
- OptionParser.new do |opt|
8
- opt.banner = "
9
- Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
7
+ o = { range: 0.5, perseq: false, length: false, o: '-' }
8
+ OptionParser.new do |opts|
9
+ opts.version = $VERSION
10
+ banner = <<~BANNER
11
+ Estimates the truncated average sequencing depth (TAD) from a BedGraph file
10
12
 
11
- IMPORTANT: This script doesn't consider zero-coverage positions if missing
12
- from the file. If you produce your BedGraph file with bedtools genomecov and
13
- want to consider zero-coverage position, be sure to use -bga (not -bg).
13
+ IMPORTANT: This script doesn't consider zero-coverage positions if missing
14
+ from the file. If you produce your BedGraph file with bedtools genomecov and
15
+ want to consider zero-coverage position, be sure to use -bga (not -bg).
16
+ BANNER
17
+ Enveomics.opt_banner(opts, banner, "#{File.basename($0)} [options]")
14
18
 
15
- Usage: #{$0} [options]"
16
- opt.separator ''
17
- opt.on('-i', '--input PATH',
18
- 'Input BedGraph file (mandatory).'){ |v| o[:i]=v }
19
- opt.on('-r', '--range FLOAT',
20
- 'Central range to consider, between 0 and 1.',
21
- "By default: #{o[:range]} (inter-quartile range)."
22
- ){ |v| o[:range]=v.to_f }
23
- opt.on('-s', '--per-seq',
24
- 'Calculate averages per reference sequence, not total.',
25
- 'Assumes a sorted BedGraph file.'
26
- ){ |v| o[:perseq] = v }
27
- opt.on('-l', '--length',
28
- 'Add sequence length to the output.'){ |v| o[:length] = v }
29
- opt.on('-h', '--help', 'Display this screen.') do
30
- puts opt
19
+ opts.separator 'Mandatory'
20
+ opts.on(
21
+ '-i', '--input PATH',
22
+ 'Input BedGraph file',
23
+ 'Supports compression with .gz extension, use - for STDIN'
24
+ ) { |v| o[:i] = v }
25
+
26
+ opts.separator ''
27
+ opts.separator 'Other Options'
28
+ opts.on(
29
+ '-o', '--out PATH',
30
+ 'Output tab-delimited values (by default, STDOUT)',
31
+ 'Supports compression with .gz extension, use - for STDOUT'
32
+ ) { |v| o[:o] = v }
33
+ opts.on(
34
+ '-r', '--range FLOAT', Float,
35
+ 'Central range to consider, between 0 and 1',
36
+ "By default: #{o[:range]} (inter-quartile range)"
37
+ ) { |v| o[:range] = v }
38
+ opts.on(
39
+ '-n', '--name STRING',
40
+ 'Name (ID) of the sequence (added as first column)'
41
+ ) { |v| o[:name] = v }
42
+ opts.on(
43
+ '-s', '--per-seq',
44
+ 'Calculate averages per reference sequence, not total',
45
+ 'Assumes a sorted BedGraph file'
46
+ ) { |v| o[:perseq] = v }
47
+ opts.on(
48
+ '-l', '--length',
49
+ 'Add sequence length to the output'
50
+ ) { |v| o[:length] = v }
51
+ opts.on(
52
+ '-b', '--breadth',
53
+ 'Add sequencing breadth to the output'
54
+ ) { |v| o[:breadth] = v }
55
+ opts.on('-h', '--help', 'Display this screen') do
56
+ puts opts
31
57
  exit
32
58
  end
33
- opt.separator ''
59
+ opts.separator ''
34
60
  end.parse!
35
- abort '-i is mandatory.' if o[:i].nil?
61
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
36
62
 
63
+ ##
64
+ # Pad an array to include all index values up to +r+ entries:
65
+ # - d: Array of [ depth => counts ]
66
+ # - idx: Array of [ depth, depth, ... ]
67
+ # - r: Expected number of entries in the array
37
68
  def pad(d, idx, r)
38
69
  idx.each do |i|
39
70
  next if d[i].nil?
71
+
40
72
  d[i] -= r
41
73
  break unless d[i] < 0
74
+
42
75
  r = -d[i]
43
76
  d[i] = nil
44
77
  end
45
78
  d
46
79
  end
47
80
 
81
+ ##
82
+ # Report the results for:
83
+ # - sq: Contig ID
84
+ # - d: Array of [ depth => counts ]
85
+ # - ln: Length of the sequence
86
+ # - o: CLI Options
48
87
  def report(sq, d, ln, o)
49
88
  # Estimate padding ranges
50
- pad = (1.0-o[:range])/2.0
51
- r = (pad*ln).round
89
+ pad = (1.0 - o[:range]) / 2.0
90
+ r = (pad * ln).round
91
+ zeroes = d[0].to_i
52
92
 
53
- # Pad
54
- d = pad(d, d.each_index.to_a, r+0)
55
- d = pad(d, d.each_index.to_a.reverse, r+0)
93
+ # Pad (truncation)
94
+ d = pad(d, d.each_index.to_a, r + 0)
95
+ d = pad(d, d.each_index.to_a.reverse, r + 0)
56
96
 
57
97
  # Average
58
98
  y = [0.0]
59
99
  unless d.compact.empty?
60
- s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
61
- y[0] = s.to_f/d.compact.inject(:+)
100
+ s = d.each_with_index.to_a.map { |v, i| v.nil? ? 0 : i * v }.inject(0, :+)
101
+ y[0] = s.to_f / d.compact.inject(:+)
62
102
  end
63
103
 
64
104
  # Report
65
105
  y.unshift(sq) if o[:perseq]
106
+ y.unshift(o[:name]) if o[:name]
66
107
  y << ln if o[:length]
67
- puts y.join("\t")
108
+ y << (ln - zeroes).to_f / ln if o[:breadth]
109
+ y.join("\t")
68
110
  end
69
111
 
70
112
  # Read BedGraph
71
- d = []
113
+ d = [] # [ depth => count ]
72
114
  ln = 0
73
115
  pre_sq = nil
74
- File.open(o[:i], "r") do |ifh|
75
- ifh.each_line do |i|
76
- next if i =~ /^#/
77
- r = i.chomp.split("\t")
78
- sq = r.shift
79
- if o[:perseq] and !pre_sq.nil? and pre_sq!=sq
80
- report(pre_sq, d, ln, o)
81
- d = []
82
- ln = 0
83
- end
84
- r.map! { |j| j.to_i }
85
- l = r[1]-r[0]
86
- d[ r[2] ] ||= 0
87
- d[ r[2] ] += l
88
- ln += l
89
- pre_sq = sq
116
+ ifh = reader(o[:i])
117
+ ofh = writer(o[:o])
118
+ ifh.each_line do |i|
119
+ next if i =~ /^#/
120
+ r = i.chomp.split("\t")
121
+ sq = r.shift # Contig ID
122
+ if o[:perseq] && !pre_sq.nil? && pre_sq != sq
123
+ ofh.puts(report(pre_sq, d, ln, o))
124
+ d = []
125
+ ln = 0
90
126
  end
127
+ r.map!(&:to_i) # From, To, Depth
128
+ l = r[1] - r[0] # Window length: To - From
129
+ d[ r[2] ] ||= 0
130
+ d[ r[2] ] += l # Add these "l" positions with depth "Depth"
131
+ ln += l
132
+ pre_sq = sq
91
133
  end
92
- report(pre_sq, d, ln, o)
134
+ ofh.puts(report(pre_sq, d, ln, o))
135
+
136
+ ifh.close
137
+ ofh.close
93
138
 
@@ -4,13 +4,14 @@ DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
4
4
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
5
5
  SRX=$1
6
6
  DIR=${2:-$SRX}
7
- VERSION=1.0
7
+ VERSION=2.0
8
8
 
9
9
  if [[ "$SRX" == "" ]] ; then
10
10
  echo "
11
11
  [Enveomics Collection: $(basename "$0" .bash) $VERSION]
12
12
 
13
13
  Downloads the set of runs from a project, sample, or experiment in SRA.
14
+ If the expected file already exists, skips the file if the MD5 hash matches.
14
15
 
15
16
  Usage:
16
17
  $(basename "$0") <SRA-ID>[ <dir>]
@@ -42,9 +43,20 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
42
43
  echo "o $srr" >&2
43
44
  for uri in $(echo "$ftp" | tr ";" " ") ; do
44
45
  file="$dir/$(basename $uri)"
46
+
47
+ # Check if it exists and it's complete
48
+ if [[ -s "$file" ]] ; then
49
+ md5obs=$(md5value "$file" 2> /dev/null)
50
+ if [[ "$md5;" == "$md5obs;"* ]] ; then
51
+ md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
52
+ continue
53
+ fi
54
+ fi
55
+
56
+ # Otherwise, download and check MD5
45
57
  curl "$uri" -o "$file"
46
58
  md5obs=$(md5value "$file" 2> /dev/null)
47
- if [[ "$md5" == "$md5obs"* ]] ; then
59
+ if [[ "$md5;" == "$md5obs;"* ]] ; then
48
60
  md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
49
61
  else
50
62
  echo "Corrupt file: $file" >&2
Binary file
@@ -1,7 +1,7 @@
1
1
  Package: enveomics.R
2
- Version: 1.8.0
3
- Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
- email="lmrodriguezr@gmail.com"))
2
+ Version: 1.9.0
3
+ Authors@R: c(person("Luis M.", "Rodriguez-R", role = c("aut", "cre"),
4
+ email = "lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
6
6
  Description: A collection of functions for microbial ecology and other
7
7
  applications of genomics and metagenomics. Companion package for the
@@ -9,7 +9,7 @@ Description: A collection of functions for microbial ecology and other
9
9
  <DOI:10.7287/peerj.preprints.1900v1>).
10
10
  Author: Luis M. Rodriguez-R [aut, cre]
11
11
  Maintainer: Luis M. Rodriguez-R <lmrodriguezr@gmail.com>
12
- URL: http://enve-omics.ce.gatech.edu/enveomics
12
+ URL: http://enve-omics.ce.gatech.edu/enveomics/
13
13
  Depends:
14
14
  R (>= 2.9),
15
15
  stats,
@@ -28,4 +28,4 @@ Suggests:
28
28
  License: Artistic-2.0
29
29
  LazyData: yes
30
30
  Encoding: UTF-8
31
- RoxygenNote: 7.0.2
31
+ RoxygenNote: 7.1.2
@@ -22,134 +22,146 @@
22
22
  #'
23
23
  #' @export
24
24
 
25
- enve.prune.dist <- function
26
- (t,
27
- dist.quantile=0.25,
28
- min_dist,
29
- quiet=FALSE,
30
- max_iters=100,
31
- min_nodes_random=4e4,
32
- random_nodes_frx=1
33
- ){
34
- if(!requireNamespace("ape", quietly=TRUE))
25
+ enve.prune.dist <- function(
26
+ t,
27
+ dist.quantile = 0.25,
28
+ min_dist,
29
+ quiet = FALSE,
30
+ max_iters = 100,
31
+ min_nodes_random = 4e4,
32
+ random_nodes_frx = 1
33
+ ) {
34
+ if (!requireNamespace("ape", quietly = TRUE))
35
35
  stop('Unavailable ape library.');
36
- if(is.character(t)) t <- ape::read.tree(t)
37
- if(missing(min_dist)){
38
- if(dist.quantile>0){
39
- min_dist <- as.numeric(quantile(t$edge.length, dist.quantile));
40
- }else{
41
- min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]));
36
+ if (is.character(t)) t <- ape::read.tree(t)
37
+ if (missing(min_dist)) {
38
+ if (dist.quantile > 0) {
39
+ min_dist <- as.numeric(quantile(t$edge.length, dist.quantile))
40
+ } else {
41
+ min_dist <- as.numeric(min(t$edge.length[t$edge.length>0]))
42
42
  }
43
43
  }
44
- if(!quiet) cat('\nObjective minimum distance: ',min_dist,', initial tips: ',length(t$tip.label),'\n', sep='');
45
- round=1;
46
- while(round <= max_iters){
47
- if(length(t$tip.label) > min_nodes_random){
48
- if(!quiet) cat(' | Iter: ',round-1,', Tips: ', length(t$tip.label),
49
- ', reducing tip-pairs.\n', sep='');
50
- rnd.nodes <- sample(t$tip.label, length(t$tip.label)*random_nodes_frx);
51
- t <- enve.__prune.reduce(t, rnd.nodes, min_dist, quiet);
52
- }else{
53
- if(!quiet) cat(' Gathering distances...\r');
54
- d <- cophenetic(t);
55
- diag(d) <- NA;
56
- if(!quiet) cat(' | Iter: ',round-1,', Tips: ', length(t$tip.label),
57
- ', Median distance: ', median(d, na.rm=TRUE),
58
- ', Minimum distance: ', min(d, na.rm=TRUE),
59
- '\n', sep='');
44
+ if (!quiet)
45
+ cat("\nObjective minimum distance: ", min_dist, ", initial tips: ",
46
+ length(t$tip.label), "\n", sep = "")
47
+
48
+ round <- 1
49
+ while (round <= max_iters) {
50
+ if (length(t$tip.label) > min_nodes_random) {
51
+ if (!quiet)
52
+ cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
53
+ ", reducing tip-pairs.\n", sep = "")
54
+ rnd.nodes <- sample(t$tip.label, length(t$tip.label) * random_nodes_frx)
55
+ t <- enve.__prune.reduce(t, rnd.nodes, min_dist, quiet)
56
+ } else {
57
+ if (!quiet) cat(" Gathering distances...\r")
58
+ d <- cophenetic(t)
59
+ diag(d) <- NA
60
+ if(!quiet)
61
+ cat(" | Iter: ", round - 1, ", Tips: ", length(t$tip.label),
62
+ ", Median distance: ", median(d, na.rm = TRUE),
63
+ ", Minimum distance: ", min(d, na.rm = TRUE), "\n", sep = "")
60
64
  # Run iteration
61
- if(min(d, na.rm=TRUE) < min_dist){
62
- t <- enve.__prune.iter(t, d, min_dist, quiet);
63
- }else{
64
- break;
65
+ if (min(d, na.rm = TRUE) < min_dist) {
66
+ t <- enve.__prune.iter(t, d, min_dist, quiet)
67
+ } else {
68
+ break
65
69
  }
66
70
  }
67
- round <- round + 1;
71
+ round <- round + 1
68
72
  }
69
- return(t);
73
+ return(t)
70
74
  }
71
75
 
72
76
  #' Enveomics: Prune Reduce (Internal Function)
73
77
  #'
74
78
  #' Internal function for \code{\link{enve.prune.dist}}.
75
79
  #'
76
- #' @param t A \strong{phylo} object
77
- #' @param nodes Vector of nodes
78
- #' @param min_dist Minimum distance
79
- #' @param quiet If running quietly
80
+ #' @param t A \strong{phylo} object.
81
+ #' @param nodes Vector of nodes.
82
+ #' @param min_dist Minimum distance.
83
+ #' @param quiet If running quietly.
84
+ #'
85
+ #' @return A \strong{phylo} object.
80
86
  #'
81
87
  #' @author Luis M. Rodriguez-R [aut, cre]
82
88
  #'
83
89
  #' @export
84
90
 
85
- enve.__prune.reduce <- function
86
- (t, nodes, min_dist, quiet){
87
- if(!quiet) pb <- txtProgressBar(1, length(nodes), style=3);
88
- for(i in 1:length(nodes)){
89
- node.name <- nodes[i];
90
- if(!quiet) setTxtProgressBar(pb, i);
91
+ enve.__prune.reduce <- function(t, nodes, min_dist, quiet) {
92
+ if (!quiet) pb <- txtProgressBar(1, length(nodes), style = 3)
93
+ for (i in 1:length(nodes)) {
94
+ node.name <- nodes[i]
95
+ if (!quiet) setTxtProgressBar(pb, i)
96
+
91
97
  # Get node ID
92
- node <- which(t$tip.label==node.name);
93
- if(length(node)==0) next;
98
+ node <- which(t$tip.label == node.name)
99
+ if (length(node) == 0) next
100
+
94
101
  # Get parent and distance to parent
95
- parent.node <- t$edge[ t$edge[,2]==node, 1];
102
+ parent.node <- t$edge[t$edge[, 2] == node, 1]
103
+
96
104
  # Get edges to parent
97
- parent.edges <- which(t$edge[,1]==parent.node);
98
- stopit <- FALSE;
99
- for(j in parent.edges){
100
- for(k in parent.edges){
101
- if(j != k & t$edge[j,2]<length(t$tip.label) & t$edge[k,2]<length(t$tip.label) & sum(t$edge.length[c(j,k)]) < min_dist){
102
- t <- ape::drop.tip(t, t$edge[k,2]);
103
- stopit <- TRUE;
104
- break;
105
+ parent.edges <- which(t$edge[, 1] == parent.node)
106
+ stopit <- FALSE
107
+ for (j in parent.edges) {
108
+ for (k in parent.edges) {
109
+ if (j != k & t$edge[j,2]<length(t$tip.label) &
110
+ t$edge[k,2]<length(t$tip.label) &
111
+ sum(t$edge.length[c(j,k)]) < min_dist) {
112
+ t <- ape::drop.tip(t, t$edge[k,2])
113
+ stopit <- TRUE
114
+ break
105
115
  }
106
116
  }
107
- if(stopit) break;
117
+ if (stopit) break
108
118
  }
109
119
  }
110
- if(!quiet) cat('\n');
111
- return(t);
120
+ if (!quiet) cat("\n")
121
+ return(t)
112
122
  }
113
123
 
114
124
  #' Enveomics: Prune Iter (Internal Function)
115
125
  #'
116
126
  #' Internal function for \code{\link{enve.prune.dist}}.
117
127
  #'
118
- #' @param t A \strong{phylo} object
119
- #' @param dist Cophenetic distance matrix
120
- #' @param min_dist Minimum distance
121
- #' @param quiet If running quietly
128
+ #' @param t A \strong{phylo} object.
129
+ #' @param dist Cophenetic distance matrix.
130
+ #' @param min_dist Minimum distance.
131
+ #' @param quiet If running quietly.
132
+ #'
133
+ #' @return Returns a \strong{phylo} object.
122
134
  #'
123
135
  #' @author Luis M. Rodriguez-R [aut, cre]
124
136
  #'
125
137
  #' @export
126
138
 
127
- enve.__prune.iter <- function
128
- (t,
129
- dist,
130
- min_dist,
131
- quiet){
132
- ori_len <- length(t$tip.label);
139
+ enve.__prune.iter <- function(t, dist, min_dist, quiet) {
140
+ ori_len <- length(t$tip.label)
141
+
133
142
  # Prune
134
- if(!quiet) pb <- txtProgressBar(1, ncol(dist)-1, style=3);
135
- ignore <- c();
136
- for(i in 1:(ncol(dist)-1)){
137
- if(i %in% ignore) next;
138
- for(j in (i+1):nrow(dist)){
139
- if(dist[j, i]<min_dist){
140
- t <- ape::drop.tip(t, rownames(dist)[j]);
141
- ignore <- c(ignore, j);
142
- break;
143
+ if (!quiet) pb <- txtProgressBar(1, ncol(dist) - 1, style = 3)
144
+ ignore <- c()
145
+ for (i in 1:(ncol(dist) - 1)) {
146
+ if (i %in% ignore) next
147
+ for (j in (i + 1):nrow(dist)) {
148
+ if (dist[j, i] < min_dist) {
149
+ t <- ape::drop.tip(t, rownames(dist)[j])
150
+ ignore <- c(ignore, j)
151
+ break
143
152
  }
144
153
  }
145
- if(!quiet) setTxtProgressBar(pb, i);
154
+ if (!quiet) setTxtProgressBar(pb, i)
146
155
  }
147
- if(!quiet) cat('\n');
156
+ if(!quiet) cat("\n")
157
+
148
158
  # Check if it droped tips
149
- cur_len <- length(t$tip.label);
150
- if(cur_len == ori_len){
151
- stop("Internal error: small edge found in tree, with no equivalent in distance matrix.\n");
159
+ cur_len <- length(t$tip.label)
160
+ if (cur_len == ori_len){
161
+ stop(
162
+ "Internal error: small edge found in tree, with no equivalent in distance matrix.\n"
163
+ )
152
164
  }
153
- return(t);
165
+ return(t)
154
166
  }
155
167