miga-base 0.3.1.6 → 0.3.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/actions/ncbi_get.rb +57 -42
  3. data/lib/miga/result/base.rb +7 -0
  4. data/lib/miga/result/dates.rb +42 -0
  5. data/lib/miga/result.rb +4 -0
  6. data/lib/miga/version.rb +1 -1
  7. data/scripts/essential_genes.bash +5 -4
  8. data/utils/enveomics/Makefile +1 -1
  9. data/utils/enveomics/Manifest/Tasks/aasubs.json +75 -75
  10. data/utils/enveomics/Manifest/Tasks/blasttab.json +194 -185
  11. data/utils/enveomics/Manifest/Tasks/distances.json +130 -130
  12. data/utils/enveomics/Manifest/Tasks/fasta.json +51 -3
  13. data/utils/enveomics/Manifest/Tasks/fastq.json +161 -126
  14. data/utils/enveomics/Manifest/Tasks/graphics.json +111 -111
  15. data/utils/enveomics/Manifest/Tasks/mapping.json +30 -0
  16. data/utils/enveomics/Manifest/Tasks/ogs.json +308 -265
  17. data/utils/enveomics/Manifest/Tasks/other.json +451 -449
  18. data/utils/enveomics/Manifest/Tasks/remote.json +1 -1
  19. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +18 -10
  20. data/utils/enveomics/Manifest/Tasks/tables.json +250 -250
  21. data/utils/enveomics/Manifest/Tasks/trees.json +52 -52
  22. data/utils/enveomics/Manifest/Tasks/variants.json +4 -4
  23. data/utils/enveomics/Manifest/categories.json +12 -4
  24. data/utils/enveomics/Manifest/examples.json +1 -1
  25. data/utils/enveomics/Scripts/BedGraph.tad.rb +71 -0
  26. data/utils/enveomics/Scripts/BlastTab.recplot2.R +23 -22
  27. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  28. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  29. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +272 -258
  30. data/utils/enveomics/Scripts/aai.rb +13 -6
  31. data/utils/enveomics/Scripts/ani.rb +2 -2
  32. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  33. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +12 -14
  34. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +2 -2
  35. data/utils/enveomics/Scripts/rbm.rb +23 -14
  36. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  37. data/utils/enveomics/enveomics.R/R/barplot.R +2 -2
  38. metadata +9 -2
@@ -10,58 +10,58 @@
10
10
  "help_arg": "--help",
11
11
  "options": [
12
12
  {
13
- "name": "Input tree",
14
- "opt": "--t",
15
- "arg": "in_file",
16
- "mandatory": true,
17
- "description": "A tree to prune in Newick format."
18
- },
19
- {
20
- "opt": "--dist-quantile",
21
- "arg": "float",
22
- "default": 0.25,
23
- "description": "The quantile of edge lengths."
24
- },
25
- {
26
- "opt": "--min_dist",
27
- "arg": "float",
28
- "description": ["The minimum distance to allow between two tips. If",
29
- "not set, dist.quantile is used instead to calculate it."]
30
- },
31
- {
32
- "opt": "--quiet",
33
- "description": ["Boolean indicating if the function must run without",
34
- "output."]
35
- },
36
- {
37
- "opt": "--max_iters",
38
- "arg": "integer",
39
- "default": 1000,
40
- "description": "Maximum number of iterations."
41
- },
42
- {
43
- "opt": "--min_nodes_random",
44
- "arg": "integer",
45
- "default": 40000,
46
- "description": ["Minimum number of nodes to trigger 'tip-pairs'",
47
- "nodes sampling. This sampling is less reproducible and more",
48
- "computationally expensive, but it's the only solution if the",
49
- "cophenetic matrix exceeds 2^31-1 entries; above that, it cannot",
50
- "be represented in R."]
51
- },
52
- {
53
- "opt": "--random_nodes_frx",
54
- "arg": "float",
55
- "default": 1.0,
56
- "description": ["Fraction of the nodes to be sampled if more than",
57
- "'Min nodes random'."]
58
- },
59
- {
60
- "arg": "out_file",
61
- "mandatory": true,
62
- "description": ["Output file in Newick format containing the pruned",
63
- "tree."]
64
- }
13
+ "name": "Input tree",
14
+ "opt": "--t",
15
+ "arg": "in_file",
16
+ "mandatory": true,
17
+ "description": "A tree to prune in Newick format."
18
+ },
19
+ {
20
+ "opt": "--dist-quantile",
21
+ "arg": "float",
22
+ "default": 0.25,
23
+ "description": "The quantile of edge lengths."
24
+ },
25
+ {
26
+ "opt": "--min_dist",
27
+ "arg": "float",
28
+ "description": ["The minimum distance to allow between two tips. If",
29
+ "not set, dist.quantile is used instead to calculate it."]
30
+ },
31
+ {
32
+ "opt": "--quiet",
33
+ "description": ["Boolean indicating if the function must run without",
34
+ "output."]
35
+ },
36
+ {
37
+ "opt": "--max_iters",
38
+ "arg": "integer",
39
+ "default": 1000,
40
+ "description": "Maximum number of iterations."
41
+ },
42
+ {
43
+ "opt": "--min_nodes_random",
44
+ "arg": "integer",
45
+ "default": 40000,
46
+ "description": ["Minimum number of nodes to trigger 'tip-pairs'",
47
+ "nodes sampling. This sampling is less reproducible and more",
48
+ "computationally expensive, but it's the only solution if the",
49
+ "cophenetic matrix exceeds 2^31-1 entries; above that, it cannot",
50
+ "be represented in R."]
51
+ },
52
+ {
53
+ "opt": "--random_nodes_frx",
54
+ "arg": "float",
55
+ "default": 1.0,
56
+ "description": ["Fraction of the nodes to be sampled if more than",
57
+ "'Min nodes random'."]
58
+ },
59
+ {
60
+ "arg": "out_file",
61
+ "mandatory": true,
62
+ "description": ["Output file in Newick format containing the pruned",
63
+ "tree."]
64
+ }
65
65
  ]
66
66
  }
67
67
  ]
@@ -58,10 +58,10 @@
58
58
  "description": "Minimum information content (in bits, from 0 to 1).",
59
59
  "default": 0.0
60
60
  },
61
- {
62
- "opt": "--indels",
63
- "description": "Process indels."
64
- }
61
+ {
62
+ "opt": "--indels",
63
+ "description": "Process indels."
64
+ }
65
65
  ]
66
66
  },
67
67
  {
@@ -2,13 +2,15 @@
2
2
  "categories": {
3
3
  "Sequence similarity search": {
4
4
  "Statistics": [
5
+ "BedGraph.tad.rb",
5
6
  "BlastPairwise.AAsubs.pl",
6
7
  "BlastTab.advance.bash",
7
8
  "BlastTab.recplot2.R",
8
9
  "BlastTab.seqdepth.pl",
9
10
  "BlastTab.seqdepth_nomedian.pl",
10
11
  "BlastTab.seqdepth_ZIP.pl",
11
- "BlastTab.sumPerHit.pl"
12
+ "BlastTab.sumPerHit.pl",
13
+ "FastQ.test-error.rb"
12
14
  ],
13
15
  "Manipulation": [
14
16
  "BlastTab.addlen.rb",
@@ -33,7 +35,8 @@
33
35
  "FastA.gc.pl",
34
36
  "FastA.length.pl",
35
37
  "FastA.N50.pl",
36
- "FastA.qlen.pl"
38
+ "FastA.qlen.pl",
39
+ "FastQ.test-error.rb"
37
40
  ],
38
41
  "Manipulation": [
39
42
  "FastA.filter.pl",
@@ -41,11 +44,12 @@
41
44
  "FastA.filterN.pl",
42
45
  "FastA.fragment.rb",
43
46
  "FastA.interpose.pl",
44
- "FastA.per_file.pl",
47
+ "FastA.per_file.pl",
45
48
  "FastA.rename.pl",
46
49
  "FastA.revcom.pl",
47
50
  "FastA.slider.pl",
48
51
  "FastA.split.pl",
52
+ "FastA.split.rb",
49
53
  "FastA.subsample.pl",
50
54
  "FastA.tag.rb",
51
55
  "FastA.wrap.rb",
@@ -86,7 +90,7 @@
86
90
  "Table.df2dist.R",
87
91
  "Table.filter.pl",
88
92
  "Table.merge.pl",
89
- "Table.replace.rb",
93
+ "Table.replace.rb",
90
94
  "Table.round.rb",
91
95
  "Table.split.pl"
92
96
  ],
@@ -126,6 +130,10 @@
126
130
  "Aln.cat.rb",
127
131
  "Aln.convert.pl",
128
132
  "BlastPairwise.AAsubs.pl"
133
+ ],
134
+ "Clustering": [
135
+ "ogs.mcl.rb",
136
+ "clust.rand.rb"
129
137
  ]
130
138
  }
131
139
  }
@@ -56,7 +56,7 @@
56
56
  "task": "BlastTab.recplot2.R",
57
57
  "description": ["Generates recruitment plots for a comparison",
58
58
  "between a virome containing HIV and the HIV-1 genome."],
59
- "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,
59
+ "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,"NA",
60
60
  "hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
61
61
  },
62
62
  {
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "optparse"
4
+
5
+ o = {range:0.5}
6
+ ARGV << "-h" if ARGV.empty?
7
+ OptionParser.new do |opt|
8
+ opt.banner = "
9
+ Estimates the truncated average sequencing depth (TAD) from a BedGraph file.
10
+
11
+ IMPORTANT: This script doesn't consider zero-coverage positions if missing
12
+ from the file. If you produce your BedGraph file with bedtools genomecov and
13
+ want to consider zero-coverage position, be sure to use -bga (not -bg).
14
+
15
+ Usage: #{$0} [options]"
16
+ opt.separator ""
17
+ opt.on("-i", "--input PATH",
18
+ "Input BedGraph file (mandatory)."){ |v| o[:i]=v }
19
+ opt.on("-r", "--range FLOAT",
20
+ "Central range to consider, between 0 and 1.",
21
+ "By default: #{o[:range]} (inter-quartile range)."
22
+ ){ |v| o[:range]=v.to_f }
23
+ opt.on("-h", "--help", "Display this screen.") do
24
+ puts opt
25
+ exit
26
+ end
27
+ opt.separator ""
28
+ end.parse!
29
+ abort "-i is mandatory." if o[:i].nil?
30
+
31
+ def pad(d, idx, r)
32
+ idx.each do |i|
33
+ next if d[i].nil?
34
+ d[i] -= r
35
+ break unless d[i] < 0
36
+ r = -d[i]
37
+ d[i] = nil
38
+ end
39
+ d
40
+ end
41
+
42
+ # Read BedGraph
43
+ d = []
44
+ ln = 0
45
+ File.open(o[:i], "r") do |ifh|
46
+ ifh.each_line do |i|
47
+ next if i =~ /^#/
48
+ r = i.chomp.split("\t")[1 .. -1].map{ |j| j.to_i }
49
+ l = r[1]-r[0]
50
+ d[ r[2] ] ||= 0
51
+ d[ r[2] ] += l
52
+ ln += l
53
+ end
54
+ end
55
+
56
+ # Estimate padding ranges
57
+ pad = (1.0-o[:range])/2.0
58
+ r = (pad*ln).round
59
+
60
+ # Pad
61
+ d = pad(d, d.each_index.to_a, r+0)
62
+ d = pad(d, d.each_index.to_a.reverse, r+0)
63
+
64
+ # Average
65
+ if d.compact.empty?
66
+ p 0.0
67
+ else
68
+ s = d.each_with_index.to_a.map{ |v,i| v.nil? ? 0 : i*v }.inject(0,:+)
69
+ p s.to_f/d.compact.inject(:+)
70
+ end
71
+
@@ -1,40 +1,41 @@
1
1
  #!/usr/bin/env Rscript
2
- #
2
+
3
3
  # @author Luis M. Rodriguez-R
4
- # @update Jan-05-2016
5
4
  # @license artistic license 2.0
6
- #
7
5
 
8
6
  #= Load stuff
9
7
  suppressPackageStartupMessages(library(enveomics.R))
10
- args <- commandArgs(trailingOnly = F)
8
+ args <- commandArgs(trailingOnly = FALSE)
11
9
  enveomics_R <- file.path(dirname(
12
- sub("^--file=", "", args[grep("^--file=", args)])),
13
- "lib", "enveomics.R")
10
+ sub("^--file=", "", args[grep("^--file=", args)])),
11
+ "lib", "enveomics.R")
14
12
 
15
13
  #= Generate interface
16
14
  opt <- enve.cliopts(enve.recplot2,
17
- file.path(enveomics_R, "man", "enve.recplot2.Rd"),
18
- positional_arguments=c(1,4),
19
- usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
20
- mandatory=c("prefix"),
21
- o_desc=list(pos.breaks="Breaks in the positions histogram.",
22
- id.breaks="Breaks in the identity histogram.",
23
- id.summary="Function summarizing the identity bins. By default: sum."),
24
- p_desc=paste("","Produce recruitment plot objects provided that",
25
- "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
26
- ignore=c("plot"),
27
-
28
- defaults=c(id.metric="identity"))
15
+ file.path(enveomics_R, "man", "enve.recplot2.Rd"),
16
+ positional_arguments=c(1,4),
17
+ usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
18
+ mandatory=c("prefix"),
19
+ o_desc=list(pos.breaks="Breaks in the positions histogram.",
20
+ id.breaks="Breaks in the identity histogram.",
21
+ id.summary="Function summarizing the identity bins. By default: sum.",
22
+ peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred)."),
23
+ p_desc=paste("","Produce recruitment plot objects provided that",
24
+ "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
25
+ ignore=c("plot"),
26
+ defaults=c(id.metric="identity", peaks.col=NA))
29
27
 
30
28
  #= Run it!
31
29
  if(length(opt$args)>1){
32
- args = as.list(opt$args[-1])
33
- for(i in 2:3) if(length(args)>=i) args[[i]] <- as.numeric(args[[i]])
34
- do.call("pdf", args)
30
+ args = as.list(opt$args[-1])
31
+ for(i in 2:3) if(length(args)>=i) args[[i]] <- as.numeric(args[[i]])
32
+ do.call("pdf", args)
35
33
  }else{
36
- opt$options[["plot"]] <- FALSE
34
+ opt$options[["plot"]] <- FALSE
37
35
  }
36
+ pc <- opt$options[["peaks.col"]]
37
+ if(!is.na(pc) && pc=="NA") opt$options[["peaks.col"]] <- NA
38
38
  rp <- do.call("enve.recplot2", opt$options)
39
39
  save(rp, file=opt$args[1])
40
40
  if(length(opt$args)>1) dev.off()
41
+
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license Artistic-2.0
5
+ #
6
+
7
+ require "optparse"
8
+
9
+ o = {q:false, n:12, lett:false, dc:false, z:false, out:"%s.%s.fa"}
10
+ ARGV << "-h" if ARGV.size==0
11
+
12
+ OptionParser.new do |opt|
13
+ opt.banner = "
14
+ Evenly splits a multi-FastA file into multiple multi-FastA files.
15
+
16
+ Usage: #{$0} [options]"
17
+ opt.separator ""
18
+ opt.separator "Mandatory"
19
+ opt.on("-i", "--input PATH", "Input FastA file."){ |v| o[:i] = v}
20
+ opt.on("-p", "--prefix PATH", "Prefix of output FastA files."){ |v| o[:p] = v}
21
+ opt.separator ""
22
+ opt.separator "Options"
23
+ opt.on("-n", "--number INT",
24
+ "Number of output files to produce. By default: #{o[:n]}."
25
+ ){ |v| o[:n] = v.to_i }
26
+ opt.on("-z", "--zero-padded",
27
+ "Use zero-padded numbers as output index."){ o[:lett]=false; o[:z]=true }
28
+ opt.on("-l", "--lowercase-letters",
29
+ "Use lowercase letters as output index."){ o[:lett]=true ; o[:dc]=true }
30
+ opt.on("-u", "--uppercase-letters",
31
+ "Use uppercase letters as output index."){ o[:lett]=true }
32
+ opt.on("-o", "--out STR",
33
+ "Format of output filenames, where %s are replaced by prefix and index.",
34
+ "By default: #{o[:out]}."){ |v| o[:out] = v }
35
+ opt.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = TRUE }
36
+ opt.on("-h", "--help", "Display this screen.") do
37
+ puts opt
38
+ exit
39
+ end
40
+ opt.separator ""
41
+ end.parse!
42
+ abort "-i is mandatory." if o[:i].nil?
43
+ abort "-p is mandatory." if o[:p].nil?
44
+
45
+ ofh = []
46
+ idx = if o[:lett]
47
+ k = Math::log(o[:n], 26).ceil
48
+ r = o[:dc] ? ["a","z"] : ["A","Z"]
49
+ ((r[0]*k) .. (r[1]*k)).first(o[:n])
50
+ elsif o[:z]
51
+ k = Math::log(o[:n], 10).ceil
52
+ (1 .. o[:n]).map{ |i| "%0#{k}d" % i }
53
+ else
54
+ (1 .. o[:n]).map{ |i| i.to_s }
55
+ end
56
+ idx.each do |i|
57
+ fn = o[:out] % [o[:p], i]
58
+ ofh << File.open(fn, "w")
59
+ end
60
+
61
+ i = -1
62
+ seq = ""
63
+ File.open(o[:i], "r") do |ifh|
64
+ ifh.each_line do |ln|
65
+ next if ln =~ /^;/
66
+ if ln =~ /^>/
67
+ ofh[i % o[:n]].print seq
68
+ i += 1
69
+ seq = ""
70
+ end
71
+ seq << ln
72
+ end
73
+ ofh[i % o[:n]].print seq
74
+ end
75
+
76
+ ofh.each{ |i| i.close }
77
+
78
+ $stderr.puts "Sequences: #{i+1}.", "Files: #{o[:n]}." unless o[:q]
79
+
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ o = {q:false, key:2}
6
+ ARGV << '-h' if ARGV.empty?
7
+ OptionParser.new do |opts|
8
+ opts.banner = "
9
+ Compares the estimated error of sequencing reads (Q-score) with
10
+ observed mismatches (identity against a know reference sequence).
11
+
12
+ Usage: #{$0} [options]"
13
+ opts.separator ""
14
+ opts.separator "Mandatory"
15
+ opts.on("-f", "--fastq FILE",
16
+ "Path to the FastQ file containing the sequences."){ |v| o[:fastq] = v }
17
+ opts.on("-b", "--blast FILE",
18
+ "Path to the tabular BLAST file mapping reads to reference sequences."
19
+ ){ |v| o[:blast] = v }
20
+ opts.on("-o", "--out FILE",
21
+ "Path to the output tab-delimited file to create."){ |v| o[:out] = v }
22
+ opts.separator ""
23
+ opts.separator "Other Options"
24
+ opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
25
+ opts.on("-h", "--help", "Display this screen") do
26
+ puts opts
27
+ exit
28
+ end
29
+ opts.separator ""
30
+ end.parse!
31
+ abort "-f is mandatory" if o[:fastq].nil?
32
+ abort "-b is mandatory" if o[:blast].nil?
33
+ abort "-o is mandatory" if o[:out].nil?
34
+
35
+ # Read the Q scores and estimate expected mismatches
36
+ mm = {} # <- Hash with read IDs as key, and arrays as values:
37
+ # [ expected mismatches, variance of mismatches, length ]
38
+ $stderr.puts "Reading FastQ file" unless o[:q]
39
+ File.open(o[:fastq], "r") do |fh|
40
+ id = nil
41
+ fh.each_line do |ln|
42
+ case $.%4
43
+ when 1
44
+ ln =~ /^@(\S+)/ or raise "Unexpected defline format: #{ln}"
45
+ id = $1
46
+ $stderr.print " #{mm.size} reads...\r" unless o[:q]
47
+ when 0
48
+ ln.chomp!
49
+ # I'm assuming ALWAYS Phred+33!!!
50
+ p = ln.split('').map{ |i| (i.ord - 33).to_f }.map{ |q| 10.0**(-q/10.0) }
51
+ mu = p.inject(:+)
52
+ var = p.map{ |i| i*(1.0-i) }.inject(:+)
53
+ mm[id] = [mu, var, p.size]
54
+ end
55
+ end
56
+ $stderr.puts " Found: #{mm.size} reads." unless o[:q]
57
+ end
58
+
59
+ ofh = File.open(o[:out], "w")
60
+ ofh.puts %w[id obs_subs obs_id aln_len obs_ins obs_del obs_gap mu var len].join("\t")
61
+
62
+ # Read Identities and compare against expectation
63
+ $stderr.puts "Reading Tabular BLAST file" unless o[:q]
64
+ File.open(o[:blast], "r") do |fh|
65
+ k = 0
66
+ fh.each_line do |ln|
67
+ r = ln.chomp.split("\t")
68
+ id = r[0]
69
+ next if mm[id].nil?
70
+ k += 1
71
+ $stderr.print " #{k} alignments...\r" unless o[:q]
72
+ obs_m = r[4].to_i + (r[6].to_i - 1) + (mm[id][2] - r[7].to_i)
73
+ obs_del = r[3].to_i - (r[7].to_i - r[6].to_i).abs
74
+ obs_ins = r[3].to_i - (r[9].to_i - r[8].to_i).abs
75
+ ofh.puts ([id, obs_m, r[2], r[7].to_i - r[6].to_i + 1,
76
+ obs_ins, obs_del, r[5]] + mm[id]).join("\t")
77
+ end
78
+ $stderr.puts " Found #{k} alignments." unless o[:q]
79
+ end
80
+
81
+ ofh.close