miga-base 0.3.6.3 → 0.3.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: f989fccd5161208979728e293bfc89d18e24e1ca6941b18a804063749cce56e9
4
- data.tar.gz: 7568cb56974c18279428a50f750a5302beae8063a190def53734eb985fe4b56b
2
+ SHA1:
3
+ metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
4
+ data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
5
5
  SHA512:
6
- metadata.gz: 8d026577fd38d399d643fbc9ef7c01ffa7969d8206d6f859655b9d04e41d7c3e75b1432581add654c5b3d58ce6b2bedf566783c2587165f811e12f6fbf87b695
7
- data.tar.gz: 4fae078b18a74314a7654d5c7a83ca7277555977b299931e33ddfabc3356cfa82994b00c6950a646eb8c16289f4cdb34f686574fc9505aacfec9ae826242c742
6
+ metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
7
+ data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f
data/actions/init.rb CHANGED
@@ -154,7 +154,7 @@ end
154
154
 
155
155
  # Check for R packages
156
156
  $stderr.puts "Looking for R packages:"
157
- %w(enveomics.R ape phangorn phytools cluster vegan).each do |pkg|
157
+ %w(enveomics.R ape cluster vegan).each do |pkg|
158
158
  $stderr.print "Testing #{pkg}... "
159
159
  `echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
160
160
  if $?.success?
data/actions/tax_dist.rb CHANGED
@@ -3,34 +3,34 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- require "miga/tax_index"
7
- require "zlib"
8
- require "tmpdir"
6
+ require 'miga/tax_index'
7
+ require 'zlib'
8
+ require 'tmpdir'
9
9
 
10
- o = {q:true, format: :json}
10
+ o = {q: true, format: :json}
11
11
  OptionParser.new do |opt|
12
12
  opt_banner(opt)
13
13
  opt_object(opt, o, [:project])
14
14
  opt_filter_datasets(opt, o)
15
- opt.on("-i", "--index FILE",
16
- "Pre-calculated tax-index (in tabular format) to be used.",
17
- "If passed, dataset filtering arguments are ignored."
18
- ){ |v| o[:index]=v }
15
+ opt.on('-i', '--index FILE',
16
+ 'Pre-calculated tax-index (in tabular format) to be used.',
17
+ 'If passed, dataset filtering arguments are ignored.'
18
+ ){ |v| o[:index] = v }
19
19
  opt_common(opt, o)
20
20
  end.parse!
21
21
 
22
22
  ##=> Functions <=
23
23
  # Returns the _cannonical_ ID between strings +a+ and +b+.
24
- def cannid(a, b) ; [a, b].sort.join("-") ; end
24
+ def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
25
25
 
26
26
  ##=> Main <=
27
- opt_require(o, project:"-P")
27
+ opt_require(o, project: '-P')
28
28
 
29
- $stderr.puts "Loading project." unless o[:q]
29
+ $stderr.puts 'Loading project.' unless o[:q]
30
30
  p = MiGA::Project.load(o[:project])
31
31
  raise "Impossible to load project: #{o[:project]}" if p.nil?
32
32
 
33
- metric = p.is_clade? ? "ani" : "aai"
33
+ metric = p.is_clade? ? 'ani' : 'aai'
34
34
  res_n = "#{metric}_distances"
35
35
  $stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
36
36
  res = p.result res_n
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
38
38
  matrix = res.file_path(:matrix)
39
39
  raise "#{res_n} has no matrix." if matrix.nil?
40
40
  dist = {}
41
- mfh = matrix=~/\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix,"r")
41
+ mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
42
42
  mfh.each_line do |ln|
43
43
  next if mfh.lineno==1
44
- row = ln.chomp.split(/\t/)
45
- dist[cannid(row[1], row[2])] = [row[3], 0, ["root:biota"]]
44
+ row = ln.chomp.split("\t")
45
+ dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
46
+ $stderr.print(" Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
46
47
  end
48
+ $stderr.puts " Lines: #{mfh.lineno}" unless o[:q]
47
49
  mfh.close
48
50
 
49
51
  Dir.mktmpdir do |dir|
50
52
  if o[:index].nil?
51
- $stderr.puts "Loading datasets." unless o[:q]
53
+ $stderr.puts 'Loading datasets.' unless o[:q]
52
54
  ds = p.datasets
53
55
  ds.select!{ |d| not d.metadata[:tax].nil? }
54
56
  ds = filter_datasets!(ds, o)
55
57
 
56
- $stderr.puts "Indexing taxonomy." unless o[:q]
58
+ $stderr.puts 'Indexing taxonomy.' unless o[:q]
57
59
  tax_index = MiGA::TaxIndex.new
58
60
  ds.each { |d| tax_index << d }
59
- tab = File.expand_path("index.tab", dir)
60
- File.open(tab, "w") { |fh| fh.print tax_index.to_tab }
61
+ tab = File.expand_path('index.tab', dir)
62
+ File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
61
63
  else
62
64
  tab = o[:index]
63
65
  end
64
-
65
- $stderr.puts "Traversing taxonomy." unless o[:q]
66
+
67
+ $stderr.puts 'Traversing taxonomy.' unless o[:q]
66
68
  rank_i = 0
67
69
  MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
68
70
  $stderr.print "o #{rank}: " unless o[:q]
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
70
72
  rank_i += 1
71
73
  in_rank = nil
72
74
  ds_name = []
73
- File.open(tab, "r") do |fh|
75
+ File.open(tab, 'r') do |fh|
74
76
  fh.each_line do |ln|
75
77
  if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
76
78
  in_rank = nil
77
79
  ds_name = []
78
80
  elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
79
- in_rank = $2=="?" ? nil : $1
81
+ in_rank = $2 == '?' ? nil : $1
80
82
  ds_name = []
81
83
  elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
82
84
  ds_i = $1
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
85
87
  k = cannid(ds_i, ds_j)
86
88
  next if dist[k].nil?
87
89
  rank_n += 1
88
- dist[k][1] = rank_i
89
- dist[k][2].unshift in_rank
90
+ dist[k][3] = rank_i
91
+ dist[k][4].unshift in_rank
90
92
  end
91
93
  end
92
94
  end
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
95
97
  end
96
98
  end
97
99
 
100
+ $stderr.puts 'Generating report.' unless o[:q]
98
101
  dist.keys.each do |k|
99
- puts (k.split("-") + dist[k].flatten).join("\t")
102
+ dist[k][5] = dist[k][4].reverse.join(' ')
103
+ dist[k][4] = dist[k][4].first
104
+ puts (k.split('-') + dist[k]).join("\t")
100
105
  end
106
+
data/bin/miga CHANGED
@@ -3,43 +3,43 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- $:.push File.expand_path("../lib", File.dirname(__FILE__))
6
+ $:.push File.expand_path('../../lib', __FILE__)
7
7
 
8
- require "optparse"
9
- require "miga"
8
+ require 'optparse'
9
+ require 'miga'
10
10
 
11
11
  ##=> Global variables <=
12
12
 
13
13
  $task_desc = {
14
14
  # Projects
15
- new: "Creates an empty MiGA project.",
16
- about: "Displays information about a MiGA project.",
17
- plugins: "Lists or (un)installs plugins in a MiGA project.",
18
- doctor: "Performs consistency checks on a MiGA project.",
15
+ new: 'Creates an empty MiGA project',
16
+ about: 'Displays information about a MiGA project',
17
+ plugins: 'Lists or (un)installs plugins in a MiGA project',
18
+ doctor: 'Performs consistency checks on a MiGA project',
19
19
  # Datasets
20
- add: "Creates an empty dataset in a pre-existing MiGA project.",
21
- get: "Downloads a dataset from public databases into a MiGA project.",
22
- ncbi_get: "Downloads all genomes in a taxon from NCBI into a MiGA project.",
23
- rm: "Removes a dataset from an MiGA project.",
24
- find: "Finds unregistered datasets based on result files.",
25
- ln: "Link datasets (including results) from one project to another.",
26
- ls: "Lists all registered datasets in an MiGA project.",
20
+ add: 'Creates an empty dataset in a pre-existing MiGA project',
21
+ get: 'Downloads a dataset from public databases into a MiGA project',
22
+ ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
23
+ rm: 'Removes a dataset from an MiGA project',
24
+ find: 'Finds unregistered datasets based on result files',
25
+ ln: 'Link datasets (including results) from one project to another',
26
+ ls: 'Lists all registered datasets in an MiGA project',
27
27
  # Results
28
- add_result: "Registers a result.",
29
- stats: "Extracts statistics for the given result.",
30
- files: "Lists all registered files from the results of a dataset or project.",
31
- run: "Executes locally one step analysis producing the given result.",
32
- summary: "Generates a summary table for the statistics of all datasets.",
28
+ add_result: 'Registers a result',
29
+ stats: 'Extracts statistics for the given result',
30
+ files: 'Lists registered files from the results of a dataset or project',
31
+ run: 'Executes locally one step analysis producing the given result',
32
+ summary: 'Generates a summary table for the statistics of all datasets',
33
33
  # System
34
- init: "Initialize MiGA to process new projects.",
35
- daemon: "Controls the daemon of a MiGA project.",
36
- date: "Returns the current date in standard MiGA format.",
37
- console: "Opens an IRB console with MiGA.",
34
+ init: 'Initialize MiGA to process new projects',
35
+ daemon: 'Controls the daemon of a MiGA project',
36
+ date: 'Returns the current date in standard MiGA format',
37
+ console: 'Opens an IRB console with MiGA',
38
38
  # Taxonomy
39
- tax_set: "Registers taxonomic information for datasets.",
40
- tax_test: "Returns test of taxonomic distributions for query datasets.",
41
- tax_index: "Creates a taxonomy-indexed list of the datasets.",
42
- tax_dist: "Estimates distributions of distance by taxonomy.",
39
+ tax_set: 'Registers taxonomic information for datasets',
40
+ tax_test: 'Returns test of taxonomic distributions for query datasets',
41
+ tax_index: 'Creates a taxonomy-indexed list of the datasets',
42
+ tax_dist: 'Estimates distributions of distance by taxonomy',
43
43
  }
44
44
 
45
45
  $task_alias = {
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
178
178
  end
179
179
 
180
180
  def add_metadata(o, obj)
181
- o[:metadata].split(",").each do |pair|
182
- (k,v) = pair.split("=")
181
+ o[:metadata].split(',').each do |pair|
182
+ (k,v) = pair.split('=')
183
183
  case v
184
184
  when 'true'; v = true
185
185
  when 'false'; v = false
186
186
  when 'nil'; v = nil
187
187
  end
188
- if k=='_step'
188
+ if k == '_step'
189
189
  obj.metadata["_try_#{v}"] ||= 0
190
190
  obj.metadata["_try_#{v}"] += 1
191
191
  end
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
205
205
  ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
206
206
 
207
207
  case ARGV[0].to_s
208
- when "-v", "--version"
208
+ when '-v', '--version'
209
209
  puts MiGA::MiGA.VERSION
210
- when "-V", "--long-version"
210
+ when '-V', '--long-version'
211
211
  puts MiGA::MiGA.LONG_VERSION
212
- when "-C", "--citation"
212
+ when '-C', '--citation'
213
213
  puts MiGA::MiGA.CITATION
214
- when "console"
215
- require "irb"
216
- require "irb/completion"
214
+ when 'console'
215
+ require 'irb'
216
+ require 'irb/completion'
217
217
  ARGV.shift
218
218
  IRB.start
219
219
  when *execs
220
220
  $task = ARGV.shift.to_sym
221
- ARGV << "-h" if ARGV.empty? and not [:date, :init].include? $task
221
+ ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
222
222
  begin
223
223
  load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
224
224
  rescue => err
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
233
233
 
234
234
  Usage: #{$0} {action} [options]
235
235
 
236
- #{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
236
+ #{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
237
237
 
238
238
  generic options:
239
239
  -h, --help Display this screen.
data/lib/miga/daemon.rb CHANGED
@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
99
99
  status = JSON.parse(File.read(f_path), symbolize_names: true)
100
100
  status.keys.each do |i|
101
101
  status[i].map! do |j|
102
- j.tap { |k| k[:ds] = project.dataset(k[:ds_name]) unless k[:ds].nil? }
102
+ j.tap do |k|
103
+ unless k[:ds].nil? or k[:ds_name] == 'miga-project'
104
+ k[:ds] = project.dataset(k[:ds_name])
105
+ end
106
+ k[:job] = k[:job].to_sym unless k[:job].nil?
107
+ end
103
108
  end
104
109
  end
105
110
  @jobs_running = status[:jobs_running]
106
111
  @jobs_to_run = status[:jobs_to_run]
112
+ say "- jobs left running: #{@jobs_running.size}"
107
113
  purge!
108
114
  say "- jobs running: #{@jobs_running.size}"
109
115
  say "- jobs to run: #{@jobs_to_run.size}"
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
171
177
  ##
172
178
  # Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
173
179
  # let +ds+ be nil.
174
- def get_job(job, ds=nil)
180
+ def get_job(job, ds = nil)
175
181
  (jobs_to_run + jobs_running).find do |j|
176
- if ds==nil
177
- j[:ds].nil? and j[:job]==job
182
+ if ds.nil?
183
+ j[:ds].nil? and j[:job] == job
178
184
  else
179
- (! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
185
+ (! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
180
186
  end
181
187
  end
182
188
  end
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 6, 3]
13
+ VERSION = [0.3, 7, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
data/scripts/ogs.bash CHANGED
@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
11
11
  # Initialize
12
12
  miga date > "miga-project.start"
13
13
 
14
- DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
14
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi)
15
+ MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
16
+ [[ $MIN_ID == "?" ]] && MIN_ID=80
15
17
  if [[ ! -s miga-project.ogs ]] ; then
16
18
  # Extract RBMs
17
19
  if [[ ! -s miga-project.abc ]] ; then
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
19
21
  for i in $DS ; do
20
22
  file="miga-project.tmp/$i.abc"
21
23
  [[ -s "$file" ]] && continue
22
- echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
24
+ echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
23
25
  | sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
24
26
  | awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
25
27
  > "$file.tmp"
@@ -6,18 +6,19 @@ require 'miga'
6
6
  ARGV[1] or abort "Usage: #{$0} path/to/project threads"
7
7
 
8
8
  $stderr.puts "Cleaning databases..."
9
- ds_list = MiGA::Project.load(ARGV[0]).datasets.
10
- select(&:is_ref?).select(&:is_active?)
11
-
9
+ p = MiGA::Project.load(ARGV[0])
10
+ ds_names = p.dataset_names
12
11
  thr = ARGV[1].to_i
13
12
 
14
13
  (0 .. thr-1).each do |t|
15
14
  fork do
16
15
  k = -1
17
- ds_list.each do |i|
16
+ ds_names.each do |i|
18
17
  k = (k+1) % thr
19
18
  next unless k == t
20
- i.cleanup_distances!
19
+ d = p.dataset(i)
20
+ next unless d.is_ref? and d.is_active?
21
+ d.cleanup_distances!
21
22
  end
22
23
  end
23
24
  end
@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
21
21
  ##
22
22
  # Estimates AAI against +target+ using hAAI
23
23
  def haai(target)
24
+ return nil if opts[:haai_p] == 'no'
24
25
  haai = aai_cmd(tmp_file('ess_genes.fa'),
25
26
  target.result(:essential_genes).file_path(:ess_genes),
26
27
  dataset.name, target.name, tmp_dbs[:haai],
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update: Oct 07 2015
5
+ # @license: artistic license 2.0
6
+ #
7
+ use strict;
8
+ use warnings;
9
+ use List::Util qw/sum min max/;
10
+
11
+ my ($seqs, $minlen, $n__) = @ARGV;
12
+ $seqs or die "
13
+ Description:
14
+ Calculates the N50 value of a set of sequences. Alternatively, it
15
+ can calculate other N** values. It also calculates the total number
16
+ of sequences and the total added length.
17
+
18
+ Usage:
19
+ $0 seqs.fa[ minlen[ **]]
20
+
21
+ seqs.fa A FastA file containing the sequences.
22
+ minlen (optional) The minimum length to take into consideration.
23
+ By default: 0.
24
+ ** Value N** to calculate. By default: 50 (N50).
25
+ ";
26
+ $minlen ||= 0;
27
+ $n__ ||= 50;
28
+
29
+ my @len = ();
30
+ open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
+ while(<SEQ>){
32
+ if(/^>/){
33
+ push @len, 0;
34
+ }else{
35
+ next if /^;/;
36
+ chomp;
37
+ s/\W//g;
38
+ $len[-1]+=length $_;
39
+ }
40
+ }
41
+ close SEQ;
42
+ @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+ my $tot = (sum(@len) || 0);
44
+
45
+ my $thr = $n__*$tot/100;
46
+ my $pos = 0;
47
+ for(@len){
48
+ $pos+= $_;
49
+ if($pos>=$thr){
50
+ print "N$n__: $_\n";
51
+ last;
52
+ }
53
+ }
54
+ print "Sequences: ".scalar(@len)."\n";
55
+ print "Total length: $tot\n";
56
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ my($file, $content, $stretch) = @ARGV;
12
+ $file or die <<HELP
13
+
14
+ Description:
15
+ Filter sequences by N-content and presence of long homopolymers.
16
+ Usage:
17
+ $0 sequences.fa [content [stretch]] > filtered.fa
18
+ Where:
19
+ sequences.fa Input file in FastA format
20
+ content A number between 0 and 1 indicating the maximum proportion of Ns
21
+ (1 to turn off, 0.5 by default)
22
+ stretch A number indicating the maximum number of consecutive identical
23
+ nucleotides allowed (0 to turn off, 100 by default)
24
+ filtered.fa Filtered set of sequences.
25
+
26
+ HELP
27
+ ;
28
+ ($content ||= 0.5)+=0;
29
+ ($stretch ||= 100)+=0;
30
+
31
+ my $good = 0;
32
+ my $N = 0;
33
+
34
+ FASTA: {
35
+ local $/ = "\n>";
36
+ open FILE, "<", $file or die "I can not open the file: $file: $!\n";
37
+ SEQ: while(<FILE>){
38
+ $N++;
39
+ s/^;.*//gm;
40
+ s/>//g;
41
+ my($n,$s) = split /\n/, $_, 2;
42
+ (my $clean = $s) =~ s/[^ACTGN]//g;
43
+ if($content < 1){
44
+ (my $Ns = $clean) =~ s/[^N]//g;
45
+ next SEQ if length($Ns)>length($clean)*$content;
46
+ }
47
+ if($stretch > 0){
48
+ for my $nuc (qw(A C T G N)){
49
+ next SEQ if $clean =~ m/[$nuc]{$stretch}/;
50
+ }
51
+ }
52
+ print ">$n\n$s\n";
53
+ $good++;
54
+ }
55
+ close FILE;
56
+ print STDERR "Total sequences: $N\nAfter filtering: $good\n";
57
+ }
58
+
59
+
60
+