miga-base 0.3.6.3 → 0.3.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: f989fccd5161208979728e293bfc89d18e24e1ca6941b18a804063749cce56e9
4
- data.tar.gz: 7568cb56974c18279428a50f750a5302beae8063a190def53734eb985fe4b56b
2
+ SHA1:
3
+ metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
4
+ data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
5
5
  SHA512:
6
- metadata.gz: 8d026577fd38d399d643fbc9ef7c01ffa7969d8206d6f859655b9d04e41d7c3e75b1432581add654c5b3d58ce6b2bedf566783c2587165f811e12f6fbf87b695
7
- data.tar.gz: 4fae078b18a74314a7654d5c7a83ca7277555977b299931e33ddfabc3356cfa82994b00c6950a646eb8c16289f4cdb34f686574fc9505aacfec9ae826242c742
6
+ metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
7
+ data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f
data/actions/init.rb CHANGED
@@ -154,7 +154,7 @@ end
154
154
 
155
155
  # Check for R packages
156
156
  $stderr.puts "Looking for R packages:"
157
- %w(enveomics.R ape phangorn phytools cluster vegan).each do |pkg|
157
+ %w(enveomics.R ape cluster vegan).each do |pkg|
158
158
  $stderr.print "Testing #{pkg}... "
159
159
  `echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
160
160
  if $?.success?
data/actions/tax_dist.rb CHANGED
@@ -3,34 +3,34 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- require "miga/tax_index"
7
- require "zlib"
8
- require "tmpdir"
6
+ require 'miga/tax_index'
7
+ require 'zlib'
8
+ require 'tmpdir'
9
9
 
10
- o = {q:true, format: :json}
10
+ o = {q: true, format: :json}
11
11
  OptionParser.new do |opt|
12
12
  opt_banner(opt)
13
13
  opt_object(opt, o, [:project])
14
14
  opt_filter_datasets(opt, o)
15
- opt.on("-i", "--index FILE",
16
- "Pre-calculated tax-index (in tabular format) to be used.",
17
- "If passed, dataset filtering arguments are ignored."
18
- ){ |v| o[:index]=v }
15
+ opt.on('-i', '--index FILE',
16
+ 'Pre-calculated tax-index (in tabular format) to be used.',
17
+ 'If passed, dataset filtering arguments are ignored.'
18
+ ){ |v| o[:index] = v }
19
19
  opt_common(opt, o)
20
20
  end.parse!
21
21
 
22
22
  ##=> Functions <=
23
23
  # Returns the _cannonical_ ID between strings +a+ and +b+.
24
- def cannid(a, b) ; [a, b].sort.join("-") ; end
24
+ def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
25
25
 
26
26
  ##=> Main <=
27
- opt_require(o, project:"-P")
27
+ opt_require(o, project: '-P')
28
28
 
29
- $stderr.puts "Loading project." unless o[:q]
29
+ $stderr.puts 'Loading project.' unless o[:q]
30
30
  p = MiGA::Project.load(o[:project])
31
31
  raise "Impossible to load project: #{o[:project]}" if p.nil?
32
32
 
33
- metric = p.is_clade? ? "ani" : "aai"
33
+ metric = p.is_clade? ? 'ani' : 'aai'
34
34
  res_n = "#{metric}_distances"
35
35
  $stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
36
36
  res = p.result res_n
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
38
38
  matrix = res.file_path(:matrix)
39
39
  raise "#{res_n} has no matrix." if matrix.nil?
40
40
  dist = {}
41
- mfh = matrix=~/\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix,"r")
41
+ mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
42
42
  mfh.each_line do |ln|
43
43
  next if mfh.lineno==1
44
- row = ln.chomp.split(/\t/)
45
- dist[cannid(row[1], row[2])] = [row[3], 0, ["root:biota"]]
44
+ row = ln.chomp.split("\t")
45
+ dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
46
+ $stderr.print(" Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
46
47
  end
48
+ $stderr.puts " Lines: #{mfh.lineno}" unless o[:q]
47
49
  mfh.close
48
50
 
49
51
  Dir.mktmpdir do |dir|
50
52
  if o[:index].nil?
51
- $stderr.puts "Loading datasets." unless o[:q]
53
+ $stderr.puts 'Loading datasets.' unless o[:q]
52
54
  ds = p.datasets
53
55
  ds.select!{ |d| not d.metadata[:tax].nil? }
54
56
  ds = filter_datasets!(ds, o)
55
57
 
56
- $stderr.puts "Indexing taxonomy." unless o[:q]
58
+ $stderr.puts 'Indexing taxonomy.' unless o[:q]
57
59
  tax_index = MiGA::TaxIndex.new
58
60
  ds.each { |d| tax_index << d }
59
- tab = File.expand_path("index.tab", dir)
60
- File.open(tab, "w") { |fh| fh.print tax_index.to_tab }
61
+ tab = File.expand_path('index.tab', dir)
62
+ File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
61
63
  else
62
64
  tab = o[:index]
63
65
  end
64
-
65
- $stderr.puts "Traversing taxonomy." unless o[:q]
66
+
67
+ $stderr.puts 'Traversing taxonomy.' unless o[:q]
66
68
  rank_i = 0
67
69
  MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
68
70
  $stderr.print "o #{rank}: " unless o[:q]
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
70
72
  rank_i += 1
71
73
  in_rank = nil
72
74
  ds_name = []
73
- File.open(tab, "r") do |fh|
75
+ File.open(tab, 'r') do |fh|
74
76
  fh.each_line do |ln|
75
77
  if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
76
78
  in_rank = nil
77
79
  ds_name = []
78
80
  elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
79
- in_rank = $2=="?" ? nil : $1
81
+ in_rank = $2 == '?' ? nil : $1
80
82
  ds_name = []
81
83
  elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
82
84
  ds_i = $1
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
85
87
  k = cannid(ds_i, ds_j)
86
88
  next if dist[k].nil?
87
89
  rank_n += 1
88
- dist[k][1] = rank_i
89
- dist[k][2].unshift in_rank
90
+ dist[k][3] = rank_i
91
+ dist[k][4].unshift in_rank
90
92
  end
91
93
  end
92
94
  end
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
95
97
  end
96
98
  end
97
99
 
100
+ $stderr.puts 'Generating report.' unless o[:q]
98
101
  dist.keys.each do |k|
99
- puts (k.split("-") + dist[k].flatten).join("\t")
102
+ dist[k][5] = dist[k][4].reverse.join(' ')
103
+ dist[k][4] = dist[k][4].first
104
+ puts (k.split('-') + dist[k]).join("\t")
100
105
  end
106
+
data/bin/miga CHANGED
@@ -3,43 +3,43 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- $:.push File.expand_path("../lib", File.dirname(__FILE__))
6
+ $:.push File.expand_path('../../lib', __FILE__)
7
7
 
8
- require "optparse"
9
- require "miga"
8
+ require 'optparse'
9
+ require 'miga'
10
10
 
11
11
  ##=> Global variables <=
12
12
 
13
13
  $task_desc = {
14
14
  # Projects
15
- new: "Creates an empty MiGA project.",
16
- about: "Displays information about a MiGA project.",
17
- plugins: "Lists or (un)installs plugins in a MiGA project.",
18
- doctor: "Performs consistency checks on a MiGA project.",
15
+ new: 'Creates an empty MiGA project',
16
+ about: 'Displays information about a MiGA project',
17
+ plugins: 'Lists or (un)installs plugins in a MiGA project',
18
+ doctor: 'Performs consistency checks on a MiGA project',
19
19
  # Datasets
20
- add: "Creates an empty dataset in a pre-existing MiGA project.",
21
- get: "Downloads a dataset from public databases into a MiGA project.",
22
- ncbi_get: "Downloads all genomes in a taxon from NCBI into a MiGA project.",
23
- rm: "Removes a dataset from an MiGA project.",
24
- find: "Finds unregistered datasets based on result files.",
25
- ln: "Link datasets (including results) from one project to another.",
26
- ls: "Lists all registered datasets in an MiGA project.",
20
+ add: 'Creates an empty dataset in a pre-existing MiGA project',
21
+ get: 'Downloads a dataset from public databases into a MiGA project',
22
+ ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
23
+ rm: 'Removes a dataset from an MiGA project',
24
+ find: 'Finds unregistered datasets based on result files',
25
+ ln: 'Link datasets (including results) from one project to another',
26
+ ls: 'Lists all registered datasets in an MiGA project',
27
27
  # Results
28
- add_result: "Registers a result.",
29
- stats: "Extracts statistics for the given result.",
30
- files: "Lists all registered files from the results of a dataset or project.",
31
- run: "Executes locally one step analysis producing the given result.",
32
- summary: "Generates a summary table for the statistics of all datasets.",
28
+ add_result: 'Registers a result',
29
+ stats: 'Extracts statistics for the given result',
30
+ files: 'Lists registered files from the results of a dataset or project',
31
+ run: 'Executes locally one step analysis producing the given result',
32
+ summary: 'Generates a summary table for the statistics of all datasets',
33
33
  # System
34
- init: "Initialize MiGA to process new projects.",
35
- daemon: "Controls the daemon of a MiGA project.",
36
- date: "Returns the current date in standard MiGA format.",
37
- console: "Opens an IRB console with MiGA.",
34
+ init: 'Initialize MiGA to process new projects',
35
+ daemon: 'Controls the daemon of a MiGA project',
36
+ date: 'Returns the current date in standard MiGA format',
37
+ console: 'Opens an IRB console with MiGA',
38
38
  # Taxonomy
39
- tax_set: "Registers taxonomic information for datasets.",
40
- tax_test: "Returns test of taxonomic distributions for query datasets.",
41
- tax_index: "Creates a taxonomy-indexed list of the datasets.",
42
- tax_dist: "Estimates distributions of distance by taxonomy.",
39
+ tax_set: 'Registers taxonomic information for datasets',
40
+ tax_test: 'Returns test of taxonomic distributions for query datasets',
41
+ tax_index: 'Creates a taxonomy-indexed list of the datasets',
42
+ tax_dist: 'Estimates distributions of distance by taxonomy',
43
43
  }
44
44
 
45
45
  $task_alias = {
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
178
178
  end
179
179
 
180
180
  def add_metadata(o, obj)
181
- o[:metadata].split(",").each do |pair|
182
- (k,v) = pair.split("=")
181
+ o[:metadata].split(',').each do |pair|
182
+ (k,v) = pair.split('=')
183
183
  case v
184
184
  when 'true'; v = true
185
185
  when 'false'; v = false
186
186
  when 'nil'; v = nil
187
187
  end
188
- if k=='_step'
188
+ if k == '_step'
189
189
  obj.metadata["_try_#{v}"] ||= 0
190
190
  obj.metadata["_try_#{v}"] += 1
191
191
  end
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
205
205
  ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
206
206
 
207
207
  case ARGV[0].to_s
208
- when "-v", "--version"
208
+ when '-v', '--version'
209
209
  puts MiGA::MiGA.VERSION
210
- when "-V", "--long-version"
210
+ when '-V', '--long-version'
211
211
  puts MiGA::MiGA.LONG_VERSION
212
- when "-C", "--citation"
212
+ when '-C', '--citation'
213
213
  puts MiGA::MiGA.CITATION
214
- when "console"
215
- require "irb"
216
- require "irb/completion"
214
+ when 'console'
215
+ require 'irb'
216
+ require 'irb/completion'
217
217
  ARGV.shift
218
218
  IRB.start
219
219
  when *execs
220
220
  $task = ARGV.shift.to_sym
221
- ARGV << "-h" if ARGV.empty? and not [:date, :init].include? $task
221
+ ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
222
222
  begin
223
223
  load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
224
224
  rescue => err
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
233
233
 
234
234
  Usage: #{$0} {action} [options]
235
235
 
236
- #{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
236
+ #{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
237
237
 
238
238
  generic options:
239
239
  -h, --help Display this screen.
data/lib/miga/daemon.rb CHANGED
@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
99
99
  status = JSON.parse(File.read(f_path), symbolize_names: true)
100
100
  status.keys.each do |i|
101
101
  status[i].map! do |j|
102
- j.tap { |k| k[:ds] = project.dataset(k[:ds_name]) unless k[:ds].nil? }
102
+ j.tap do |k|
103
+ unless k[:ds].nil? or k[:ds_name] == 'miga-project'
104
+ k[:ds] = project.dataset(k[:ds_name])
105
+ end
106
+ k[:job] = k[:job].to_sym unless k[:job].nil?
107
+ end
103
108
  end
104
109
  end
105
110
  @jobs_running = status[:jobs_running]
106
111
  @jobs_to_run = status[:jobs_to_run]
112
+ say "- jobs left running: #{@jobs_running.size}"
107
113
  purge!
108
114
  say "- jobs running: #{@jobs_running.size}"
109
115
  say "- jobs to run: #{@jobs_to_run.size}"
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
171
177
  ##
172
178
  # Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
173
179
  # let +ds+ be nil.
174
- def get_job(job, ds=nil)
180
+ def get_job(job, ds = nil)
175
181
  (jobs_to_run + jobs_running).find do |j|
176
- if ds==nil
177
- j[:ds].nil? and j[:job]==job
182
+ if ds.nil?
183
+ j[:ds].nil? and j[:job] == job
178
184
  else
179
- (! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
185
+ (! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
180
186
  end
181
187
  end
182
188
  end
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 6, 3]
13
+ VERSION = [0.3, 7, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
data/scripts/ogs.bash CHANGED
@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
11
11
  # Initialize
12
12
  miga date > "miga-project.start"
13
13
 
14
- DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
14
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi)
15
+ MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
16
+ [[ $MIN_ID == "?" ]] && MIN_ID=80
15
17
  if [[ ! -s miga-project.ogs ]] ; then
16
18
  # Extract RBMs
17
19
  if [[ ! -s miga-project.abc ]] ; then
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
19
21
  for i in $DS ; do
20
22
  file="miga-project.tmp/$i.abc"
21
23
  [[ -s "$file" ]] && continue
22
- echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
24
+ echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
23
25
  | sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
24
26
  | awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
25
27
  > "$file.tmp"
@@ -6,18 +6,19 @@ require 'miga'
6
6
  ARGV[1] or abort "Usage: #{$0} path/to/project threads"
7
7
 
8
8
  $stderr.puts "Cleaning databases..."
9
- ds_list = MiGA::Project.load(ARGV[0]).datasets.
10
- select(&:is_ref?).select(&:is_active?)
11
-
9
+ p = MiGA::Project.load(ARGV[0])
10
+ ds_names = p.dataset_names
12
11
  thr = ARGV[1].to_i
13
12
 
14
13
  (0 .. thr-1).each do |t|
15
14
  fork do
16
15
  k = -1
17
- ds_list.each do |i|
16
+ ds_names.each do |i|
18
17
  k = (k+1) % thr
19
18
  next unless k == t
20
- i.cleanup_distances!
19
+ d = p.dataset(i)
20
+ next unless d.is_ref? and d.is_active?
21
+ d.cleanup_distances!
21
22
  end
22
23
  end
23
24
  end
@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
21
21
  ##
22
22
  # Estimates AAI against +target+ using hAAI
23
23
  def haai(target)
24
+ return nil if opts[:haai_p] == 'no'
24
25
  haai = aai_cmd(tmp_file('ess_genes.fa'),
25
26
  target.result(:essential_genes).file_path(:ess_genes),
26
27
  dataset.name, target.name, tmp_dbs[:haai],
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update: Oct 07 2015
5
+ # @license: artistic license 2.0
6
+ #
7
+ use strict;
8
+ use warnings;
9
+ use List::Util qw/sum min max/;
10
+
11
+ my ($seqs, $minlen, $n__) = @ARGV;
12
+ $seqs or die "
13
+ Description:
14
+ Calculates the N50 value of a set of sequences. Alternatively, it
15
+ can calculate other N** values. It also calculates the total number
16
+ of sequences and the total added length.
17
+
18
+ Usage:
19
+ $0 seqs.fa[ minlen[ **]]
20
+
21
+ seqs.fa A FastA file containing the sequences.
22
+ minlen (optional) The minimum length to take into consideration.
23
+ By default: 0.
24
+ ** Value N** to calculate. By default: 50 (N50).
25
+ ";
26
+ $minlen ||= 0;
27
+ $n__ ||= 50;
28
+
29
+ my @len = ();
30
+ open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
+ while(<SEQ>){
32
+ if(/^>/){
33
+ push @len, 0;
34
+ }else{
35
+ next if /^;/;
36
+ chomp;
37
+ s/\W//g;
38
+ $len[-1]+=length $_;
39
+ }
40
+ }
41
+ close SEQ;
42
+ @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+ my $tot = (sum(@len) || 0);
44
+
45
+ my $thr = $n__*$tot/100;
46
+ my $pos = 0;
47
+ for(@len){
48
+ $pos+= $_;
49
+ if($pos>=$thr){
50
+ print "N$n__: $_\n";
51
+ last;
52
+ }
53
+ }
54
+ print "Sequences: ".scalar(@len)."\n";
55
+ print "Total length: $tot\n";
56
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ my($file, $content, $stretch) = @ARGV;
12
+ $file or die <<HELP
13
+
14
+ Description:
15
+ Filter sequences by N-content and presence of long homopolymers.
16
+ Usage:
17
+ $0 sequences.fa [content [stretch]] > filtered.fa
18
+ Where:
19
+ sequences.fa Input file in FastA format
20
+ content A number between 0 and 1 indicating the maximum proportion of Ns
21
+ (1 to turn off, 0.5 by default)
22
+ stretch A number indicating the maximum number of consecutive identical
23
+ nucleotides allowed (0 to turn off, 100 by default)
24
+ filtered.fa Filtered set of sequences.
25
+
26
+ HELP
27
+ ;
28
+ ($content ||= 0.5)+=0;
29
+ ($stretch ||= 100)+=0;
30
+
31
+ my $good = 0;
32
+ my $N = 0;
33
+
34
+ FASTA: {
35
+ local $/ = "\n>";
36
+ open FILE, "<", $file or die "I can not open the file: $file: $!\n";
37
+ SEQ: while(<FILE>){
38
+ $N++;
39
+ s/^;.*//gm;
40
+ s/>//g;
41
+ my($n,$s) = split /\n/, $_, 2;
42
+ (my $clean = $s) =~ s/[^ACTGN]//g;
43
+ if($content < 1){
44
+ (my $Ns = $clean) =~ s/[^N]//g;
45
+ next SEQ if length($Ns)>length($clean)*$content;
46
+ }
47
+ if($stretch > 0){
48
+ for my $nuc (qw(A C T G N)){
49
+ next SEQ if $clean =~ m/[$nuc]{$stretch}/;
50
+ }
51
+ }
52
+ print ">$n\n$s\n";
53
+ $good++;
54
+ }
55
+ close FILE;
56
+ print STDERR "Total sequences: $N\nAfter filtering: $good\n";
57
+ }
58
+
59
+
60
+