miga-base 0.3.6.0 → 0.3.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/actions/stats.rb +0 -2
- data/lib/miga/version.rb +3 -3
- data/scripts/clade_finding.bash +3 -0
- data/utils/cleanup-databases.rb +25 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/subclade/pipeline.rb +6 -4
- data/utils/subclades.R +2 -2
- metadata +4 -2
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -56
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -60
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -38
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c629b49cedd42f76fd8c466ecbc561e915dcaeef9dbbc2140f66300ac21c4e86
|
4
|
+
data.tar.gz: 2174cd7e010340ea865b7ec251a9d8b2823a059bbcec782924052a5da0c0a247
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f20e4c7312402beec67de7a458356f76bd932edbeadffdee83061d040c8eaaddf31ada6304873638237ca299b806054e7de9656ecfebaeaa7e8e5ddb83710a93
|
7
|
+
data.tar.gz: 5dcae9006b7b84d75ce05019f9ac3f6defe4305c52f574a7c3426a3f8ee098ee4cd0ecf0968d5669066886fbc77185cd477ea265df5509eaf66733e2d4dfb421
|
data/actions/stats.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
require
|
2
|
+
require 'date'
|
3
3
|
|
4
4
|
##
|
5
5
|
# High-level minimal requirements for the MiGA::MiGA class.
|
@@ -10,11 +10,11 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 6,
|
13
|
+
VERSION = [0.3, 6, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
17
|
-
VERSION_NAME =
|
17
|
+
VERSION_NAME = 'tinge'
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
data/scripts/clade_finding.bash
CHANGED
@@ -11,6 +11,9 @@ cd "$PROJECT/data/10.clades/01.find"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
+
# Cleanup databases
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
16
|
+
|
14
17
|
# Run
|
15
18
|
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
16
19
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
require 'miga'
|
5
|
+
|
6
|
+
ARGV[1] or abort "Usage: #{$0} path/to/project threads"
|
7
|
+
|
8
|
+
$stderr.puts "Cleaning databases..."
|
9
|
+
ds_list = MiGA::Project.load(ARGV[0]).datasets.
|
10
|
+
select(&:is_ref?).select(&:is_active?)
|
11
|
+
|
12
|
+
thr = ARGV[1].to_i
|
13
|
+
|
14
|
+
(0 .. thr-1).each do |t|
|
15
|
+
fork do
|
16
|
+
k = -1
|
17
|
+
ds_list.each do |i|
|
18
|
+
k = (k+1) % thr
|
19
|
+
next unless k == t
|
20
|
+
i.cleanup_distances!
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
Process.waitall
|
25
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Scripts/lib/../../enveomics.R
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -26,16 +26,18 @@ module MiGA::SubcladeRunner::Pipeline
|
|
26
26
|
`ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
27
|
File.open(ogs_file, 'w') do |fh|
|
28
28
|
File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
|
29
|
-
fh.puts ln if lno
|
29
|
+
fh.puts ln if lno > 0
|
30
30
|
end
|
31
31
|
end
|
32
32
|
File.unlink "#{ogs_file}.tmp"
|
33
33
|
end
|
34
34
|
|
35
35
|
# Find species medoids
|
36
|
-
|
37
|
-
|
38
|
-
miga-project.
|
36
|
+
if File.size? 'miga-project.dist.rdata'
|
37
|
+
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
38
|
+
`Rscript '#{src}' miga-project.dist.rdata \
|
39
|
+
miga-project.ani95-medoids miga-project.ani95-clades`
|
40
|
+
end
|
39
41
|
|
40
42
|
# Propose clades
|
41
43
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
data/utils/subclades.R
CHANGED
@@ -22,11 +22,11 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
22
22
|
if(length(ani.d) == 0 && !file.exists(dist_rdata)){
|
23
23
|
# Read from ani_file
|
24
24
|
a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
|
25
|
-
if(nrow(a)==0){
|
25
|
+
if(nrow(a) == 0){
|
26
26
|
generate_empty_files(out_base)
|
27
27
|
return(NULL)
|
28
28
|
}
|
29
|
-
if(!is.na(sel)
|
29
|
+
if(!is.na(sel) && file.exists(sel)){
|
30
30
|
say('Filter selection')
|
31
31
|
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
32
32
|
a <- a[a$a %in% lab & a$b %in% lab, ]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.6.
|
4
|
+
version: 0.3.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
@@ -188,6 +188,7 @@ files:
|
|
188
188
|
- test/test_helper.rb
|
189
189
|
- utils/adapters.fa
|
190
190
|
- utils/arch-ess-genes.rb
|
191
|
+
- utils/cleanup-databases.rb
|
191
192
|
- utils/core-pan-plot.R
|
192
193
|
- utils/distance/base.rb
|
193
194
|
- utils/distance/commands.rb
|
@@ -341,6 +342,7 @@ files:
|
|
341
342
|
- utils/enveomics/Scripts/gi2tax.rb
|
342
343
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
343
344
|
- utils/enveomics/Scripts/lib/data/essential.hmm.gz
|
345
|
+
- utils/enveomics/Scripts/lib/enveomics.R
|
344
346
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
345
347
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
346
348
|
- utils/enveomics/Scripts/lib/enveomics_rb/og.rb
|
@@ -495,7 +497,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
495
497
|
version: '0'
|
496
498
|
requirements: []
|
497
499
|
rubyforge_project:
|
498
|
-
rubygems_version: 2.
|
500
|
+
rubygems_version: 2.7.7
|
499
501
|
signing_key:
|
500
502
|
specification_version: 4
|
501
503
|
summary: MiGA
|
@@ -1,56 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @update: Oct 07 2015
|
5
|
-
# @license: artistic license 2.0
|
6
|
-
#
|
7
|
-
use strict;
|
8
|
-
use warnings;
|
9
|
-
use List::Util qw/sum min max/;
|
10
|
-
|
11
|
-
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
-
$seqs or die "
|
13
|
-
Description:
|
14
|
-
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
-
can calculate other N** values. It also calculates the total number
|
16
|
-
of sequences and the total added length.
|
17
|
-
|
18
|
-
Usage:
|
19
|
-
$0 seqs.fa[ minlen[ **]]
|
20
|
-
|
21
|
-
seqs.fa A FastA file containing the sequences.
|
22
|
-
minlen (optional) The minimum length to take into consideration.
|
23
|
-
By default: 0.
|
24
|
-
** Value N** to calculate. By default: 50 (N50).
|
25
|
-
";
|
26
|
-
$minlen ||= 0;
|
27
|
-
$n__ ||= 50;
|
28
|
-
|
29
|
-
my @len = ();
|
30
|
-
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
-
while(<SEQ>){
|
32
|
-
if(/^>/){
|
33
|
-
push @len, 0;
|
34
|
-
}else{
|
35
|
-
next if /^;/;
|
36
|
-
chomp;
|
37
|
-
s/\W//g;
|
38
|
-
$len[-1]+=length $_;
|
39
|
-
}
|
40
|
-
}
|
41
|
-
close SEQ;
|
42
|
-
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
-
my $tot = (sum(@len) || 0);
|
44
|
-
|
45
|
-
my $thr = $n__*$tot/100;
|
46
|
-
my $pos = 0;
|
47
|
-
for(@len){
|
48
|
-
$pos+= $_;
|
49
|
-
if($pos>=$thr){
|
50
|
-
print "N$n__: $_\n";
|
51
|
-
last;
|
52
|
-
}
|
53
|
-
}
|
54
|
-
print "Sequences: ".scalar(@len)."\n";
|
55
|
-
print "Total length: $tot\n";
|
56
|
-
|
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R
|
4
|
-
# @update Oct-07-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
|
11
|
-
my($file, $content, $stretch) = @ARGV;
|
12
|
-
$file or die <<HELP
|
13
|
-
|
14
|
-
Description:
|
15
|
-
Filter sequences by N-content and presence of long homopolymers.
|
16
|
-
Usage:
|
17
|
-
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
-
Where:
|
19
|
-
sequences.fa Input file in FastA format
|
20
|
-
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
-
(1 to turn off, 0.5 by default)
|
22
|
-
stretch A number indicating the maximum number of consecutive identical
|
23
|
-
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
-
filtered.fa Filtered set of sequences.
|
25
|
-
|
26
|
-
HELP
|
27
|
-
;
|
28
|
-
($content ||= 0.5)+=0;
|
29
|
-
($stretch ||= 100)+=0;
|
30
|
-
|
31
|
-
my $good = 0;
|
32
|
-
my $N = 0;
|
33
|
-
|
34
|
-
FASTA: {
|
35
|
-
local $/ = "\n>";
|
36
|
-
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
-
SEQ: while(<FILE>){
|
38
|
-
$N++;
|
39
|
-
s/^;.*//gm;
|
40
|
-
s/>//g;
|
41
|
-
my($n,$s) = split /\n/, $_, 2;
|
42
|
-
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
-
if($content < 1){
|
44
|
-
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
-
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
-
}
|
47
|
-
if($stretch > 0){
|
48
|
-
for my $nuc (qw(A C T G N)){
|
49
|
-
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
-
}
|
51
|
-
}
|
52
|
-
print ">$n\n$s\n";
|
53
|
-
$good++;
|
54
|
-
}
|
55
|
-
close FILE;
|
56
|
-
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
-
}
|
58
|
-
|
59
|
-
|
60
|
-
|
@@ -1,38 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M Rodriguez-R
|
4
|
-
# @update Oct-07-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
|
11
|
-
$#ARGV>=0 or die "
|
12
|
-
Usage:
|
13
|
-
$0 seqs.fa... > length.txt
|
14
|
-
|
15
|
-
seqs.fa One or more FastA files.
|
16
|
-
length.txt A table with the lengths of the sequences.
|
17
|
-
|
18
|
-
";
|
19
|
-
|
20
|
-
for my $fa (@ARGV){
|
21
|
-
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
22
|
-
my $def = '';
|
23
|
-
my $len = 0;
|
24
|
-
while(<FA>){
|
25
|
-
next if /^;/;
|
26
|
-
if(m/^>(\S+)\s?/){
|
27
|
-
print "$def\t$len\n" if $def;
|
28
|
-
$def = $1;
|
29
|
-
$len = 0;
|
30
|
-
}else{
|
31
|
-
s/[^A-Za-z]//g;
|
32
|
-
$len+= length $_;
|
33
|
-
}
|
34
|
-
}
|
35
|
-
print "$def\t$len\n" if $def;
|
36
|
-
close FA;
|
37
|
-
}
|
38
|
-
|
@@ -1,55 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @update Oct-13-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
use Symbol;
|
11
|
-
|
12
|
-
my ($file, $base, $outN) = @ARGV;
|
13
|
-
|
14
|
-
$outN ||= 12;
|
15
|
-
($file and $base) or die "
|
16
|
-
Usage
|
17
|
-
$0 in_file.fa out_base[ no_files]
|
18
|
-
|
19
|
-
in_file.fa Input file in FastA format.
|
20
|
-
out_base Prefix for the name of the output files. It will
|
21
|
-
be appended with .<i>.fa, where <i> is a consecutive
|
22
|
-
number starting in 1.
|
23
|
-
no_files Number of files to generate. By default: 12.
|
24
|
-
|
25
|
-
";
|
26
|
-
|
27
|
-
|
28
|
-
my @outSym = ();
|
29
|
-
for my $i (1 .. $outN){
|
30
|
-
$outSym[$i-1] = gensym;
|
31
|
-
open $outSym[$i-1], ">", "$base.$i.fa" or
|
32
|
-
die "I can not create the file: $base.$i.fa: $!\n";
|
33
|
-
}
|
34
|
-
|
35
|
-
|
36
|
-
my($i, $seq) = (-1, '');
|
37
|
-
open FILE, "<", $file or die "I can not read the file: $file: $!\n";
|
38
|
-
while(my $ln=<FILE>){
|
39
|
-
next if $ln=~/^;/;
|
40
|
-
if($ln =~ m/^>/){
|
41
|
-
print { $outSym[$i % $outN] } $seq if $seq;
|
42
|
-
$i++;
|
43
|
-
$seq = '';
|
44
|
-
}
|
45
|
-
$seq.=$ln;
|
46
|
-
}
|
47
|
-
print { $outSym[$i % $outN] } $seq if $seq;
|
48
|
-
close FILE;
|
49
|
-
|
50
|
-
for(my $j=0; $j<$outN; $j++){
|
51
|
-
close $outSym[$j];
|
52
|
-
}
|
53
|
-
|
54
|
-
print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
|
55
|
-
|