miga-base 0.3.6.0 → 0.3.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/actions/stats.rb +0 -2
- data/lib/miga/version.rb +3 -3
- data/scripts/clade_finding.bash +3 -0
- data/utils/cleanup-databases.rb +25 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/subclade/pipeline.rb +6 -4
- data/utils/subclades.R +2 -2
- metadata +4 -2
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -56
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -60
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -38
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c629b49cedd42f76fd8c466ecbc561e915dcaeef9dbbc2140f66300ac21c4e86
|
4
|
+
data.tar.gz: 2174cd7e010340ea865b7ec251a9d8b2823a059bbcec782924052a5da0c0a247
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f20e4c7312402beec67de7a458356f76bd932edbeadffdee83061d040c8eaaddf31ada6304873638237ca299b806054e7de9656ecfebaeaa7e8e5ddb83710a93
|
7
|
+
data.tar.gz: 5dcae9006b7b84d75ce05019f9ac3f6defe4305c52f574a7c3426a3f8ee098ee4cd0ecf0968d5669066886fbc77185cd477ea265df5509eaf66733e2d4dfb421
|
data/actions/stats.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
require
|
2
|
+
require 'date'
|
3
3
|
|
4
4
|
##
|
5
5
|
# High-level minimal requirements for the MiGA::MiGA class.
|
@@ -10,11 +10,11 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 6,
|
13
|
+
VERSION = [0.3, 6, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
17
|
-
VERSION_NAME =
|
17
|
+
VERSION_NAME = 'tinge'
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
data/scripts/clade_finding.bash
CHANGED
@@ -11,6 +11,9 @@ cd "$PROJECT/data/10.clades/01.find"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
+
# Cleanup databases
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
16
|
+
|
14
17
|
# Run
|
15
18
|
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
16
19
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
require 'miga'
|
5
|
+
|
6
|
+
ARGV[1] or abort "Usage: #{$0} path/to/project threads"
|
7
|
+
|
8
|
+
$stderr.puts "Cleaning databases..."
|
9
|
+
ds_list = MiGA::Project.load(ARGV[0]).datasets.
|
10
|
+
select(&:is_ref?).select(&:is_active?)
|
11
|
+
|
12
|
+
thr = ARGV[1].to_i
|
13
|
+
|
14
|
+
(0 .. thr-1).each do |t|
|
15
|
+
fork do
|
16
|
+
k = -1
|
17
|
+
ds_list.each do |i|
|
18
|
+
k = (k+1) % thr
|
19
|
+
next unless k == t
|
20
|
+
i.cleanup_distances!
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
Process.waitall
|
25
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
|
@@ -0,0 +1 @@
|
|
1
|
+
utils/enveomics/Scripts/lib/../../enveomics.R
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -26,16 +26,18 @@ module MiGA::SubcladeRunner::Pipeline
|
|
26
26
|
`ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
27
|
File.open(ogs_file, 'w') do |fh|
|
28
28
|
File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
|
29
|
-
fh.puts ln if lno
|
29
|
+
fh.puts ln if lno > 0
|
30
30
|
end
|
31
31
|
end
|
32
32
|
File.unlink "#{ogs_file}.tmp"
|
33
33
|
end
|
34
34
|
|
35
35
|
# Find species medoids
|
36
|
-
|
37
|
-
|
38
|
-
miga-project.
|
36
|
+
if File.size? 'miga-project.dist.rdata'
|
37
|
+
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
38
|
+
`Rscript '#{src}' miga-project.dist.rdata \
|
39
|
+
miga-project.ani95-medoids miga-project.ani95-clades`
|
40
|
+
end
|
39
41
|
|
40
42
|
# Propose clades
|
41
43
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
data/utils/subclades.R
CHANGED
@@ -22,11 +22,11 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
22
22
|
if(length(ani.d) == 0 && !file.exists(dist_rdata)){
|
23
23
|
# Read from ani_file
|
24
24
|
a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
|
25
|
-
if(nrow(a)==0){
|
25
|
+
if(nrow(a) == 0){
|
26
26
|
generate_empty_files(out_base)
|
27
27
|
return(NULL)
|
28
28
|
}
|
29
|
-
if(!is.na(sel)
|
29
|
+
if(!is.na(sel) && file.exists(sel)){
|
30
30
|
say('Filter selection')
|
31
31
|
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
32
32
|
a <- a[a$a %in% lab & a$b %in% lab, ]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.6.
|
4
|
+
version: 0.3.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
@@ -188,6 +188,7 @@ files:
|
|
188
188
|
- test/test_helper.rb
|
189
189
|
- utils/adapters.fa
|
190
190
|
- utils/arch-ess-genes.rb
|
191
|
+
- utils/cleanup-databases.rb
|
191
192
|
- utils/core-pan-plot.R
|
192
193
|
- utils/distance/base.rb
|
193
194
|
- utils/distance/commands.rb
|
@@ -341,6 +342,7 @@ files:
|
|
341
342
|
- utils/enveomics/Scripts/gi2tax.rb
|
342
343
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
343
344
|
- utils/enveomics/Scripts/lib/data/essential.hmm.gz
|
345
|
+
- utils/enveomics/Scripts/lib/enveomics.R
|
344
346
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
345
347
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
346
348
|
- utils/enveomics/Scripts/lib/enveomics_rb/og.rb
|
@@ -495,7 +497,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
495
497
|
version: '0'
|
496
498
|
requirements: []
|
497
499
|
rubyforge_project:
|
498
|
-
rubygems_version: 2.
|
500
|
+
rubygems_version: 2.7.7
|
499
501
|
signing_key:
|
500
502
|
specification_version: 4
|
501
503
|
summary: MiGA
|
@@ -1,56 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @update: Oct 07 2015
|
5
|
-
# @license: artistic license 2.0
|
6
|
-
#
|
7
|
-
use strict;
|
8
|
-
use warnings;
|
9
|
-
use List::Util qw/sum min max/;
|
10
|
-
|
11
|
-
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
-
$seqs or die "
|
13
|
-
Description:
|
14
|
-
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
-
can calculate other N** values. It also calculates the total number
|
16
|
-
of sequences and the total added length.
|
17
|
-
|
18
|
-
Usage:
|
19
|
-
$0 seqs.fa[ minlen[ **]]
|
20
|
-
|
21
|
-
seqs.fa A FastA file containing the sequences.
|
22
|
-
minlen (optional) The minimum length to take into consideration.
|
23
|
-
By default: 0.
|
24
|
-
** Value N** to calculate. By default: 50 (N50).
|
25
|
-
";
|
26
|
-
$minlen ||= 0;
|
27
|
-
$n__ ||= 50;
|
28
|
-
|
29
|
-
my @len = ();
|
30
|
-
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
-
while(<SEQ>){
|
32
|
-
if(/^>/){
|
33
|
-
push @len, 0;
|
34
|
-
}else{
|
35
|
-
next if /^;/;
|
36
|
-
chomp;
|
37
|
-
s/\W//g;
|
38
|
-
$len[-1]+=length $_;
|
39
|
-
}
|
40
|
-
}
|
41
|
-
close SEQ;
|
42
|
-
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
-
my $tot = (sum(@len) || 0);
|
44
|
-
|
45
|
-
my $thr = $n__*$tot/100;
|
46
|
-
my $pos = 0;
|
47
|
-
for(@len){
|
48
|
-
$pos+= $_;
|
49
|
-
if($pos>=$thr){
|
50
|
-
print "N$n__: $_\n";
|
51
|
-
last;
|
52
|
-
}
|
53
|
-
}
|
54
|
-
print "Sequences: ".scalar(@len)."\n";
|
55
|
-
print "Total length: $tot\n";
|
56
|
-
|
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R
|
4
|
-
# @update Oct-07-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
|
11
|
-
my($file, $content, $stretch) = @ARGV;
|
12
|
-
$file or die <<HELP
|
13
|
-
|
14
|
-
Description:
|
15
|
-
Filter sequences by N-content and presence of long homopolymers.
|
16
|
-
Usage:
|
17
|
-
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
-
Where:
|
19
|
-
sequences.fa Input file in FastA format
|
20
|
-
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
-
(1 to turn off, 0.5 by default)
|
22
|
-
stretch A number indicating the maximum number of consecutive identical
|
23
|
-
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
-
filtered.fa Filtered set of sequences.
|
25
|
-
|
26
|
-
HELP
|
27
|
-
;
|
28
|
-
($content ||= 0.5)+=0;
|
29
|
-
($stretch ||= 100)+=0;
|
30
|
-
|
31
|
-
my $good = 0;
|
32
|
-
my $N = 0;
|
33
|
-
|
34
|
-
FASTA: {
|
35
|
-
local $/ = "\n>";
|
36
|
-
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
-
SEQ: while(<FILE>){
|
38
|
-
$N++;
|
39
|
-
s/^;.*//gm;
|
40
|
-
s/>//g;
|
41
|
-
my($n,$s) = split /\n/, $_, 2;
|
42
|
-
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
-
if($content < 1){
|
44
|
-
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
-
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
-
}
|
47
|
-
if($stretch > 0){
|
48
|
-
for my $nuc (qw(A C T G N)){
|
49
|
-
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
-
}
|
51
|
-
}
|
52
|
-
print ">$n\n$s\n";
|
53
|
-
$good++;
|
54
|
-
}
|
55
|
-
close FILE;
|
56
|
-
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
-
}
|
58
|
-
|
59
|
-
|
60
|
-
|
@@ -1,38 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M Rodriguez-R
|
4
|
-
# @update Oct-07-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
|
11
|
-
$#ARGV>=0 or die "
|
12
|
-
Usage:
|
13
|
-
$0 seqs.fa... > length.txt
|
14
|
-
|
15
|
-
seqs.fa One or more FastA files.
|
16
|
-
length.txt A table with the lengths of the sequences.
|
17
|
-
|
18
|
-
";
|
19
|
-
|
20
|
-
for my $fa (@ARGV){
|
21
|
-
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
22
|
-
my $def = '';
|
23
|
-
my $len = 0;
|
24
|
-
while(<FA>){
|
25
|
-
next if /^;/;
|
26
|
-
if(m/^>(\S+)\s?/){
|
27
|
-
print "$def\t$len\n" if $def;
|
28
|
-
$def = $1;
|
29
|
-
$len = 0;
|
30
|
-
}else{
|
31
|
-
s/[^A-Za-z]//g;
|
32
|
-
$len+= length $_;
|
33
|
-
}
|
34
|
-
}
|
35
|
-
print "$def\t$len\n" if $def;
|
36
|
-
close FA;
|
37
|
-
}
|
38
|
-
|
@@ -1,55 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @update Oct-13-2015
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
use warnings;
|
9
|
-
use strict;
|
10
|
-
use Symbol;
|
11
|
-
|
12
|
-
my ($file, $base, $outN) = @ARGV;
|
13
|
-
|
14
|
-
$outN ||= 12;
|
15
|
-
($file and $base) or die "
|
16
|
-
Usage
|
17
|
-
$0 in_file.fa out_base[ no_files]
|
18
|
-
|
19
|
-
in_file.fa Input file in FastA format.
|
20
|
-
out_base Prefix for the name of the output files. It will
|
21
|
-
be appended with .<i>.fa, where <i> is a consecutive
|
22
|
-
number starting in 1.
|
23
|
-
no_files Number of files to generate. By default: 12.
|
24
|
-
|
25
|
-
";
|
26
|
-
|
27
|
-
|
28
|
-
my @outSym = ();
|
29
|
-
for my $i (1 .. $outN){
|
30
|
-
$outSym[$i-1] = gensym;
|
31
|
-
open $outSym[$i-1], ">", "$base.$i.fa" or
|
32
|
-
die "I can not create the file: $base.$i.fa: $!\n";
|
33
|
-
}
|
34
|
-
|
35
|
-
|
36
|
-
my($i, $seq) = (-1, '');
|
37
|
-
open FILE, "<", $file or die "I can not read the file: $file: $!\n";
|
38
|
-
while(my $ln=<FILE>){
|
39
|
-
next if $ln=~/^;/;
|
40
|
-
if($ln =~ m/^>/){
|
41
|
-
print { $outSym[$i % $outN] } $seq if $seq;
|
42
|
-
$i++;
|
43
|
-
$seq = '';
|
44
|
-
}
|
45
|
-
$seq.=$ln;
|
46
|
-
}
|
47
|
-
print { $outSym[$i % $outN] } $seq if $seq;
|
48
|
-
close FILE;
|
49
|
-
|
50
|
-
for(my $j=0; $j<$outN; $j++){
|
51
|
-
close $outSym[$j];
|
52
|
-
}
|
53
|
-
|
54
|
-
print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
|
55
|
-
|