miga-base 0.2.1.4 → 0.2.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1dd8e19d8780562edc9d8bf94bb84e5f60487e54
4
- data.tar.gz: dd86f3de40811fb2862adef1ef5eddb128a9d4d0
3
+ metadata.gz: 5145851e5906de7cafb98f5c701c69caca6404cc
4
+ data.tar.gz: 745cb4dcc63e62c3d054cae99a37a22a6dbdb80e
5
5
  SHA512:
6
- metadata.gz: f93f2e32564193007777e83fb26583a96081eb607881167ba179ba25187ed4ee466897a188d5a81157d6287bcfff73d0690cb834842ebae182533049cc3d89f3
7
- data.tar.gz: a4fcfb6ef19264145afedf204ff9b412db4f9d32549a060d4dc8fab21eca6a0e2c13f8646307c8fff56ef902404246a226668a3045f128c2a494edfd57e46d36
6
+ metadata.gz: 71308be2992a9f78776f618afb2d8e8f3f4055d77d2cada696772130b2570468f7031fc3cee04371ebe721c8fd3e598d302cd20dda85fe4d43ddb850d1bdc213
7
+ data.tar.gz: 0ff19642a0dcd5a679adb9562988a731d631c811551400c02884b8a47ba39072f23efc1f5a1a651f87cdfd01d2c962464e2821965dc0a06b81202df2ee2f9d5b
@@ -16,6 +16,10 @@ OptionParser.new do |opt|
16
16
  "Owner of the dataset."){ |v| o[:user]=v }
17
17
  opt.on("-c", "--comments STRING",
18
18
  "Comments on the dataset."){ |v| o[:comments]=v }
19
+ opt.on("-m", "--metadata STRING",
20
+ "Metadata as key-value pairs separated by = and delimited by comma.",
21
+ "Values are saved as strings except for booleans (true / false) or nil."
22
+ ){ |v| o[:metadata]=v }
19
23
  opt.on("--update",
20
24
  "Updates the dataset if it already exists."){ o[:update]=true }
21
25
  opt_common(opt, o)
@@ -34,6 +38,20 @@ $stderr.puts "Loading dataset." unless o[:q]
34
38
  d = o[:update] ? p.dataset(o[:dataset]) :
35
39
  MiGA::Dataset.new(p, o[:dataset], o[:ref], {})
36
40
  raise "Dataset does not exist." if d.nil?
41
+ unless o[:metadata].nil?
42
+ o[:metadata].split(",").each do |pair|
43
+ (k,v) = pair.split("=")
44
+ case v
45
+ when "true"
46
+ v = true
47
+ when "false"
48
+ v = false
49
+ when "nil"
50
+ v = nil
51
+ end
52
+ d.metadata[k] = v
53
+ end
54
+ end
37
55
  [:type, :description, :user, :comments].each do |k|
38
56
  d.metadata[k]=o[k] unless o[k].nil?
39
57
  end
@@ -191,7 +191,9 @@ class MiGA::Dataset < MiGA::MiGA
191
191
  # execution order). This typically corresponds to the result used as the
192
192
  # initial input. Passes +save+ to #add_result.
193
193
  def first_preprocessing(save=false)
194
- @@PREPROCESSING_TASKS.find{ |t| not add_result(t, save).nil? }
194
+ @@PREPROCESSING_TASKS.find do |t|
195
+ not ignore_task?(t) and not add_result(t, save).nil?
196
+ end
195
197
  end
196
198
 
197
199
  ##
@@ -212,6 +214,7 @@ class MiGA::Dataset < MiGA::MiGA
212
214
  ##
213
215
  # Should I ignore +task+ for this dataset?
214
216
  def ignore_task?(task)
217
+ return !metadata["run_#{task}"] unless metadata["run_#{task}"].nil?
215
218
  ( (@@EXCLUDE_NOREF_TASKS.include?(task) and not is_ref?) or
216
219
  (@@ONLY_MULTI_TASKS.include?(task) and not is_multi?) or
217
220
  (@@ONLY_NONMULTI_TASKS.include?(task) and not is_nonmulti?))
@@ -107,8 +107,8 @@ class MiGA::Metadata < MiGA::MiGA
107
107
  v=v.miga_name if k==:name
108
108
  # Symbolize the special field :type
109
109
  v=v.to_sym if k==:type
110
- # Register and return
111
- @data[k]=v
110
+ # Delete if nil, register, and return
111
+ v.nil? ? @data.delete(k) : (@data[k]=v)
112
112
  end
113
113
 
114
114
  ##
@@ -130,7 +130,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
130
130
  else
131
131
  download("#{base}.LargeContigs.fna")
132
132
  end
133
- File.symlink("#{base}.LargeContigs.fna", "#{base}.AllContigs.fna")
133
+ File.symlink(
134
+ File.basename("#{base}.LargeContigs.fna"), "#{base}.AllContigs.fna")
134
135
  File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
135
136
  else
136
137
  raise "Unexpected error: Unsupported result for database #{db}."
@@ -53,9 +53,9 @@ module MiGA::TaxDist
53
53
  out = {}
54
54
  meaning.each do |phrase, thresholds|
55
55
  lwr, upr = thresholds
56
- min = pv.values.select{ |v| v >= lwr }.min
56
+ min = pv.values.select{ |v| v < upr }.max
57
57
  return out if min.nil?
58
- if min < upr
58
+ if min >= lwr
59
59
  v = pv.select{ |_,v| v==min }
60
60
  out[phrase] = (test==:intax ? v.reverse_each : v).first
61
61
  end
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.2, 1, 4]
13
+ VERSION = [0.2, 1, 5]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -62,6 +62,19 @@ function haai {
62
62
  fi
63
63
  }
64
64
 
65
+ function haai_or_aai {
66
+ local FH1=$1
67
+ local FH2=$2
68
+ local DBH=$3
69
+ local F1=$4
70
+ local F2=$5
71
+ local DB=$6
72
+ local TH=$7
73
+ AAI=$(haai $FH1 $FH2 $TH $DBH $DB)
74
+ [[ "${AAI%.*}" -le 0 ]] && AAI=$(aai $F1 $F2 $TH $DB)
75
+ echo $AAI
76
+ }
77
+
65
78
  function val_from_db {
66
79
  local N1=$1
67
80
  local N2=$2
@@ -24,91 +24,73 @@ function checkpoint_n {
24
24
  fi
25
25
  }
26
26
 
27
+ function noref_haai_or_aai {
28
+ local Q=$1
29
+ local S=$2
30
+ haai_or_aai $ESS/$Q.ess.faa $ESS/$S.ess.faa $TMPDIR/$Q.haai.db \
31
+ ../06.cds/$Q.faa ../06.cds/$S.faa $TMPDIR/$Q.aai.db $CORES
32
+ }
33
+
34
+ function noref_ani {
35
+ local Q=$1
36
+ local S=$2
37
+ ani ../05.assembly/$Q.LargeContigs.fna ../05.assembly/$S.LargeContigs.fna \
38
+ $CORES $TMPDIR/$Q.ani.db
39
+ }
40
+
27
41
  ESS="../07.annotation/01.function/01.essential"
28
42
  if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
29
43
  # Classify aai-clade (if project type is not clade)
30
44
  CLADES="../10.clades/01.find"
31
- CLASSIF="."
32
- [[ -e "$DATASET.aai-medoids.tsv" ]] && rm "$DATASET.aai-medoids.tsv"
33
- while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
34
- MAX_AAI=0
35
- AAI_MED=""
36
- AAI_CLS=""
37
- i_n=0
38
- for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
39
- let i_n=$i_n+1
40
- AAI=$(haai $ESS/$DATASET.ess.faa $ESS/$i.ess.faa $CORES \
41
- $TMPDIR/$DATASET.haai.db $TMPDIR/$DATASET.aai.db)
42
- [[ "${AAI%.*}" -le 0 ]] \
43
- && AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
44
- $TMPDIR/$DATASET.aai.db)
45
- checkpoint_n
46
- if [[ $(perl -e "print 1 if '$AAI' >= '$MAX_AAI'") == "1" ]] ; then
47
- MAX_AAI=$AAI
48
- AAI_MED=$i
49
- AAI_CLS=$i_n
50
- echo "[$CLASSIF] New max: $AAI_MED ($AAI_CLS): $MAX_AAI"
51
- fi
52
- done
53
- CLASSIF="$CLASSIF/miga-project.sc-$AAI_CLS"
54
- echo "$AAI_CLS $AAI_MED $MAX_AAI $CLASSIF" \
55
- >> "$DATASET.aai-medoids.tsv"
56
- done
57
-
58
- # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
59
- if [[ "$CLASSIF" != "." ]] ; then
60
- PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
61
- if [[ -s "$PAR" ]] ; then
62
- for i in $(cat "$PAR" | awk "\$2==$AAI_CLS{print \$1}") ; do
63
- AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
64
- $TMPDIR/$DATASET.aai.db)
65
- if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
66
- ani ../05.assembly/$DATASET.LargeContigs.fna \
67
- ../05.assembly/$i.LargeContigs.fna \
68
- $TMPDIR/$DATASET.ani.db >/dev/null
69
- fi
70
- checkpoint_n
71
- done
72
- fi
73
- fi
45
+ METRIC="aai"
74
46
  else
75
47
  # Classify ani-clade (if project type is clade)
76
48
  CLADES="../10.clades/02.ani"
77
- CLASSIF="."
78
- [[ -e "$DATASET.ani-medoids.tsv" ]] && rm "$DATASET.ani-medoids.tsv"
79
- while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
80
- MAX_ANI=0
81
- ANI_MED=""
82
- ANI_CLS=""
83
- i_n=0
84
- for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
85
- let i_n=$i_n+1
86
- ANI=$(ani ../05.assembly/$DATASET.LargeContigs.fna \
87
- ../05.assembly/$i.LargeContigs.fna $CORES $TMPDIR/$DATASET.ani.db)
88
- checkpoint_n
89
- if [[ $(perl -e "print 1 if '$ANI' >= '$MAX_ANI'") == "1" ]] ; then
90
- MAX_ANI=$ANI
91
- ANI_MED=$i
92
- ANI_CLS=$i_n
93
- echo "[$CLASSIF] New max: $ANI_MED ($ANI_CLS): $MAX_ANI"
94
- fi
95
- done
96
- CLASSIF="$CLASSIF/miga-project.sc-$ANI_CLS"
97
- echo "$ANI_CLS $ANI_MED $MAX_ANI $CLASSIF" \
98
- >> "$DATASET.ani-medoids.tsv"
99
- done
49
+ METRIC="ani"
50
+ fi
100
51
 
101
- # Calculate all the ANIs against the lowest subclade (if classified in-clade)
102
- if [[ "$CLASSIF" != "." ]] ; then
103
- PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
104
- if [[ -s "$CLADES/$CLASSIF/miga-project.all" ]] ; then
105
- for i in $(cat "$PAR" | awk "\$2==$ANI_CLS{print \$1}") ; do
106
- ani ../05.assembly/$DATASET.LargeContigs.fna \
107
- ../05.assembly/$i.LargeContigs.fna $CORES \
108
- $TMPDIR/$DATASET.ani.db > /dev/null
109
- checkpoint_n
110
- done
52
+ CLASSIF="."
53
+ [[ -e "$DATASET.$METRIC-medoids.tsv" ]] && rm "$DATASET.$METRIC-medoids.tsv"
54
+ while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
55
+ MAX_VAL=0
56
+ VAL_MED=""
57
+ VAL_CLS=""
58
+ i_n=0
59
+ for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
60
+ let i_n=$i_n+1
61
+ if [[ $METRIC == "aai" ]] ; then
62
+ VAL=$(noref_haai_or_aai $DATASET $i)
63
+ else
64
+ VAL=$(noref_ani $DATASET $i)
65
+ fi
66
+ checkpoint_n
67
+ if [[ $(perl -e "print 1 if '$VAL' >= '$MAX_VAL'") == "1" ]] ; then
68
+ MAX_VAL=$VAL
69
+ VAL_MED=$i
70
+ VAL_CLS=$i_n
71
+ echo "[$CLASSIF] New max: $VAL_MED ($VAL_CLS): $MAX_VAL"
111
72
  fi
73
+ done
74
+ CLASSIF="$CLASSIF/miga-project.sc-$VAL_CLS"
75
+ echo "$VAL_CLS $VAL_MED $MAX_VAL $CLASSIF" \
76
+ >> "$DATASET.$METRIC-medoids.tsv"
77
+ done
78
+
79
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
80
+ if [[ "$CLASSIF" != "." ]] ; then
81
+ PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
82
+ if [[ -s "$PAR" ]] ; then
83
+ for i in $(cat "$PAR" | awk "\$2==$VAL_CLS{print \$1}") ; do
84
+ if [[ $METRIC == "aai" ]] ; then
85
+ AAI=$(noref_haai_or_aai $DATASET $i)
86
+ else
87
+ AAI=100
88
+ fi
89
+ if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
90
+ noref_ani $DATASET $i
91
+ fi
92
+ checkpoint_n
93
+ done
112
94
  fi
113
95
  fi
114
96
 
@@ -21,15 +21,21 @@ for i in $DS ; do
21
21
  done
22
22
 
23
23
  # R-ify
24
- echo "
25
- haai <- read.table('miga-project.txt', sep='\\t', h=T);
26
- save(haai, file='miga-project.Rdata');
27
- h <- hist(haai[,'value'], breaks=100, plot=FALSE);
28
- write.table(
29
- cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
30
- file='miga-project.hist', quote=FALSE, sep='\\t',
31
- col.names=FALSE, row.names=FALSE);
32
- " | R --vanilla
24
+ if true ; then
25
+ echo "
26
+ haai <- read.table('miga-project.txt', sep='\\t', h=T);
27
+ save(haai, file='miga-project.Rdata');"
28
+ if [[ $(cat miga-project.txt | wc -l) -gt 1 ]] ; then
29
+ echo "
30
+ h <- hist(haai[,'value'], breaks=100, plot=FALSE);
31
+ write.table(
32
+ cbind(h[['breaks']][-length(h[['breaks']])],
33
+ h[['breaks']][-1],h[['counts']]),
34
+ file='miga-project.hist', quote=FALSE, sep='\\t',
35
+ col.names=FALSE, row.names=FALSE);
36
+ "
37
+ fi
38
+ fi | R --vanilla
33
39
 
34
40
  # Gzip
35
41
  gzip -9 -f miga-project.txt
@@ -31,6 +31,9 @@ done
31
31
  ogs.mcl.rb -o miga-project.ogs -d miga-project.rbm -t $CORES
32
32
  ogs.stats.rb -o miga-project.ogs -j miga-project.stats
33
33
 
34
+ # Clean RBMs
35
+ rm -rf miga-project.rbm
36
+
34
37
  # Finalize
35
38
  date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
36
39
  miga add_result -P "$PROJECT" -r ogs
@@ -41,15 +41,16 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
41
41
  cl <- makeCluster(thr)
42
42
  s <- parSapply(cl, k, function(x) {
43
43
  library(cluster)
44
- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo$avg.width
44
+ s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
45
+ c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
45
46
  })
46
47
  stopCluster(cl)
47
- ds <- (s[-c(1,length(s))]-pmax(s[-length(s)+c(0,1)],s[-c(1,2)]))
48
- top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)], n=1)
48
+ ds <- s[1,]/s[2,]
49
+ top.n <- k[which.max(ds)]
49
50
 
50
51
  # Classify genomes
51
52
  say("Classify")
52
- ani.cl <- pam(ani.d, top.n)
53
+ ani.cl <- pam(ani.d, top.n, pamonce=1)
53
54
  ani.types <- ani.cl$clustering
54
55
  ani.medoids <- ani.cl$medoids
55
56
 
@@ -58,7 +59,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
58
59
  pdf(paste(out_base, ".pdf", sep=""), 7, 12)
59
60
  layout(1:4)
60
61
  plot_distances(ani.d)
61
- plot_silhouette(k, s, ds, top.n)
62
+ plot_silhouette(k, s[1,], s[2,], top.n)
62
63
  plot_clustering(ani.cl, ani.d, ani.types)
63
64
  plot_tree(ani.ph, ani.types, ani.medoids)
64
65
  dev.off()
@@ -67,6 +68,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
67
68
  say("Text report")
68
69
  write.table(ani.medoids, paste(out_base, "medoids", sep="."),
69
70
  quote=FALSE, col.names=FALSE, row.names=FALSE)
71
+ save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
70
72
  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
71
73
  for(j in 1:nrow(classif)){
72
74
  classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
@@ -75,6 +77,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
75
77
  quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
76
78
 
77
79
  # Recursive search
80
+ say("Recursive search")
78
81
  for(i in 1:top.n){
79
82
  medoid <- ani.medoids[i]
80
83
  ds_f <- names(ani.types)[ ani.types==i ]
@@ -109,13 +112,13 @@ plot_silhouette <- function(k, s, ds, top.n) {
109
112
  ylim=range(s), bty="n", xaxs="i", yaxt="n")
110
113
  polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
111
114
  axis(2, fg="grey60", col.axis="grey60")
112
- mtext("Average silhouette", side=2, line=3, col="grey60")
115
+ mtext("Mean silhouette", side=2, line=3, col="grey60")
113
116
  par(new=TRUE)
114
117
  plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
115
118
  ylim=range(ds), bty="n", xaxs="i")
116
- points(k[-c(1,length(k))], ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
119
+ points(k, ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
117
120
  axis(4, fg="darkred", col.axis="darkred")
118
- mtext("Silhouette gain", side=4, line=3, col="darkred")
121
+ mtext("Negative silhouette area", side=4, line=3, col="darkred")
119
122
  abline(v=top.n, lty=2)
120
123
  }
121
124
 
@@ -129,7 +132,11 @@ plot_clustering <- function(cl, dist, types) {
129
132
  top.n <- length(cl$medoids)
130
133
  col <- ggplotColours(top.n)
131
134
  plot(silhouette(cl), col=col)
132
- clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
135
+ if(length(labels(dist))<=15){
136
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
137
+ }else{
138
+ clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
139
+ }
133
140
  }
134
141
 
135
142
  plot_tree <- function(phy, types, medoids){
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1.4
4
+ version: 0.2.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R