miga-base 0.2.1.4 → 0.2.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1dd8e19d8780562edc9d8bf94bb84e5f60487e54
4
- data.tar.gz: dd86f3de40811fb2862adef1ef5eddb128a9d4d0
3
+ metadata.gz: 5145851e5906de7cafb98f5c701c69caca6404cc
4
+ data.tar.gz: 745cb4dcc63e62c3d054cae99a37a22a6dbdb80e
5
5
  SHA512:
6
- metadata.gz: f93f2e32564193007777e83fb26583a96081eb607881167ba179ba25187ed4ee466897a188d5a81157d6287bcfff73d0690cb834842ebae182533049cc3d89f3
7
- data.tar.gz: a4fcfb6ef19264145afedf204ff9b412db4f9d32549a060d4dc8fab21eca6a0e2c13f8646307c8fff56ef902404246a226668a3045f128c2a494edfd57e46d36
6
+ metadata.gz: 71308be2992a9f78776f618afb2d8e8f3f4055d77d2cada696772130b2570468f7031fc3cee04371ebe721c8fd3e598d302cd20dda85fe4d43ddb850d1bdc213
7
+ data.tar.gz: 0ff19642a0dcd5a679adb9562988a731d631c811551400c02884b8a47ba39072f23efc1f5a1a651f87cdfd01d2c962464e2821965dc0a06b81202df2ee2f9d5b
@@ -16,6 +16,10 @@ OptionParser.new do |opt|
16
16
  "Owner of the dataset."){ |v| o[:user]=v }
17
17
  opt.on("-c", "--comments STRING",
18
18
  "Comments on the dataset."){ |v| o[:comments]=v }
19
+ opt.on("-m", "--metadata STRING",
20
+ "Metadata as key-value pairs separated by = and delimited by comma.",
21
+ "Values are saved as strings except for booleans (true / false) or nil."
22
+ ){ |v| o[:metadata]=v }
19
23
  opt.on("--update",
20
24
  "Updates the dataset if it already exists."){ o[:update]=true }
21
25
  opt_common(opt, o)
@@ -34,6 +38,20 @@ $stderr.puts "Loading dataset." unless o[:q]
34
38
  d = o[:update] ? p.dataset(o[:dataset]) :
35
39
  MiGA::Dataset.new(p, o[:dataset], o[:ref], {})
36
40
  raise "Dataset does not exist." if d.nil?
41
+ unless o[:metadata].nil?
42
+ o[:metadata].split(",").each do |pair|
43
+ (k,v) = pair.split("=")
44
+ case v
45
+ when "true"
46
+ v = true
47
+ when "false"
48
+ v = false
49
+ when "nil"
50
+ v = nil
51
+ end
52
+ d.metadata[k] = v
53
+ end
54
+ end
37
55
  [:type, :description, :user, :comments].each do |k|
38
56
  d.metadata[k]=o[k] unless o[k].nil?
39
57
  end
@@ -191,7 +191,9 @@ class MiGA::Dataset < MiGA::MiGA
191
191
  # execution order). This typically corresponds to the result used as the
192
192
  # initial input. Passes +save+ to #add_result.
193
193
  def first_preprocessing(save=false)
194
- @@PREPROCESSING_TASKS.find{ |t| not add_result(t, save).nil? }
194
+ @@PREPROCESSING_TASKS.find do |t|
195
+ not ignore_task?(t) and not add_result(t, save).nil?
196
+ end
195
197
  end
196
198
 
197
199
  ##
@@ -212,6 +214,7 @@ class MiGA::Dataset < MiGA::MiGA
212
214
  ##
213
215
  # Should I ignore +task+ for this dataset?
214
216
  def ignore_task?(task)
217
+ return !metadata["run_#{task}"] unless metadata["run_#{task}"].nil?
215
218
  ( (@@EXCLUDE_NOREF_TASKS.include?(task) and not is_ref?) or
216
219
  (@@ONLY_MULTI_TASKS.include?(task) and not is_multi?) or
217
220
  (@@ONLY_NONMULTI_TASKS.include?(task) and not is_nonmulti?))
@@ -107,8 +107,8 @@ class MiGA::Metadata < MiGA::MiGA
107
107
  v=v.miga_name if k==:name
108
108
  # Symbolize the special field :type
109
109
  v=v.to_sym if k==:type
110
- # Register and return
111
- @data[k]=v
110
+ # Delete if nil, register, and return
111
+ v.nil? ? @data.delete(k) : (@data[k]=v)
112
112
  end
113
113
 
114
114
  ##
@@ -130,7 +130,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
130
130
  else
131
131
  download("#{base}.LargeContigs.fna")
132
132
  end
133
- File.symlink("#{base}.LargeContigs.fna", "#{base}.AllContigs.fna")
133
+ File.symlink(
134
+ File.basename("#{base}.LargeContigs.fna"), "#{base}.AllContigs.fna")
134
135
  File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
135
136
  else
136
137
  raise "Unexpected error: Unsupported result for database #{db}."
@@ -53,9 +53,9 @@ module MiGA::TaxDist
53
53
  out = {}
54
54
  meaning.each do |phrase, thresholds|
55
55
  lwr, upr = thresholds
56
- min = pv.values.select{ |v| v >= lwr }.min
56
+ min = pv.values.select{ |v| v < upr }.max
57
57
  return out if min.nil?
58
- if min < upr
58
+ if min >= lwr
59
59
  v = pv.select{ |_,v| v==min }
60
60
  out[phrase] = (test==:intax ? v.reverse_each : v).first
61
61
  end
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.2, 1, 4]
13
+ VERSION = [0.2, 1, 5]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -62,6 +62,19 @@ function haai {
62
62
  fi
63
63
  }
64
64
 
65
+ function haai_or_aai {
66
+ local FH1=$1
67
+ local FH2=$2
68
+ local DBH=$3
69
+ local F1=$4
70
+ local F2=$5
71
+ local DB=$6
72
+ local TH=$7
73
+ AAI=$(haai $FH1 $FH2 $TH $DBH $DB)
74
+ [[ "${AAI%.*}" -le 0 ]] && AAI=$(aai $F1 $F2 $TH $DB)
75
+ echo $AAI
76
+ }
77
+
65
78
  function val_from_db {
66
79
  local N1=$1
67
80
  local N2=$2
@@ -24,91 +24,73 @@ function checkpoint_n {
24
24
  fi
25
25
  }
26
26
 
27
+ function noref_haai_or_aai {
28
+ local Q=$1
29
+ local S=$2
30
+ haai_or_aai $ESS/$Q.ess.faa $ESS/$S.ess.faa $TMPDIR/$Q.haai.db \
31
+ ../06.cds/$Q.faa ../06.cds/$S.faa $TMPDIR/$Q.aai.db $CORES
32
+ }
33
+
34
+ function noref_ani {
35
+ local Q=$1
36
+ local S=$2
37
+ ani ../05.assembly/$Q.LargeContigs.fna ../05.assembly/$S.LargeContigs.fna \
38
+ $CORES $TMPDIR/$Q.ani.db
39
+ }
40
+
27
41
  ESS="../07.annotation/01.function/01.essential"
28
42
  if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
29
43
  # Classify aai-clade (if project type is not clade)
30
44
  CLADES="../10.clades/01.find"
31
- CLASSIF="."
32
- [[ -e "$DATASET.aai-medoids.tsv" ]] && rm "$DATASET.aai-medoids.tsv"
33
- while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
34
- MAX_AAI=0
35
- AAI_MED=""
36
- AAI_CLS=""
37
- i_n=0
38
- for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
39
- let i_n=$i_n+1
40
- AAI=$(haai $ESS/$DATASET.ess.faa $ESS/$i.ess.faa $CORES \
41
- $TMPDIR/$DATASET.haai.db $TMPDIR/$DATASET.aai.db)
42
- [[ "${AAI%.*}" -le 0 ]] \
43
- && AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
44
- $TMPDIR/$DATASET.aai.db)
45
- checkpoint_n
46
- if [[ $(perl -e "print 1 if '$AAI' >= '$MAX_AAI'") == "1" ]] ; then
47
- MAX_AAI=$AAI
48
- AAI_MED=$i
49
- AAI_CLS=$i_n
50
- echo "[$CLASSIF] New max: $AAI_MED ($AAI_CLS): $MAX_AAI"
51
- fi
52
- done
53
- CLASSIF="$CLASSIF/miga-project.sc-$AAI_CLS"
54
- echo "$AAI_CLS $AAI_MED $MAX_AAI $CLASSIF" \
55
- >> "$DATASET.aai-medoids.tsv"
56
- done
57
-
58
- # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
59
- if [[ "$CLASSIF" != "." ]] ; then
60
- PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
61
- if [[ -s "$PAR" ]] ; then
62
- for i in $(cat "$PAR" | awk "\$2==$AAI_CLS{print \$1}") ; do
63
- AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
64
- $TMPDIR/$DATASET.aai.db)
65
- if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
66
- ani ../05.assembly/$DATASET.LargeContigs.fna \
67
- ../05.assembly/$i.LargeContigs.fna \
68
- $TMPDIR/$DATASET.ani.db >/dev/null
69
- fi
70
- checkpoint_n
71
- done
72
- fi
73
- fi
45
+ METRIC="aai"
74
46
  else
75
47
  # Classify ani-clade (if project type is clade)
76
48
  CLADES="../10.clades/02.ani"
77
- CLASSIF="."
78
- [[ -e "$DATASET.ani-medoids.tsv" ]] && rm "$DATASET.ani-medoids.tsv"
79
- while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
80
- MAX_ANI=0
81
- ANI_MED=""
82
- ANI_CLS=""
83
- i_n=0
84
- for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
85
- let i_n=$i_n+1
86
- ANI=$(ani ../05.assembly/$DATASET.LargeContigs.fna \
87
- ../05.assembly/$i.LargeContigs.fna $CORES $TMPDIR/$DATASET.ani.db)
88
- checkpoint_n
89
- if [[ $(perl -e "print 1 if '$ANI' >= '$MAX_ANI'") == "1" ]] ; then
90
- MAX_ANI=$ANI
91
- ANI_MED=$i
92
- ANI_CLS=$i_n
93
- echo "[$CLASSIF] New max: $ANI_MED ($ANI_CLS): $MAX_ANI"
94
- fi
95
- done
96
- CLASSIF="$CLASSIF/miga-project.sc-$ANI_CLS"
97
- echo "$ANI_CLS $ANI_MED $MAX_ANI $CLASSIF" \
98
- >> "$DATASET.ani-medoids.tsv"
99
- done
49
+ METRIC="ani"
50
+ fi
100
51
 
101
- # Calculate all the ANIs against the lowest subclade (if classified in-clade)
102
- if [[ "$CLASSIF" != "." ]] ; then
103
- PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
104
- if [[ -s "$CLADES/$CLASSIF/miga-project.all" ]] ; then
105
- for i in $(cat "$PAR" | awk "\$2==$ANI_CLS{print \$1}") ; do
106
- ani ../05.assembly/$DATASET.LargeContigs.fna \
107
- ../05.assembly/$i.LargeContigs.fna $CORES \
108
- $TMPDIR/$DATASET.ani.db > /dev/null
109
- checkpoint_n
110
- done
52
+ CLASSIF="."
53
+ [[ -e "$DATASET.$METRIC-medoids.tsv" ]] && rm "$DATASET.$METRIC-medoids.tsv"
54
+ while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
55
+ MAX_VAL=0
56
+ VAL_MED=""
57
+ VAL_CLS=""
58
+ i_n=0
59
+ for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
60
+ let i_n=$i_n+1
61
+ if [[ $METRIC == "aai" ]] ; then
62
+ VAL=$(noref_haai_or_aai $DATASET $i)
63
+ else
64
+ VAL=$(noref_ani $DATASET $i)
65
+ fi
66
+ checkpoint_n
67
+ if [[ $(perl -e "print 1 if '$VAL' >= '$MAX_VAL'") == "1" ]] ; then
68
+ MAX_VAL=$VAL
69
+ VAL_MED=$i
70
+ VAL_CLS=$i_n
71
+ echo "[$CLASSIF] New max: $VAL_MED ($VAL_CLS): $MAX_VAL"
111
72
  fi
73
+ done
74
+ CLASSIF="$CLASSIF/miga-project.sc-$VAL_CLS"
75
+ echo "$VAL_CLS $VAL_MED $MAX_VAL $CLASSIF" \
76
+ >> "$DATASET.$METRIC-medoids.tsv"
77
+ done
78
+
79
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
80
+ if [[ "$CLASSIF" != "." ]] ; then
81
+ PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
82
+ if [[ -s "$PAR" ]] ; then
83
+ for i in $(cat "$PAR" | awk "\$2==$VAL_CLS{print \$1}") ; do
84
+ if [[ $METRIC == "aai" ]] ; then
85
+ AAI=$(noref_haai_or_aai $DATASET $i)
86
+ else
87
+ AAI=100
88
+ fi
89
+ if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
90
+ noref_ani $DATASET $i
91
+ fi
92
+ checkpoint_n
93
+ done
112
94
  fi
113
95
  fi
114
96
 
@@ -21,15 +21,21 @@ for i in $DS ; do
21
21
  done
22
22
 
23
23
  # R-ify
24
- echo "
25
- haai <- read.table('miga-project.txt', sep='\\t', h=T);
26
- save(haai, file='miga-project.Rdata');
27
- h <- hist(haai[,'value'], breaks=100, plot=FALSE);
28
- write.table(
29
- cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
30
- file='miga-project.hist', quote=FALSE, sep='\\t',
31
- col.names=FALSE, row.names=FALSE);
32
- " | R --vanilla
24
+ if true ; then
25
+ echo "
26
+ haai <- read.table('miga-project.txt', sep='\\t', h=T);
27
+ save(haai, file='miga-project.Rdata');"
28
+ if [[ $(cat miga-project.txt | wc -l) -gt 1 ]] ; then
29
+ echo "
30
+ h <- hist(haai[,'value'], breaks=100, plot=FALSE);
31
+ write.table(
32
+ cbind(h[['breaks']][-length(h[['breaks']])],
33
+ h[['breaks']][-1],h[['counts']]),
34
+ file='miga-project.hist', quote=FALSE, sep='\\t',
35
+ col.names=FALSE, row.names=FALSE);
36
+ "
37
+ fi
38
+ fi | R --vanilla
33
39
 
34
40
  # Gzip
35
41
  gzip -9 -f miga-project.txt
@@ -31,6 +31,9 @@ done
31
31
  ogs.mcl.rb -o miga-project.ogs -d miga-project.rbm -t $CORES
32
32
  ogs.stats.rb -o miga-project.ogs -j miga-project.stats
33
33
 
34
+ # Clean RBMs
35
+ rm -rf miga-project.rbm
36
+
34
37
  # Finalize
35
38
  date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
36
39
  miga add_result -P "$PROJECT" -r ogs
@@ -41,15 +41,16 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
41
41
  cl <- makeCluster(thr)
42
42
  s <- parSapply(cl, k, function(x) {
43
43
  library(cluster)
44
- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo$avg.width
44
+ s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
45
+ c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
45
46
  })
46
47
  stopCluster(cl)
47
- ds <- (s[-c(1,length(s))]-pmax(s[-length(s)+c(0,1)],s[-c(1,2)]))
48
- top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)], n=1)
48
+ ds <- s[1,]/s[2,]
49
+ top.n <- k[which.max(ds)]
49
50
 
50
51
  # Classify genomes
51
52
  say("Classify")
52
- ani.cl <- pam(ani.d, top.n)
53
+ ani.cl <- pam(ani.d, top.n, pamonce=1)
53
54
  ani.types <- ani.cl$clustering
54
55
  ani.medoids <- ani.cl$medoids
55
56
 
@@ -58,7 +59,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
58
59
  pdf(paste(out_base, ".pdf", sep=""), 7, 12)
59
60
  layout(1:4)
60
61
  plot_distances(ani.d)
61
- plot_silhouette(k, s, ds, top.n)
62
+ plot_silhouette(k, s[1,], s[2,], top.n)
62
63
  plot_clustering(ani.cl, ani.d, ani.types)
63
64
  plot_tree(ani.ph, ani.types, ani.medoids)
64
65
  dev.off()
@@ -67,6 +68,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
67
68
  say("Text report")
68
69
  write.table(ani.medoids, paste(out_base, "medoids", sep="."),
69
70
  quote=FALSE, col.names=FALSE, row.names=FALSE)
71
+ save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
70
72
  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
71
73
  for(j in 1:nrow(classif)){
72
74
  classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
@@ -75,6 +77,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
75
77
  quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
76
78
 
77
79
  # Recursive search
80
+ say("Recursive search")
78
81
  for(i in 1:top.n){
79
82
  medoid <- ani.medoids[i]
80
83
  ds_f <- names(ani.types)[ ani.types==i ]
@@ -109,13 +112,13 @@ plot_silhouette <- function(k, s, ds, top.n) {
109
112
  ylim=range(s), bty="n", xaxs="i", yaxt="n")
110
113
  polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
111
114
  axis(2, fg="grey60", col.axis="grey60")
112
- mtext("Average silhouette", side=2, line=3, col="grey60")
115
+ mtext("Mean silhouette", side=2, line=3, col="grey60")
113
116
  par(new=TRUE)
114
117
  plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
115
118
  ylim=range(ds), bty="n", xaxs="i")
116
- points(k[-c(1,length(k))], ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
119
+ points(k, ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
117
120
  axis(4, fg="darkred", col.axis="darkred")
118
- mtext("Silhouette gain", side=4, line=3, col="darkred")
121
+ mtext("Negative silhouette area", side=4, line=3, col="darkred")
119
122
  abline(v=top.n, lty=2)
120
123
  }
121
124
 
@@ -129,7 +132,11 @@ plot_clustering <- function(cl, dist, types) {
129
132
  top.n <- length(cl$medoids)
130
133
  col <- ggplotColours(top.n)
131
134
  plot(silhouette(cl), col=col)
132
- clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
135
+ if(length(labels(dist))<=15){
136
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
137
+ }else{
138
+ clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
139
+ }
133
140
  }
134
141
 
135
142
  plot_tree <- function(phy, types, medoids){
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1.4
4
+ version: 0.2.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R