rbbt-study 0.2.19 → 0.2.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/rbbt/entity/study/genotypes.rb +28 -0
- data/lib/rbbt/entity/study.rb +1 -0
- data/share/R/data.R +76 -0
- data/share/R/plots.R +122 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmU0YmUxZDUxOTZhMmU1MjJlMmI3MWUwYzU4NzYxODY1NWZhNjk0NQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NGI4ODVjZThmOTk5ODI2NTRhYzRlN2RlNDA0NTc0ZWIyNDk4NGYzOA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MjI1Yzc5YmZjM2ZhY2NmNzI0Zjk3MTZhNTFlMzM0YWZkZWZkZjEwNDVkZGI0
|
10
|
+
NjVkM2Q4ZTY5OGQwZjkyMDRkYWM4Yzg0YmU3MWVjNzY4OGNjNWNiMDM2M2U4
|
11
|
+
YTQxYmZiMDQwM2VjNzI3MTNjOTgzNjdmNzg5ZTQ2NDM4OTJmYTM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTk5MjBlZTc1NDQ4OTZiNjE0M2VlZGVkYWE0YWZiM2VhYzJkNjk0MjhjYTI2
|
14
|
+
OTc2YzhiZTAxMTBmOWQwZGI0OGNiZjM5NmZiZjk3ODMxMjYwNzYzNDJmZGRj
|
15
|
+
ZTMyY2EzZjg1Y2I2MDkwNzM0OGRhOGJkYTllMmVmNTEwNzQzODY=
|
@@ -13,6 +13,34 @@ module StudyWorkflow
|
|
13
13
|
study.metadata[:organism]
|
14
14
|
end
|
15
15
|
|
16
|
+
task :binomial_significance => :tsv do
|
17
|
+
|
18
|
+
tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :fields => ["Matches", "Bases", "Frequency", "p.value"], :namespace => organism)
|
19
|
+
|
20
|
+
matches = study.knowledge_base.get_index(:mutation_genes).keys
|
21
|
+
genes = matches.collect{|m| m.partition("~").last}.uniq
|
22
|
+
all_mutations = matches.collect{|m| m.partition("~").first}.uniq
|
23
|
+
|
24
|
+
total_bases = Gene.gene_list_exon_bases(genes)
|
25
|
+
global_frequency = all_mutations.length.to_f / total_bases
|
26
|
+
|
27
|
+
gene2exon_size = Misc.process_to_hash(genes){|genes| genes.collect{|gene| Gene.gene_list_exon_bases([gene]) }}
|
28
|
+
|
29
|
+
genes.each do |gene|
|
30
|
+
mutations = study.knowledge_base.parents(:mutation_genes, gene).target
|
31
|
+
mutations = study.knowledge_base.subset(:sample_mutations, "Genomic Mutation" => mutations).source
|
32
|
+
next if mutations.empty?
|
33
|
+
matches = mutations.length
|
34
|
+
exon_bases = gene2exon_size[gene]
|
35
|
+
next if exon_bases == 0
|
36
|
+
frequency = matches.to_f / exon_bases
|
37
|
+
pvalue = RSRuby.instance.binom_test(matches, exon_bases, global_frequency, 'greater')["p.value"]
|
38
|
+
tsv[gene] = [matches, exon_bases, frequency, pvalue]
|
39
|
+
end
|
40
|
+
|
41
|
+
tsv
|
42
|
+
end
|
43
|
+
|
16
44
|
task :genotype_overview => :tsv do
|
17
45
|
gene_overview = TSV.setup({},
|
18
46
|
:key_field => "Ensembl Gene ID",
|
data/lib/rbbt/entity/study.rb
CHANGED
data/share/R/data.R
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
rbbt.SE.sample.mutated.genes <- function(study){
|
2
|
+
sample.mutated.genes <- rbbt.ruby.substitutions(
|
3
|
+
"
|
4
|
+
require 'rbbt/workflow'
|
5
|
+
Workflow.require_workflow 'StudyExplorer'
|
6
|
+
|
7
|
+
YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
|
8
|
+
|
9
|
+
Log.severity = 0
|
10
|
+
|
11
|
+
study = Study.setup('STUDY')
|
12
|
+
|
13
|
+
relevant_genes = study.job(:relevant_genes, study).run.uniq
|
14
|
+
|
15
|
+
tsv = TSV.setup({}, :key_field => 'Sample', :fields => relevant_genes.name, :type => :single)
|
16
|
+
|
17
|
+
study.cohort.each do |genotype|
|
18
|
+
sample = genotype.jobname
|
19
|
+
mutated_genes = genotype.genes.compact.flatten.uniq
|
20
|
+
tsv[sample] = relevant_genes.collect{|gene| mutated_genes.include?(gene)? 'TRUE' : 'FALSE' }
|
21
|
+
end
|
22
|
+
|
23
|
+
tsv
|
24
|
+
", substitutions = list(STUDY=study))
|
25
|
+
}
|
26
|
+
|
27
|
+
rbbt.SE.gene.kegg.pathway <- function(genes){
|
28
|
+
gene_str = rbbt.a.to.string(genes);
|
29
|
+
gene.pathways = rbbt.ruby.substitutions(
|
30
|
+
"
|
31
|
+
require 'rbbt/entity/gene'
|
32
|
+
require 'rbbt/sources/kegg'
|
33
|
+
|
34
|
+
YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
|
35
|
+
|
36
|
+
Log.severity=0
|
37
|
+
|
38
|
+
genes = [GENE_STR];
|
39
|
+
|
40
|
+
Gene.setup(genes, 'Associated Gene Name', 'Hsa/jun2011')
|
41
|
+
|
42
|
+
pathways = genes.kegg_pathways.compact.flatten.uniq
|
43
|
+
|
44
|
+
gene_pathways = {}
|
45
|
+
genes.each do |gene|
|
46
|
+
gene_pathway_list = gene.kegg_pathways || []
|
47
|
+
gene_pathways[gene] = pathways.collect{|p| gene_pathway_list.include?(p) ? 1 : 0 }
|
48
|
+
end
|
49
|
+
|
50
|
+
tsv = TSV.setup(gene_pathways, :key_field => 'Associated Gene Name', :fields => [pathways], :type => :flat)
|
51
|
+
", substitutions = list(GENE_STR=gene_str));
|
52
|
+
|
53
|
+
gene.pathways$Gene = rownames(gene.pathways)
|
54
|
+
|
55
|
+
return(gene.pathways)
|
56
|
+
}
|
57
|
+
|
58
|
+
rbbt.SE.study.samples <- function(study){
|
59
|
+
samples <- rbbt.ruby.substitutions(
|
60
|
+
"
|
61
|
+
require 'rbbt/workflow'
|
62
|
+
Workflow.require_workflow 'StudyExplorer'
|
63
|
+
|
64
|
+
YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
|
65
|
+
|
66
|
+
Log.severity = 0
|
67
|
+
|
68
|
+
study = Study.setup('STUDY')
|
69
|
+
|
70
|
+
study.samples
|
71
|
+
", substitutions = list(STUDY=study));
|
72
|
+
|
73
|
+
return(samples);
|
74
|
+
}
|
75
|
+
|
76
|
+
|
data/share/R/plots.R
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
rbbt.SE.plot.sort.by.field <- function(plot, field){
|
2
|
+
d = plot$data;
|
3
|
+
|
4
|
+
d[[field]] = reorder(d[[field]], d$Mutated, sum)
|
5
|
+
|
6
|
+
sample.best.gene.pos.df = ddply(d, "Sample", function(x){ min(match(subset(x, Mutated==TRUE)[[field]], rev(levels(d[[field]]))), na.rm=T)})
|
7
|
+
|
8
|
+
d$sample.best.gene.pos = NULL
|
9
|
+
names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
|
10
|
+
|
11
|
+
d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
|
12
|
+
|
13
|
+
d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
|
14
|
+
|
15
|
+
plot$data = d;
|
16
|
+
|
17
|
+
return(plot);
|
18
|
+
}
|
19
|
+
|
20
|
+
rbbt.SE.plot.sort.by.mutations <- function(plot){
|
21
|
+
d = plot$data;
|
22
|
+
|
23
|
+
d$Gene = reorder(d$Gene, d$Mutated, sum);
|
24
|
+
num.elems = length(levels(d$Gene));
|
25
|
+
|
26
|
+
#sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/mean(1/match(subset(x, Mutated==TRUE)$Gene, rev(levels(d$Gene)))^2)})
|
27
|
+
sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/sum(2^(num.elems - match(subset(x, Mutated==TRUE)$Gene, rev(levels(d$Gene)))))})
|
28
|
+
|
29
|
+
d$sample.best.gene.pos = NULL
|
30
|
+
names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
|
31
|
+
|
32
|
+
d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
|
33
|
+
|
34
|
+
d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
|
35
|
+
|
36
|
+
plot$data = d;
|
37
|
+
|
38
|
+
return(plot);
|
39
|
+
}
|
40
|
+
|
41
|
+
rbbt.SE.plot.sort.by.pathway.mutations <- function(plot){
|
42
|
+
d = plot$data;
|
43
|
+
|
44
|
+
d$Pathway = reorder(d$Pathway, d$Mutated, sum);
|
45
|
+
num.elems = length(levels(d$Pathway));
|
46
|
+
|
47
|
+
#sample.best.gene.pos.df = ddply(d, "Sample", function(x){ min(match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway))), na.rm=T)})
|
48
|
+
#sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/mean(1/match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway)))^2)})
|
49
|
+
sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/sum(2^(num.elems - match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway)))))})
|
50
|
+
|
51
|
+
d$sample.best.gene.pos = NULL
|
52
|
+
names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
|
53
|
+
|
54
|
+
d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
|
55
|
+
|
56
|
+
d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
|
57
|
+
|
58
|
+
plot$data = d;
|
59
|
+
|
60
|
+
return(plot);
|
61
|
+
}
|
62
|
+
|
63
|
+
rbbt.SE.plot.mutations <- function(study, cutoff = 3, sample.info = NULL){
|
64
|
+
sample.mutated.genes = rbbt.SE.sample.mutated.genes(study);
|
65
|
+
|
66
|
+
gene.mutation.counts = apply(sample.mutated.genes, 2, function(x){sum(x==TRUE)})
|
67
|
+
|
68
|
+
recurrent.genes = names(gene.mutation.counts[gene.mutation.counts >= cutoff])
|
69
|
+
|
70
|
+
d.recurrent = sample.mutated.genes[, recurrent.genes]
|
71
|
+
d.recurrent$Sample = rownames(d.recurrent)
|
72
|
+
|
73
|
+
d.recurrent.m = melt(d.recurrent, "Sample")
|
74
|
+
|
75
|
+
names(d.recurrent.m) <- c("Sample", "Gene", "Mutated")
|
76
|
+
|
77
|
+
if (is.null(sample.info)){
|
78
|
+
d = d.recurrent.m
|
79
|
+
}else{
|
80
|
+
d = merge(d.recurrent.m, sample.info, all.x=TRUE)
|
81
|
+
}
|
82
|
+
|
83
|
+
layer.mutations = geom_tile(data=d,aes(x=Sample, y=Gene, alpha=Mutated))
|
84
|
+
|
85
|
+
rbbt.SE.plot.sort.by.mutations(layer.mutations);
|
86
|
+
|
87
|
+
return(layer.mutations);
|
88
|
+
}
|
89
|
+
|
90
|
+
rbbt.SE.plot.add.expression <- function(plot, study, ...){
|
91
|
+
|
92
|
+
genes = unique(plot$data$Gene);
|
93
|
+
gene.expression <- rbbt.SE.expression(study, genes, ...);
|
94
|
+
|
95
|
+
gene.expression.m <- melt(gene.expression);
|
96
|
+
names(gene.expression.m) <- c("Gene", "Sample", "Expression");
|
97
|
+
|
98
|
+
gene.expression.3rd = summary(gene.expression.m$Expression)[["3rd Qu."]]
|
99
|
+
gene.expression.1st = summary(gene.expression.m$Expression)[["1st Qu."]]
|
100
|
+
|
101
|
+
unpadd = as.character(as.numeric(gene.expression.m$Sample));
|
102
|
+
unpadd[is.na(unpadd)] = gene.expression.m$Sample[is.na(unpadd)];
|
103
|
+
gene.expression.m$Sample = unpadd;
|
104
|
+
|
105
|
+
mean.gene.expression <- aggregate(Expression ~ Gene, gene.expression.m, mean, trim=0.1, na.rm=T);
|
106
|
+
names(mean.gene.expression) <- c("Gene", "Mean");
|
107
|
+
gene.expression.m[gene.expression.m[,"Expression"] > gene.expression.3rd, "Expression"] = gene.expression.3rd
|
108
|
+
gene.expression.m[gene.expression.m[,"Expression"] < gene.expression.1st, "Expression"] = gene.expression.1st
|
109
|
+
|
110
|
+
sd.gene.expression <- aggregate(Expression ~ Gene, gene.expression.m, mad, na.rm=T);
|
111
|
+
names(sd.gene.expression) <- c("Gene", "SD");
|
112
|
+
|
113
|
+
gene.expression.m <- merge(gene.expression.m, mean.gene.expression);
|
114
|
+
gene.expression.m <- merge(gene.expression.m, sd.gene.expression);
|
115
|
+
|
116
|
+
#d = merge(d, gene.expression.m, by=c("Sample", "Gene"), all.x=TRUE);
|
117
|
+
plot$data = merge(plot$data, gene.expression.m, all.x=TRUE);
|
118
|
+
|
119
|
+
layer.expression = geom_point(data=plot$data, aes(x=Sample, y=Gene, size=abs((Expression - Mean) / SD), color=((Expression - Mean) / SD)));
|
120
|
+
|
121
|
+
return(layer.expression)
|
122
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-study
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: This gem add the study entity with suport for NGS, Microarray and other
|
14
14
|
types of data
|
@@ -43,6 +43,8 @@ files:
|
|
43
43
|
- lib/rbbt/entity/study/plots.rb
|
44
44
|
- lib/rbbt/entity/study/samples.rb
|
45
45
|
- lib/rbbt/entity/study/snp.rb
|
46
|
+
- share/R/data.R
|
47
|
+
- share/R/plots.R
|
46
48
|
homepage: http://github.com/mikisvaz/rbbt-study
|
47
49
|
licenses:
|
48
50
|
- MIT
|