rbbt-study 0.2.19 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ODg5YzNlNzAzMDA4NjUzNzNlYTc0MTIxMzI3MmZiNzI4N2VhZjg0Yw==
4
+ MmU0YmUxZDUxOTZhMmU1MjJlMmI3MWUwYzU4NzYxODY1NWZhNjk0NQ==
5
5
  data.tar.gz: !binary |-
6
- ZWNmM2VjZjJhY2FkMGYzM2JlOWMxYjBiYTEwNjA1YjVlYzM2N2Y0Yw==
6
+ NGI4ODVjZThmOTk5ODI2NTRhYzRlN2RlNDA0NTc0ZWIyNDk4NGYzOA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MmU0NjNlZGY0YzM3ZTBkYjQ1YWEyNjNkNzE0ODdkYTE4OWE1MzUxZjg4Y2Ji
10
- NzRjOGZmODI5MTUxMzJlYzMyOWZlOTA0MjBmNmQyZDM1MDhiMzA4MTc1ZDgx
11
- ZmJjZTYwMzM4ZTU3NzQzMzQ5ZWNiNTIwZTk1NjhjNzM0ZDYxMWQ=
9
+ MjI1Yzc5YmZjM2ZhY2NmNzI0Zjk3MTZhNTFlMzM0YWZkZWZkZjEwNDVkZGI0
10
+ NjVkM2Q4ZTY5OGQwZjkyMDRkYWM4Yzg0YmU3MWVjNzY4OGNjNWNiMDM2M2U4
11
+ YTQxYmZiMDQwM2VjNzI3MTNjOTgzNjdmNzg5ZTQ2NDM4OTJmYTM=
12
12
  data.tar.gz: !binary |-
13
- MzUwNzIzYWM0MDE1MjNhNTFjMGZkYTdjNzU5ZjVhNDcwZGNiY2NlZGUwYTBj
14
- ODY5ODZhMzMxMDUzOGFhM2M4ZDU4YzM2MzVhMTZkN2QyN2I1YzEwOTg1ZDJk
15
- ZDNiYmM0OTExOWZhMjJmOGYxMmRlMDJmZTYxYTMyMDRkMWMzOTA=
13
+ MTk5MjBlZTc1NDQ4OTZiNjE0M2VlZGVkYWE0YWZiM2VhYzJkNjk0MjhjYTI2
14
+ OTc2YzhiZTAxMTBmOWQwZGI0OGNiZjM5NmZiZjk3ODMxMjYwNzYzNDJmZGRj
15
+ ZTMyY2EzZjg1Y2I2MDkwNzM0OGRhOGJkYTllMmVmNTEwNzQzODY=
@@ -13,6 +13,34 @@ module StudyWorkflow
13
13
  study.metadata[:organism]
14
14
  end
15
15
 
16
+ task :binomial_significance => :tsv do
17
+
18
+ tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :fields => ["Matches", "Bases", "Frequency", "p.value"], :namespace => organism)
19
+
20
+ matches = study.knowledge_base.get_index(:mutation_genes).keys
21
+ genes = matches.collect{|m| m.partition("~").last}.uniq
22
+ all_mutations = matches.collect{|m| m.partition("~").first}.uniq
23
+
24
+ total_bases = Gene.gene_list_exon_bases(genes)
25
+ global_frequency = all_mutations.length.to_f / total_bases
26
+
27
+ gene2exon_size = Misc.process_to_hash(genes){|genes| genes.collect{|gene| Gene.gene_list_exon_bases([gene]) }}
28
+
29
+ genes.each do |gene|
30
+ mutations = study.knowledge_base.parents(:mutation_genes, gene).target
31
+ mutations = study.knowledge_base.subset(:sample_mutations, "Genomic Mutation" => mutations).source
32
+ next if mutations.empty?
33
+ matches = mutations.length
34
+ exon_bases = gene2exon_size[gene]
35
+ next if exon_bases == 0
36
+ frequency = matches.to_f / exon_bases
37
+ pvalue = RSRuby.instance.binom_test(matches, exon_bases, global_frequency, 'greater')["p.value"]
38
+ tsv[gene] = [matches, exon_bases, frequency, pvalue]
39
+ end
40
+
41
+ tsv
42
+ end
43
+
16
44
  task :genotype_overview => :tsv do
17
45
  gene_overview = TSV.setup({},
18
46
  :key_field => "Ensembl Gene ID",
@@ -10,6 +10,7 @@ Workflow.require_workflow "Genomics"
10
10
  require 'rbbt/entity/study'
11
11
  require 'rbbt/entity/study/knowledge_base'
12
12
  require 'rbbt/entity/study/samples'
13
+
13
14
  require 'rbbt/expression/matrix'
14
15
 
15
16
  module StudyWorkflow
data/share/R/data.R ADDED
@@ -0,0 +1,76 @@
1
+ rbbt.SE.sample.mutated.genes <- function(study){
2
+ sample.mutated.genes <- rbbt.ruby.substitutions(
3
+ "
4
+ require 'rbbt/workflow'
5
+ Workflow.require_workflow 'StudyExplorer'
6
+
7
+ YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
8
+
9
+ Log.severity = 0
10
+
11
+ study = Study.setup('STUDY')
12
+
13
+ relevant_genes = study.job(:relevant_genes, study).run.uniq
14
+
15
+ tsv = TSV.setup({}, :key_field => 'Sample', :fields => relevant_genes.name, :type => :single)
16
+
17
+ study.cohort.each do |genotype|
18
+ sample = genotype.jobname
19
+ mutated_genes = genotype.genes.compact.flatten.uniq
20
+ tsv[sample] = relevant_genes.collect{|gene| mutated_genes.include?(gene)? 'TRUE' : 'FALSE' }
21
+ end
22
+
23
+ tsv
24
+ ", substitutions = list(STUDY=study))
25
+ }
26
+
27
+ rbbt.SE.gene.kegg.pathway <- function(genes){
28
+ gene_str = rbbt.a.to.string(genes);
29
+ gene.pathways = rbbt.ruby.substitutions(
30
+ "
31
+ require 'rbbt/entity/gene'
32
+ require 'rbbt/sources/kegg'
33
+
34
+ YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
35
+
36
+ Log.severity=0
37
+
38
+ genes = [GENE_STR];
39
+
40
+ Gene.setup(genes, 'Associated Gene Name', 'Hsa/jun2011')
41
+
42
+ pathways = genes.kegg_pathways.compact.flatten.uniq
43
+
44
+ gene_pathways = {}
45
+ genes.each do |gene|
46
+ gene_pathway_list = gene.kegg_pathways || []
47
+ gene_pathways[gene] = pathways.collect{|p| gene_pathway_list.include?(p) ? 1 : 0 }
48
+ end
49
+
50
+ tsv = TSV.setup(gene_pathways, :key_field => 'Associated Gene Name', :fields => [pathways], :type => :flat)
51
+ ", substitutions = list(GENE_STR=gene_str));
52
+
53
+ gene.pathways$Gene = rownames(gene.pathways)
54
+
55
+ return(gene.pathways)
56
+ }
57
+
58
+ rbbt.SE.study.samples <- function(study){
59
+ samples <- rbbt.ruby.substitutions(
60
+ "
61
+ require 'rbbt/workflow'
62
+ Workflow.require_workflow 'StudyExplorer'
63
+
64
+ YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
65
+
66
+ Log.severity = 0
67
+
68
+ study = Study.setup('STUDY')
69
+
70
+ study.samples
71
+ ", substitutions = list(STUDY=study));
72
+
73
+ return(samples);
74
+ }
75
+
76
+
data/share/R/plots.R ADDED
@@ -0,0 +1,122 @@
1
+ rbbt.SE.plot.sort.by.field <- function(plot, field){
2
+ d = plot$data;
3
+
4
+ d[[field]] = reorder(d[[field]], d$Mutated, sum)
5
+
6
+ sample.best.gene.pos.df = ddply(d, "Sample", function(x){ min(match(subset(x, Mutated==TRUE)[[field]], rev(levels(d[[field]]))), na.rm=T)})
7
+
8
+ d$sample.best.gene.pos = NULL
9
+ names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
10
+
11
+ d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
12
+
13
+ d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
14
+
15
+ plot$data = d;
16
+
17
+ return(plot);
18
+ }
19
+
20
+ rbbt.SE.plot.sort.by.mutations <- function(plot){
21
+ d = plot$data;
22
+
23
+ d$Gene = reorder(d$Gene, d$Mutated, sum);
24
+ num.elems = length(levels(d$Gene));
25
+
26
+ #sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/mean(1/match(subset(x, Mutated==TRUE)$Gene, rev(levels(d$Gene)))^2)})
27
+ sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/sum(2^(num.elems - match(subset(x, Mutated==TRUE)$Gene, rev(levels(d$Gene)))))})
28
+
29
+ d$sample.best.gene.pos = NULL
30
+ names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
31
+
32
+ d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
33
+
34
+ d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
35
+
36
+ plot$data = d;
37
+
38
+ return(plot);
39
+ }
40
+
41
+ rbbt.SE.plot.sort.by.pathway.mutations <- function(plot){
42
+ d = plot$data;
43
+
44
+ d$Pathway = reorder(d$Pathway, d$Mutated, sum);
45
+ num.elems = length(levels(d$Pathway));
46
+
47
+ #sample.best.gene.pos.df = ddply(d, "Sample", function(x){ min(match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway))), na.rm=T)})
48
+ #sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/mean(1/match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway)))^2)})
49
+ sample.best.gene.pos.df = ddply(d, "Sample", function(x){ 1/sum(2^(num.elems - match(subset(x, Mutated==TRUE)$Pathway, rev(levels(d$Pathway)))))})
50
+
51
+ d$sample.best.gene.pos = NULL
52
+ names(sample.best.gene.pos.df) <- c("Sample", "sample.best.gene.pos");
53
+
54
+ d = merge(d, sample.best.gene.pos.df, all.x=TRUE)
55
+
56
+ d$Sample = reorder(d$Sample, d$sample.best.gene.pos)
57
+
58
+ plot$data = d;
59
+
60
+ return(plot);
61
+ }
62
+
63
+ rbbt.SE.plot.mutations <- function(study, cutoff = 3, sample.info = NULL){
64
+ sample.mutated.genes = rbbt.SE.sample.mutated.genes(study);
65
+
66
+ gene.mutation.counts = apply(sample.mutated.genes, 2, function(x){sum(x==TRUE)})
67
+
68
+ recurrent.genes = names(gene.mutation.counts[gene.mutation.counts >= cutoff])
69
+
70
+ d.recurrent = sample.mutated.genes[, recurrent.genes]
71
+ d.recurrent$Sample = rownames(d.recurrent)
72
+
73
+ d.recurrent.m = melt(d.recurrent, "Sample")
74
+
75
+ names(d.recurrent.m) <- c("Sample", "Gene", "Mutated")
76
+
77
+ if (is.null(sample.info)){
78
+ d = d.recurrent.m
79
+ }else{
80
+ d = merge(d.recurrent.m, sample.info, all.x=TRUE)
81
+ }
82
+
83
+ layer.mutations = geom_tile(data=d,aes(x=Sample, y=Gene, alpha=Mutated))
84
+
85
+ rbbt.SE.plot.sort.by.mutations(layer.mutations);
86
+
87
+ return(layer.mutations);
88
+ }
89
+
90
+ rbbt.SE.plot.add.expression <- function(plot, study, ...){
91
+
92
+ genes = unique(plot$data$Gene);
93
+ gene.expression <- rbbt.SE.expression(study, genes, ...);
94
+
95
+ gene.expression.m <- melt(gene.expression);
96
+ names(gene.expression.m) <- c("Gene", "Sample", "Expression");
97
+
98
+ gene.expression.3rd = summary(gene.expression.m$Expression)[["3rd Qu."]]
99
+ gene.expression.1st = summary(gene.expression.m$Expression)[["1st Qu."]]
100
+
101
+ unpadd = as.character(as.numeric(gene.expression.m$Sample));
102
+ unpadd[is.na(unpadd)] = gene.expression.m$Sample[is.na(unpadd)];
103
+ gene.expression.m$Sample = unpadd;
104
+
105
+ mean.gene.expression <- aggregate(Expression ~ Gene, gene.expression.m, mean, trim=0.1, na.rm=T);
106
+ names(mean.gene.expression) <- c("Gene", "Mean");
107
+ gene.expression.m[gene.expression.m[,"Expression"] > gene.expression.3rd, "Expression"] = gene.expression.3rd
108
+ gene.expression.m[gene.expression.m[,"Expression"] < gene.expression.1st, "Expression"] = gene.expression.1st
109
+
110
+ sd.gene.expression <- aggregate(Expression ~ Gene, gene.expression.m, mad, na.rm=T);
111
+ names(sd.gene.expression) <- c("Gene", "SD");
112
+
113
+ gene.expression.m <- merge(gene.expression.m, mean.gene.expression);
114
+ gene.expression.m <- merge(gene.expression.m, sd.gene.expression);
115
+
116
+ #d = merge(d, gene.expression.m, by=c("Sample", "Gene"), all.x=TRUE);
117
+ plot$data = merge(plot$data, gene.expression.m, all.x=TRUE);
118
+
119
+ layer.expression = geom_point(data=plot$data, aes(x=Sample, y=Gene, size=abs((Expression - Mean) / SD), color=((Expression - Mean) / SD)));
120
+
121
+ return(layer.expression)
122
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-study
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.19
4
+ version: 0.2.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-05 00:00:00.000000000 Z
11
+ date: 2013-12-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This gem add the study entity with suport for NGS, Microarray and other
14
14
  types of data
@@ -43,6 +43,8 @@ files:
43
43
  - lib/rbbt/entity/study/plots.rb
44
44
  - lib/rbbt/entity/study/samples.rb
45
45
  - lib/rbbt/entity/study/snp.rb
46
+ - share/R/data.R
47
+ - share/R/plots.R
46
48
  homepage: http://github.com/mikisvaz/rbbt-study
47
49
  licenses:
48
50
  - MIT