rbbt-study 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ module Study
2
+ property :genes_with_overlapping_mutations => :single do
3
+ mutations = cohort.metagenotype
4
+ mutations.genes.compact.flatten.uniq
5
+ end
6
+
7
+ property :altered_isoforms => :single do
8
+ mutated_isoforms = cohort.metagenotype.subset(relevant_mutations).mutated_isoforms.compact.flatten.uniq
9
+ return [] if mutated_isoforms.empty?
10
+ mutated_isoforms.select_by(:consequence){|c| c != "SYNONYMOUS"}
11
+ end
12
+
13
+ property :genes_with_altered_isoform_sequence => :single do
14
+ altered_isoforms = self.altered_isoforms
15
+ return [] if altered_isoforms.empty?
16
+ altered_isoforms.transcript.compact.gene.uniq
17
+ end
18
+
19
+ property :damaged_isoforms => :single do |*args|
20
+ altered_isoforms = self.altered_isoforms
21
+ return [] if altered_isoforms.empty?
22
+ altered_isoforms.select_by(:damaged?, *args)
23
+ end
24
+
25
+ property :genes_with_damaged_isoforms => :single do |*args|
26
+ damaged_isoforms = damaged_isoforms(*args)
27
+ return [] if damaged_isoforms.empty?
28
+ damaged_isoforms.transcript.gene.uniq
29
+ end
30
+
31
+ property :genes_with_affected_splicing_sites => :single do
32
+ cohort.metagenotype.subset(relevant_mutations).transcripts_with_affected_splicing.compact.flatten.uniq.gene.compact.uniq
33
+ end
34
+
35
+ property :affected_genes => :single do
36
+ Gene.setup(genes_with_altered_isoform_sequence + genes_with_affected_splicing_sites, "Ensembl Gene ID", organism).uniq
37
+ end
38
+
39
+ property :damaged_genes => :single do |*args|
40
+ Gene.setup((genes_with_damaged_isoforms(*args) + genes_with_affected_splicing_sites).uniq, "Ensembl Gene ID", organism)
41
+ end
42
+
43
+ property :samples_with_gene_damaged => :single do
44
+ damaging_mutations= self.damaging_mutations
45
+
46
+ samples_with_gene_damaged = {}
47
+ cohort.each do |genotype|
48
+ genotype.each do |mutation|
49
+ next unless damaging_mutations.include? mutation
50
+ genes = []
51
+ mis = mutation.mutated_isoforms
52
+ genes.concat mis.select_by(:damaged?).transcript.gene unless mis.nil? or mis.empty?
53
+ genes.concat mutation.transcripts_with_affected_splicing.gene
54
+ genes.uniq.each{|gene| samples_with_gene_damaged[gene] ||= []; samples_with_gene_damaged[gene] << genotype.jobname}
55
+ end
56
+ end
57
+ samples_with_gene_damaged
58
+ end
59
+
60
+ property :samples_with_gene_affected => :single do
61
+ relevant_mutations = self.relevant_mutations
62
+
63
+ samples_with_gene_affected = {}
64
+ cohort.each do |genotype|
65
+ genotype.each do |mutation|
66
+ next unless relevant_mutations.include? mutation
67
+ genes = []
68
+ mis = mutation.mutated_isoforms
69
+ genes.concat mis.select_by(:consequence){|c| c != "SYNONYMOUS"}.transcript.gene unless mis.nil? or mis.empty?
70
+ genes.concat mutation.transcripts_with_affected_splicing.gene
71
+ genes.uniq.each{|gene| samples_with_gene_affected[gene] ||= []; samples_with_gene_affected[gene] << genotype.jobname}
72
+ end
73
+ end
74
+ samples_with_gene_affected
75
+ end
76
+
77
+ property :gene_sample_matrix => :single do
78
+ genotyped_samples = samples.select{|s| s.has_genotype?}.sort.uniq
79
+
80
+ tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :namespace => organism, :type => :list, :fields => genotyped_samples)
81
+
82
+ num_samples = genotyped_samples.length
83
+ genotyped_samples.each_with_index do |sample,i|
84
+ affected_genes = sample.affected_genes
85
+ next if affected_genes.empty?
86
+ affected_genes.clean_annotations.each do |gene|
87
+ tsv[gene] ||= ["FALSE"] * num_samples
88
+ tsv[gene][i] = "TRUE"
89
+ end
90
+ end
91
+
92
+ tsv.fields = genotyped_samples
93
+
94
+ tsv
95
+ end
96
+
97
+ property :recurrent_genes => :single do |*args|
98
+ min = args.first
99
+ min = 2 if min.nil?
100
+
101
+ Gene.setup(samples_with_gene_affected.select{|gene, samples| samples.length >= min }.collect{|gene,samples| gene}, "Ensembl Gene ID", organism)
102
+ end
103
+ end
@@ -0,0 +1,39 @@
1
+ module Study
2
+
3
+ self.study_registry[:mutation_genes] = Proc.new{|study,database|
4
+ tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
5
+ study.cohort.metagenotype.uniq.each do |mutation|
6
+ tsv[mutation] = mutation.genes
7
+ end
8
+ tsv
9
+ }
10
+
11
+ self.study_registry[:sample_mutations] = Proc.new{|study,database|
12
+ tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Genomic Mutation"], :type => :flat, :namespace => study.organism)
13
+ study.samples.select_by(:has_genotype?).each do |sample|
14
+ tsv[sample] = sample.mutations
15
+ end
16
+ tsv
17
+ }
18
+
19
+ self.study_registry[:sample_genes] = Proc.new{|study,database|
20
+ tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
21
+ study.samples.select_by(:has_genotype?).each do |sample|
22
+ tsv[sample] = sample.affected_genes
23
+ end
24
+ tsv
25
+ }
26
+
27
+ self.study_registry[:sample_genes2] = Proc.new{|study,database|
28
+ tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Ensembl Gene ID", "Genomic Mutation"], :type => :double, :namespace => study.organism)
29
+ kb = study.knowledge_base.get_database(:mutation_genes, :source => "Ensembl Gene ID")
30
+ study.samples.select_by(:has_genotype?).each do |sample|
31
+ values = sample.affected_genes.collect do |gene|
32
+ [gene, kb[gene] * ";;"]
33
+ end
34
+ tsv[sample] = Misc.zip_fields values
35
+ end
36
+ tsv
37
+ }
38
+
39
+ end
@@ -0,0 +1,34 @@
1
+ module Study
2
+ property :all_mutations do
3
+ cohort.metagenotype.tap{|o| o.jobname = "All mutations in #{ self }" }
4
+ end
5
+
6
+ property :relevant_mutations do
7
+ all_mutations = self.all_mutations
8
+
9
+ all_mutations.select_by(:relevant?).tap{|o| o.jobname = "Relevant mutations in #{ self }" }
10
+ end
11
+
12
+ property :damaging_mutations do |*args|
13
+ relevant_mutations.select_by(:damaging?, *args).tap{|o| o.jobname = "Damaging mutations in #{ self }" }
14
+ end
15
+
16
+ property :mutations_altering_isoform_sequence do
17
+ relevant_mutations.select{|m|
18
+ mis = m.mutated_isoforms; not mis.nil? and mis.select{|m| m.consequence != "SYNONYMOUS"}.any?
19
+ }.tap{|o| o.jobname = "Mutations altering isoform sequence in #{ self }"}
20
+ end
21
+
22
+ property :mutations_affecting_splicing_sites do
23
+ relevant_mutations.select_by(:transcripts_with_affected_splicing){|ts| ts.any? }.
24
+ tap{|o| o.jobname = "Mutations affecting splicing sites in #{ self }"}
25
+ end
26
+
27
+ property :mutations_over_gene do |gene|
28
+ all_mutations.select_by(:genes){|genes| genes and genes.include? gene}
29
+ end
30
+
31
+ property :mutations_over_gene_list do |list|
32
+ all_mutations.select_by(:genes){|genes| genes and (genes & list).any?}
33
+ end
34
+ end
@@ -0,0 +1,28 @@
1
+ module Sample
2
+ property :has_genotype? => :array2single do
3
+ study.cohort.values_at(*self).collect{|g| not g.nil?}
4
+ end
5
+
6
+ property :mutations do
7
+ Study.setup(study)
8
+ study.cohort[self]
9
+ end
10
+
11
+ property :relevant_mutations do
12
+ mutations.select_by(:relevant?)
13
+ end
14
+
15
+ property :damaging_mutations do |*args|
16
+ mutations.select_by(:damaging?, *args)
17
+ end
18
+
19
+ property :affected_genes do
20
+ mutations.affected_genes.compact.flatten.uniq
21
+ end
22
+
23
+ property :damaged_genes do |*args|
24
+ mutations.damaged_genes(*args).compact.flatten.uniq
25
+ end
26
+ end
27
+
28
+
@@ -0,0 +1,110 @@
1
+ require 'rbbt/entity/genotype'
2
+
3
+ require 'rbbt/entity/study/genotypes/samples'
4
+ require 'rbbt/entity/study/genotypes/mutations'
5
+ require 'rbbt/entity/study/genotypes/genes'
6
+ require 'rbbt/entity/study/genotypes/enrichment'
7
+ require 'rbbt/entity/study/genotypes/knowledge_base'
8
+
9
+ Workflow.require_workflow "NKIWorkflow"
10
+ Workflow.require_workflow "TSVWorkflow"
11
+
12
+ module StudyWorkflow
13
+ helper :organism do
14
+ study.metadata[:organism]
15
+ end
16
+
17
+ task :genotype_overview => :tsv do
18
+ gene_overview = TSV.setup({},
19
+ :key_field => "Ensembl Gene ID",
20
+ :fields => ["Samples with gene mutated", "Samples with gene affected", "Samples with gene damaged", "Mutation significance"],
21
+ :type => :double
22
+ )
23
+ genotyped_samples = study.samples.select_by(:has_genotype?)
24
+ all_mutations = study.all_mutations
25
+ if all_mutations.empty?
26
+ gene_overview
27
+ else
28
+
29
+ log :affected_genes, "Computing how genes are affected by mutations"
30
+ mutation_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.genes}
31
+ mutation_affected_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.affected_genes}
32
+ if all_mutations.length < 5000
33
+ log :damaged_genes, "Computing genes damaged genes"
34
+ mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.damaged_genes}
35
+ else
36
+ mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| [nil] * all_mutations.length}
37
+ end
38
+ log :significance, "Computing mutation significance"
39
+ mutation_significance = NKIWorkflow.job(:significantly_mutated, study, :study => study, :threshold => 0.1).run
40
+ log :significance, "Reordering mutation significance file"
41
+
42
+ #TSVWorkflow.job(:change_id, study, :format => "Ensembl Gene ID", :tsv => mutation_significance).run
43
+ mutation_significance.identifiers = Organism.identifiers(study.organism)
44
+ mutation_significance = mutation_significance.change_key "Ensembl Gene ID"
45
+
46
+ log :samples, "Gathering affected samples"
47
+ samples_gene_status = {}
48
+ genotyped_samples.each do |sample|
49
+ samples_gene_status[sample] = {}
50
+
51
+ mutation_genes.values_at(*sample.mutations).each do |genes|
52
+ genes.each do |gene|
53
+ samples_gene_status[sample][gene] ||= [false, false, false]
54
+ samples_gene_status[sample][gene][0] = true
55
+ end
56
+ end
57
+
58
+ mutation_affected_genes.values_at(*sample.mutations).each do |genes|
59
+ genes.each do |gene|
60
+ samples_gene_status[sample][gene] ||= [false, false, false]
61
+ samples_gene_status[sample][gene][1] = true
62
+ end
63
+ end
64
+
65
+ mutation_damaged_genes.values_at(*sample.mutations).each do |genes|
66
+ next if genes.nil?
67
+ genes.each do |gene|
68
+ samples_gene_status[sample][gene] ||= [false, false, false]
69
+ samples_gene_status[sample][gene][2] = true
70
+ end
71
+ end
72
+ end
73
+
74
+ log :compiling, "Compiling result"
75
+ mutation_genes.values.compact.flatten.uniq.each do |gene|
76
+ gene_overview[gene] = []
77
+ gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][0]}.collect{|sample, gene_status| sample}
78
+ gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][1]}.collect{|sample, gene_status| sample}
79
+ gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][2]}.collect{|sample, gene_status| sample}
80
+ gene_overview[gene] << [mutation_significance.include?(gene) ? mutation_significance[gene]["p.value"] : "> 0.1"]
81
+ end
82
+
83
+ gene_overview
84
+ end
85
+ end
86
+ end
87
+
88
+ module Study
89
+ def has_genotypes?
90
+ dir.genotypes.exists?
91
+ end
92
+
93
+ attr_accessor :watson
94
+ def watson
95
+ @watson = metadata[:watson] if @watson.nil?
96
+ @watson
97
+ end
98
+
99
+ def genotype_files
100
+ dir.genotypes.glob("*")
101
+ end
102
+
103
+ def cohort
104
+ @cohort ||= genotype_files.collect do |f|
105
+ name = File.basename(f)
106
+ genomic_mutations = Open.read(f).split("\n").sort
107
+ GenomicMutation.setup(genomic_mutations, name, organism, watson)
108
+ end.tap{|cohort| cohort.extend Genotype::Cohort}
109
+ end
110
+ end
@@ -0,0 +1,36 @@
1
+ module Study
2
+
3
+ class << self
4
+ attr_accessor :knowledge_base, :study_registry
5
+ def knowledge_base
6
+ @knowledge_base ||= KnowledgeBase.new Rbbt.var.knowledge_base.Study
7
+ end
8
+
9
+ def study_registry
10
+ @study_registry ||= {}
11
+ end
12
+ end
13
+
14
+ attr_accessor :knowledge_base
15
+
16
+ def knowledge_base
17
+ @knowledge_base ||= begin
18
+ kb = KnowledgeBase.new(self.dir.var.knowledge_base, self.organism)
19
+ kb.format["Gene"] = "Ensembl Gene ID"
20
+ kb.entity_options["Sample"] = {"Study" => self}
21
+ Study.study_registry.each do |database, file|
22
+ Log.debug("Inheriting #{ database } from registry: #{Misc.fingerprint file}")
23
+ if Proc === file
24
+ study = self
25
+ block = Proc.new{ file.call(self, database) }
26
+ block.define_singleton_method(:filename) do [database, study] * "@" end
27
+ kb.register database, nil, {}, &block
28
+ else
29
+ kb.register database, file
30
+ end
31
+ end
32
+ kb
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,31 @@
1
+ module Sample
2
+ property :methylation => :array2single do
3
+ study.methylation_cohort
4
+ end
5
+
6
+ property :has_methylation? => :array2single do
7
+ study.methylation_cohort.values_at(*self).collect{|methylation| not methylation.nil?}
8
+ end
9
+
10
+ property :methylated => :single do
11
+ return [] if methylation.empty?
12
+ methylation.select_by(:methylated?)
13
+ end
14
+
15
+ property :unmethylated => :single do
16
+ return [] if methylation.empty?
17
+ methylation.select_by(:unmethylated?)
18
+ end
19
+
20
+ property :methylated_genes => :single do
21
+ return [] if methylated.empty?
22
+ Gene.setup(methylated.genes.flatten.uniq, "Ensembl Gene ID", organism)
23
+ end
24
+
25
+ property :unmethylated_genes => :single do
26
+ return [] if unmethylated.empty?
27
+ Gene.setup(unmethylated.genes.flatten.uniq, "Ensembl Gene ID", organism)
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,90 @@
1
+ require 'rbbt/entity/methylation'
2
+
3
+ require 'rbbt/entity/study/methylation/samples'
4
+
5
+ module StudyWorkflow
6
+ helper :organism do
7
+ study.metadata[:organism]
8
+ end
9
+ end
10
+
11
+ module Study
12
+ def has_methylation?
13
+ dir.methylation.exists?
14
+ end
15
+
16
+ def methylation_files
17
+ dir.methylation.find.glob("*")
18
+ end
19
+
20
+ def methylation_cohort
21
+ if @methylation_cohort.nil?
22
+ @methylation_cohort = {}
23
+ methylation_files.each do |f|
24
+ sample = File.basename(f)
25
+ Sample.setup(sample, self)
26
+ methylations = Open.read(f).split("\n").sort
27
+ Methylation.setup(methylations, organism)
28
+ @methylation_cohort[sample] = methylations
29
+ end
30
+ end
31
+ @methylation_cohort
32
+ end
33
+ end
34
+
35
+ module Study
36
+ property :recurrently_lost_genes => :single do |threshold|
37
+ counts = {}
38
+ self.samples.each do |sample|
39
+ next unless sample.has_methylation?
40
+ puts sample
41
+
42
+ genes = nil
43
+ genes = sample.lost_genes.clean_annotations
44
+ genes.each do |gene|
45
+ counts[gene] ||= 0
46
+ counts[gene] += 1
47
+ end
48
+ end
49
+
50
+ recurrent = counts.select{|k,c| c >= threshold }.collect{|k,v| k }
51
+ Gene.setup(recurrent, "Ensembl Gene ID", organism)
52
+ end
53
+
54
+ property :recurrently_gained_genes => :single do |threshold|
55
+ counts = {}
56
+ self.samples.each do |sample|
57
+ next unless sample.has_methylation?
58
+ puts sample
59
+
60
+ genes = nil
61
+ genes = sample.gained_genes.clean_annotations
62
+ genes.each do |gene|
63
+ counts[gene] ||= 0
64
+ counts[gene] += 1
65
+ end
66
+ end
67
+
68
+ recurrent = counts.select{|k,c| c >= threshold }.collect{|k,v| k }
69
+ Gene.setup(recurrent, "Ensembl Gene ID", organism)
70
+ end
71
+
72
+ property :gene_sample_methylation_matrix => :single do
73
+ tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :namespace => organism, :type => :list)
74
+ samples = []
75
+ i = 0
76
+ num_samples = cohort.length
77
+ methylation_cohort.each do |sample,methylation|
78
+ methylation.genes.compact.flatten.uniq.each do |gene|
79
+ tsv[gene] ||= ["FALSE"] * num_samples
80
+ tsv[gene][i] = "TRUE"
81
+ end
82
+ samples << sample
83
+ i += 1
84
+ end
85
+
86
+ tsv.fields = samples
87
+
88
+ tsv
89
+ end
90
+ end