rbbt-study 0.2.30 → 0.2.31

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,24 +0,0 @@
1
- module StudyWorkflow
2
- helper :organism do
3
- study.metadata[:organism]
4
- end
5
-
6
- task :matrix => :tsv do
7
- matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
8
- matrix.matrix_file(path)
9
- nil
10
- end
11
-
12
- task :expression_barcode => :tsv do |*args|
13
- factor = args.first || 2
14
- matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
15
- matrix.barcode(path, factor)
16
- nil
17
- end
18
- end
19
-
20
- module Study
21
- def has_expression?
22
- dir.matrices["gene_expression"].exists?
23
- end
24
- end
@@ -1,17 +0,0 @@
1
- dep :mutated_genes_per_sample
2
- input :list, :array, "Gene list in Ensembl Gene ID"
3
- task :gene_features => :tsv do |list|
4
- mutated_genes_per_sample = step(:mutated_genes_per_sample).load
5
-
6
-
7
- samples = study.cohort.fields
8
- fields = list.name.collect{|n| n + "_mut"}
9
- table = TSV.setup({}, :key_field => "Sample", :fields => fields)
10
-
11
- samples.each do |sample|
12
- affected_genes = mutated_genes_per_sample[sample] || []
13
- table[sample] = list.collect{|gene| affected_genes.include?(gene)? 1 : 0}
14
- end
15
-
16
- table
17
- end
@@ -1,104 +0,0 @@
1
- # NON UNIQ
2
- returns "Ensembl Gene ID"
3
- task :affected_genes => :annotations do
4
- Gene.setup(study.cohort.collect{|genotype| genotype.genes.compact}.flatten, "Ensembl Gene ID", organism)
5
- end
6
-
7
- # NON UNIQ
8
- dep :relevant_mutations
9
- returns "Ensembl Gene ID"
10
- task :relevant_genes => :annotations do
11
- relevant_mutations = step(:relevant_mutations).load
12
- genes = relevant_mutations.collect{|mutation|
13
- splicing = mutation.in_exon_junction? ? mutation.transcripts_with_affected_splicing.gene : []
14
- protein = (mis = mutation.mutated_isoforms).nil? ? [] : mis.protein.gene.compact.uniq
15
- (splicing + protein).uniq
16
- }.compact.flatten
17
- Gene.setup(genes, "Ensembl Gene ID", organism)
18
- end
19
-
20
- # NON UNIQ
21
- dep :relevant_mutations
22
- returns "Ensembl Gene ID"
23
- input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
24
- input :add_exon_junction, :boolean, "Add exon junction mutations", true
25
- task :damaged_genes => :annotations do |methods, add_exon_junction|
26
- relevant_mutations = step(:relevant_mutations).load
27
- all_mis = relevant_mutations.mutated_isoforms.compact.flatten
28
- mi_damaged = Misc.process_to_hash(all_mis){|all_mis| all_mis.damaged?(methods) }
29
-
30
- genes = relevant_mutations.collect{|mutation|
31
- genes = []
32
-
33
- genes.concat mutation.transcripts_with_affected_splicing.gene if add_exon_junction and mutation.in_exon_junction? and mutation.type != 'none'
34
-
35
- mis = mutation.mutated_isoforms
36
- genes.concat mis.select{|mi| mi_damaged[mi]}.protein.gene.compact.uniq unless mis.nil?
37
-
38
- genes.uniq
39
- }.compact.flatten
40
-
41
- Gene.setup(genes, "Ensembl Gene ID", organism)
42
- end
43
-
44
- dep :relevant_genes
45
- task :gene_mutation_count => :yaml do
46
- relevant_genes = step(:relevant_genes).load
47
- if relevant_genes.any?
48
- Misc.counts(relevant_genes.clean_annotations)
49
- else
50
- {}
51
- end
52
- end
53
-
54
- # NON UNIQ
55
- dep :gene_mutation_count
56
- input :percentage, :float, "Minimum percentage of samples with the mutation", 0
57
- returns "Ensembl Gene ID"
58
- task :recurrent_genes => :annotations do |percentage|
59
- gene_mutation_count = step(:gene_mutation_count).load
60
- minimum = (study.cohort.length.to_f * percentage.to_f) / 100.0
61
-
62
- genes = gene_mutation_count.select{|gene, count|
63
-
64
- count > 1 and count > minimum
65
-
66
- }.collect{|gene, count| gene}
67
-
68
- Gene.setup(genes, "Ensembl Gene ID", organism)
69
- end
70
-
71
- dep :damaged_genes
72
- dep :recurrent_genes
73
- returns "Ensembl Gene ID"
74
- task :suspect_genes => :annotations do
75
- damaged_genes = step(:damaged_genes).load
76
- recurrent_genes = step(:recurrent_genes).load
77
-
78
- Gene.setup(( damaged_genes + recurrent_genes ).flatten.uniq, "Ensembl Gene ID", organism)
79
- end
80
-
81
- dep :relevant_mutations
82
- dep :recurrent_genes
83
- task :mutations_over_recurrent_genes => :annotations do
84
- relevant_mutations = step(:relevant_mutations).load
85
- recurrent_genes = step(:recurrent_genes).load
86
-
87
- relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & recurrent_genes).any?}
88
- end
89
-
90
- dep :relevant_mutations
91
- dep :suspect_genes
92
- task :mutations_over_suspect_genes => :annotations do
93
- relevant_mutations = step(:relevant_mutations).load
94
- suspect_genes = step(:suspect_genes).load
95
-
96
- relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & suspect_genes).any?}
97
- end
98
-
99
- require 'rbbt/mutation/oncodriveFM'
100
- task :oncodriveFM => :tsv do
101
- tsv = OncodriveFM.process_cohort(study.cohort)
102
- tsv.namespace = organism
103
- tsv
104
- end
@@ -1,134 +0,0 @@
1
- require 'rbbt/entity/genotype'
2
-
3
- require 'rbbt/entity/study/genotypes/samples'
4
- require 'rbbt/entity/study/genotypes/mutations'
5
- require 'rbbt/entity/study/genotypes/genes'
6
- require 'rbbt/entity/study/genotypes/enrichment'
7
- require 'rbbt/entity/study/genotypes/knowledge_base'
8
-
9
- module StudyWorkflow
10
- helper :organism do
11
- study.metadata[:organism]
12
- end
13
-
14
- #task :binomial_significance => :tsv do
15
-
16
- # tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :fields => ["Matches", "Bases", "Frequency", "p.value"], :namespace => organism)
17
-
18
- # matches = study.knowledge_base.get_index(:mutation_genes).keys
19
- # genes = matches.collect{|m| m.partition("~").last}.uniq
20
- # all_mutations = matches.collect{|m| m.partition("~").first}.uniq
21
-
22
- # total_bases = Gene.gene_list_exon_bases(genes)
23
- # global_frequency = all_mutations.length.to_f / total_bases
24
-
25
- # gene2exon_size = Misc.process_to_hash(genes){|genes| genes.collect{|gene| Gene.gene_list_exon_bases([gene]) }}
26
-
27
- # genes.each do |gene|
28
- # mutations = study.knowledge_base.parents(:mutation_genes, gene).target
29
- # mutations = study.knowledge_base.subset(:sample_mutations, "Genomic Mutation" => mutations, "Sample" => :all).source
30
- # next if mutations.empty?
31
- # matches = mutations.length
32
- # exon_bases = gene2exon_size[gene]
33
- # next if exon_bases == 0
34
- # frequency = matches.to_f / exon_bases
35
- # pvalue = RSRuby.instance.binom_test(matches, exon_bases, global_frequency, 'greater')["p.value"]
36
- # tsv[gene] = [matches, exon_bases, frequency, pvalue]
37
- # end
38
-
39
- # tsv
40
- #end
41
-
42
- task :genotype_overview => :tsv do
43
- gene_overview = TSV.setup({},
44
- :key_field => "Ensembl Gene ID",
45
- :fields => ["Samples with gene mutated", "Samples with gene affected", "Samples with gene damaged", "Mutation significance"],
46
- :type => :double
47
- )
48
- genotyped_samples = study.samples.select_by(:has_genotype?)
49
- all_mutations = study.all_mutations
50
- if all_mutations.empty?
51
- gene_overview
52
- else
53
-
54
- log :affected_genes, "Computing how genes are affected by mutations"
55
- mutation_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.genes}
56
- mutation_affected_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.affected_genes}
57
- if all_mutations.length < 5000
58
- log :damaged_genes, "Computing damaged genes"
59
- mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.damaged_genes}
60
- else
61
- mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| [nil] * all_mutations.length}
62
- end
63
- log :significance, "Computing mutation significance"
64
- mutation_significance = NKIWorkflow.job(:significantly_mutated, study, :study => study, :threshold => 0.1).run
65
- log :significance, "Reordering mutation significance file"
66
-
67
- mutation_significance.identifiers = Organism.identifiers(study.organism)
68
- mutation_significance = mutation_significance.change_key "Ensembl Gene ID"
69
-
70
- log :samples, "Gathering affected samples"
71
- samples_gene_status = {}
72
- genotyped_samples.each do |sample|
73
- samples_gene_status[sample] = {}
74
-
75
- mutation_genes.values_at(*sample.mutations).each do |genes|
76
- genes.each do |gene|
77
- samples_gene_status[sample][gene] ||= [false, false, false]
78
- samples_gene_status[sample][gene][0] = true
79
- end
80
- end
81
-
82
- mutation_affected_genes.values_at(*sample.mutations).each do |genes|
83
- genes.each do |gene|
84
- samples_gene_status[sample][gene] ||= [false, false, false]
85
- samples_gene_status[sample][gene][1] = true
86
- end
87
- end
88
-
89
- mutation_damaged_genes.values_at(*sample.mutations).each do |genes|
90
- next if genes.nil?
91
- genes.each do |gene|
92
- samples_gene_status[sample][gene] ||= [false, false, false]
93
- samples_gene_status[sample][gene][2] = true
94
- end
95
- end
96
- end
97
-
98
- log :compiling, "Compiling result"
99
- mutation_genes.values.compact.flatten.uniq.each do |gene|
100
- gene_overview[gene] = []
101
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][0]}.collect{|sample, gene_status| sample}
102
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][1]}.collect{|sample, gene_status| sample}
103
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][2]}.collect{|sample, gene_status| sample}
104
- gene_overview[gene] << [mutation_significance.include?(gene) ? mutation_significance[gene]["p.value"] : "> 0.1"]
105
- end
106
-
107
- gene_overview
108
- end
109
- end
110
- end
111
-
112
- module Study
113
- def has_genotypes?
114
- dir.genotypes.exists?
115
- end
116
-
117
- attr_accessor :watson
118
- def watson
119
- @watson = metadata[:watson] if @watson.nil?
120
- @watson
121
- end
122
-
123
- def genotype_files
124
- dir.genotypes.glob("*")
125
- end
126
-
127
- def cohort
128
- @cohort ||= genotype_files.collect do |f|
129
- name = File.basename(f)
130
- genomic_mutations = Open.read(f).split("\n").sort
131
- GenomicMutation.setup(genomic_mutations, name, organism, watson)
132
- end.tap{|cohort| cohort.extend Genotype::Cohort}
133
- end
134
- end
@@ -1,56 +0,0 @@
1
- require 'rbbt/workflow'
2
-
3
- Workflow.require_workflow "MutationEnrichment"
4
- module StudyWorkflow
5
-
6
- #{{{ SAMPLE ENRICHMENT
7
- input :database, :string
8
- input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
9
- input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
10
- input :permutations, :integer, "Number of permutations in test", 10000
11
- input :fdr, :boolean, "BH FDR corrections", true
12
- input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
13
- task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes|
14
-
15
- mutations = study.send(mutation_subset)
16
-
17
- mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
18
-
19
- study.cohort.each do |genotype|
20
- sample = genotype.jobname
21
- genotype.each do |mutation|
22
- next unless mutations.include? mutation
23
- mutation_tsv[mutation] ||= []
24
- mutation_tsv[mutation] << sample
25
- end
26
- end
27
-
28
- job = MutationEnrichment.job(:sample_pathway_enrichment, study,
29
- :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
30
- :masked_genes => masked_genes, :organism => study.organism, :permutations => permutations)
31
-
32
- res = job.run
33
- set_info :total_covered, job.info[:total_covered]
34
- set_info :covered_mutations, job.info[:covered_mutations]
35
- res
36
- end
37
-
38
- #{{{ METAGENOTYPE ENRICHMENT
39
- input :database, :string
40
- input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
41
- input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
42
- input :fdr, :boolean, "BH FDR corrections", true
43
- input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
44
- task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
45
-
46
- mutations = study.send(mutation_subset)
47
-
48
- job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
49
- :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
50
- :masked_genes => masked_genes, :organism => study.organism)
51
- res = job.run
52
- set_info :total_covered, job.info[:total_covered]
53
- set_info :covered_mutations, job.info[:covered_mutations]
54
- res
55
- end
56
- end
@@ -1,104 +0,0 @@
1
- module Study
2
- property :genes_with_overlapping_mutations => :single do
3
- mutations = cohort.metagenotype
4
- mutations.genes.compact.flatten.uniq
5
- end
6
-
7
- property :altered_isoforms => :single do
8
- mutated_isoforms = cohort.metagenotype.subset(relevant_mutations).mutated_isoforms.compact.flatten.uniq
9
- return [] if mutated_isoforms.empty?
10
- mutated_isoforms.select_by(:consequence){|c| c != "SYNONYMOUS"}
11
- end
12
-
13
- property :genes_with_altered_isoform_sequence => :single do
14
- altered_isoforms = self.altered_isoforms
15
- return [] if altered_isoforms.empty?
16
- altered_isoforms.transcript.compact.gene.uniq
17
- end
18
-
19
- property :damaged_isoforms => :single do |*args|
20
- altered_isoforms = self.altered_isoforms
21
- return [] if altered_isoforms.empty?
22
- altered_isoforms.select_by(:damaged?, *args)
23
- end
24
-
25
- property :genes_with_damaged_isoforms => :single do |*args|
26
- damaged_isoforms = damaged_isoforms(*args)
27
- return [] if damaged_isoforms.empty?
28
- damaged_isoforms.transcript.gene.uniq
29
- end
30
-
31
- property :genes_with_affected_splicing_sites => :single do
32
- cohort.metagenotype.subset(relevant_mutations).transcripts_with_affected_splicing.compact.flatten.uniq.gene.compact.uniq
33
- end
34
-
35
- property :affected_genes => :single do
36
- Gene.setup(genes_with_altered_isoform_sequence + genes_with_affected_splicing_sites, "Ensembl Gene ID", organism).uniq
37
- end
38
-
39
- property :damaged_genes => :single do |*args|
40
- Gene.setup((genes_with_damaged_isoforms(*args) + genes_with_affected_splicing_sites).uniq, "Ensembl Gene ID", organism)
41
- end
42
-
43
- property :samples_with_gene_damaged => :single do
44
- damaging_mutations= self.damaging_mutations
45
-
46
- samples_with_gene_damaged = {}
47
- cohort.each do |genotype|
48
- genotype.each do |mutation|
49
- next unless damaging_mutations.include? mutation
50
- genes = []
51
- mis = mutation.mutated_isoforms
52
- genes.concat mis.select_by(:damaged?).transcript.gene unless mis.nil? or mis.empty?
53
- genes.concat mutation.transcripts_with_affected_splicing.gene
54
- genes.uniq.each{|gene| samples_with_gene_damaged[gene] ||= []; samples_with_gene_damaged[gene] << genotype.jobname}
55
- end
56
- end
57
- samples_with_gene_damaged
58
- end
59
-
60
- property :samples_with_gene_affected => :single do
61
- relevant_mutations = self.relevant_mutations
62
-
63
- samples_with_gene_affected = {}
64
- cohort.each do |genotype|
65
- genotype.each do |mutation|
66
- next if mutation.nil?
67
- next unless relevant_mutations.include? mutation
68
- genes = []
69
- mis = mutation.mutated_isoforms
70
- genes.concat mis.select_by(:non_synonymous).transcript.gene unless mis.nil? or mis.empty?
71
- genes.concat mutation.transcripts_with_affected_splicing.gene
72
- genes.uniq.each{|gene| samples_with_gene_affected[gene] ||= []; samples_with_gene_affected[gene] << genotype.jobname}
73
- end
74
- end
75
- samples_with_gene_affected
76
- end
77
-
78
- property :gene_sample_matrix => :single do
79
- genotyped_samples = samples.select{|s| s.has_genotype?}.sort.uniq
80
-
81
- tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :namespace => organism, :type => :list, :fields => genotyped_samples)
82
-
83
- num_samples = genotyped_samples.length
84
- genotyped_samples.each_with_index do |sample,i|
85
- affected_genes = sample.affected_genes
86
- next if affected_genes.empty?
87
- affected_genes.clean_annotations.each do |gene|
88
- tsv[gene] ||= ["FALSE"] * num_samples
89
- tsv[gene][i] = "TRUE"
90
- end
91
- end
92
-
93
- tsv.fields = genotyped_samples
94
-
95
- tsv
96
- end
97
-
98
- property :recurrent_genes => :single do |*args|
99
- min = args.first
100
- min = 2 if min.nil?
101
-
102
- Gene.setup(samples_with_gene_affected.select{|gene, samples| samples.length >= min }.collect{|gene,samples| gene}, "Ensembl Gene ID", organism)
103
- end
104
- end
@@ -1,81 +0,0 @@
1
- require 'rbbt/workflow'
2
- Workflow.require_workflow "Genomics"
3
- require 'rbbt/entity/gene'
4
- require 'rbbt/entity/genomic_mutation'
5
-
6
- module Study
7
-
8
- self.study_registry[:mutation_genes] = Proc.new{|study,database|
9
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
10
- study.cohort.metagenotype.uniq.each do |mutation|
11
- tsv[mutation] = mutation.genes
12
- end
13
- tsv
14
- }
15
-
16
- self.study_registry[:mutation_affected_genes] = Proc.new{|study,database|
17
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
18
- study.cohort.metagenotype.uniq.each do |mutation|
19
- tsv[mutation] = mutation.affected_genes
20
- end
21
- tsv
22
- }
23
-
24
- self.study_registry[:mutation_damaged_genes] = Proc.new{|study,database|
25
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
26
- study.cohort.metagenotype.uniq.each do |mutation|
27
- tsv[mutation] = mutation.damaged_genes
28
- end
29
- tsv
30
- }
31
-
32
- self.study_registry[:sample_mutations] = Proc.new{|study,database|
33
- tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Genomic Mutation"], :type => :flat, :namespace => study.organism)
34
-
35
- study.samples.select_by(:has_genotype?).each do |sample|
36
- tsv[sample] = sample.mutations
37
- end
38
-
39
- tsv
40
- }
41
-
42
- self.study_registry[:sample_genes] = Proc.new{|study,database|
43
- tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Ensembl Gene ID", "Genomic Mutation", "Affected isoform", "Damaged isoform", "Exon Junction"], :type => :double, :namespace => study.organism)
44
-
45
- sample_mutations = study.knowledge_base.get_database(:sample_mutations, :source => "Sample")
46
- all_mutations = study.all_mutations
47
- mutations2mutated_isoforms = Misc.process_to_hash(all_mutations){|mutations| mutations.any? ? mutations.mutated_isoforms : [] }
48
- mutations2exon_junction = Misc.process_to_hash(all_mutations){|mutations| mutations.any? ? mutations.in_exon_junction? : [] }
49
- mi2damaged = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| mis.any? ? mis.damaged? : [] }
50
- #mi2damaged = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| [false] * mis.length }
51
- mi2consequence = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| mis.any? ? mis.consequence : [] }
52
-
53
- gene_mutations = study.knowledge_base.get_database(:mutation_genes, :source => "Ensembl Gene ID")
54
- gene_mutations.unnamed = true
55
- gene_mutations.entity_options["Genomic Mutation"] = {:watson => study.watson, :organism => study.organism}
56
- study.samples.select_by(:has_genotype?).each do |sample|
57
- values = sample.affected_genes.collect do |gene|
58
- mutations = gene_mutations[gene] & (sample_mutations[sample] || [])
59
-
60
- if mutations and mutations.any?
61
- GenomicMutation.setup(mutations, "Mutations in #{ sample } over #{ gene }", study.organism, study.watson)
62
- junction = mutations.select{|mutation| mutations2exon_junction[mutation] }.any?
63
-
64
- mis = Annotated.flatten mutations2mutated_isoforms.values_at(*mutations).compact
65
-
66
- affected = (mis.any? and mis.select{|mi| c = mi2consequence[mi]; ! %w(UTR SYNONYMOUS).include? c}.any?)
67
- damaged = (mis.any? and mis.select{|mi| mi2damaged[mi] }.any?)
68
-
69
- [gene, mutations * ";;", affected, damaged, junction]
70
- else
71
- [gene, "", false, false, false]
72
- end
73
- end
74
-
75
- tsv[sample] = Misc.zip_fields values
76
- end
77
-
78
- tsv
79
- }
80
-
81
- end