rbbt-study 0.2.30 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +0,0 @@
1
- module StudyWorkflow
2
- helper :organism do
3
- study.metadata[:organism]
4
- end
5
-
6
- task :matrix => :tsv do
7
- matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
8
- matrix.matrix_file(path)
9
- nil
10
- end
11
-
12
- task :expression_barcode => :tsv do |*args|
13
- factor = args.first || 2
14
- matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
15
- matrix.barcode(path, factor)
16
- nil
17
- end
18
- end
19
-
20
- module Study
21
- def has_expression?
22
- dir.matrices["gene_expression"].exists?
23
- end
24
- end
@@ -1,17 +0,0 @@
1
- dep :mutated_genes_per_sample
2
- input :list, :array, "Gene list in Ensembl Gene ID"
3
- task :gene_features => :tsv do |list|
4
- mutated_genes_per_sample = step(:mutated_genes_per_sample).load
5
-
6
-
7
- samples = study.cohort.fields
8
- fields = list.name.collect{|n| n + "_mut"}
9
- table = TSV.setup({}, :key_field => "Sample", :fields => fields)
10
-
11
- samples.each do |sample|
12
- affected_genes = mutated_genes_per_sample[sample] || []
13
- table[sample] = list.collect{|gene| affected_genes.include?(gene)? 1 : 0}
14
- end
15
-
16
- table
17
- end
@@ -1,104 +0,0 @@
1
- # NON UNIQ
2
- returns "Ensembl Gene ID"
3
- task :affected_genes => :annotations do
4
- Gene.setup(study.cohort.collect{|genotype| genotype.genes.compact}.flatten, "Ensembl Gene ID", organism)
5
- end
6
-
7
- # NON UNIQ
8
- dep :relevant_mutations
9
- returns "Ensembl Gene ID"
10
- task :relevant_genes => :annotations do
11
- relevant_mutations = step(:relevant_mutations).load
12
- genes = relevant_mutations.collect{|mutation|
13
- splicing = mutation.in_exon_junction? ? mutation.transcripts_with_affected_splicing.gene : []
14
- protein = (mis = mutation.mutated_isoforms).nil? ? [] : mis.protein.gene.compact.uniq
15
- (splicing + protein).uniq
16
- }.compact.flatten
17
- Gene.setup(genes, "Ensembl Gene ID", organism)
18
- end
19
-
20
- # NON UNIQ
21
- dep :relevant_mutations
22
- returns "Ensembl Gene ID"
23
- input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
24
- input :add_exon_junction, :boolean, "Add exon junction mutations", true
25
- task :damaged_genes => :annotations do |methods, add_exon_junction|
26
- relevant_mutations = step(:relevant_mutations).load
27
- all_mis = relevant_mutations.mutated_isoforms.compact.flatten
28
- mi_damaged = Misc.process_to_hash(all_mis){|all_mis| all_mis.damaged?(methods) }
29
-
30
- genes = relevant_mutations.collect{|mutation|
31
- genes = []
32
-
33
- genes.concat mutation.transcripts_with_affected_splicing.gene if add_exon_junction and mutation.in_exon_junction? and mutation.type != 'none'
34
-
35
- mis = mutation.mutated_isoforms
36
- genes.concat mis.select{|mi| mi_damaged[mi]}.protein.gene.compact.uniq unless mis.nil?
37
-
38
- genes.uniq
39
- }.compact.flatten
40
-
41
- Gene.setup(genes, "Ensembl Gene ID", organism)
42
- end
43
-
44
- dep :relevant_genes
45
- task :gene_mutation_count => :yaml do
46
- relevant_genes = step(:relevant_genes).load
47
- if relevant_genes.any?
48
- Misc.counts(relevant_genes.clean_annotations)
49
- else
50
- {}
51
- end
52
- end
53
-
54
- # NON UNIQ
55
- dep :gene_mutation_count
56
- input :percentage, :float, "Minimum percentage of samples with the mutation", 0
57
- returns "Ensembl Gene ID"
58
- task :recurrent_genes => :annotations do |percentage|
59
- gene_mutation_count = step(:gene_mutation_count).load
60
- minimum = (study.cohort.length.to_f * percentage.to_f) / 100.0
61
-
62
- genes = gene_mutation_count.select{|gene, count|
63
-
64
- count > 1 and count > minimum
65
-
66
- }.collect{|gene, count| gene}
67
-
68
- Gene.setup(genes, "Ensembl Gene ID", organism)
69
- end
70
-
71
- dep :damaged_genes
72
- dep :recurrent_genes
73
- returns "Ensembl Gene ID"
74
- task :suspect_genes => :annotations do
75
- damaged_genes = step(:damaged_genes).load
76
- recurrent_genes = step(:recurrent_genes).load
77
-
78
- Gene.setup(( damaged_genes + recurrent_genes ).flatten.uniq, "Ensembl Gene ID", organism)
79
- end
80
-
81
- dep :relevant_mutations
82
- dep :recurrent_genes
83
- task :mutations_over_recurrent_genes => :annotations do
84
- relevant_mutations = step(:relevant_mutations).load
85
- recurrent_genes = step(:recurrent_genes).load
86
-
87
- relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & recurrent_genes).any?}
88
- end
89
-
90
- dep :relevant_mutations
91
- dep :suspect_genes
92
- task :mutations_over_suspect_genes => :annotations do
93
- relevant_mutations = step(:relevant_mutations).load
94
- suspect_genes = step(:suspect_genes).load
95
-
96
- relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & suspect_genes).any?}
97
- end
98
-
99
- require 'rbbt/mutation/oncodriveFM'
100
- task :oncodriveFM => :tsv do
101
- tsv = OncodriveFM.process_cohort(study.cohort)
102
- tsv.namespace = organism
103
- tsv
104
- end
@@ -1,134 +0,0 @@
1
- require 'rbbt/entity/genotype'
2
-
3
- require 'rbbt/entity/study/genotypes/samples'
4
- require 'rbbt/entity/study/genotypes/mutations'
5
- require 'rbbt/entity/study/genotypes/genes'
6
- require 'rbbt/entity/study/genotypes/enrichment'
7
- require 'rbbt/entity/study/genotypes/knowledge_base'
8
-
9
- module StudyWorkflow
10
- helper :organism do
11
- study.metadata[:organism]
12
- end
13
-
14
- #task :binomial_significance => :tsv do
15
-
16
- # tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :fields => ["Matches", "Bases", "Frequency", "p.value"], :namespace => organism)
17
-
18
- # matches = study.knowledge_base.get_index(:mutation_genes).keys
19
- # genes = matches.collect{|m| m.partition("~").last}.uniq
20
- # all_mutations = matches.collect{|m| m.partition("~").first}.uniq
21
-
22
- # total_bases = Gene.gene_list_exon_bases(genes)
23
- # global_frequency = all_mutations.length.to_f / total_bases
24
-
25
- # gene2exon_size = Misc.process_to_hash(genes){|genes| genes.collect{|gene| Gene.gene_list_exon_bases([gene]) }}
26
-
27
- # genes.each do |gene|
28
- # mutations = study.knowledge_base.parents(:mutation_genes, gene).target
29
- # mutations = study.knowledge_base.subset(:sample_mutations, "Genomic Mutation" => mutations, "Sample" => :all).source
30
- # next if mutations.empty?
31
- # matches = mutations.length
32
- # exon_bases = gene2exon_size[gene]
33
- # next if exon_bases == 0
34
- # frequency = matches.to_f / exon_bases
35
- # pvalue = RSRuby.instance.binom_test(matches, exon_bases, global_frequency, 'greater')["p.value"]
36
- # tsv[gene] = [matches, exon_bases, frequency, pvalue]
37
- # end
38
-
39
- # tsv
40
- #end
41
-
42
- task :genotype_overview => :tsv do
43
- gene_overview = TSV.setup({},
44
- :key_field => "Ensembl Gene ID",
45
- :fields => ["Samples with gene mutated", "Samples with gene affected", "Samples with gene damaged", "Mutation significance"],
46
- :type => :double
47
- )
48
- genotyped_samples = study.samples.select_by(:has_genotype?)
49
- all_mutations = study.all_mutations
50
- if all_mutations.empty?
51
- gene_overview
52
- else
53
-
54
- log :affected_genes, "Computing how genes are affected by mutations"
55
- mutation_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.genes}
56
- mutation_affected_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.affected_genes}
57
- if all_mutations.length < 5000
58
- log :damaged_genes, "Computing damaged genes"
59
- mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| all_mutations.damaged_genes}
60
- else
61
- mutation_damaged_genes = Misc.process_to_hash(all_mutations){|all_mutations| [nil] * all_mutations.length}
62
- end
63
- log :significance, "Computing mutation significance"
64
- mutation_significance = NKIWorkflow.job(:significantly_mutated, study, :study => study, :threshold => 0.1).run
65
- log :significance, "Reordering mutation significance file"
66
-
67
- mutation_significance.identifiers = Organism.identifiers(study.organism)
68
- mutation_significance = mutation_significance.change_key "Ensembl Gene ID"
69
-
70
- log :samples, "Gathering affected samples"
71
- samples_gene_status = {}
72
- genotyped_samples.each do |sample|
73
- samples_gene_status[sample] = {}
74
-
75
- mutation_genes.values_at(*sample.mutations).each do |genes|
76
- genes.each do |gene|
77
- samples_gene_status[sample][gene] ||= [false, false, false]
78
- samples_gene_status[sample][gene][0] = true
79
- end
80
- end
81
-
82
- mutation_affected_genes.values_at(*sample.mutations).each do |genes|
83
- genes.each do |gene|
84
- samples_gene_status[sample][gene] ||= [false, false, false]
85
- samples_gene_status[sample][gene][1] = true
86
- end
87
- end
88
-
89
- mutation_damaged_genes.values_at(*sample.mutations).each do |genes|
90
- next if genes.nil?
91
- genes.each do |gene|
92
- samples_gene_status[sample][gene] ||= [false, false, false]
93
- samples_gene_status[sample][gene][2] = true
94
- end
95
- end
96
- end
97
-
98
- log :compiling, "Compiling result"
99
- mutation_genes.values.compact.flatten.uniq.each do |gene|
100
- gene_overview[gene] = []
101
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][0]}.collect{|sample, gene_status| sample}
102
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][1]}.collect{|sample, gene_status| sample}
103
- gene_overview[gene] << samples_gene_status.select{|sample, gene_status| gene_status.include? gene and gene_status[gene][2]}.collect{|sample, gene_status| sample}
104
- gene_overview[gene] << [mutation_significance.include?(gene) ? mutation_significance[gene]["p.value"] : "> 0.1"]
105
- end
106
-
107
- gene_overview
108
- end
109
- end
110
- end
111
-
112
- module Study
113
- def has_genotypes?
114
- dir.genotypes.exists?
115
- end
116
-
117
- attr_accessor :watson
118
- def watson
119
- @watson = metadata[:watson] if @watson.nil?
120
- @watson
121
- end
122
-
123
- def genotype_files
124
- dir.genotypes.glob("*")
125
- end
126
-
127
- def cohort
128
- @cohort ||= genotype_files.collect do |f|
129
- name = File.basename(f)
130
- genomic_mutations = Open.read(f).split("\n").sort
131
- GenomicMutation.setup(genomic_mutations, name, organism, watson)
132
- end.tap{|cohort| cohort.extend Genotype::Cohort}
133
- end
134
- end
@@ -1,56 +0,0 @@
1
- require 'rbbt/workflow'
2
-
3
- Workflow.require_workflow "MutationEnrichment"
4
- module StudyWorkflow
5
-
6
- #{{{ SAMPLE ENRICHMENT
7
- input :database, :string
8
- input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
9
- input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
10
- input :permutations, :integer, "Number of permutations in test", 10000
11
- input :fdr, :boolean, "BH FDR corrections", true
12
- input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
13
- task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes|
14
-
15
- mutations = study.send(mutation_subset)
16
-
17
- mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
18
-
19
- study.cohort.each do |genotype|
20
- sample = genotype.jobname
21
- genotype.each do |mutation|
22
- next unless mutations.include? mutation
23
- mutation_tsv[mutation] ||= []
24
- mutation_tsv[mutation] << sample
25
- end
26
- end
27
-
28
- job = MutationEnrichment.job(:sample_pathway_enrichment, study,
29
- :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
30
- :masked_genes => masked_genes, :organism => study.organism, :permutations => permutations)
31
-
32
- res = job.run
33
- set_info :total_covered, job.info[:total_covered]
34
- set_info :covered_mutations, job.info[:covered_mutations]
35
- res
36
- end
37
-
38
- #{{{ METAGENOTYPE ENRICHMENT
39
- input :database, :string
40
- input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
41
- input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
42
- input :fdr, :boolean, "BH FDR corrections", true
43
- input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
44
- task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
45
-
46
- mutations = study.send(mutation_subset)
47
-
48
- job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
49
- :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
50
- :masked_genes => masked_genes, :organism => study.organism)
51
- res = job.run
52
- set_info :total_covered, job.info[:total_covered]
53
- set_info :covered_mutations, job.info[:covered_mutations]
54
- res
55
- end
56
- end
@@ -1,104 +0,0 @@
1
- module Study
2
- property :genes_with_overlapping_mutations => :single do
3
- mutations = cohort.metagenotype
4
- mutations.genes.compact.flatten.uniq
5
- end
6
-
7
- property :altered_isoforms => :single do
8
- mutated_isoforms = cohort.metagenotype.subset(relevant_mutations).mutated_isoforms.compact.flatten.uniq
9
- return [] if mutated_isoforms.empty?
10
- mutated_isoforms.select_by(:consequence){|c| c != "SYNONYMOUS"}
11
- end
12
-
13
- property :genes_with_altered_isoform_sequence => :single do
14
- altered_isoforms = self.altered_isoforms
15
- return [] if altered_isoforms.empty?
16
- altered_isoforms.transcript.compact.gene.uniq
17
- end
18
-
19
- property :damaged_isoforms => :single do |*args|
20
- altered_isoforms = self.altered_isoforms
21
- return [] if altered_isoforms.empty?
22
- altered_isoforms.select_by(:damaged?, *args)
23
- end
24
-
25
- property :genes_with_damaged_isoforms => :single do |*args|
26
- damaged_isoforms = damaged_isoforms(*args)
27
- return [] if damaged_isoforms.empty?
28
- damaged_isoforms.transcript.gene.uniq
29
- end
30
-
31
- property :genes_with_affected_splicing_sites => :single do
32
- cohort.metagenotype.subset(relevant_mutations).transcripts_with_affected_splicing.compact.flatten.uniq.gene.compact.uniq
33
- end
34
-
35
- property :affected_genes => :single do
36
- Gene.setup(genes_with_altered_isoform_sequence + genes_with_affected_splicing_sites, "Ensembl Gene ID", organism).uniq
37
- end
38
-
39
- property :damaged_genes => :single do |*args|
40
- Gene.setup((genes_with_damaged_isoforms(*args) + genes_with_affected_splicing_sites).uniq, "Ensembl Gene ID", organism)
41
- end
42
-
43
- property :samples_with_gene_damaged => :single do
44
- damaging_mutations= self.damaging_mutations
45
-
46
- samples_with_gene_damaged = {}
47
- cohort.each do |genotype|
48
- genotype.each do |mutation|
49
- next unless damaging_mutations.include? mutation
50
- genes = []
51
- mis = mutation.mutated_isoforms
52
- genes.concat mis.select_by(:damaged?).transcript.gene unless mis.nil? or mis.empty?
53
- genes.concat mutation.transcripts_with_affected_splicing.gene
54
- genes.uniq.each{|gene| samples_with_gene_damaged[gene] ||= []; samples_with_gene_damaged[gene] << genotype.jobname}
55
- end
56
- end
57
- samples_with_gene_damaged
58
- end
59
-
60
- property :samples_with_gene_affected => :single do
61
- relevant_mutations = self.relevant_mutations
62
-
63
- samples_with_gene_affected = {}
64
- cohort.each do |genotype|
65
- genotype.each do |mutation|
66
- next if mutation.nil?
67
- next unless relevant_mutations.include? mutation
68
- genes = []
69
- mis = mutation.mutated_isoforms
70
- genes.concat mis.select_by(:non_synonymous).transcript.gene unless mis.nil? or mis.empty?
71
- genes.concat mutation.transcripts_with_affected_splicing.gene
72
- genes.uniq.each{|gene| samples_with_gene_affected[gene] ||= []; samples_with_gene_affected[gene] << genotype.jobname}
73
- end
74
- end
75
- samples_with_gene_affected
76
- end
77
-
78
- property :gene_sample_matrix => :single do
79
- genotyped_samples = samples.select{|s| s.has_genotype?}.sort.uniq
80
-
81
- tsv = TSV.setup({}, :key_field => "Ensembl Gene ID", :namespace => organism, :type => :list, :fields => genotyped_samples)
82
-
83
- num_samples = genotyped_samples.length
84
- genotyped_samples.each_with_index do |sample,i|
85
- affected_genes = sample.affected_genes
86
- next if affected_genes.empty?
87
- affected_genes.clean_annotations.each do |gene|
88
- tsv[gene] ||= ["FALSE"] * num_samples
89
- tsv[gene][i] = "TRUE"
90
- end
91
- end
92
-
93
- tsv.fields = genotyped_samples
94
-
95
- tsv
96
- end
97
-
98
- property :recurrent_genes => :single do |*args|
99
- min = args.first
100
- min = 2 if min.nil?
101
-
102
- Gene.setup(samples_with_gene_affected.select{|gene, samples| samples.length >= min }.collect{|gene,samples| gene}, "Ensembl Gene ID", organism)
103
- end
104
- end
@@ -1,81 +0,0 @@
1
- require 'rbbt/workflow'
2
- Workflow.require_workflow "Genomics"
3
- require 'rbbt/entity/gene'
4
- require 'rbbt/entity/genomic_mutation'
5
-
6
- module Study
7
-
8
- self.study_registry[:mutation_genes] = Proc.new{|study,database|
9
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
10
- study.cohort.metagenotype.uniq.each do |mutation|
11
- tsv[mutation] = mutation.genes
12
- end
13
- tsv
14
- }
15
-
16
- self.study_registry[:mutation_affected_genes] = Proc.new{|study,database|
17
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
18
- study.cohort.metagenotype.uniq.each do |mutation|
19
- tsv[mutation] = mutation.affected_genes
20
- end
21
- tsv
22
- }
23
-
24
- self.study_registry[:mutation_damaged_genes] = Proc.new{|study,database|
25
- tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Ensembl Gene ID"], :type => :flat, :namespace => study.organism)
26
- study.cohort.metagenotype.uniq.each do |mutation|
27
- tsv[mutation] = mutation.damaged_genes
28
- end
29
- tsv
30
- }
31
-
32
- self.study_registry[:sample_mutations] = Proc.new{|study,database|
33
- tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Genomic Mutation"], :type => :flat, :namespace => study.organism)
34
-
35
- study.samples.select_by(:has_genotype?).each do |sample|
36
- tsv[sample] = sample.mutations
37
- end
38
-
39
- tsv
40
- }
41
-
42
- self.study_registry[:sample_genes] = Proc.new{|study,database|
43
- tsv = TSV.setup({}, :key_field => "Sample", :fields => ["Ensembl Gene ID", "Genomic Mutation", "Affected isoform", "Damaged isoform", "Exon Junction"], :type => :double, :namespace => study.organism)
44
-
45
- sample_mutations = study.knowledge_base.get_database(:sample_mutations, :source => "Sample")
46
- all_mutations = study.all_mutations
47
- mutations2mutated_isoforms = Misc.process_to_hash(all_mutations){|mutations| mutations.any? ? mutations.mutated_isoforms : [] }
48
- mutations2exon_junction = Misc.process_to_hash(all_mutations){|mutations| mutations.any? ? mutations.in_exon_junction? : [] }
49
- mi2damaged = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| mis.any? ? mis.damaged? : [] }
50
- #mi2damaged = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| [false] * mis.length }
51
- mi2consequence = Misc.process_to_hash(MutatedIsoform.setup(mutations2mutated_isoforms.values.flatten.compact.uniq, study.organism)){|mis| mis.any? ? mis.consequence : [] }
52
-
53
- gene_mutations = study.knowledge_base.get_database(:mutation_genes, :source => "Ensembl Gene ID")
54
- gene_mutations.unnamed = true
55
- gene_mutations.entity_options["Genomic Mutation"] = {:watson => study.watson, :organism => study.organism}
56
- study.samples.select_by(:has_genotype?).each do |sample|
57
- values = sample.affected_genes.collect do |gene|
58
- mutations = gene_mutations[gene] & (sample_mutations[sample] || [])
59
-
60
- if mutations and mutations.any?
61
- GenomicMutation.setup(mutations, "Mutations in #{ sample } over #{ gene }", study.organism, study.watson)
62
- junction = mutations.select{|mutation| mutations2exon_junction[mutation] }.any?
63
-
64
- mis = Annotated.flatten mutations2mutated_isoforms.values_at(*mutations).compact
65
-
66
- affected = (mis.any? and mis.select{|mi| c = mi2consequence[mi]; ! %w(UTR SYNONYMOUS).include? c}.any?)
67
- damaged = (mis.any? and mis.select{|mi| mi2damaged[mi] }.any?)
68
-
69
- [gene, mutations * ";;", affected, damaged, junction]
70
- else
71
- [gene, "", false, false, false]
72
- end
73
- end
74
-
75
- tsv[sample] = Misc.zip_fields values
76
- end
77
-
78
- tsv
79
- }
80
-
81
- end