rbbt-study 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,418 @@
1
+ #{{{ SAMPLE ENRICHMENT
2
+ dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
3
+ input :database, :string, "Database code"
4
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
5
+ input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
6
+ input :permutations, :integer, "Number of permutations in test", 10000
7
+ input :fdr, :boolean, "BH FDR corrections", true
8
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
9
+ input :organism, :string, "Organism code", metadata[:organism]
10
+ task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes,organism|
11
+
12
+ mutations = step(mutation_subset).load
13
+
14
+ mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
15
+
16
+ study.cohort.each do |genotype|
17
+ sample = genotype.jobname
18
+ genotype.each do |mutation|
19
+ next unless mutations.include? mutation
20
+ mutation_tsv[mutation] ||= []
21
+ mutation_tsv[mutation] << sample
22
+ end
23
+ end
24
+
25
+ job = MutationEnrichment.job(:sample_pathway_enrichment, study,
26
+ :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
27
+ :masked_genes => masked_genes, :organism => organism, :permutations => permutations)
28
+
29
+ res = job.run
30
+ set_info :total_covered, job.info[:total_covered]
31
+ set_info :covered_mutations, job.info[:covered_mutations]
32
+ res
33
+ end
34
+
35
+ #{{{ METAGENOTYPE ENRICHMENT
36
+ dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
37
+ input :database, :string, "Database code"
38
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
39
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
40
+ input :fdr, :boolean, "BH FDR corrections", true
41
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
42
+ input :organism, :string, "Organism code", metadata[:organism]
43
+ task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
44
+
45
+ mutations = step(mutation_subset).load
46
+
47
+ job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
48
+ :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
49
+ :masked_genes => masked_genes, :organism => organism)
50
+ res = job.run
51
+ set_info :total_covered, job.info[:total_covered]
52
+ set_info :covered_mutations, job.info[:covered_mutations]
53
+ res
54
+ end
55
+
56
+
57
+
58
+
59
+
60
+
61
+ ################################################################
62
+ #{{{ OLD
63
+ ################################################################
64
+
65
+ #{{{ BASE AND GENE COUNTS
66
+
67
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
68
+ #input :organism, :string, "Organism code", metadata[:organism]
69
+ #task :pathway_base_counts => :tsv do |masked_genes, organism|
70
+ # database = clean_name
71
+ # log :loading_genes, "Loading genes from #{ database } #{ organism }"
72
+ # case database
73
+ # when 'kegg'
74
+ # tsv = KEGG.gene_pathway.tsv :key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :merge => true
75
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "KEGG Gene ID", organism).ensembl.compact
76
+ # when 'go', 'go_bp'
77
+ # tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
78
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
79
+ # when 'pfam'
80
+ # tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
81
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
82
+ # end
83
+ #
84
+ # tsv.namespace = organism
85
+ #
86
+ # counts = TSV.setup({}, :key_field => tsv.key_field, :fields => ["Bases"], :type => :single, :cast => :to_i, :namespace => organism)
87
+ #
88
+ # log :processing_database, "Processing database #{database}"
89
+ # tsv.with_monitor do
90
+ # tsv.through do |pathway, genes|
91
+ # next if genes.nil? or genes.empty?
92
+ # size = Gene.gene_list_exon_bases(genes.ensembl.compact.remove(masked_genes))
93
+ # counts[pathway] = size
94
+ # end
95
+ # end
96
+ #
97
+ # log :computing_exome_size, "Computing number of exome bases covered by pathway annotations"
98
+ # total_size = Gene.gene_list_exon_bases(total_genes.remove(masked_genes))
99
+ #
100
+ # set_info :total_size, total_size
101
+ # set_info :total_gene_list, total_genes.remove(masked_genes)
102
+ #
103
+ # counts
104
+ #end
105
+ #
106
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
107
+ #input :organism, :string, "Organism code", metadata[:organism]
108
+ #task :pathway_gene_counts => :tsv do |masked_genes,organism|
109
+ # database = clean_name
110
+ # case database.to_s
111
+ # when 'kegg'
112
+ # tsv = KEGG.gene_pathway.tsv :key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :merge => true
113
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "KEGG Gene ID", organism).ensembl.compact
114
+ # when 'go', 'go_bp'
115
+ # tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
116
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
117
+ # when 'pfam'
118
+ # tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
119
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
120
+ # end
121
+ #
122
+ # counts = TSV.setup({}, :key_field => tsv.key_field, :fields => ["Genes"], :type => :single, :cast => :to_i, :namespace => organism)
123
+ #
124
+ # tsv.through do |pathway, genes|
125
+ # next if genes.nil? or genes.empty?
126
+ # genes = genes.ensembl.remove(masked_genes)
127
+ # num = genes.length
128
+ # counts[pathway] = num
129
+ # end
130
+ #
131
+ # set_info :total_genes, total_genes.remove(masked_genes).length
132
+ # set_info :total_gene_list, total_genes.remove(masked_genes)
133
+ #
134
+ # counts
135
+ #end
136
+
137
+
138
+ #
139
+ #
140
+ #dep do |jobname, inputs| job(inputs[:baseline], inputs[:database].to_s, inputs) end
141
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
142
+ #dep :affected_samples_per_pathway
143
+ #input :database, :string
144
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
145
+ #input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
146
+ #input :permutations, :integer, "Number of permutations in test", 10000
147
+ #input :fdr, :boolean, "BH FDR corrections", true
148
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
149
+ #input :organism, :string, "Organism code", metadata[:organism]
150
+ #task :sample_pathway_enrichment_old => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes,organism|
151
+ # pathway_counts = step(baseline).load
152
+ # pathway_counts.unnamed = true
153
+ # total_covered = step(baseline).info[:total_size] || step(baseline).info[:total_genes]
154
+ # total_pathway_genes_list = step(baseline).info[:total_gene_list]
155
+ # mutations = step(mutation_subset).load
156
+ # affected_samples_per_pathway = step(:affected_samples_per_pathway).load
157
+ # affected_samples_per_pathway.namespace = organism
158
+ #
159
+ # affected_genes = mutations.genes.compact.flatten.uniq
160
+ #
161
+ # case database.to_s
162
+ # when 'kegg'
163
+ # database_tsv = KEGG.gene_pathway.tsv :key_field => 'KEGG Pathway ID', :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
164
+ # when 'go', 'go_bp'
165
+ # database_tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
166
+ # when 'pfam'
167
+ # database_tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
168
+ # end
169
+ #
170
+ # covered_genes_per_samples = {}
171
+ # samples = []
172
+ # study.cohort.each do |genotype|
173
+ # samples << genotype.jobname
174
+ # covered_genes_per_samples[genotype.jobname] = genotype.subset(mutations).genes.compact.flatten.subset(total_pathway_genes_list)
175
+ # end
176
+ #
177
+ # sample_mutation_tokens = []
178
+ # samples.collect{|sample| study.cohort[sample].subset(mutations).genes.select{|l| not l.nil? and (l & total_pathway_genes_list).any? }.length.times{ sample_mutation_tokens << sample } }
179
+ #
180
+ # mutation_genes = Misc.process_to_hash(mutations){|list| list.genes}
181
+ # covered_mutations = mutations.select{|mutation|(mutation_genes[mutation] & total_pathway_genes_list).any? }.length
182
+ #
183
+ # pathways = pathway_counts.keys
184
+ #
185
+ # pathway_expected_counts = {}
186
+ # pathway_counts.with_monitor :desc => "Pathway gene counts" do
187
+ # pathway_counts.through do |pathway, count|
188
+ # next unless affected_samples_per_pathway.include?(pathway) and affected_samples_per_pathway[pathway].any?
189
+ # ratio = count.to_f / total_covered
190
+ # num_token_list = RSRuby.instance.rbinom(permutations, sample_mutation_tokens.length, ratio)
191
+ # pathway_expected_counts[pathway] = num_token_list.collect{|num_tokens|
192
+ # Misc.sample(sample_mutation_tokens, num_tokens.to_i).uniq.length
193
+ # }
194
+ # end
195
+ # end
196
+ #
197
+ # tsv = TSV.setup({}, :key_field => affected_samples_per_pathway.key_field, :fields => ["Sample", "Matches", "Expected", "Ratio", "Pathway total", "p-value", "Ensembl Gene ID"], :namespace => organism, :type => :double)
198
+ # affected_samples_per_pathway.through do |pathway, samples|
199
+ # next unless samples.any?
200
+ # next unless pathway_expected_counts.include? pathway
201
+ # pathway_genes = database_tsv[pathway].ensembl
202
+ # samples = samples.uniq.select{|sample| (covered_genes_per_samples[sample] & pathway_genes).any?}
203
+ # count = samples.length
204
+ # expected = Misc.mean(pathway_expected_counts[pathway]).floor
205
+ # pvalue = pathway_expected_counts[pathway].select{|exp_c| exp_c > count}.length.to_f / permutations
206
+ # tsv[pathway] = [samples.sort, [count], [expected], [count.to_f / expected], [pathway_counts[pathway]], [pvalue], pathway_genes.subset(affected_genes)]
207
+ # end
208
+ #
209
+ # FDR.adjust_hash! tsv, 5 if fdr
210
+ #
211
+ # set_info :covered_mutations, covered_mutations
212
+ # set_info :total_covered, total_covered
213
+ #
214
+ # tsv
215
+ #end
216
+ #
217
+ #
218
+ #
219
+ #
220
+ #dep do |jobname, inputs| job(inputs[:baseline], inputs[:database].to_s, inputs) end
221
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
222
+ #input :database, :string
223
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
224
+ #input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
225
+ #input :fdr, :boolean, "BH FDR corrections", true
226
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
227
+ #input :organism, :string, "Organism code", metadata[:organism]
228
+ #task :mutation_pathway_enrichment_old => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
229
+ # counts = step(baseline).load
230
+ # total_covered = step(baseline).info[:total_size] || step(baseline).info[:total_genes]
231
+ # mutations = step(mutation_subset).load
232
+ #
233
+ # affected_genes = mutations.genes.compact.flatten.uniq
234
+ #
235
+ # # Get database tsv and native ids
236
+ #
237
+ # case database
238
+ # when 'kegg'
239
+ # database_tsv = KEGG.gene_pathway.tsv :key_field => 'KEGG Gene ID', :fields => ["KEGG Pathway ID"], :type => :flat, :persist => true, :unnamed => true, :merge => true
240
+ # affected_genes_db = affected_genes.to_kegg
241
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).ensembl.uniq
242
+ # when 'go', 'go_bp'
243
+ # database_tsv = Organism.gene_go_bp(organism).tsv :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :persist => true, :unnamed => true, :merge => true
244
+ # affected_genes_db = affected_genes
245
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).uniq
246
+ # when 'pfam'
247
+ # database_tsv = Organism.gene_pfam(organism).tsv :key_field => "Ensembl Gene ID", :fields => ["Pfam Domain"], :type => :flat, :persist => true, :unnamed => true, :merge => true
248
+ # affected_genes_db = affected_genes
249
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).uniq
250
+ # end
251
+ #
252
+ # affected_genes = affected_genes.remove(masked_genes)
253
+ # all_db_genes = all_db_genes.remove(masked_genes)
254
+ #
255
+ # # Annotate each pathway with the affected genes that are involved in it
256
+ #
257
+ # affected_genes_per_pathway = {}
258
+ # affected_genes_db.zip(affected_genes).each do |gene_db,gene|
259
+ # next if gene_db.nil?
260
+ # pathways = database_tsv[gene_db]
261
+ # next if pathways.nil?
262
+ # pathways.uniq.each do |pathway|
263
+ # affected_genes_per_pathway[pathway] ||= []
264
+ # affected_genes_per_pathway[pathway] << gene
265
+ # end
266
+ # end
267
+ #
268
+ # pvalues = TSV.setup({}, :key_field => database_tsv.fields.first, :fields => ["Matches", "Pathway total", "p-value", "Ensembl Gene ID"], :namespace => organism, :type => :double)
269
+ # mutation_genes = Misc.process_to_hash(mutations){|list| list.genes}
270
+ # covered_mutations = mutations.select{|mutation|(mutation_genes[mutation] & all_db_genes).any? }.length
271
+ #
272
+ # affected_genes_per_pathway.each do |pathway, genes|
273
+ # pathway_total = counts[pathway]
274
+ # matches = mutations.select{|mutation| (mutation_genes[mutation] & genes).any? }.length
275
+ # pvalue = RSRuby.instance.binom_test(matches, covered_mutations, pathway_total.to_f / total_covered.to_f, "greater")["p.value"]
276
+ #
277
+ # pvalues[pathway] = [[matches], [pathway_total], [pvalue], affected_genes.subset(genes).uniq.sort_by{|g| g.name || g}]
278
+ # end
279
+ #
280
+ # FDR.adjust_hash! pvalues, 2 if fdr
281
+ #
282
+ # set_info :covered_mutations, covered_mutations
283
+ # set_info :total_covered, total_covered
284
+ #
285
+ # pvalues
286
+ #end
287
+ #
288
+ #
289
+ #dep do |jobname, inputs| job(:pathway_base_counts, inputs[:database].to_s, inputs) end
290
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
291
+ #dep :affected_genes
292
+ #input :database, :string
293
+ #input :fdr, :boolean, "BH FDR corrections", true
294
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
295
+ #
296
+ ##{{{ RATIOS
297
+ #
298
+ #dep :affected_samples_per_pathway
299
+ #dep :affected_genes
300
+ #dep do |name, inputs|
301
+ # database = inputs[:database]
302
+ # if inputs[:type] == :genes
303
+ # job(:pathway_base_counts, database, inputs)
304
+ # else
305
+ # job(:pathway_gene_counts, database, inputs)
306
+ # end
307
+ #end
308
+ #task :pathway_sample_ratios => :tsv do
309
+ # num_samples = study.cohort.length
310
+ #
311
+ # affected_samples_per_pathway = step(:affected_samples_per_pathway).load
312
+ # affected_genes = step(:affected_genes).load
313
+ #
314
+ # ratios = TSV.setup({}, :key_field => sample_pathway_probability.key_field, :fields => ["Num Samples", "Expected", "Ratio", "Ensembl Gene ID"], :namespace => organism)
315
+ #
316
+ # pathways.through do |pathway, probability|
317
+ # next unless affected_samples_per_pathway.include?(pathway) and affected_samples_per_pathway[pathway].length > 1
318
+ # affected_samples = (affected_samples_per_pathway[pathway] || []).length
319
+ # ratios[pathway] = [affected_samples, probability * num_samples, affected_samples.to_f / (probability * num_samples), pathway.genes.ensembl.subset(affected_genes)]
320
+ # end
321
+ #
322
+ # ratios.namespace = organism
323
+ #
324
+ # ratios
325
+ #end
326
+ #
327
+
328
+ #input :database, :string
329
+ #dep :damaging_mutations
330
+ #dep :affected_genes
331
+ #task :pathway_mutation_ratios => :tsv do |database|
332
+ # damaging_mutations = step(:damaging_mutations).load
333
+ #
334
+ # affected_genes = step(:affected_genes).load
335
+ # affected_genes.organism = organism
336
+ #
337
+ # pathways = case database
338
+ # when 'go'
339
+ # affected_genes.go_terms
340
+ # when 'go_bp'
341
+ # affected_genes.go_bp_terms
342
+ # when 'go_cc'
343
+ # affected_genes.go_cc_terms
344
+ # when 'go_mf'
345
+ # affected_genes.go_mf_terms
346
+ # when 'pfam'
347
+ # affected_genes.pfam_domains
348
+ # else
349
+ # affected_genes.send(database + '_pathways')
350
+ # end.compact.flatten.uniq
351
+ #
352
+ # key_field = nil
353
+ # pathway_genes = Misc.process_to_hash(pathways) do |pathways|
354
+ # pathways.uniq.collect do |pathway|
355
+ # case database
356
+ # when 'kegg'
357
+ # KeggPathway.setup(pathway, organism)
358
+ # key_field = "KEGG Pathway ID"
359
+ # Gene.setup(pathway.genes, "KEGG Gene ID", organism).ensembl
360
+ # when 'go', 'go_bp', 'go_mf', 'go_cc'
361
+ # GOTerm.setup(pathway, organism)
362
+ # key_field = "GO ID"
363
+ # Gene.setup(pathway.genes, "Ensembl Gene ID", organism).ensembl
364
+ # when 'pfam'
365
+ # PfamDomain.setup(pathway, organism)
366
+ # key_field = "Pfam Domain"
367
+ # Gene.setup(pathway.genes, "Ensembl Gene ID", organism).ensembl
368
+ # end
369
+ # end
370
+ # end
371
+ #
372
+ # pathway_mutation_ratios = TSV.setup({}, :key_field => key_field,
373
+ # :fields => ["Mutations per MB", "# Mutations", "# Damaging Mut.", "# Genes", "# Bases", "Ensembl Gene ID"], :type => :double)
374
+ #
375
+ #
376
+ # pathway_sizes = Misc.process_to_hash(pathways) do |pathways|
377
+ # pathways.collect{|pathway| Gene.gene_list_exon_bases(pathway_genes[pathway])}
378
+ # end
379
+ #
380
+ # pathways_for_mutations = Misc.process_to_hash(study.cohort.flatten){|all_mutations| Gene.setup(study.cohort.collect{|genotype| genotype.genes }.flatten(1), "Ensembl Gene ID", organism).collect{|genes|
381
+ # genes.nil? ? [] : genes.collect{|gene|
382
+ # case database
383
+ # when 'go'
384
+ # gene.go_terms
385
+ # when 'go_bp'
386
+ # gene.go_bp_terms
387
+ # when 'go_cc'
388
+ # gene.go_cc_terms
389
+ # when 'go_mf'
390
+ # gene.go_mf_terms
391
+ # when 'pfam'
392
+ # gene.pfam_domains
393
+ # else
394
+ # affected_genes.send(database + '_pathways')
395
+ # end
396
+ # }.flatten}
397
+ # }
398
+ #
399
+ # mutations_in_pathway = Misc.process_to_hash(pathways) do |pathways|
400
+ # pathways.collect do |pathway|
401
+ # GenomicMutation.setup(pathways_for_mutations.select{|mut,pths| pths and pths.include? pathway}.collect{|mut, pths| mut}, "Pathway mutations #{pathway} in #{study}", organism, watson)
402
+ # end
403
+ # end
404
+ #
405
+ # pathways.each do |pathway|
406
+ # pathway_mutations = mutations_in_pathway[pathway]
407
+ # next if pathway_mutations.one?
408
+ # pathway_score = pathway_mutations.length.to_f / pathway_sizes[pathway]
409
+ # genes = pathway_mutations.genes.compact.flatten.subset(affected_genes).subset(pathway_genes[pathway])
410
+ #
411
+ # pathway_mutation_ratios[pathway] = [["%.5g" % (pathway_score * 1_000_000)], [pathway_mutations.length], [pathway_mutations.subset(damaging_mutations).length], [pathway_genes[pathway].length], [pathway_sizes[pathway]], genes]
412
+ # end
413
+ #
414
+ # pathway_mutation_ratios.namespace = organism
415
+ #
416
+ # pathway_mutation_ratios
417
+ #end
418
+ #
@@ -0,0 +1,19 @@
1
+ module StudyWorkflow
2
+ helper :organism do
3
+ study.metadata[:organism]
4
+ end
5
+
6
+ task :matrix => :tsv do
7
+ matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
8
+ matrix.matrix_file(path)
9
+ nil
10
+ end
11
+
12
+ task :expression_barcode => :tsv do |*args|
13
+ factor = args.first || 2
14
+ matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
15
+ matrix.barcode(path, factor)
16
+ nil
17
+ end
18
+ end
19
+
@@ -0,0 +1,17 @@
1
+ dep :mutated_genes_per_sample
2
+ input :list, :array, "Gene list in Ensembl Gene ID"
3
+ task :gene_features => :tsv do |list|
4
+ mutated_genes_per_sample = step(:mutated_genes_per_sample).load
5
+
6
+
7
+ samples = study.cohort.fields
8
+ fields = list.name.collect{|n| n + "_mut"}
9
+ table = TSV.setup({}, :key_field => "Sample", :fields => fields)
10
+
11
+ samples.each do |sample|
12
+ affected_genes = mutated_genes_per_sample[sample] || []
13
+ table[sample] = list.collect{|gene| affected_genes.include?(gene)? 1 : 0}
14
+ end
15
+
16
+ table
17
+ end
@@ -0,0 +1,104 @@
1
+ # NON UNIQ
2
+ returns "Ensembl Gene ID"
3
+ task :affected_genes => :annotations do
4
+ Gene.setup(study.cohort.collect{|genotype| genotype.genes.compact}.flatten, "Ensembl Gene ID", organism)
5
+ end
6
+
7
+ # NON UNIQ
8
+ dep :relevant_mutations
9
+ returns "Ensembl Gene ID"
10
+ task :relevant_genes => :annotations do
11
+ relevant_mutations = step(:relevant_mutations).load
12
+ genes = relevant_mutations.collect{|mutation|
13
+ splicing = mutation.in_exon_junction? ? mutation.transcripts_with_affected_splicing.gene : []
14
+ protein = (mis = mutation.mutated_isoforms).nil? ? [] : mis.protein.gene.compact.uniq
15
+ (splicing + protein).uniq
16
+ }.compact.flatten
17
+ Gene.setup(genes, "Ensembl Gene ID", organism)
18
+ end
19
+
20
+ # NON UNIQ
21
+ dep :relevant_mutations
22
+ returns "Ensembl Gene ID"
23
+ input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
24
+ input :add_exon_junction, :boolean, "Add exon junction mutations", true
25
+ task :damaged_genes => :annotations do |methods, add_exon_junction|
26
+ relevant_mutations = step(:relevant_mutations).load
27
+ all_mis = relevant_mutations.mutated_isoforms.compact.flatten
28
+ mi_damaged = Misc.process_to_hash(all_mis){|all_mis| all_mis.damaged?(methods) }
29
+
30
+ genes = relevant_mutations.collect{|mutation|
31
+ genes = []
32
+
33
+ genes.concat mutation.transcripts_with_affected_splicing.gene if add_exon_junction and mutation.in_exon_junction? and mutation.type != 'none'
34
+
35
+ mis = mutation.mutated_isoforms
36
+ genes.concat mis.select{|mi| mi_damaged[mi]}.protein.gene.compact.uniq unless mis.nil?
37
+
38
+ genes.uniq
39
+ }.compact.flatten
40
+
41
+ Gene.setup(genes, "Ensembl Gene ID", organism)
42
+ end
43
+
44
+ dep :relevant_genes
45
+ task :gene_mutation_count => :yaml do
46
+ relevant_genes = step(:relevant_genes).load
47
+ if relevant_genes.any?
48
+ Misc.counts(relevant_genes.clean_annotations)
49
+ else
50
+ {}
51
+ end
52
+ end
53
+
54
+ # NON UNIQ
55
+ dep :gene_mutation_count
56
+ input :percentage, :float, "Minimum percentage of samples with the mutation", 0
57
+ returns "Ensembl Gene ID"
58
+ task :recurrent_genes => :annotations do |percentage|
59
+ gene_mutation_count = step(:gene_mutation_count).load
60
+ minimum = (study.cohort.length.to_f * percentage.to_f) / 100.0
61
+
62
+ genes = gene_mutation_count.select{|gene, count|
63
+
64
+ count > 1 and count > minimum
65
+
66
+ }.collect{|gene, count| gene}
67
+
68
+ Gene.setup(genes, "Ensembl Gene ID", organism)
69
+ end
70
+
71
+ dep :damaged_genes
72
+ dep :recurrent_genes
73
+ returns "Ensembl Gene ID"
74
+ task :suspect_genes => :annotations do
75
+ damaged_genes = step(:damaged_genes).load
76
+ recurrent_genes = step(:recurrent_genes).load
77
+
78
+ Gene.setup(( damaged_genes + recurrent_genes ).flatten.uniq, "Ensembl Gene ID", organism)
79
+ end
80
+
81
+ dep :relevant_mutations
82
+ dep :recurrent_genes
83
+ task :mutations_over_recurrent_genes => :annotations do
84
+ relevant_mutations = step(:relevant_mutations).load
85
+ recurrent_genes = step(:recurrent_genes).load
86
+
87
+ relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & recurrent_genes).any?}
88
+ end
89
+
90
+ dep :relevant_mutations
91
+ dep :suspect_genes
92
+ task :mutations_over_suspect_genes => :annotations do
93
+ relevant_mutations = step(:relevant_mutations).load
94
+ suspect_genes = step(:suspect_genes).load
95
+
96
+ relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & suspect_genes).any?}
97
+ end
98
+
99
+ require 'rbbt/mutation/oncodriveFM'
100
+ task :oncodriveFM => :tsv do
101
+ tsv = OncodriveFM.process_cohort(study.cohort)
102
+ tsv.namespace = organism
103
+ tsv
104
+ end
@@ -0,0 +1,56 @@
1
+ require 'rbbt/workflow'
2
+
3
+ Workflow.require_workflow "MutationEnrichment"
4
+ module StudyWorkflow
5
+
6
+ #{{{ SAMPLE ENRICHMENT
7
+ input :database, :string
8
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
9
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
10
+ input :permutations, :integer, "Number of permutations in test", 10000
11
+ input :fdr, :boolean, "BH FDR corrections", true
12
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
13
+ task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes|
14
+
15
+ mutations = study.send(mutation_subset)
16
+
17
+ mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
18
+
19
+ study.cohort.each do |genotype|
20
+ sample = genotype.jobname
21
+ genotype.each do |mutation|
22
+ next unless mutations.include? mutation
23
+ mutation_tsv[mutation] ||= []
24
+ mutation_tsv[mutation] << sample
25
+ end
26
+ end
27
+
28
+ job = MutationEnrichment.job(:sample_pathway_enrichment, study,
29
+ :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
30
+ :masked_genes => masked_genes, :organism => study.organism, :permutations => permutations)
31
+
32
+ res = job.run
33
+ set_info :total_covered, job.info[:total_covered]
34
+ set_info :covered_mutations, job.info[:covered_mutations]
35
+ res
36
+ end
37
+
38
+ #{{{ METAGENOTYPE ENRICHMENT
39
+ input :database, :string
40
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
41
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
42
+ input :fdr, :boolean, "BH FDR corrections", true
43
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
44
+ task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
45
+
46
+ mutations = study.send(mutation_subset)
47
+
48
+ job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
49
+ :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
50
+ :masked_genes => masked_genes, :organism => study.organism)
51
+ res = job.run
52
+ set_info :total_covered, job.info[:total_covered]
53
+ set_info :covered_mutations, job.info[:covered_mutations]
54
+ res
55
+ end
56
+ end