rbbt-study 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ #{{{ SAMPLE ENRICHMENT
2
+ dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
3
+ input :database, :string, "Database code"
4
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
5
+ input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
6
+ input :permutations, :integer, "Number of permutations in test", 10000
7
+ input :fdr, :boolean, "BH FDR corrections", true
8
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
9
+ input :organism, :string, "Organism code", metadata[:organism]
10
+ task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes,organism|
11
+
12
+ mutations = step(mutation_subset).load
13
+
14
+ mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
15
+
16
+ study.cohort.each do |genotype|
17
+ sample = genotype.jobname
18
+ genotype.each do |mutation|
19
+ next unless mutations.include? mutation
20
+ mutation_tsv[mutation] ||= []
21
+ mutation_tsv[mutation] << sample
22
+ end
23
+ end
24
+
25
+ job = MutationEnrichment.job(:sample_pathway_enrichment, study,
26
+ :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
27
+ :masked_genes => masked_genes, :organism => organism, :permutations => permutations)
28
+
29
+ res = job.run
30
+ set_info :total_covered, job.info[:total_covered]
31
+ set_info :covered_mutations, job.info[:covered_mutations]
32
+ res
33
+ end
34
+
35
+ #{{{ METAGENOTYPE ENRICHMENT
36
+ dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
37
+ input :database, :string, "Database code"
38
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
39
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
40
+ input :fdr, :boolean, "BH FDR corrections", true
41
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
42
+ input :organism, :string, "Organism code", metadata[:organism]
43
+ task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
44
+
45
+ mutations = step(mutation_subset).load
46
+
47
+ job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
48
+ :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
49
+ :masked_genes => masked_genes, :organism => organism)
50
+ res = job.run
51
+ set_info :total_covered, job.info[:total_covered]
52
+ set_info :covered_mutations, job.info[:covered_mutations]
53
+ res
54
+ end
55
+
56
+
57
+
58
+
59
+
60
+
61
+ ################################################################
62
+ #{{{ OLD
63
+ ################################################################
64
+
65
+ #{{{ BASE AND GENE COUNTS
66
+
67
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
68
+ #input :organism, :string, "Organism code", metadata[:organism]
69
+ #task :pathway_base_counts => :tsv do |masked_genes, organism|
70
+ # database = clean_name
71
+ # log :loading_genes, "Loading genes from #{ database } #{ organism }"
72
+ # case database
73
+ # when 'kegg'
74
+ # tsv = KEGG.gene_pathway.tsv :key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :merge => true
75
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "KEGG Gene ID", organism).ensembl.compact
76
+ # when 'go', 'go_bp'
77
+ # tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
78
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
79
+ # when 'pfam'
80
+ # tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
81
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
82
+ # end
83
+ #
84
+ # tsv.namespace = organism
85
+ #
86
+ # counts = TSV.setup({}, :key_field => tsv.key_field, :fields => ["Bases"], :type => :single, :cast => :to_i, :namespace => organism)
87
+ #
88
+ # log :processing_database, "Processing database #{database}"
89
+ # tsv.with_monitor do
90
+ # tsv.through do |pathway, genes|
91
+ # next if genes.nil? or genes.empty?
92
+ # size = Gene.gene_list_exon_bases(genes.ensembl.compact.remove(masked_genes))
93
+ # counts[pathway] = size
94
+ # end
95
+ # end
96
+ #
97
+ # log :computing_exome_size, "Computing number of exome bases covered by pathway annotations"
98
+ # total_size = Gene.gene_list_exon_bases(total_genes.remove(masked_genes))
99
+ #
100
+ # set_info :total_size, total_size
101
+ # set_info :total_gene_list, total_genes.remove(masked_genes)
102
+ #
103
+ # counts
104
+ #end
105
+ #
106
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
107
+ #input :organism, :string, "Organism code", metadata[:organism]
108
+ #task :pathway_gene_counts => :tsv do |masked_genes,organism|
109
+ # database = clean_name
110
+ # case database.to_s
111
+ # when 'kegg'
112
+ # tsv = KEGG.gene_pathway.tsv :key_field => "KEGG Pathway ID", :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :merge => true
113
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "KEGG Gene ID", organism).ensembl.compact
114
+ # when 'go', 'go_bp'
115
+ # tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
116
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
117
+ # when 'pfam'
118
+ # tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :merge => true
119
+ # total_genes = Gene.setup(tsv.values.compact.flatten.uniq, "Ensembl Gene ID", organism).ensembl.compact
120
+ # end
121
+ #
122
+ # counts = TSV.setup({}, :key_field => tsv.key_field, :fields => ["Genes"], :type => :single, :cast => :to_i, :namespace => organism)
123
+ #
124
+ # tsv.through do |pathway, genes|
125
+ # next if genes.nil? or genes.empty?
126
+ # genes = genes.ensembl.remove(masked_genes)
127
+ # num = genes.length
128
+ # counts[pathway] = num
129
+ # end
130
+ #
131
+ # set_info :total_genes, total_genes.remove(masked_genes).length
132
+ # set_info :total_gene_list, total_genes.remove(masked_genes)
133
+ #
134
+ # counts
135
+ #end
136
+
137
+
138
+ #
139
+ #
140
+ #dep do |jobname, inputs| job(inputs[:baseline], inputs[:database].to_s, inputs) end
141
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
142
+ #dep :affected_samples_per_pathway
143
+ #input :database, :string
144
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
145
+ #input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
146
+ #input :permutations, :integer, "Number of permutations in test", 10000
147
+ #input :fdr, :boolean, "BH FDR corrections", true
148
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
149
+ #input :organism, :string, "Organism code", metadata[:organism]
150
+ #task :sample_pathway_enrichment_old => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes,organism|
151
+ # pathway_counts = step(baseline).load
152
+ # pathway_counts.unnamed = true
153
+ # total_covered = step(baseline).info[:total_size] || step(baseline).info[:total_genes]
154
+ # total_pathway_genes_list = step(baseline).info[:total_gene_list]
155
+ # mutations = step(mutation_subset).load
156
+ # affected_samples_per_pathway = step(:affected_samples_per_pathway).load
157
+ # affected_samples_per_pathway.namespace = organism
158
+ #
159
+ # affected_genes = mutations.genes.compact.flatten.uniq
160
+ #
161
+ # case database.to_s
162
+ # when 'kegg'
163
+ # database_tsv = KEGG.gene_pathway.tsv :key_field => 'KEGG Pathway ID', :fields => ["KEGG Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
164
+ # when 'go', 'go_bp'
165
+ # database_tsv = Organism.gene_go_bp(organism).tsv :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
166
+ # when 'pfam'
167
+ # database_tsv = Organism.gene_pfam(organism).tsv :key_field => "Pfam Domain", :fields => ["Ensembl Gene ID"], :type => :flat, :persist => true, :unnamed => false, :merge => true
168
+ # end
169
+ #
170
+ # covered_genes_per_samples = {}
171
+ # samples = []
172
+ # study.cohort.each do |genotype|
173
+ # samples << genotype.jobname
174
+ # covered_genes_per_samples[genotype.jobname] = genotype.subset(mutations).genes.compact.flatten.subset(total_pathway_genes_list)
175
+ # end
176
+ #
177
+ # sample_mutation_tokens = []
178
+ # samples.collect{|sample| study.cohort[sample].subset(mutations).genes.select{|l| not l.nil? and (l & total_pathway_genes_list).any? }.length.times{ sample_mutation_tokens << sample } }
179
+ #
180
+ # mutation_genes = Misc.process_to_hash(mutations){|list| list.genes}
181
+ # covered_mutations = mutations.select{|mutation|(mutation_genes[mutation] & total_pathway_genes_list).any? }.length
182
+ #
183
+ # pathways = pathway_counts.keys
184
+ #
185
+ # pathway_expected_counts = {}
186
+ # pathway_counts.with_monitor :desc => "Pathway gene counts" do
187
+ # pathway_counts.through do |pathway, count|
188
+ # next unless affected_samples_per_pathway.include?(pathway) and affected_samples_per_pathway[pathway].any?
189
+ # ratio = count.to_f / total_covered
190
+ # num_token_list = RSRuby.instance.rbinom(permutations, sample_mutation_tokens.length, ratio)
191
+ # pathway_expected_counts[pathway] = num_token_list.collect{|num_tokens|
192
+ # Misc.sample(sample_mutation_tokens, num_tokens.to_i).uniq.length
193
+ # }
194
+ # end
195
+ # end
196
+ #
197
+ # tsv = TSV.setup({}, :key_field => affected_samples_per_pathway.key_field, :fields => ["Sample", "Matches", "Expected", "Ratio", "Pathway total", "p-value", "Ensembl Gene ID"], :namespace => organism, :type => :double)
198
+ # affected_samples_per_pathway.through do |pathway, samples|
199
+ # next unless samples.any?
200
+ # next unless pathway_expected_counts.include? pathway
201
+ # pathway_genes = database_tsv[pathway].ensembl
202
+ # samples = samples.uniq.select{|sample| (covered_genes_per_samples[sample] & pathway_genes).any?}
203
+ # count = samples.length
204
+ # expected = Misc.mean(pathway_expected_counts[pathway]).floor
205
+ # pvalue = pathway_expected_counts[pathway].select{|exp_c| exp_c > count}.length.to_f / permutations
206
+ # tsv[pathway] = [samples.sort, [count], [expected], [count.to_f / expected], [pathway_counts[pathway]], [pvalue], pathway_genes.subset(affected_genes)]
207
+ # end
208
+ #
209
+ # FDR.adjust_hash! tsv, 5 if fdr
210
+ #
211
+ # set_info :covered_mutations, covered_mutations
212
+ # set_info :total_covered, total_covered
213
+ #
214
+ # tsv
215
+ #end
216
+ #
217
+ #
218
+ #
219
+ #
220
+ #dep do |jobname, inputs| job(inputs[:baseline], inputs[:database].to_s, inputs) end
221
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
222
+ #input :database, :string
223
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
224
+ #input :baseline, :select, "Type of baseline to use", :bases, :select_options => [:pathway_base_counts, :pathway_gene_counts]
225
+ #input :fdr, :boolean, "BH FDR corrections", true
226
+ #input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
227
+ #input :organism, :string, "Organism code", metadata[:organism]
228
+ #task :mutation_pathway_enrichment_old => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
229
+ # counts = step(baseline).load
230
+ # total_covered = step(baseline).info[:total_size] || step(baseline).info[:total_genes]
231
+ # mutations = step(mutation_subset).load
232
+ #
233
+ # affected_genes = mutations.genes.compact.flatten.uniq
234
+ #
235
+ # # Get database tsv and native ids
236
+ #
237
+ # case database
238
+ # when 'kegg'
239
+ # database_tsv = KEGG.gene_pathway.tsv :key_field => 'KEGG Gene ID', :fields => ["KEGG Pathway ID"], :type => :flat, :persist => true, :unnamed => true, :merge => true
240
+ # affected_genes_db = affected_genes.to_kegg
241
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).ensembl.uniq
242
+ # when 'go', 'go_bp'
243
+ # database_tsv = Organism.gene_go_bp(organism).tsv :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :persist => true, :unnamed => true, :merge => true
244
+ # affected_genes_db = affected_genes
245
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).uniq
246
+ # when 'pfam'
247
+ # database_tsv = Organism.gene_pfam(organism).tsv :key_field => "Ensembl Gene ID", :fields => ["Pfam Domain"], :type => :flat, :persist => true, :unnamed => true, :merge => true
248
+ # affected_genes_db = affected_genes
249
+ # all_db_genes = Gene.setup(database_tsv.keys, "KEGG Gene ID", organism).uniq
250
+ # end
251
+ #
252
+ # affected_genes = affected_genes.remove(masked_genes)
253
+ # all_db_genes = all_db_genes.remove(masked_genes)
254
+ #
255
+ # # Annotate each pathway with the affected genes that are involved in it
256
+ #
257
+ # affected_genes_per_pathway = {}
258
+ # affected_genes_db.zip(affected_genes).each do |gene_db,gene|
259
+ # next if gene_db.nil?
260
+ # pathways = database_tsv[gene_db]
261
+ # next if pathways.nil?
262
+ # pathways.uniq.each do |pathway|
263
+ # affected_genes_per_pathway[pathway] ||= []
264
+ # affected_genes_per_pathway[pathway] << gene
265
+ # end
266
+ # end
267
+ #
268
+ # pvalues = TSV.setup({}, :key_field => database_tsv.fields.first, :fields => ["Matches", "Pathway total", "p-value", "Ensembl Gene ID"], :namespace => organism, :type => :double)
269
+ # mutation_genes = Misc.process_to_hash(mutations){|list| list.genes}
270
+ # covered_mutations = mutations.select{|mutation|(mutation_genes[mutation] & all_db_genes).any? }.length
271
+ #
272
+ # affected_genes_per_pathway.each do |pathway, genes|
273
+ # pathway_total = counts[pathway]
274
+ # matches = mutations.select{|mutation| (mutation_genes[mutation] & genes).any? }.length
275
+ # pvalue = RSRuby.instance.binom_test(matches, covered_mutations, pathway_total.to_f / total_covered.to_f, "greater")["p.value"]
276
+ #
277
+ # pvalues[pathway] = [[matches], [pathway_total], [pvalue], affected_genes.subset(genes).uniq.sort_by{|g| g.name || g}]
278
+ # end
279
+ #
280
+ # FDR.adjust_hash! pvalues, 2 if fdr
281
+ #
282
+ # set_info :covered_mutations, covered_mutations
283
+ # set_info :total_covered, total_covered
284
+ #
285
+ # pvalues
286
+ #end
287
+ #
288
+ #
289
+ #dep do |jobname, inputs| job(:pathway_base_counts, inputs[:database].to_s, inputs) end
290
+ #dep do |jobname, inputs| job(inputs[:mutation_subset] || :relevant_mutations, jobname, inputs) end
291
+ #dep :affected_genes
292
+ #input :database, :string
293
+ #input :fdr, :boolean, "BH FDR corrections", true
294
+ #input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
295
+ #
296
+ ##{{{ RATIOS
297
+ #
298
+ #dep :affected_samples_per_pathway
299
+ #dep :affected_genes
300
+ #dep do |name, inputs|
301
+ # database = inputs[:database]
302
+ # if inputs[:type] == :genes
303
+ # job(:pathway_base_counts, database, inputs)
304
+ # else
305
+ # job(:pathway_gene_counts, database, inputs)
306
+ # end
307
+ #end
308
+ #task :pathway_sample_ratios => :tsv do
309
+ # num_samples = study.cohort.length
310
+ #
311
+ # affected_samples_per_pathway = step(:affected_samples_per_pathway).load
312
+ # affected_genes = step(:affected_genes).load
313
+ #
314
+ # ratios = TSV.setup({}, :key_field => sample_pathway_probability.key_field, :fields => ["Num Samples", "Expected", "Ratio", "Ensembl Gene ID"], :namespace => organism)
315
+ #
316
+ # pathways.through do |pathway, probability|
317
+ # next unless affected_samples_per_pathway.include?(pathway) and affected_samples_per_pathway[pathway].length > 1
318
+ # affected_samples = (affected_samples_per_pathway[pathway] || []).length
319
+ # ratios[pathway] = [affected_samples, probability * num_samples, affected_samples.to_f / (probability * num_samples), pathway.genes.ensembl.subset(affected_genes)]
320
+ # end
321
+ #
322
+ # ratios.namespace = organism
323
+ #
324
+ # ratios
325
+ #end
326
+ #
327
+
328
+ #input :database, :string
329
+ #dep :damaging_mutations
330
+ #dep :affected_genes
331
+ #task :pathway_mutation_ratios => :tsv do |database|
332
+ # damaging_mutations = step(:damaging_mutations).load
333
+ #
334
+ # affected_genes = step(:affected_genes).load
335
+ # affected_genes.organism = organism
336
+ #
337
+ # pathways = case database
338
+ # when 'go'
339
+ # affected_genes.go_terms
340
+ # when 'go_bp'
341
+ # affected_genes.go_bp_terms
342
+ # when 'go_cc'
343
+ # affected_genes.go_cc_terms
344
+ # when 'go_mf'
345
+ # affected_genes.go_mf_terms
346
+ # when 'pfam'
347
+ # affected_genes.pfam_domains
348
+ # else
349
+ # affected_genes.send(database + '_pathways')
350
+ # end.compact.flatten.uniq
351
+ #
352
+ # key_field = nil
353
+ # pathway_genes = Misc.process_to_hash(pathways) do |pathways|
354
+ # pathways.uniq.collect do |pathway|
355
+ # case database
356
+ # when 'kegg'
357
+ # KeggPathway.setup(pathway, organism)
358
+ # key_field = "KEGG Pathway ID"
359
+ # Gene.setup(pathway.genes, "KEGG Gene ID", organism).ensembl
360
+ # when 'go', 'go_bp', 'go_mf', 'go_cc'
361
+ # GOTerm.setup(pathway, organism)
362
+ # key_field = "GO ID"
363
+ # Gene.setup(pathway.genes, "Ensembl Gene ID", organism).ensembl
364
+ # when 'pfam'
365
+ # PfamDomain.setup(pathway, organism)
366
+ # key_field = "Pfam Domain"
367
+ # Gene.setup(pathway.genes, "Ensembl Gene ID", organism).ensembl
368
+ # end
369
+ # end
370
+ # end
371
+ #
372
+ # pathway_mutation_ratios = TSV.setup({}, :key_field => key_field,
373
+ # :fields => ["Mutations per MB", "# Mutations", "# Damaging Mut.", "# Genes", "# Bases", "Ensembl Gene ID"], :type => :double)
374
+ #
375
+ #
376
+ # pathway_sizes = Misc.process_to_hash(pathways) do |pathways|
377
+ # pathways.collect{|pathway| Gene.gene_list_exon_bases(pathway_genes[pathway])}
378
+ # end
379
+ #
380
+ # pathways_for_mutations = Misc.process_to_hash(study.cohort.flatten){|all_mutations| Gene.setup(study.cohort.collect{|genotype| genotype.genes }.flatten(1), "Ensembl Gene ID", organism).collect{|genes|
381
+ # genes.nil? ? [] : genes.collect{|gene|
382
+ # case database
383
+ # when 'go'
384
+ # gene.go_terms
385
+ # when 'go_bp'
386
+ # gene.go_bp_terms
387
+ # when 'go_cc'
388
+ # gene.go_cc_terms
389
+ # when 'go_mf'
390
+ # gene.go_mf_terms
391
+ # when 'pfam'
392
+ # gene.pfam_domains
393
+ # else
394
+ # affected_genes.send(database + '_pathways')
395
+ # end
396
+ # }.flatten}
397
+ # }
398
+ #
399
+ # mutations_in_pathway = Misc.process_to_hash(pathways) do |pathways|
400
+ # pathways.collect do |pathway|
401
+ # GenomicMutation.setup(pathways_for_mutations.select{|mut,pths| pths and pths.include? pathway}.collect{|mut, pths| mut}, "Pathway mutations #{pathway} in #{study}", organism, watson)
402
+ # end
403
+ # end
404
+ #
405
+ # pathways.each do |pathway|
406
+ # pathway_mutations = mutations_in_pathway[pathway]
407
+ # next if pathway_mutations.one?
408
+ # pathway_score = pathway_mutations.length.to_f / pathway_sizes[pathway]
409
+ # genes = pathway_mutations.genes.compact.flatten.subset(affected_genes).subset(pathway_genes[pathway])
410
+ #
411
+ # pathway_mutation_ratios[pathway] = [["%.5g" % (pathway_score * 1_000_000)], [pathway_mutations.length], [pathway_mutations.subset(damaging_mutations).length], [pathway_genes[pathway].length], [pathway_sizes[pathway]], genes]
412
+ # end
413
+ #
414
+ # pathway_mutation_ratios.namespace = organism
415
+ #
416
+ # pathway_mutation_ratios
417
+ #end
418
+ #
@@ -0,0 +1,19 @@
1
+ module StudyWorkflow
2
+ helper :organism do
3
+ study.metadata[:organism]
4
+ end
5
+
6
+ task :matrix => :tsv do
7
+ matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
8
+ matrix.matrix_file(path)
9
+ nil
10
+ end
11
+
12
+ task :expression_barcode => :tsv do |*args|
13
+ factor = args.first || 2
14
+ matrix = study.matrix("gene_expression", "Ensembl Gene ID", organism)
15
+ matrix.barcode(path, factor)
16
+ nil
17
+ end
18
+ end
19
+
@@ -0,0 +1,17 @@
1
+ dep :mutated_genes_per_sample
2
+ input :list, :array, "Gene list in Ensembl Gene ID"
3
+ task :gene_features => :tsv do |list|
4
+ mutated_genes_per_sample = step(:mutated_genes_per_sample).load
5
+
6
+
7
+ samples = study.cohort.fields
8
+ fields = list.name.collect{|n| n + "_mut"}
9
+ table = TSV.setup({}, :key_field => "Sample", :fields => fields)
10
+
11
+ samples.each do |sample|
12
+ affected_genes = mutated_genes_per_sample[sample] || []
13
+ table[sample] = list.collect{|gene| affected_genes.include?(gene)? 1 : 0}
14
+ end
15
+
16
+ table
17
+ end
@@ -0,0 +1,104 @@
1
+ # NON UNIQ
2
+ returns "Ensembl Gene ID"
3
+ task :affected_genes => :annotations do
4
+ Gene.setup(study.cohort.collect{|genotype| genotype.genes.compact}.flatten, "Ensembl Gene ID", organism)
5
+ end
6
+
7
+ # NON UNIQ
8
+ dep :relevant_mutations
9
+ returns "Ensembl Gene ID"
10
+ task :relevant_genes => :annotations do
11
+ relevant_mutations = step(:relevant_mutations).load
12
+ genes = relevant_mutations.collect{|mutation|
13
+ splicing = mutation.in_exon_junction? ? mutation.transcripts_with_affected_splicing.gene : []
14
+ protein = (mis = mutation.mutated_isoforms).nil? ? [] : mis.protein.gene.compact.uniq
15
+ (splicing + protein).uniq
16
+ }.compact.flatten
17
+ Gene.setup(genes, "Ensembl Gene ID", organism)
18
+ end
19
+
20
+ # NON UNIQ
21
+ dep :relevant_mutations
22
+ returns "Ensembl Gene ID"
23
+ input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
24
+ input :add_exon_junction, :boolean, "Add exon junction mutations", true
25
+ task :damaged_genes => :annotations do |methods, add_exon_junction|
26
+ relevant_mutations = step(:relevant_mutations).load
27
+ all_mis = relevant_mutations.mutated_isoforms.compact.flatten
28
+ mi_damaged = Misc.process_to_hash(all_mis){|all_mis| all_mis.damaged?(methods) }
29
+
30
+ genes = relevant_mutations.collect{|mutation|
31
+ genes = []
32
+
33
+ genes.concat mutation.transcripts_with_affected_splicing.gene if add_exon_junction and mutation.in_exon_junction? and mutation.type != 'none'
34
+
35
+ mis = mutation.mutated_isoforms
36
+ genes.concat mis.select{|mi| mi_damaged[mi]}.protein.gene.compact.uniq unless mis.nil?
37
+
38
+ genes.uniq
39
+ }.compact.flatten
40
+
41
+ Gene.setup(genes, "Ensembl Gene ID", organism)
42
+ end
43
+
44
+ dep :relevant_genes
45
+ task :gene_mutation_count => :yaml do
46
+ relevant_genes = step(:relevant_genes).load
47
+ if relevant_genes.any?
48
+ Misc.counts(relevant_genes.clean_annotations)
49
+ else
50
+ {}
51
+ end
52
+ end
53
+
54
+ # NON UNIQ
55
+ dep :gene_mutation_count
56
+ input :percentage, :float, "Minimum percentage of samples with the mutation", 0
57
+ returns "Ensembl Gene ID"
58
+ task :recurrent_genes => :annotations do |percentage|
59
+ gene_mutation_count = step(:gene_mutation_count).load
60
+ minimum = (study.cohort.length.to_f * percentage.to_f) / 100.0
61
+
62
+ genes = gene_mutation_count.select{|gene, count|
63
+
64
+ count > 1 and count > minimum
65
+
66
+ }.collect{|gene, count| gene}
67
+
68
+ Gene.setup(genes, "Ensembl Gene ID", organism)
69
+ end
70
+
71
+ dep :damaged_genes
72
+ dep :recurrent_genes
73
+ returns "Ensembl Gene ID"
74
+ task :suspect_genes => :annotations do
75
+ damaged_genes = step(:damaged_genes).load
76
+ recurrent_genes = step(:recurrent_genes).load
77
+
78
+ Gene.setup(( damaged_genes + recurrent_genes ).flatten.uniq, "Ensembl Gene ID", organism)
79
+ end
80
+
81
+ dep :relevant_mutations
82
+ dep :recurrent_genes
83
+ task :mutations_over_recurrent_genes => :annotations do
84
+ relevant_mutations = step(:relevant_mutations).load
85
+ recurrent_genes = step(:recurrent_genes).load
86
+
87
+ relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & recurrent_genes).any?}
88
+ end
89
+
90
+ dep :relevant_mutations
91
+ dep :suspect_genes
92
+ task :mutations_over_suspect_genes => :annotations do
93
+ relevant_mutations = step(:relevant_mutations).load
94
+ suspect_genes = step(:suspect_genes).load
95
+
96
+ relevant_mutations.select{|mutation| mutation.genes and (mutation.genes & suspect_genes).any?}
97
+ end
98
+
99
+ require 'rbbt/mutation/oncodriveFM'
100
+ task :oncodriveFM => :tsv do
101
+ tsv = OncodriveFM.process_cohort(study.cohort)
102
+ tsv.namespace = organism
103
+ tsv
104
+ end
@@ -0,0 +1,56 @@
1
+ require 'rbbt/workflow'
2
+
3
+ Workflow.require_workflow "MutationEnrichment"
4
+ module StudyWorkflow
5
+
6
+ #{{{ SAMPLE ENRICHMENT
7
+ input :database, :string
8
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
9
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
10
+ input :permutations, :integer, "Number of permutations in test", 10000
11
+ input :fdr, :boolean, "BH FDR corrections", true
12
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
13
+ task :sample_pathway_enrichment => :tsv do |database,mutation_subset,baseline,permutations,fdr,masked_genes|
14
+
15
+ mutations = study.send(mutation_subset)
16
+
17
+ mutation_tsv = TSV.setup({}, :key_field => "Genomic Mutation", :fields => ["Sample"], :type => :flat)
18
+
19
+ study.cohort.each do |genotype|
20
+ sample = genotype.jobname
21
+ genotype.each do |mutation|
22
+ next unless mutations.include? mutation
23
+ mutation_tsv[mutation] ||= []
24
+ mutation_tsv[mutation] << sample
25
+ end
26
+ end
27
+
28
+ job = MutationEnrichment.job(:sample_pathway_enrichment, study,
29
+ :mutations => mutation_tsv, :database => database, :baseline => baseline, :fdr => fdr,
30
+ :masked_genes => masked_genes, :organism => study.organism, :permutations => permutations)
31
+
32
+ res = job.run
33
+ set_info :total_covered, job.info[:total_covered]
34
+ set_info :covered_mutations, job.info[:covered_mutations]
35
+ res
36
+ end
37
+
38
+ #{{{ METAGENOTYPE ENRICHMENT
39
+ input :database, :string
40
+ input :mutation_subset, :select, "Mutation subset to use", :relevant_mutations
41
+ input :baseline, :select, "Type of baseline to use", :pathway_base_counts, :select_options => [:pathway_base_counts, :pathway_gene_counts]
42
+ input :fdr, :boolean, "BH FDR corrections", true
43
+ input :masked_genes, :array, "Ensembl Gene ID list of genes to mask", []
44
+ task :mutation_pathway_enrichment => :tsv do |database,mutation_subset,baseline,fdr,masked_genes,organism|
45
+
46
+ mutations = study.send(mutation_subset)
47
+
48
+ job = MutationEnrichment.job(:mutation_pathway_enrichment, study,
49
+ :mutations => mutations, :database => database, :baseline => baseline, :fdr => fdr,
50
+ :masked_genes => masked_genes, :organism => study.organism)
51
+ res = job.run
52
+ set_info :total_covered, job.info[:total_covered]
53
+ set_info :covered_mutations, job.info[:covered_mutations]
54
+ res
55
+ end
56
+ end