germ 0.1 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mutation.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'genomic_locus'
2
+ require 'fasta'
3
+ class Mutation
4
+ include GenomicLocus
5
+ # This is a generic description of a mutation.
6
+ VALID_ALLELE = /^[ATGCNatgcn]+$/
7
+
8
+ attr_reader :seqname, :pos, :ref, :alt, :ref_count, :alt_count
9
+ alias_method :start, :pos
10
+ def initialize seqname, pos, ref, alt, ref_count=nil, alt_count=nil
11
+ @seqname, @pos, @ref, @alt, @ref_count, @alt_count = seqname, pos, ref, alt, ref_count, alt_count
12
+ end
13
+
14
+ def stop
15
+ start + ref.size - 1
16
+ end
17
+
18
+ def to_s
19
+ range.to_s + ":#{ref}-#{alt}"
20
+ end
21
+
22
+ def is_valid?
23
+ ref =~ VALID_ALLELE && alt =~ VALID_ALLELE
24
+ end
25
+
26
+ def ref_at loc
27
+ return nil unless contains? loc
28
+ ref[pos - loc.pos]
29
+ end
30
+
31
+ def var_freq
32
+ if ref_count && alt_count
33
+ ref_count / alt_count
34
+ end
35
+ end
36
+
37
+ def alt_at loc
38
+ return nil unless contains? loc
39
+ alt[pos - loc.pos]
40
+ end
41
+ end
data/lib/mutation_set.rb CHANGED
@@ -1,69 +1,14 @@
1
- require 'oncotator'
2
1
  require 'yaml'
3
2
  require 'intervals'
3
+ require 'hash_table'
4
+ require 'mutation'
4
5
 
5
- module MutationSet
6
- class Line
7
- include IntervalList::Interval
8
- attr_reader :sample
9
- attr_accessor :invalid
10
-
11
- def self.alias_key sym1, sym2
12
- define_method sym1 do
13
- send sym2
14
- end
15
- define_method "#{sym1}=" do |v|
16
- send "#{sym2}=", v
17
- end
18
- end
19
-
20
- def copy
21
- self.class.new @mutation.clone, sample
22
- end
23
-
24
- def invalid?
25
- invalid
26
- end
27
-
28
- def invalidate!
29
- @invalid = true
30
- end
31
-
32
-
33
- def initialize(fields, sample)
34
- if fields.is_a? Hash
35
- @mutation = fields
36
- else
37
- @mutation = Hash[sample.clean_headers.zip(fields)]
38
- end
39
- @sample = sample
40
- end
41
-
42
- def key
43
- "#{chrom}:#{start}:#{stop}"
44
- end
45
-
46
- def long_chrom
47
- @long_chrom ||= "chr#{short_chrom}"
48
- end
49
-
50
- def short_chrom
51
- @short_chrom ||= chrom.sub(/^chr/,'')
52
- end
53
-
54
- def to_s
55
- sample.clean_headers.map{ |h| @mutation[h] }.join("\t")
56
- end
57
-
58
- def to_hash
59
- @mutation
60
- #Hash[@mutation.map do |k,v| [ k, v ? v.clone : v ]; end]
61
- end
62
-
6
+ class Mutation
7
+ module Filtering
63
8
  def criteria_failed? obj, name
64
- return nil if !sample.mutation_config
9
+ return nil if !@table.mutation_config
65
10
  name = [ name ] if !name.is_a? Array
66
- crit = name.reduce(sample.mutation_config) do |h,n|
11
+ crit = name.reduce(@table.mutation_config) do |h,n|
67
12
  h.is_a?(Hash) ? h[n] : nil
68
13
  end
69
14
  return nil if !crit
@@ -112,10 +57,10 @@ module MutationSet
112
57
  end
113
58
  return v
114
59
  when /^whitelisted/
115
- whitelist = sample.whitelist value
60
+ whitelist = @table.whitelist value
116
61
  return whitelist.intersect(self)
117
62
  when /^blacklisted/
118
- blacklist = sample.blacklist value
63
+ blacklist = @table.blacklist value
119
64
  return !blacklist.intersect(self)
120
65
  else
121
66
  # send it
@@ -128,129 +73,45 @@ module MutationSet
128
73
  end
129
74
  true
130
75
  end
76
+ end
77
+ end
131
78
 
132
- def onco
133
- raise ArgumentError, @onco_error unless valid_onco_input?
134
- @onco ||= Oncotator.new :key => self.to_ot
135
- end
136
-
137
- def discard_onco
138
- @onco = nil
139
- end
140
-
141
- def skip_oncotator? criteria=nil
142
- return true if !onco || onco.empty? || criteria_failed?(onco, criteria || :oncotator)
143
- end
144
-
145
- def inspect
146
- "#<#{self.class.name}:#{object_id} @mutation=#{@mutation}>"
147
- end
148
-
149
- def in_cosmic
150
- onco.Cosmic_overlapping_mutations ? "YES" : "NO"
151
- end
152
-
153
- def to_ot
154
- [ short_chrom, start, stop, ref_allele, alt_allele ].join("_")
155
- end
156
-
157
- def method_missing(meth,*args,&block)
158
- if meth.to_s =~ /(.*)=/
159
- @mutation[$1.to_sym] = args.first
160
- else
161
- @mutation.has_key?(meth.to_sym) ? @mutation[meth.to_sym] : super
162
- end
79
+ class Mutation
80
+ class Record < HashTable::HashLine
81
+ include GenomicLocus
82
+ include Mutation::Filtering
83
+ def copy
84
+ self.class.new @hash.clone, @table
163
85
  end
164
86
 
165
- def respond_to? method
166
- !@mutation[method.to_sym].nil? || super
87
+ attr_reader :muts
88
+ def initialize h, table
89
+ super h, table
90
+ @muts = []
167
91
  end
168
92
 
169
- private
170
- CHROM_POS=/^[0-9]+$/
171
- ALLELE_SEQ=/^([A-Z]+|-)$/
172
- def valid_onco_input?
173
- @onco_error = []
174
- @onco_error.push 'Malformed start position' unless start.to_s =~ MutationSet::Line::CHROM_POS
175
- @onco_error.push 'Malformed stop position' unless stop.to_s =~ MutationSet::Line::CHROM_POS
176
- @onco_error.push 'Malformed reference allele' unless ref_allele =~ MutationSet::Line::ALLELE_SEQ
177
- @onco_error.push 'Malformed alt allele' unless alt_allele =~ MutationSet::Line::ALLELE_SEQ
178
- @onco_error.empty?
93
+ def mut
94
+ @muts.first
179
95
  end
180
96
  end
181
97
 
182
- class Sample
183
- include Enumerable
184
- attr_reader :samples, :mutation_config, :lines, :preamble_lines
185
- attr_accessor :headers
98
+ class Collection < HashTable
99
+ attr_reader :mutation_config
186
100
  class << self
187
- attr_reader :required, :comment
188
- def requires *terms
101
+ attr_reader :required, :optional
102
+ def requires terms
189
103
  @required = terms
190
104
  end
191
105
 
192
- def comments c
193
- @comment = c
194
- end
195
-
196
- def read(filename,mutation_config=nil)
197
- set = new mutation_config, true
198
-
199
- set.load_file filename
200
-
201
- return set
202
- end
203
- end
204
-
205
- def load_file filename
206
- File.foreach(filename) do |l|
207
- fields = l.chomp.split(/\t/,-1)
208
- if !headers
209
- if fields.first.downcase == required.first.downcase
210
- enforce_headers fields
211
- else
212
- preamble_lines.push l
213
- end
214
- next
215
- end
216
- add_line fields
106
+ def might_have terms
107
+ @optional = terms
217
108
  end
218
109
 
219
- post_read_hook
220
- end
221
-
222
- def preamble
223
- preamble_lines.join("")
224
- end
225
-
226
- def write file
227
- File.open(file,"w") do |f|
228
- output f
229
- end
230
- end
231
-
232
- def print f=nil
233
- if f
234
- write f
235
- else
236
- output STDOUT
237
- end
238
- end
239
-
240
- def output f
241
- f.puts preamble
242
- f.puts headers.join("\t")
243
- @lines.each do |l|
244
- l = yield l if block_given?
245
- next if !l || l.invalid?
246
- f.puts format_line(l)
110
+ def comments c
111
+ @comment = c
247
112
  end
248
113
  end
249
114
 
250
- def format_line l
251
- l.to_s
252
- end
253
-
254
115
  def clean_header s
255
116
  s.to_s.gsub(/\s+/,"_").gsub(/[^\w]+/,"").downcase.to_sym
256
117
  end
@@ -259,92 +120,52 @@ module MutationSet
259
120
  @headers.map {|h| clean_header h}
260
121
  end
261
122
 
262
- def add_line fields
263
- @lines.push self.class.const_get(:Line).new(clean_fields(fields), self)
264
-
265
- index_line @lines.last
266
- end
267
-
268
- def clean_fields fields
269
- fields.is_a?(Array) ? fields.map{|f| f == "NA" ? "" : f } : fields
270
- end
271
-
272
- def index_line line
273
- @index[ line.key ] = line
274
- end
275
-
276
- def find_mutation line
277
- @index[ line.key ]
278
- end
279
-
280
123
  def required
281
- self.class.required
124
+ self.class.required.keys
282
125
  end
283
126
 
284
- def enforce_headers array
285
- raise "File lacks required headers: #{(required.map(&:downcase)-array.map(&:downcase)).join(", ")}" if !(required.map(&:downcase) - array.map(&:downcase)).empty?
286
- @headers = array
127
+ def initialize(obj=nil,opts={})
128
+ # get types from required
129
+ opts[:types] = types_from_required opts
130
+ super obj, opts
131
+ @header = required unless @header
132
+ @mutation_config = YAML.load_file(opts[:mutation_config]) if opts[:mutation_config]
287
133
  end
288
134
 
289
- def initialize(mutation_config=nil,suppress_headers=nil)
290
- @lines = []
291
-
292
- @mutation_config = YAML.load_file(mutation_config) if mutation_config
293
-
294
- @headers = required.map(&:to_sym) unless suppress_headers
295
-
296
- @preamble_lines = []
297
-
298
- @index = {}
135
+ protected
136
+ def types_from_required opts
137
+ types = self.class.required ? self.class.required.clone : {}
138
+ types = types.merge(self.class.optional.clone) if self.class.optional
139
+ types.merge(opts[:types] || {})
299
140
  end
300
141
 
301
- def whitelist file
302
- case file
303
- when /.gtf$/
304
- require 'gtf'
305
- @whitelist ||= GTF.new(file).to_interval_list
306
- when /.vcf$/
307
- require 'vcf'
308
- @whitelist ||= VCF.read(file).to_interval_list
309
- end
310
- @whitelist
142
+ def enforce_header
143
+ create_sleeve
144
+ raise "File lacks required headers: #{missing_required.join(", ")}" unless missing_required.empty?
311
145
  end
312
146
 
313
- def blacklist file
314
- case file
315
- when /.gtf$/
316
- require 'gtf'
317
- @blacklist ||= GTF.new(file).to_interval_list
318
- when /.vcf$/
319
- require 'vcf'
320
- @blacklist ||= VCF.read(file).to_interval_list
147
+ def create_sleeve
148
+ @sleeve = @header
149
+ @header = @header.map do |h|
150
+ clean_header h
321
151
  end
322
- @blacklist
152
+ raise "Headers are not unique: #{duplicate_headers.join(", ")}" unless duplicate_headers.empty?
153
+ @sleeve = Hash[@header.zip @sleeve]
323
154
  end
324
155
 
325
- def to_interval_list
326
- IntervalList.new self.map{|g| [ g.chrom, g.start, g.stop, g ] }
156
+ def duplicate_headers
157
+ @header.inject(Hash.new(0)) do |count,h|
158
+ count[h] += 1
159
+ count
160
+ end.select do |h,count|
161
+ count > 1
162
+ end.keys
327
163
  end
328
164
 
329
- def inspect
330
- to_s
165
+ def missing_required
166
+ required - @header.map(&:downcase)
331
167
  end
332
168
 
333
- def [](key)
334
- @lines[key]
335
- end
336
-
337
- def sort_by! &block
338
- @lines.sort_by! &block
339
- end
340
-
341
- def each
342
- @lines.each do |l|
343
- yield l
344
- end
345
- end
346
-
347
- protected
348
169
  def post_read_hook
349
170
  end
350
171
  end
data/lib/mutect.rb CHANGED
@@ -2,22 +2,25 @@ require 'oncotator'
2
2
  require 'yaml'
3
3
  require 'mutation_set'
4
4
 
5
- class MuTect < MutationSet::Sample
6
- requires "contig", "position", "context", "ref_allele", "alt_allele",
7
- "tumor_name", "normal_name", "score", "dbsnp_site", "covered", "power",
8
- "tumor_power", "normal_power", "total_pairs", "improper_pairs",
9
- "map_q0_reads", "t_lod_fstar", "tumor_f", "contaminant_fraction",
10
- "contaminant_lod", "t_ref_count", "t_alt_count", "t_ref_sum", "t_alt_sum",
11
- "t_ref_max_mapq", "t_alt_max_mapq", "t_ins_count", "t_del_count",
12
- "normal_best_gt", "init_n_lod", "n_ref_count", "n_alt_count", "n_ref_sum",
13
- "n_alt_sum", "judgement"
5
+ class MuTect < Mutation::Collection
6
+ header_on
7
+ requires :contig => :str, :position => :int, :context => :str, :ref_allele => :str, :alt_allele => :str,
8
+ :tumor_name => :str, :normal_name => :str, :score => :float, :dbsnp_site => :str, :covered => :str, :power => :float,
9
+ :tumor_power => :float, :normal_power => :float, :total_pairs => :int, :improper_pairs => :int,
10
+ :map_q0_reads => :int, :t_lod_fstar => :float, :tumor_f => :float, :contaminant_fraction => :float,
11
+ :contaminant_lod => :float, :t_ref_count => :int, :t_alt_count => :int, :t_ref_sum => :int, :t_alt_sum => :int,
12
+ :t_ref_max_mapq => :int, :t_alt_max_mapq => :int, :t_ins_count => :int, :t_del_count => :int,
13
+ :normal_best_gt => :str, :init_n_lod => :float, :n_ref_count => :int, :n_alt_count => :int, :n_ref_sum => :int,
14
+ :n_alt_sum => :int, :judgement => :str
14
15
  comments "##"
15
16
 
16
- class Line < MutationSet::Line
17
- alias_key :chrom, :contig
17
+ class Line < Mutation::Record
18
+ alias_key :seqname, :contig
19
+ alias_key :pos, :position
18
20
  alias_key :start, :position
19
- def stop; @stop || end_position; end
20
- def stop= nc; @stop = nc; end
21
+ alias_key :stop, :default_stop
22
+ alias_key :ref, :ref_allele
23
+ alias_key :alt, :alt_allele
21
24
  def keep_somatic?
22
25
  !criteria_failed?(self, [ :mutect, :somatic ])
23
26
  end
@@ -25,10 +28,6 @@ class MuTect < MutationSet::Sample
25
28
  !criteria_failed?(self, [ :mutect, :germline ])
26
29
  end
27
30
 
28
- def end_position
29
- position.to_i + ref_allele.length-1
30
- end
31
-
32
31
  def q0_ratio
33
32
  map_q0_reads.to_f / (t_alt_count.to_i + n_alt_count.to_i)
34
33
  end
@@ -39,5 +38,11 @@ class MuTect < MutationSet::Sample
39
38
  def n_var_freq; n_alt_count.to_f / n_depth end
40
39
  def t_depth; t_alt_count.to_i + t_ref_count.to_i end
41
40
  def n_depth; n_alt_count.to_i + n_ref_count.to_i end
41
+
42
+ def initialize h, table
43
+ super h, table
44
+ @muts.push Mutation.new(seqname,pos,ref,alt,t_ref_count,t_alt_count)
45
+ end
42
46
  end
47
+ line_class MuTect::Line
43
48
  end
data/lib/oncotator.rb CHANGED
@@ -16,7 +16,7 @@ class Oncotator
16
16
  end
17
17
 
18
18
  def self.db_opts
19
- @db_opts ||= TaylorlibConfig.get_conf :oncotator
19
+ @db_opts ||= GermConfig.get_conf :oncotator
20
20
  end
21
21
 
22
22
  def self.db_cache
@@ -142,3 +142,45 @@ class Oncotator
142
142
  end
143
143
  end
144
144
  end
145
+ class Mutation
146
+ class Record < HashTable::HashLine
147
+ def skip_oncotator? criteria=nil
148
+ return true if !mut.onco || mut.onco.empty? || criteria_failed?(mut.onco, criteria || :oncotator)
149
+ end
150
+ end
151
+ module Oncotate
152
+ def onco
153
+ raise ArgumentError, @onco_error unless valid_onco_input?
154
+ @onco ||= Oncotator.new :key => to_ot
155
+ end
156
+
157
+ def discard_onco
158
+ @onco = nil
159
+ end
160
+
161
+ def inspect
162
+ "#<#{self.class.name}:#{object_id} @mutation=#{@mutation}>"
163
+ end
164
+
165
+ def in_cosmic
166
+ onco.Cosmic_overlapping_mutations ? "YES" : "NO"
167
+ end
168
+
169
+ def to_ot
170
+ [ short_chrom, start, stop, ref, alt ].join("_")
171
+ end
172
+
173
+ private
174
+ CHROM_POS=/^[0-9]+$/
175
+ ALLELE_SEQ=/^([A-Z]+|-)$/
176
+ def valid_onco_input?
177
+ @onco_error = []
178
+ #@onco_error.push 'Malformed start position' unless start.to_s =~ Mutation::Oncotate::CHROM_POS
179
+ #@onco_error.push 'Malformed stop position' unless stop.to_s =~ Mutation::Oncotate::CHROM_POS
180
+ @onco_error.push 'Malformed reference allele' unless ref =~ Mutation::Oncotate::ALLELE_SEQ
181
+ @onco_error.push 'Malformed alt allele' unless alt =~ Mutation::Oncotate::ALLELE_SEQ
182
+ @onco_error.empty?
183
+ end
184
+ end
185
+ include Oncotate
186
+ end
data/lib/sdrf.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'hash_table'
2
+
3
+ class SDRF < HashTable
4
+ def initialize file, opts={}
5
+ @sdrf_header = File.open(file).gets.split(/\t/)
6
+ super file, :header => [ :extract_name, :tcga_barcode ] + other_headers, :skip_header => true
7
+ end
8
+
9
+ def other_headers
10
+ @sdrf_header[2..-1].map.with_index do |header,i|
11
+ "header#{i}".to_sym
12
+ end
13
+ end
14
+ end
data/lib/tcga.rb ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hash_table'
4
+ require 'sdrf'
5
+ require 'germ/config'
6
+
7
+ module TCGA
8
+ class Sample
9
+ attr_reader :barcode, :cancer_type
10
+ def initialize barcode, cancer_type
11
+ @barcode = barcode
12
+ @cancer_type = cancer_type
13
+ end
14
+ end
15
+ class RsemExpression
16
+ def initialize sample
17
+ @sample = sample
18
+ end
19
+ def gene_file
20
+ exp = sdrf.find do |exp|
21
+ exp.tcga_barcode == @sample.barcode && exp.header21 == 'RSEM_genes'
22
+ end
23
+ File.join(GermConfig.get_conf(:tcga, :cancer_types, @sample.cancer_type, :rsem_exp_raw), exp.header19) if exp
24
+ end
25
+ def gene_exp
26
+ if gene_file
27
+ @gene_exp ||= HashTable.new gene_file, :header => { :gene_id => :str,
28
+ :raw_count => :float,
29
+ :scaled_estimate => :float,
30
+ :transcript_id => :str },
31
+ :skip_header => true,
32
+ :idx => :gene_id
33
+ end
34
+ end
35
+ def sdrf
36
+ return @sdrf if @sdrf
37
+ sdrf_file = GermConfig.get_conf(:tcga, :cancer_types, @sample.cancer_type, :rsem_exp_sdrf)
38
+ @sdrf = SDRF.new sdrf_file if sdrf_file
39
+ end
40
+ end
41
+ end