germ 0.1 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/intervals.rb CHANGED
@@ -1,142 +1,27 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- class IntervalList
4
- include Enumerable
5
- class OrderedList
6
- include Enumerable
7
- def initialize ints
8
- @track = ints
9
- end
10
-
11
- def each
12
- @track.each do |t|
13
- yield t
14
- end
15
- end
16
-
17
- def intersect interval
18
- ovs = overlap interval
19
- return nil if !ovs
20
- ovs.map{|s| s.strict_overlap interval }
21
- end
22
-
23
- def overlap interval
24
- # first, find the lowest interval that is not below the given interval
25
- low = (0...@track.size).bsearch do |i|
26
- !@track[i].below? interval
27
- end
28
- # if low is nil, all of the intervals are below the search
29
- # otherwise, low might be the first interval
30
- return nil if !low || (low == 0 && @track[low].above?(interval))
31
-
32
- # now you have a real value on the low end!
33
- # get the first guy who is above the interval
34
- high = (0...@track.size).bsearch do |i|
35
- @track[i].above? interval
36
- end
37
- # if nil, all of these guys are not above the interval
38
- high = high ? high - 1 : @track.size-1
39
- o = @track[ low..high ]
40
- o.empty? ? nil : o
41
- end
42
-
43
- def nearest interval
44
- # find the first guy who is above the interval
45
- low = (0...@track.size).bsearch do |i|
46
- !@track[i].below? interval
47
- end
48
-
49
- return @track.last if !low
50
- return @track[low] if low == 0
51
- prev = @track[ low - 1]
52
- @track[low].dist(interval) > prev.dist(interval) ? prev : @track[low]
53
- end
54
- end
55
- class BinaryTree
56
- attr_reader :max
57
- def self.create intervals
58
- new intervals.sort_by(&:start)
59
- end
60
- def initialize intervals
61
- # assume they are sorted by start
62
- low, high = intervals.each_slice((intervals.size/2.0).round).to_a
63
- @node = low.pop
64
- @left = BinaryTree.new low unless low.empty?
65
- @right = BinaryTree.new high unless high.nil?
66
- update_max
67
- end
68
-
69
- def update_max
70
- # set your max to the max of your children
71
- @max = @node.stop
72
- @max = @left.max if @left && @left.max > @max
73
- @max = @right.max if @right && @right.max > @max
74
- end
75
-
76
- def nearest interval
77
- #
78
- end
79
-
80
- def overlap interval
81
- ols = []
82
- return ols if interval.start > @max
83
- ols.concat @left.overlap(interval) if @left
84
- ols.push @node if @node.overlaps? interval
85
- ols.concat @right.overlap(interval) if @right && !@node.above?(interval)
86
- ols
87
- end
88
- end
89
- class Tree
90
- def self.create intervals
91
- new intervals.sort_by(&:start), intervals.sort_by(&:stop)
92
- end
93
- def initialize starts, stops
94
- # find the midpoint
95
- midp = (starts.first.start + stops.last.stop) / 2
96
- @mid = starts.clone :pos => midp
97
-
98
- l = left_tree starts, stops
99
- r = right_tree starts, stops
100
- @left = IntervalList::Tree.new *l unless l.first.empty?
101
- @right = IntervalList::Tree.new *r unless r.first.empty?
102
- @center_start = starts - l.first - r.first
103
- @center_stop = stops - l.last - r.last
104
- end
105
-
106
- private
107
- def left_tree starts, stops
108
- low = (0...stops.size).bsearch do |i|
109
- !stops[i].below? @mid
110
- end
111
- left_stops = (low == 0 ? [] : stops[0..low-1])
112
- return [ [], [] ] if left_stops.empty?
113
- left_starts = starts & left_stops
114
- [ left_stops, left_starts ]
115
- end
116
-
117
- def right_tree starts, stops
118
- low = (0...starts.size).bsearch do |i|
119
- starts[i].above? @mid
120
- end
121
- right_starts = (!low ? [] : starts[low..-1])
122
- return [ [], [] ] if right_starts.empty?
123
- right_stops = stops & right_starts
124
- [ right_starts, right_stops ]
125
- end
126
- end
2
+ #
3
+ #
4
+ # Operations that can be defined on a pair of intervals, yielding a new set of intervals
5
+ #
6
+ # - overlap(b) = intersection between a and b, nil if no overlap
7
+ # - a.union(b) = union between a and b, nil if no overlap
8
+ # - a.diff(b) = set of regions of b that do not include a, { b } if no overlap
9
+ #
10
+ # Operations that can be defined on interval b and set a
11
+ # - a.overlap(b) - set of intervals in a that overlap b
12
+ #
13
+ # Operations on a set of intervals a
14
+ # - flatten - collapses overlapping intervals
15
+ # -
16
+
17
+ module IntervalList
127
18
  module Interval
128
- # this interface needs to implement :chrom, :start, :stop, and :clone
129
- def clone opts={}
19
+ # this interface needs to implement :seqname, :start, :stop, and :copy
20
+ def clone
130
21
  c = copy
131
- c.chrom = opts[:chrom] if opts[:chrom]
132
- c.start = opts[:start] if opts[:start]
133
- c.stop = opts[:stop] if opts[:stop]
134
- c.start = opts[:pos] if opts[:pos]
135
- c.stop = opts[:pos] if opts[:pos]
22
+ yield c if block_given?
136
23
  return c
137
24
  end
138
- #def start= ns; @start = ns; end
139
- #def stop= ns; @stop = ns; end
140
25
 
141
26
  def below? interval
142
27
  stop < interval.start
@@ -147,53 +32,49 @@ class IntervalList
147
32
  end
148
33
 
149
34
  def overlaps? interval
150
- chrom == interval.chrom && !below?(interval) && !above?(interval)
35
+ seqname == interval.seqname && !below?(interval) && !above?(interval)
151
36
  end
152
37
 
153
38
  def contains? interval
154
39
  if interval.is_a? Numeric
155
40
  start <= interval && stop >= interval
156
41
  else
157
- chrom == interval.chrom && start <= interval.start && stop >= interval.stop
42
+ seqname == interval.seqname && start <= interval.start && stop >= interval.stop
158
43
  end
159
44
  end
160
45
 
161
- def strict_overlap interval
46
+ def intersect interval
162
47
  return nil if !overlaps? interval
163
48
 
164
- clone chrom, [ interval.start, start ].max, [ interval.stop, stop ].min
49
+ clone do |c|
50
+ c.seqname = seqname,
51
+ c.start = [ interval.start, start ].max
52
+ c.stop = [ interval.stop, stop ].min
53
+ end
165
54
  end
166
55
 
167
- def strict_diff interval
168
- ol = strict_overlap interval
169
- return IntervalList.new [ self ] if !ol
56
+ def diff interval
57
+ ol = overlap interval
58
+ if !ol
59
+ return yield([ self ])
60
+ end
61
+
170
62
  ints = []
171
63
  if ol.start > start
172
- ints.push clone( :start => start, :stop => ol.start-1 )
64
+ ints.push(clone { |c| c.start = start; c.stop = ol.start-1 })
173
65
  end
174
66
  if ol.stop < stop
175
- ints.push clone(:start => ol.stop+1, :stop => stop)
176
- end
177
- if !ints.empty?
178
- return IntervalList.new ints
67
+ ints.push(clone { |c| c.start = ol.stop+1; c.stop = stop })
179
68
  end
69
+ return yield(ints)
180
70
  end
181
71
 
182
- def strict_union interval
72
+ def union interval
183
73
  return nil unless interval && overlaps?(interval)
184
- clone :start => [ interval.start, start ].min, :stop => [ interval.stop, stop ].max
185
- end
186
-
187
- def overlap interval_list
188
- interval_list.overlap self
189
- end
190
-
191
- def nearest interval_list
192
- interval_list.nearest self
193
- end
194
-
195
- def intersect interval_list
196
- interval_list.intersect self
74
+ clone do |c|
75
+ c.start = [ interval.start, start ].min
76
+ c.stop = [ interval.stop, stop ].max
77
+ end
197
78
  end
198
79
 
199
80
  def size
@@ -207,131 +88,225 @@ class IntervalList
207
88
  def dist interval
208
89
  (center-interval.center).abs
209
90
  end
91
+ end
92
+ end
210
93
 
211
- def intersection_size interval_list
212
- return 0 if !inters = intersect(interval_list)
213
- inters.inject(0) {|sum,int| sum += int.size}
214
- end
94
+ module IntervalList
95
+ def overlap interval
96
+ return present([]) unless interval_set[interval.seqname]
97
+ present(interval_set[interval.seqname].overlap interval)
215
98
  end
216
- class BasicInterval
217
- include Interval
218
99
 
219
- attr_accessor :chrom, :start, :stop, :data
100
+ def nearest interval
101
+ return nil unless interval_set[interval.seqname]
102
+ interval_set[interval.seqname].nearest interval
103
+ end
220
104
 
221
- def initialize opts
222
- @chrom = opts[:chrom]
223
- @start = opts[:start]
224
- @stop = opts[:stop]
225
- @stop = @start = opts[:pos] if opts[:pos]
226
- @data = opts[:data]
227
- end
228
- def copy
229
- self.class.new :chrom => @chrom, :start => @start, :stop => @stop, :data => @data
105
+ def flatten
106
+ current_span = nil
107
+ flat = []
108
+ each do |interval|
109
+ if current_span && current_span.overlaps?(interval)
110
+ current_span.stop = interval.stop
111
+ else
112
+ # you reached a new span
113
+ if current_span
114
+ yield current_span if block_given?
115
+ flat.push current_span
116
+ end
117
+ current_span = interval.clone
118
+ end
230
119
  end
231
- def inspect
232
- "#<#{self.class}:0x#{'%x' % (object_id << 1)} @chrom=#{@chrom} @start=#{@start} @stop=#{@stop}>"
120
+ if current_span
121
+ yield current_span if block_given?
122
+ flat.push current_span
233
123
  end
124
+ present flat
234
125
  end
235
126
 
236
- def each
237
- @intervals.each do |int|
238
- yield int
127
+ def present obj
128
+ if respond_to? :wrap
129
+ wrap obj
130
+ else
131
+ obj
239
132
  end
240
133
  end
241
134
 
242
- def overlap interval
243
- track = @ints_chrom[interval.chrom]
244
- return nil if !track
245
- track.overlap interval
135
+ def add_interval int
136
+ # don't bother if the tree hasn't been built yet
137
+ @interval_set << int if @interval_set
246
138
  end
247
139
 
248
- def nearest interval
249
- track = @ints_chrom[interval.chrom]
250
- return nil if !track
251
- track.nearest interval
140
+ def interval_set
141
+ # create a new set of intervals
142
+ @interval_set ||= IntervalList::Set.new self.to_a
252
143
  end
144
+ end
253
145
 
254
- def intersect interval
255
- track = @ints_chrom[interval.chrom]
256
- return nil if !track
257
- track.intersect interval
258
- end
146
+ module IntervalList
147
+ class TreeNode
148
+ attr_reader :max
149
+
150
+ def initialize intervals
151
+ # assume they are sorted by start
152
+
153
+ low, high = intervals.each_slice((intervals.size/2.0).round).to_a
259
154
 
260
- # subtract this set of intervals from the given interval_list
261
- def diff interval_list
262
- interval_list.map do |int|
263
- ols = overlap(int)
264
- # if there are no overlaps, return int
265
- unless ols
266
- int
155
+ @node = low.pop
156
+ @left = TreeNode.new low unless low.empty?
157
+ @right = TreeNode.new high unless high.nil?
158
+
159
+ update_max
160
+ end
161
+
162
+ def add interval
163
+ if interval.start < @node.start
164
+ if @left
165
+ @left.add interval
166
+ else
167
+ @left = TreeNode.new [interval]
168
+ end
267
169
  else
268
- int = ols.each do |ol|
269
- int.strict_diff(ol).to_a
270
- end.flatten
170
+ if @right
171
+ @right.add interval
172
+ else
173
+ @right = TreeNode.new [interval]
174
+ end
271
175
  end
176
+ update_max
272
177
  end
273
- end
274
178
 
275
- def initialize array, opts = {}
276
- @intervals = []
277
- @ints_chrom = {}
278
- array.each do |item|
279
- if item.is_a? IntervalList::Interval
280
- int = item
179
+ def update_max
180
+ # set your max to the max of your children
181
+ @max = @node
182
+ @max = @left.max if @left && @left.max.stop > @max.stop
183
+ @max = @right.max if @right && @right.max.stop > @max.stop
184
+ end
185
+
186
+ def is_max?
187
+ @max == @node
188
+ end
189
+
190
+
191
+ def breadth_traverse &block
192
+ @left.breadth_traverse(&block) if @left
193
+ yield @node
194
+ @right.breadth_traverse(&block) if @right
195
+ end
196
+
197
+ def depth_traverse &block
198
+ yield @node
199
+ @left.breadth_traverse(&block) if @left
200
+ @right.breadth_traverse(&block) if @right
201
+ end
202
+
203
+ def nearest interval
204
+ # if there are overlaps, pick the one with the closest distance
205
+
206
+ ol = overlap(interval)
207
+ if !ol.empty?
208
+ return ol.min do |a,b|
209
+ interval.dist(a) <=> interval.dist(b)
210
+ end
211
+ end
212
+
213
+ # there are no overlaps. Find the highest stop that is less than interval.start
214
+ [ nearest_stop(interval),
215
+ nearest_start(interval) ].compact.min do |a,b|
216
+ interval.dist(a) <=> interval.dist(b)
281
217
  end
282
- @intervals.push int
283
- @ints_chrom[int.chrom] ||= []
284
- @ints_chrom[int.chrom].push int
285
218
  end
286
219
 
287
- sort_ints_chrom opts[:type]
220
+ def overlap interval
221
+ ols = []
222
+ return ols if interval.start > @max.stop
223
+ ols.concat @left.overlap(interval) if @left
224
+ ols.push @node if @node.overlaps? interval
225
+ ols.concat @right.overlap(interval) if @right && !@node.above?(interval)
226
+ ols
227
+ end
288
228
  end
229
+ end
289
230
 
290
- def inspect
291
- "#<#{self.class}:0x#{'%x' % (object_id << 1)} @intervals=#{@intervals.size}>"
231
+ module IntervalList
232
+ class Set
233
+ def initialize array
234
+ @seqs = {}
235
+ array.each do |item|
236
+ self << item
237
+ end
238
+ end
239
+
240
+ def << item
241
+ @seqs[item.seqname] ||= IntervalList::Tree.new
242
+ @seqs[item.seqname] << item
243
+ end
244
+ def [] ind
245
+ @seqs[ind]
246
+ end
247
+ def each
248
+ @seqs.each do |seq|
249
+ yield seq
250
+ end
251
+ end
252
+ def inspect
253
+ "#<#{self.class}:0x#{'%x' % (object_id << 1)} @seqs=#{@seqs.keys}>"
254
+ end
292
255
  end
293
256
 
294
- attr_reader :ints_chrom
257
+ class Tree
258
+ def initialize
259
+ @intervals = []
260
+ end
295
261
 
296
- def collapse!
297
- # collapse this set of intervals down to a shorter one
298
- @ints_chrom.each do |chrom,list|
299
- @ints_chrom[chrom] = collapsed_list list
262
+ def << int
263
+ @intervals << int
300
264
  end
301
265
 
302
- @intervals = @ints_chrom.map(&:last).flatten
303
- self
304
- end
266
+ def build_tree
267
+ IntervalList::TreeNode.new intervals_start
268
+ end
305
269
 
306
- private
307
- def collapsed_list intervals
308
- new_list = []
309
- cache_interval = nil
310
- intervals.each do |interval|
311
- # it should be sorted already
312
- if cache_interval
313
- if !un = cache_interval.strict_union(interval)
314
- new_list.push cache_interval
315
- cache_interval = interval
316
- else
317
- cache_interval = un
270
+ def intervals_start
271
+ @intervals_start ||= @intervals.sort_by &:start
272
+ end
273
+
274
+ def intervals_stop
275
+ @intervals_stop ||= @intervals.sort_by { |i| -1 * i.stop }
276
+ end
277
+
278
+ def nearest interval
279
+ # first see if you have an overlap
280
+ ols = overlap(interval)
281
+
282
+ unless ols.empty?
283
+ return ols.min do |int|
284
+ int.dist(interval)
318
285
  end
319
- else
320
- cache_interval = interval
321
286
  end
322
- end
323
- new_list.push cache_interval if cache_interval
324
- new_list
325
- end
326
287
 
327
- def sort_ints_chrom type
328
- @ints_chrom.each do |chrom,list|
329
- case type
330
- when nil, :btree
331
- @ints_chrom[chrom] = IntervalList::BinaryTree.new list.sort_by{ |int| int.start }
332
- when :flat
333
- @ints_chrom[chrom] = IntervalList::OrderedList.new list.sort_by{ |int| int.start }
288
+ # you can just use the sorted intervals to do this
289
+ lowest_start = intervals_start.bsearch do |i|
290
+ i.above? interval
291
+ end
292
+ highest_stop = intervals_stop.bsearch do |i|
293
+ i.below? interval
294
+ end
295
+ [ lowest_start, highest_stop ].compact.min do |i|
296
+ i.dist(interval)
334
297
  end
335
298
  end
299
+
300
+ def tree
301
+ @tree ||= build_tree
302
+ end
303
+
304
+ def respond_to_missing? sym, include_all = false
305
+ tree.respond_to?(sym) || super
306
+ end
307
+
308
+ def method_missing sym, *args, &block
309
+ tree.send(sym, *args, &block)
310
+ end
336
311
  end
337
312
  end
data/lib/maf.rb CHANGED
@@ -1,78 +1,69 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'oncotator'
4
3
  require 'yaml'
5
4
  require 'mutation_set'
6
5
 
7
- class Maf < MutationSet::Sample
8
- requires "Hugo_Symbol", "Entrez_Gene_Id", "Center",
9
- "NCBI_Build", "Chromosome",
10
- "Start_Position", "End_Position", "Strand",
11
- "Variant_Classification", "Variant_Type",
12
- "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
13
- "dbSNP_RS", "dbSNP_Val_Status",
14
- "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode",
15
- "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2",
16
- "Tumor_Validation_Allele1", "Tumor_Validation_Allele2",
17
- "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2",
18
- "verification_Status", "Validation_Status",
19
- "Mutation_Status", "Sequencing_Phase", "Sequence_Source",
20
- "Validation_Method", "Score" #, "BAM_File", "Sequencer"
6
+ class Maf < Mutation::Collection
7
+ header_on
8
+ requires :hugo_symbol => :str, :entrez_gene_id => :str, :center => :str,
9
+ :ncbi_build => :str, :chromosome => :str,
10
+ :start_position => :int, :end_position => :int, :strand => :str,
11
+ :variant_classification => :str, :variant_type => :str,
12
+ :reference_allele => :str, :tumor_seq_allele1 => :str, :tumor_seq_allele2 => :str,
13
+ :dbsnp_rs => :str, :dbsnp_val_status => :str,
14
+ :tumor_sample_barcode => :str, :matched_norm_sample_barcode => :str,
15
+ :match_norm_seq_allele1 => :str, :match_norm_seq_allele2 => :str,
16
+ :tumor_validation_allele1 => :str, :tumor_validation_allele2 => :str,
17
+ :match_norm_validation_allele1 => :str, :match_norm_validation_allele2 => :str,
18
+ :verification_status => :str, :validation_status => :str,
19
+ :mutation_status => :str, :sequencing_phase => :str, :sequence_source => :str,
20
+ :validation_method => :str, :score => :str
21
+ might_have :tumor_var_freq => :float,
22
+ :tumor_ref_count => :int, :t_ref_count => :int,
23
+ :normal_ref_count => :int, :n_ref_count => :int,
24
+ :tumor_alt_count => :int, :t_alt_count => :int,
25
+ :normal_alt_count => :int, :n_alt_count => :int
21
26
  comments "#"
22
27
 
23
28
  def preamble
24
29
  "#version 2.2"
25
30
  end
26
31
 
27
- class Line < MutationSet::Line
28
- alias_key :chrom, :chromosome
32
+ class Line < Mutation::Record
33
+ alias_key :seqname, :chromosome
34
+ alias_key :pos, :start_position
29
35
  alias_key :start, :start_position
30
36
  alias_key :stop, :end_position
31
- alias_key :ref_allele, :reference_allele
32
-
33
- def skip_maf?
34
- criteria_failed?(self, :maf)
37
+ alias_key :ref, :reference_allele
38
+ def alt
39
+ tumor_seq_allele1 == reference_allele ? tumor_seq_allele2 : tumor_seq_allele1
35
40
  end
36
41
 
37
- def key
38
- [ tumor_sample_barcode, chrom, start, stop ].join(":")
42
+ def initialize h, table
43
+ super h, table
44
+ @muts.push Mutation.new(seqname, pos, ref, alt, ref_count, alt_count)
39
45
  end
40
46
 
41
- def alt_allele
42
- tumor_seq_allele1 == reference_allele ? tumor_seq_allele2 : tumor_seq_allele1
47
+ def respond_to_missing? sym, include_all = false
48
+ [ :ref_count, :alt_count ].include?(sym) || super
43
49
  end
44
50
 
45
- def _ref_count
46
- [ :t_ref_count, :tumor_ref_count, :ref_count ].each do |s|
47
- if respond_to? s
48
- return send(s)
51
+ def method_missing sym, *args, &block
52
+ if sym == :ref_count
53
+ [ :t_ref_count, :tumor_ref_count ].each do |s|
54
+ return send(s) if respond_to? s
49
55
  end
50
- end
51
- nil
52
- end
53
-
54
- def _alt_count
55
- [ :t_alt_count, :tumor_alt_count, :alt_count ].each do |s|
56
- if respond_to? s
57
- return send(s)
56
+ nil
57
+ elsif sym == :alt_count
58
+ [ :t_alt_count, :tumor_alt_count ].each do |s|
59
+ return send(s) if respond_to? s
58
60
  end
59
- end
60
- nil
61
- end
62
-
63
- def chrom_name
64
- # properly format the name
65
- if chromosome =~ /chr/
66
- chromosome
61
+ nil
67
62
  else
68
- "chr#{chromosome}"
63
+ super
69
64
  end
70
65
  end
71
66
 
72
- def is_coding?
73
- variant_classification =~ /(Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Missense_Mutation|Nonsense_Mutation|Splice_Site|Translation_Start_Site)/
74
- end
75
-
76
67
  def gene_name
77
68
  if !hugo_symbol || hugo_symbol.size == 0
78
69
  onco.txp_gene
@@ -80,13 +71,6 @@ class Maf < MutationSet::Sample
80
71
  hugo_symbol
81
72
  end
82
73
  end
83
-
84
- def var_freq
85
- if !_ref_count.empty? && !_alt_count.empty?
86
- _ref_count.to_f / (_ref_count.to_i + _alt_count.to_i)
87
- else
88
- nil
89
- end
90
- end
91
74
  end
75
+ line_class Maf::Line
92
76
  end