bio-gngm 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-gngm"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Dan MacLean"]
12
- s.date = "2012-03-19"
12
+ s.date = "2012-11-15"
13
13
  s.description = "Identify causative mutations in a model genome from NGS reads using the NGM method."
14
14
  s.email = "maclean.daniel@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -89,6 +89,7 @@ Gem::Specification.new do |s|
89
89
  "examples/use_indels.rb",
90
90
  "lib/bio-gngm.rb",
91
91
  "lib/bio/util/bio-gngm.rb",
92
+ "lib/bio/util/mutation_effects.rb",
92
93
  "scripts/get_subseq.rb",
93
94
  "scripts/make_histograms_laerfyve.rb",
94
95
  "scripts/make_histograms_laerfyve_stitched.rb",
@@ -14,27 +14,53 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
14
  require 'bio-gngm'
15
15
  require 'bio-samtools'
16
16
 
17
+ #def make_snp_array(id, file)
18
+ # a = []
19
+ # File.open(file, "r").each do |line|
20
+ # arr = line.split(/\t/)
21
+ # next if arr[0] !~ /#{id}/
22
+ # a << [a]
23
+ # end
24
+ #end
17
25
 
26
+ sequences = Bio::DB::FastaLengthDB.new(:file => ARGV[0])
27
+ $stderr.puts "Loaded sequences..."
18
28
 
19
29
 
20
- #open the BAM file and specify the region of interest
21
- g = Bio::Util::Gngm.new(:file => "aln.bam",
22
- :format => :bam,
23
- :fasta => "reference.fasta",
24
- :samtools => {:r => "Chr1:1-6000000",
30
+ sequences.each do |id, length|
31
+ g = Bio::Util::Gngm.new(:file => ARGV[1],
32
+ #:format => :pileup,
33
+ :format => :bam,
34
+ :fasta => ARGV[0],
35
+ :chromosome => id,
36
+ :start => 1,
37
+ :stop => length,
38
+ :samtools => {
25
39
  :q => 20,
26
- :Q => 50
27
- }
28
- )
29
- #retrieve the SNPs from the BAM file
30
- g.snp_positions
31
-
32
- #plot a frequency histogram for different bin sizes
33
- [100000, 250000, 500000].each do |bin_width|
34
- file_name = "#{bin_width}.png"
40
+ :Q => 20
41
+ },
42
+ :variant_call => {
43
+ :indels => false,
44
+ :deletions_only => false,
45
+ :insertions_only => false,
46
+ :min_depth => 6,
47
+ :max_depth => 250,
48
+ :mapping_quality => 20.0,
49
+ :min_non_ref_count => 2,
50
+ :ignore_reference_n => true,
51
+ :min_snp_quality => 20,
52
+ :min_consensus_quality => 20
53
+ }
54
+ )
55
+ $stderr.puts "getting #{id}.."
56
+
57
+ g.snp_positions
58
+ puts "Found #{g.snp_positions.length} SNPs"
59
+ [100000, 250000, 500000].each do |bin_width|
60
+ $stderr.puts "working on #{bin_width} windows"
61
+ file_name = "test_#{id}_#{bin_width}.png"
35
62
  g.frequency_histogram("#{file_name}",bin_width)
36
- end
37
-
38
- #close the BAM file
63
+ end
39
64
  g.close
65
+ end
40
66
 
@@ -10,3 +10,4 @@
10
10
  # was ever to get merged into the main bioruby tree.
11
11
 
12
12
  require 'bio/util/bio-gngm'
13
+ require 'bio/util/mutation_effects'
@@ -5,13 +5,36 @@
5
5
  # Created by Dan MacLean (TSL) on 2011-12-07.
6
6
  # Copyright (c) . All rights reserved.
7
7
  ###################################################
8
-
8
+ require 'rubygems'
9
9
  require 'rinruby'
10
10
  require 'bio-samtools'
11
11
  require 'bio/db/pileup'
12
12
  require 'bio/db/vcf'
13
13
  require 'pp'
14
14
 
15
+ =begin
16
+ Simple class representing a file of Fasta format sequences and each ones length
17
+ =end
18
+ class Bio::DB::FastaLengthDB
19
+ require 'bio'
20
+ def initialize(args)
21
+ @file = args[:file]
22
+ @seqs = {}
23
+ file = Bio::FastaFormat.open(@file)
24
+ file.each do |entry|
25
+ @seqs[entry.entry_id] = entry.length
26
+ end
27
+
28
+ def each
29
+ @seqs.keys.sort.each do |k|
30
+ yield k, @seqs[k]
31
+ end
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+
15
38
  =begin
16
39
  Extends the methods of the Bio::DB::Pileup class in bio-samtools. A pileup object represents the SAMtools pileup format at
17
40
  http://samtools.sourceforge.net/pileup.shtml. These extension methods are used by the Bio::Util::Gngm object internally and
@@ -53,11 +76,10 @@ class Bio::DB::Pileup
53
76
  # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 2)
54
77
  # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 1, :ignore_reference_n => true)
55
78
  def is_snp?(opts)
56
- if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
57
- return false
58
- elsif self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
59
- return true
60
- end
79
+ return false if self.ref_base == '*'
80
+ #return false unless is_ct
81
+ return false if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
82
+ return true if self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
61
83
  false
62
84
  end
63
85
  end
@@ -131,9 +153,9 @@ class Bio::DB::Vcf
131
153
  end
132
154
 
133
155
  #Returns true if only one variant allele is recorded. Loci with more than one allele are too complicated for now, so are discarded...
134
- def has_just_one_variant?
135
- self.alternatives.length == 1 and self.variant?
136
- end
156
+ #def has_just_one_variant?
157
+ # self.alternatives.length == 1 and self.variant?
158
+ #end
137
159
 
138
160
  #Returns true if the position passes criteria
139
161
  #
@@ -143,38 +165,27 @@ class Bio::DB::Vcf
143
165
  #- :mapping_quality => 10
144
166
  #
145
167
  #Example
146
- # vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25)
168
+ # vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25, :min_snp_quality => 20)
147
169
  def pass_quality?(options)
148
- (self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count])
170
+ (self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count] and self.qual >= options[:min_snp_quality])
149
171
  end
150
172
 
151
- #Returns true if the length of the alt column is less than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
152
- #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
153
- def is_deletion?(options)
154
- case
155
- when (not self.has_just_one_variant?) then false
156
- when ( self.alt.length < self.ref.length and self.pass_quality?(options) ) then true
157
- else false
158
- end
159
- rescue ## if something goes wrong, skip the postion,
173
+ #returns true if ref col has same length as all alternatives and position variant passes quality
174
+ def is_mnp?(options)
175
+ return true if self.alternatives.all? {|x| x.length == self.ref.length} and self.pass_quality?(options)
160
176
  false
161
177
  end
162
178
 
163
- #Returns true if the length of the alt column is greater than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
164
- #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
165
- def is_insertion?(options)
166
- case
167
- when (not self.has_just_one_variant?) then false
168
- when ( self.alt.length > self.ref.length and self.pass_quality?(options) ) then true
169
- else false
170
- end
171
- rescue ## if something goes wrong, skip the postion,
172
- false
179
+ ##returns true if ref col has length of 1 and is_mnp?
180
+ def is_snp?(options)
181
+ return true if self.is_mnp?(options) and self.ref.length == 1
182
+ false
173
183
  end
174
184
 
175
- #Returns true if either Vcf.is_insertion? or Vcf.is_deletion? is true
176
- def is_indel?(opts)
177
- self.is_insertion?(opts) || self.is_deletion?(opts)
185
+ #Returns true if ref col is different in length from any of the entries in alt column
186
+ def is_indel?(options)
187
+ return true if self.variant? and self.alternatives.any? {|x| x.length != self.ref.length} and self.pass_quality?(options)
188
+ false
178
189
  end
179
190
 
180
191
 
@@ -321,19 +332,33 @@ class Gngm
321
332
  #
322
333
  # g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
323
334
  # :format => :bam,
324
- # :samtools => {:q => 20, :Q => 50, :r => "Chr1:1-100000"},
335
+ # :samtools => {:q => 20, :Q => 50},
325
336
  # :fasta => "reference.fa"
326
- #
337
+ # :start => 100,
338
+ # :stop => 200
327
339
  # )
328
340
  #
329
341
  #Required parameters and defaults:
330
342
  #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
331
- #- <tt>:format => :bam</tt> -always bam
343
+ #- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
344
+ #- <tt>:chromosome => "nil"</tt> -sequence id to look at
345
+ #- <tt>:start => nil</tt> -start position on that sequence
346
+ #- <tt>:stop => nil</tt> -stop position on that sequence
332
347
  #- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
333
- #- <tt>:samtools => {:q => 20, :Q => 50, :r => "Chr1:100-1100"}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
348
+ #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
334
349
  #Optional parameters and defaults:
350
+
335
351
  #Most of these are parameters for specific methods and can be over-ridden when particular methods are called
336
- #- <tt>:variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true}</tt> -for SNP/Indel calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used.
352
+ #- <tt>:variant_call => {:indels => false,
353
+ # :min_depth => 2,
354
+ # :max_depth => 10000000,
355
+ # :min_snp_quality => 20,
356
+ # :mapping_quality => 10.0,
357
+ # :min_non_ref_count => 2,
358
+ # :ignore_reference_n => true,
359
+ # :min_consensus_quality => 20,
360
+ # :min_snp_quality => 20 }</tt>.
361
+ # For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
337
362
  #- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
338
363
  #- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
339
364
  #- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
@@ -353,20 +378,30 @@ class Gngm
353
378
  @density_max_y = nil #the maximum y value needed to plot the entire set density plots of threads and maintain a consistent scale for plots
354
379
  @colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
355
380
  @thread_colours = {}
381
+ @known_variants = nil #a list of variants to keep track of
356
382
  @opts = {
357
383
  :file => nil,
358
384
  :format => :bam,
359
385
  :fasta => nil,
360
386
  :samtools => {:q => 20, :Q => 50},
361
- ##indels = call any and only indels.. :deletions_only :insertions_only = only one tyoe
387
+ :indels => false,
388
+ ##:indels = call indels too, causes return of vcf
389
+ :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
390
+ :variant_call => { :min_depth => 2,
391
+ :max_depth => 10000000,
392
+ :mapping_quality => 10.0,
393
+ :min_non_ref_count => 2,
394
+ :ignore_reference_n => true,
395
+ :shore_map => false,
396
+ :snp_file => :false,
397
+ :min_consensus_quality => 20,
398
+ :min_snp_quality => 20},
362
399
  ## some options are designed to be equivalent to vcfutils.pl from bvftools options when using vcf
363
400
  ##:min_depth (-d)
364
401
  ##:max_depth (-D)
365
402
  ##:mapping_quality (-Q) minimum RMS mappinq quality for SNPs (mq in info fields)
366
403
  ##:min_non_ref_count (-a) minimum num of alt bases ... the sum of the last two numbers in DP4 in info fields
367
404
  ##doesnt do anything with window filtering or pv values...
368
- :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
369
- :variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true},
370
405
  :histo_bin_width => 250000,
371
406
  :graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil},
372
407
  :adjust => 1,
@@ -376,6 +411,7 @@ class Gngm
376
411
  :peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000} ##range is the width of the box to draw on the peak plot
377
412
  }
378
413
  @opts.merge!(options)
414
+ @opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
379
415
  open_file
380
416
  end
381
417
 
@@ -384,6 +420,7 @@ class Gngm
384
420
  def open_file
385
421
  case @opts[:format]
386
422
  when :bam then open_bam
423
+ when :pileup, :text then open_text
387
424
  end
388
425
  end
389
426
 
@@ -394,6 +431,10 @@ class Gngm
394
431
  @file.open
395
432
  end
396
433
 
434
+ def open_text
435
+ @file = File.open(@opts[:file], "r")
436
+ end
437
+
397
438
  public
398
439
  #for BAM files calls Bio::DB::Sam#close to close the connections to input files safely
399
440
  def close
@@ -415,16 +456,31 @@ class Gngm
415
456
  #- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
416
457
  #- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
417
458
  #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
418
- #
459
+ #- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
460
+ #- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
419
461
  #When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
420
462
  #
421
- #Sets the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
463
+ #calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
422
464
  def snp_positions(optsa={})
423
465
  opts = @opts[:variant_call].merge(optsa)
424
466
  return @snp_positions if @snp_positions
425
- case
426
- when @file.instance_of?(Bio::DB::Sam) then get_snp_positions_from_bam(opts)
467
+ case @opts[:format]
468
+ when :bam then get_snp_positions_from_bam(opts)
469
+ when :text then get_snp_positions_from_text(opts)
470
+ when :pileup then get_snp_positions_from_pileup(opts)
471
+ end
472
+ end
473
+
474
+ ##allows the user to assign SNP positions
475
+ def snp_positions=(arr)
476
+ @snp_positions = arr
477
+ end
478
+
479
+ def is_allowed_substitution?(ref,alt,opts)
480
+ if opts[:substitutions].instance_of?(Array)
481
+ return false unless opts[:substitutions].include?("#{ref}:#{alt}")
427
482
  end
483
+ true
428
484
  end
429
485
 
430
486
  private
@@ -432,18 +488,14 @@ class Gngm
432
488
  #Sets @snp_positions
433
489
  def get_snp_positions_from_bam(options={})
434
490
  opts = @opts[:variant_call].merge(options)
435
- if opts[:indels] and (opts[:deletions_only] or opts[:insertions_only])
436
- raise "Cant have indels and deletions only or insertions only, need to specify ':indels => true' to get both"
437
- end
438
491
  arr = []
439
- ##when we are calling mpileup_plus we need to add :g to the samtools options
440
- if opts[:indels] or opts[:deletions_only] or opts[:insertions_only]
441
- @opts[:samtools][:g] = true
442
- end
492
+ ##when we are calling mpileup_plus we need to add :g to the samtools options #alw
493
+ @opts[:samtools][:g] = true if opts[:indels]
494
+
443
495
 
444
496
  if not @opts[:samtools][:g]
445
497
  @file.mpileup(@opts[:samtools]) do |pileup|
446
- arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts)
498
+ arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
447
499
  end
448
500
  else
449
501
  @file.mpileup_plus(@opts[:samtools]) do |vcf|
@@ -451,19 +503,56 @@ class Gngm
451
503
  next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
452
504
  ##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
453
505
  if opts[:indels]
454
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts)
455
- elsif opts[:deletions_only]
456
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_deletion?(opts)
457
- elsif opts[:insertions_only]
458
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_insertion?(opts)
506
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
507
+ else
508
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
459
509
  end
460
510
  end
461
511
  end
462
512
 
463
513
  @snp_positions = arr
514
+
464
515
  arr
465
516
  end
466
517
 
518
+ private
519
+ def get_snp_positions_from_map(options={})
520
+ arr = []
521
+ opts = @opts[:variant_call].merge(options)
522
+ end
523
+
524
+ #this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
525
+ #text file is of format chr\tpos\tref\talt\tfreq\n
526
+ def get_snp_positions_from_text(options={})
527
+ arr = []
528
+ opts = @opts[:variant_call].merge(options)
529
+ @file.each do |line|
530
+ chr,pos,ref,alt,freq = line.chomp.split("\t")
531
+ pos = pos.to_i
532
+ freq = freq.to_f
533
+ next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
534
+ arr << [pos, freq]
535
+ end
536
+ @snp_positions = arr
537
+ end
538
+
539
+ private
540
+ def get_snp_positions_from_pileup(options={})
541
+ arr = []
542
+ opts = @opts[:variant_call].merge(options)
543
+ @file.each do |line|
544
+ pileup = Bio::DB::Pileup.new(line)
545
+ if pileup.ref_name != @opts[:chromosome] or pileup.pos < @opts[:start] or pileup.pos > @opts[:stop]
546
+ next
547
+ end
548
+ #old fashioned 10 col pileup format has extra fields we can use if needed
549
+ if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
550
+ arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
551
+ end
552
+ end
553
+ @snp_positions = arr
554
+ end
555
+
467
556
  private
468
557
  #Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
469
558
  def get_insert_size_frequency_from_bam(opts={})
@@ -815,7 +904,7 @@ class Gngm
815
904
  @peak_indices = nil #needs resetting as we are working with new cluster
816
905
  @peak_y_values = nil #needs resetting as we are working with new cluster
817
906
  self.calculate_densities(options[:adjust])
818
- @clusters = Array.new (@densities.length) {|x| 1 + x}
907
+ @clusters = Array.new(@densities.length) {|x| 1 + x}
819
908
  ##now set the cluster colours..
820
909
  colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
821
910
  ci = 0
@@ -916,7 +1005,7 @@ class Gngm
916
1005
  r.quit
917
1006
  end
918
1007
 
919
- private
1008
+ #private
920
1009
  #Calculates the position of peaks in the signal curve
921
1010
  def get_peaks(opts=@opts[:peaks])
922
1011
  opts[:background] = opts[:background].to_s.upcase
@@ -1023,7 +1112,29 @@ class Gngm
1023
1112
  return r
1024
1113
  end
1025
1114
 
1115
+ private
1116
+ #returns an array of arrays of known variants
1117
+ #file:
1118
+ #chr1 500 A G
1119
+ #chr2 1000 ATGTTA
1120
+ #chr3 1500 . TTGGA
1121
+ # returns [["chr1", "500", "A", "G"], ["chr2", "1000", "ATG", "TTA"], ["chr3", "1500", ".", "TTGGA"]]
1122
+ def parse_known_variants(file)
1123
+ File.open(file, "r").readlines.collect {|x| x.chomp.split("\t")}
1124
+ end
1026
1125
 
1126
+ public
1127
+ #Deletes everything from self.snp_positions not mentioned by position in self.known_variants. Directly modifies self.snp_positions
1128
+ def keep_known_variants(file=nil)
1129
+ raise "file of known variants not provided and @known_variants is nil" if @known_variants.nil? and file.nil?
1130
+ @known_variants = parse_known_variants(file) if @known_variants.nil? and file
1131
+ @snp_positions.each do |snp|
1132
+ end
1133
+ end
1134
+
1027
1135
  end
1028
1136
  end
1029
1137
  end
1138
+
1139
+
1140
+
@@ -0,0 +1,39 @@
1
+ require 'bio'
2
+
3
+
4
+ module Bio
5
+ class Util
6
+ def self.read_gff3(fn)
7
+ genearray=Array.new
8
+ mRNAhash=Hash.new
9
+ exonhash=Hash.new
10
+ tehash=Hash.new
11
+ lastid = ''
12
+ lastrecord = nil
13
+ gff3 = Bio::GFF::GFF3.new(File.read(fn))
14
+ gff3.records.each do | record |
15
+ feature_type = record.feature_type
16
+ if(feature_type == 'gene')
17
+ genearray << record.id
18
+ elsif(feature_type == 'mRNA')
19
+ parent = record.get_attribute('Parent')
20
+ if mRNAhash[parent] == nil
21
+ mRNAhash[parent] = [record]
22
+ else
23
+ mRNAhash[parent] << record
24
+ end
25
+ elsif(feature_type == 'transposable_element')
26
+ #--- not yet implemented
27
+ elsif(feature_type == 'exon')
28
+ parents = record.get_attributes('Parent')
29
+ parents.each do |parent|
30
+ if exonhash[parent] == nil
31
+ exonhash[parent] = Array.new
32
+ end
33
+ exonhash[parent] << record
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -14,36 +14,48 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
14
  $LOAD_PATH.unshift(File.dirname(__FILE__))
15
15
  require 'bio-gngm'
16
16
  require 'bio'
17
+ require 'pp'
17
18
  length = 0
18
19
  chr_name = ""
19
- file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
20
- file.each do |entry|
21
- length = entry.length
22
- chr_name = entry.entry_id
23
- end
20
+ #file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
21
+ #file.each do |entry|
22
+ # length = entry.length
23
+ # chr_name = entry.entry_id
24
+ #end
24
25
 
25
26
 
26
- region = "gi|57116681|ref|NC_000962.2|:1-#{length}"
27
+ region = "1:2000000-3000000"
27
28
 
28
29
  puts "analyzing - #{region}"
29
30
 
30
- g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/deletion_simulation/aln.sort.bam",
31
+ g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/insertion_finding/athal/aln.sort.bam",
31
32
  :format => :bam,
32
- :fasta => "/Users/macleand/Desktop/deletion_simulation/NC_000962.fna",
33
- :samtools => {:q => 20, :Q => 50, :r => region
33
+ :fasta => "/Users/macleand/Desktop/insertion_finding/athal/chr1.fa",
34
+ :samtools => {:q => 50, :Q => 13, :r => region
34
35
  }
35
36
  )
36
37
 
37
- g.get_unmapped_mate_frequency(:ref_window_size => 76, :ref_window_slide => 76)
38
- g.collect_threads(:start => 0.0, :stop => 0.5, :slide => 0.1, :size => 0.1)
39
- puts g.threads
38
+ g.get_unmapped_mate_frequency(:ref_window_size => 152, :ref_window_slide => 10)
39
+
40
+
41
+
42
+ g.collect_threads(:start => 0.0, :stop => 1.0, :slide => 0.1, :size => 0.1)
43
+ pp g.threads
44
+ g.threads.delete_if {|x| x.last.length <= 3 }
40
45
 
41
46
  begin
42
- g.calculate_clusters(:pseudo => true)
43
- filename = "sim_2_#{region}_all_threads.png"
44
- g.draw_threads(filename, :draw_legend => "sim_#{region}_legend.png")
47
+ #g.calculate_clusters(:pseudo => true)
48
+ g.calculate_clusters(:k => 4, :adjust => 0.5, :control_chd => 0.0, :expected_chd => 0.3, :pseudo => false)
49
+ filename = "deletion_real_data#{region}_all_threads.png"
50
+ g.draw_threads(filename, :draw_legend => "deletion_real_data#{region}_legend.png")
45
51
  ##no bands or signal to draw without clustering...
46
- filename = "sim_#{region}_hits.png"
52
+ filename = "deletion_real_data#{region}_bands.png"
53
+ g.draw_bands(filename)
54
+ filename = "deletion_real_data#{region}_signal.png"
55
+ g.draw_signal(filename)
56
+
57
+
58
+ filename = "deletion_real_data_#{region}_hits.png"
47
59
  g.draw_hit_count(filename)
48
60
  rescue Exception => e
49
61
  puts e.message, e.backtrace
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gngm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-19 00:00:00.000000000 Z
12
+ date: 2012-11-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bio
16
- requirement: &70226478302620 !ruby/object:Gem::Requirement
16
+ requirement: &70148237802860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.4.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70226478302620
24
+ version_requirements: *70148237802860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bio-samtools
27
- requirement: &70226478299960 !ruby/object:Gem::Requirement
27
+ requirement: &70148237802000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.5.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70226478299960
35
+ version_requirements: *70148237802000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: rinruby
38
- requirement: &70226478297240 !ruby/object:Gem::Requirement
38
+ requirement: &70148237801440 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 2.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70226478297240
46
+ version_requirements: *70148237801440
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: shoulda
49
- requirement: &70226478383720 !ruby/object:Gem::Requirement
49
+ requirement: &70148237800620 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70226478383720
57
+ version_requirements: *70148237800620
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: bundler
60
- requirement: &70226478382700 !ruby/object:Gem::Requirement
60
+ requirement: &70148237793360 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70226478382700
68
+ version_requirements: *70148237793360
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: jeweler
71
- requirement: &70226478381080 !ruby/object:Gem::Requirement
71
+ requirement: &70148237791880 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70226478381080
79
+ version_requirements: *70148237791880
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rcov
82
- requirement: &70226478380320 !ruby/object:Gem::Requirement
82
+ requirement: &70148237789480 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70226478380320
90
+ version_requirements: *70148237789480
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: bio
93
- requirement: &70226478379680 !ruby/object:Gem::Requirement
93
+ requirement: &70148237788820 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: 1.4.2
99
99
  type: :development
100
100
  prerelease: false
101
- version_requirements: *70226478379680
101
+ version_requirements: *70148237788820
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: bio-samtools
104
- requirement: &70226478379180 !ruby/object:Gem::Requirement
104
+ requirement: &70148237788260 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: 0.5.0
110
110
  type: :development
111
111
  prerelease: false
112
- version_requirements: *70226478379180
112
+ version_requirements: *70148237788260
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rinruby
115
- requirement: &70226478378620 !ruby/object:Gem::Requirement
115
+ requirement: &70148237787540 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 2.0.2
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *70226478378620
123
+ version_requirements: *70148237787540
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: rdoc
126
- requirement: &70226478378060 !ruby/object:Gem::Requirement
126
+ requirement: &70148237787000 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :development
133
133
  prerelease: false
134
- version_requirements: *70226478378060
134
+ version_requirements: *70148237787000
135
135
  description: Identify causative mutations in a model genome from NGS reads using the
136
136
  NGM method.
137
137
  email: maclean.daniel@gmail.com
@@ -213,6 +213,7 @@ files:
213
213
  - examples/use_indels.rb
214
214
  - lib/bio-gngm.rb
215
215
  - lib/bio/util/bio-gngm.rb
216
+ - lib/bio/util/mutation_effects.rb
216
217
  - scripts/get_subseq.rb
217
218
  - scripts/make_histograms_laerfyve.rb
218
219
  - scripts/make_histograms_laerfyve_stitched.rb
@@ -260,7 +261,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
260
261
  version: '0'
261
262
  segments:
262
263
  - 0
263
- hash: 3948590901607660961
264
+ hash: 729394422036910323
264
265
  required_rubygems_version: !ruby/object:Gem::Requirement
265
266
  none: false
266
267
  requirements: