bio-gngm 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-gngm"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Dan MacLean"]
12
- s.date = "2012-03-19"
12
+ s.date = "2012-11-15"
13
13
  s.description = "Identify causative mutations in a model genome from NGS reads using the NGM method."
14
14
  s.email = "maclean.daniel@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -89,6 +89,7 @@ Gem::Specification.new do |s|
89
89
  "examples/use_indels.rb",
90
90
  "lib/bio-gngm.rb",
91
91
  "lib/bio/util/bio-gngm.rb",
92
+ "lib/bio/util/mutation_effects.rb",
92
93
  "scripts/get_subseq.rb",
93
94
  "scripts/make_histograms_laerfyve.rb",
94
95
  "scripts/make_histograms_laerfyve_stitched.rb",
@@ -14,27 +14,53 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
14
  require 'bio-gngm'
15
15
  require 'bio-samtools'
16
16
 
17
+ #def make_snp_array(id, file)
18
+ # a = []
19
+ # File.open(file, "r").each do |line|
20
+ # arr = line.split(/\t/)
21
+ # next if arr[0] !~ /#{id}/
22
+ # a << [a]
23
+ # end
24
+ #end
17
25
 
26
+ sequences = Bio::DB::FastaLengthDB.new(:file => ARGV[0])
27
+ $stderr.puts "Loaded sequences..."
18
28
 
19
29
 
20
- #open the BAM file and specify the region of interest
21
- g = Bio::Util::Gngm.new(:file => "aln.bam",
22
- :format => :bam,
23
- :fasta => "reference.fasta",
24
- :samtools => {:r => "Chr1:1-6000000",
30
+ sequences.each do |id, length|
31
+ g = Bio::Util::Gngm.new(:file => ARGV[1],
32
+ #:format => :pileup,
33
+ :format => :bam,
34
+ :fasta => ARGV[0],
35
+ :chromosome => id,
36
+ :start => 1,
37
+ :stop => length,
38
+ :samtools => {
25
39
  :q => 20,
26
- :Q => 50
27
- }
28
- )
29
- #retrieve the SNPs from the BAM file
30
- g.snp_positions
31
-
32
- #plot a frequency histogram for different bin sizes
33
- [100000, 250000, 500000].each do |bin_width|
34
- file_name = "#{bin_width}.png"
40
+ :Q => 20
41
+ },
42
+ :variant_call => {
43
+ :indels => false,
44
+ :deletions_only => false,
45
+ :insertions_only => false,
46
+ :min_depth => 6,
47
+ :max_depth => 250,
48
+ :mapping_quality => 20.0,
49
+ :min_non_ref_count => 2,
50
+ :ignore_reference_n => true,
51
+ :min_snp_quality => 20,
52
+ :min_consensus_quality => 20
53
+ }
54
+ )
55
+ $stderr.puts "getting #{id}.."
56
+
57
+ g.snp_positions
58
+ puts "Found #{g.snp_positions.length} SNPs"
59
+ [100000, 250000, 500000].each do |bin_width|
60
+ $stderr.puts "working on #{bin_width} windows"
61
+ file_name = "test_#{id}_#{bin_width}.png"
35
62
  g.frequency_histogram("#{file_name}",bin_width)
36
- end
37
-
38
- #close the BAM file
63
+ end
39
64
  g.close
65
+ end
40
66
 
@@ -10,3 +10,4 @@
10
10
  # was ever to get merged into the main bioruby tree.
11
11
 
12
12
  require 'bio/util/bio-gngm'
13
+ require 'bio/util/mutation_effects'
@@ -5,13 +5,36 @@
5
5
  # Created by Dan MacLean (TSL) on 2011-12-07.
6
6
  # Copyright (c) . All rights reserved.
7
7
  ###################################################
8
-
8
+ require 'rubygems'
9
9
  require 'rinruby'
10
10
  require 'bio-samtools'
11
11
  require 'bio/db/pileup'
12
12
  require 'bio/db/vcf'
13
13
  require 'pp'
14
14
 
15
+ =begin
16
+ Simple class representing a file of Fasta format sequences and each ones length
17
+ =end
18
+ class Bio::DB::FastaLengthDB
19
+ require 'bio'
20
+ def initialize(args)
21
+ @file = args[:file]
22
+ @seqs = {}
23
+ file = Bio::FastaFormat.open(@file)
24
+ file.each do |entry|
25
+ @seqs[entry.entry_id] = entry.length
26
+ end
27
+
28
+ def each
29
+ @seqs.keys.sort.each do |k|
30
+ yield k, @seqs[k]
31
+ end
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+
15
38
  =begin
16
39
  Extends the methods of the Bio::DB::Pileup class in bio-samtools. A pileup object represents the SAMtools pileup format at
17
40
  http://samtools.sourceforge.net/pileup.shtml. These extension methods are used by the Bio::Util::Gngm object internally and
@@ -53,11 +76,10 @@ class Bio::DB::Pileup
53
76
  # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 2)
54
77
  # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 1, :ignore_reference_n => true)
55
78
  def is_snp?(opts)
56
- if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
57
- return false
58
- elsif self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
59
- return true
60
- end
79
+ return false if self.ref_base == '*'
80
+ #return false unless is_ct
81
+ return false if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
82
+ return true if self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
61
83
  false
62
84
  end
63
85
  end
@@ -131,9 +153,9 @@ class Bio::DB::Vcf
131
153
  end
132
154
 
133
155
  #Returns true if only one variant allele is recorded. Loci with more than one allele are too complicated for now, so are discarded...
134
- def has_just_one_variant?
135
- self.alternatives.length == 1 and self.variant?
136
- end
156
+ #def has_just_one_variant?
157
+ # self.alternatives.length == 1 and self.variant?
158
+ #end
137
159
 
138
160
  #Returns true if the position passes criteria
139
161
  #
@@ -143,38 +165,27 @@ class Bio::DB::Vcf
143
165
  #- :mapping_quality => 10
144
166
  #
145
167
  #Example
146
- # vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25)
168
+ # vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25, :min_snp_quality => 20)
147
169
  def pass_quality?(options)
148
- (self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count])
170
+ (self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count] and self.qual >= options[:min_snp_quality])
149
171
  end
150
172
 
151
- #Returns true if the length of the alt column is less than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
152
- #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
153
- def is_deletion?(options)
154
- case
155
- when (not self.has_just_one_variant?) then false
156
- when ( self.alt.length < self.ref.length and self.pass_quality?(options) ) then true
157
- else false
158
- end
159
- rescue ## if something goes wrong, skip the postion,
173
+ #returns true if ref col has same length as all alternatives and position variant passes quality
174
+ def is_mnp?(options)
175
+ return true if self.alternatives.all? {|x| x.length == self.ref.length} and self.pass_quality?(options)
160
176
  false
161
177
  end
162
178
 
163
- #Returns true if the length of the alt column is greater than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
164
- #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
165
- def is_insertion?(options)
166
- case
167
- when (not self.has_just_one_variant?) then false
168
- when ( self.alt.length > self.ref.length and self.pass_quality?(options) ) then true
169
- else false
170
- end
171
- rescue ## if something goes wrong, skip the postion,
172
- false
179
+ ##returns true if ref col has length of 1 and is_mnp?
180
+ def is_snp?(options)
181
+ return true if self.is_mnp?(options) and self.ref.length == 1
182
+ false
173
183
  end
174
184
 
175
- #Returns true if either Vcf.is_insertion? or Vcf.is_deletion? is true
176
- def is_indel?(opts)
177
- self.is_insertion?(opts) || self.is_deletion?(opts)
185
+ #Returns true if ref col is different in length from any of the entries in alt column
186
+ def is_indel?(options)
187
+ return true if self.variant? and self.alternatives.any? {|x| x.length != self.ref.length} and self.pass_quality?(options)
188
+ false
178
189
  end
179
190
 
180
191
 
@@ -321,19 +332,33 @@ class Gngm
321
332
  #
322
333
  # g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
323
334
  # :format => :bam,
324
- # :samtools => {:q => 20, :Q => 50, :r => "Chr1:1-100000"},
335
+ # :samtools => {:q => 20, :Q => 50},
325
336
  # :fasta => "reference.fa"
326
- #
337
+ # :start => 100,
338
+ # :stop => 200
327
339
  # )
328
340
  #
329
341
  #Required parameters and defaults:
330
342
  #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
331
- #- <tt>:format => :bam</tt> -always bam
343
+ #- <tt>:format => :bam</tt> -either :bam, :emap, :pileup (pileup expected to be 10 col format from samtools -vcf)
344
+ #- <tt>:chromosome => "nil"</tt> -sequence id to look at
345
+ #- <tt>:start => nil</tt> -start position on that sequence
346
+ #- <tt>:stop => nil</tt> -stop position on that sequence
332
347
  #- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
333
- #- <tt>:samtools => {:q => 20, :Q => 50, :r => "Chr1:100-1100"}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
348
+ #- <tt>:samtools => {:q => 20, :Q => 50}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
334
349
  #Optional parameters and defaults:
350
+
335
351
  #Most of these are parameters for specific methods and can be over-ridden when particular methods are called
336
- #- <tt>:variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true}</tt> -for SNP/Indel calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used.
352
+ #- <tt>:variant_call => {:indels => false,
353
+ # :min_depth => 2,
354
+ # :max_depth => 10000000,
355
+ # :min_snp_quality => 20,
356
+ # :mapping_quality => 10.0,
357
+ # :min_non_ref_count => 2,
358
+ # :ignore_reference_n => true,
359
+ # :min_consensus_quality => 20,
360
+ # :min_snp_quality => 20 }</tt>.
361
+ # For Pileup files from old samtools pileup -vcf <tt>:min_consensus_quality</tt> can be applied
337
362
  #- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
338
363
  #- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
339
364
  #- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
@@ -353,20 +378,30 @@ class Gngm
353
378
  @density_max_y = nil #the maximum y value needed to plot the entire set density plots of threads and maintain a consistent scale for plots
354
379
  @colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
355
380
  @thread_colours = {}
381
+ @known_variants = nil #a list of variants to keep track of
356
382
  @opts = {
357
383
  :file => nil,
358
384
  :format => :bam,
359
385
  :fasta => nil,
360
386
  :samtools => {:q => 20, :Q => 50},
361
- ##indels = call any and only indels.. :deletions_only :insertions_only = only one tyoe
387
+ :indels => false,
388
+ ##:indels = call indels too, causes return of vcf
389
+ :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
390
+ :variant_call => { :min_depth => 2,
391
+ :max_depth => 10000000,
392
+ :mapping_quality => 10.0,
393
+ :min_non_ref_count => 2,
394
+ :ignore_reference_n => true,
395
+ :shore_map => false,
396
+ :snp_file => :false,
397
+ :min_consensus_quality => 20,
398
+ :min_snp_quality => 20},
362
399
  ## some options are designed to be equivalent to vcfutils.pl from bvftools options when using vcf
363
400
  ##:min_depth (-d)
364
401
  ##:max_depth (-D)
365
402
  ##:mapping_quality (-Q) minimum RMS mappinq quality for SNPs (mq in info fields)
366
403
  ##:min_non_ref_count (-a) minimum num of alt bases ... the sum of the last two numbers in DP4 in info fields
367
404
  ##doesnt do anything with window filtering or pv values...
368
- :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
369
- :variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true},
370
405
  :histo_bin_width => 250000,
371
406
  :graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil},
372
407
  :adjust => 1,
@@ -376,6 +411,7 @@ class Gngm
376
411
  :peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000} ##range is the width of the box to draw on the peak plot
377
412
  }
378
413
  @opts.merge!(options)
414
+ @opts[:samtools][:r] = "#{options[:chromosome]}:#{options[:start]}-#{options[:stop]}"
379
415
  open_file
380
416
  end
381
417
 
@@ -384,6 +420,7 @@ class Gngm
384
420
  def open_file
385
421
  case @opts[:format]
386
422
  when :bam then open_bam
423
+ when :pileup, :text then open_text
387
424
  end
388
425
  end
389
426
 
@@ -394,6 +431,10 @@ class Gngm
394
431
  @file.open
395
432
  end
396
433
 
434
+ def open_text
435
+ @file = File.open(@opts[:file], "r")
436
+ end
437
+
397
438
  public
398
439
  #for BAM files calls Bio::DB::Sam#close to close the connections to input files safely
399
440
  def close
@@ -415,16 +456,31 @@ class Gngm
415
456
  #- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
416
457
  #- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
417
458
  #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
418
- #
459
+ #- <tt>:shore_map => false</tt> -use SHOREmap INTERVAL calculations as described in Anderson et al., 2009, Nature Methods 6. 8. Requires a file of known SNPs between the mapping line (eg Ler in Andersen et al.,) and a reference line (eg Col in Andersen et al).
460
+ #- <tt>:snp_file => -file of known SNPs in format "reference sequence id \t position \t mapping line nucleotide identity \t reference line nucleotide identity". Only used when +:shore_map+ is set to true. Only SNPs listed in this file will be considered.
419
461
  #When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
420
462
  #
421
- #Sets the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
463
+ #calculates or returns the value of the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
422
464
  def snp_positions(optsa={})
423
465
  opts = @opts[:variant_call].merge(optsa)
424
466
  return @snp_positions if @snp_positions
425
- case
426
- when @file.instance_of?(Bio::DB::Sam) then get_snp_positions_from_bam(opts)
467
+ case @opts[:format]
468
+ when :bam then get_snp_positions_from_bam(opts)
469
+ when :text then get_snp_positions_from_text(opts)
470
+ when :pileup then get_snp_positions_from_pileup(opts)
471
+ end
472
+ end
473
+
474
+ ##allows the user to assign SNP positions
475
+ def snp_positions=(arr)
476
+ @snp_positions = arr
477
+ end
478
+
479
+ def is_allowed_substitution?(ref,alt,opts)
480
+ if opts[:substitutions].instance_of?(Array)
481
+ return false unless opts[:substitutions].include?("#{ref}:#{alt}")
427
482
  end
483
+ true
428
484
  end
429
485
 
430
486
  private
@@ -432,18 +488,14 @@ class Gngm
432
488
  #Sets @snp_positions
433
489
  def get_snp_positions_from_bam(options={})
434
490
  opts = @opts[:variant_call].merge(options)
435
- if opts[:indels] and (opts[:deletions_only] or opts[:insertions_only])
436
- raise "Cant have indels and deletions only or insertions only, need to specify ':indels => true' to get both"
437
- end
438
491
  arr = []
439
- ##when we are calling mpileup_plus we need to add :g to the samtools options
440
- if opts[:indels] or opts[:deletions_only] or opts[:insertions_only]
441
- @opts[:samtools][:g] = true
442
- end
492
+ ##when we are calling mpileup_plus we need to add :g to the samtools options #alw
493
+ @opts[:samtools][:g] = true if opts[:indels]
494
+
443
495
 
444
496
  if not @opts[:samtools][:g]
445
497
  @file.mpileup(@opts[:samtools]) do |pileup|
446
- arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts)
498
+ arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts) and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
447
499
  end
448
500
  else
449
501
  @file.mpileup_plus(@opts[:samtools]) do |vcf|
@@ -451,19 +503,56 @@ class Gngm
451
503
  next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
452
504
  ##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
453
505
  if opts[:indels]
454
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts)
455
- elsif opts[:deletions_only]
456
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_deletion?(opts)
457
- elsif opts[:insertions_only]
458
- arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_insertion?(opts)
506
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
507
+ else
508
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_snp?(opts) and is_allowed_substitution?(vcf.ref, vcf.alt,opts)
459
509
  end
460
510
  end
461
511
  end
462
512
 
463
513
  @snp_positions = arr
514
+
464
515
  arr
465
516
  end
466
517
 
518
+ private
519
+ def get_snp_positions_from_map(options={})
520
+ arr = []
521
+ opts = @opts[:variant_call].merge(options)
522
+ end
523
+
524
+ #this does not filter snps, other than to check they are in the right region and are allowed substitutions.. no qual control, assumed to be done prior
525
+ #text file is of format chr\tpos\tref\talt\tfreq\n
526
+ def get_snp_positions_from_text(options={})
527
+ arr = []
528
+ opts = @opts[:variant_call].merge(options)
529
+ @file.each do |line|
530
+ chr,pos,ref,alt,freq = line.chomp.split("\t")
531
+ pos = pos.to_i
532
+ freq = freq.to_f
533
+ next unless chr == @opts[:chromosome] and pos >= @opts[:start] and pos <= @opts[:stop] and is_allowed_substitution?(ref,alt,opts)
534
+ arr << [pos, freq]
535
+ end
536
+ @snp_positions = arr
537
+ end
538
+
539
+ private
540
+ def get_snp_positions_from_pileup(options={})
541
+ arr = []
542
+ opts = @opts[:variant_call].merge(options)
543
+ @file.each do |line|
544
+ pileup = Bio::DB::Pileup.new(line)
545
+ if pileup.ref_name != @opts[:chromosome] or pileup.pos < @opts[:start] or pileup.pos > @opts[:stop]
546
+ next
547
+ end
548
+ #old fashioned 10 col pileup format has extra fields we can use if needed
549
+ if pileup.is_snp?(opts) and not pileup.consensus_quality.nil? and not pileup.snp_quality.nil?
550
+ arr << [pileup.pos, pileup.discordant_chastity] if pileup.consensus_quality > opts[:min_consensus_quality] and pileup.snp_quality > opts[:min_snp_quality] and is_allowed_substitution?(pileup.ref_base, pileup.consensus,opts)
551
+ end
552
+ end
553
+ @snp_positions = arr
554
+ end
555
+
467
556
  private
468
557
  #Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
469
558
  def get_insert_size_frequency_from_bam(opts={})
@@ -815,7 +904,7 @@ class Gngm
815
904
  @peak_indices = nil #needs resetting as we are working with new cluster
816
905
  @peak_y_values = nil #needs resetting as we are working with new cluster
817
906
  self.calculate_densities(options[:adjust])
818
- @clusters = Array.new (@densities.length) {|x| 1 + x}
907
+ @clusters = Array.new(@densities.length) {|x| 1 + x}
819
908
  ##now set the cluster colours..
820
909
  colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
821
910
  ci = 0
@@ -916,7 +1005,7 @@ class Gngm
916
1005
  r.quit
917
1006
  end
918
1007
 
919
- private
1008
+ #private
920
1009
  #Calculates the position of peaks in the signal curve
921
1010
  def get_peaks(opts=@opts[:peaks])
922
1011
  opts[:background] = opts[:background].to_s.upcase
@@ -1023,7 +1112,29 @@ class Gngm
1023
1112
  return r
1024
1113
  end
1025
1114
 
1115
+ private
1116
+ #returns an array of arrays of known variants
1117
+ #file:
1118
+ #chr1 500 A G
1119
+ #chr2 1000 ATGTTA
1120
+ #chr3 1500 . TTGGA
1121
+ # returns [["chr1", "500", "A", "G"], ["chr2", "1000", "ATG", "TTA"], ["chr3", "1500", ".", "TTGGA"]]
1122
+ def parse_known_variants(file)
1123
+ File.open(file, "r").readlines.collect {|x| x.chomp.split("\t")}
1124
+ end
1026
1125
 
1126
+ public
1127
+ #Deletes everything from self.snp_positions not mentioned by position in self.known_variants. Directly modifies self.snp_positions
1128
+ def keep_known_variants(file=nil)
1129
+ raise "file of known variants not provided and @known_variants is nil" if @known_variants.nil? and file.nil?
1130
+ @known_variants = parse_known_variants(file) if @known_variants.nil? and file
1131
+ @snp_positions.each do |snp|
1132
+ end
1133
+ end
1134
+
1027
1135
  end
1028
1136
  end
1029
1137
  end
1138
+
1139
+
1140
+
@@ -0,0 +1,39 @@
1
+ require 'bio'
2
+
3
+
4
+ module Bio
5
+ class Util
6
+ def self.read_gff3(fn)
7
+ genearray=Array.new
8
+ mRNAhash=Hash.new
9
+ exonhash=Hash.new
10
+ tehash=Hash.new
11
+ lastid = ''
12
+ lastrecord = nil
13
+ gff3 = Bio::GFF::GFF3.new(File.read(fn))
14
+ gff3.records.each do | record |
15
+ feature_type = record.feature_type
16
+ if(feature_type == 'gene')
17
+ genearray << record.id
18
+ elsif(feature_type == 'mRNA')
19
+ parent = record.get_attribute('Parent')
20
+ if mRNAhash[parent] == nil
21
+ mRNAhash[parent] = [record]
22
+ else
23
+ mRNAhash[parent] << record
24
+ end
25
+ elsif(feature_type == 'transposable_element')
26
+ #--- not yet implemented
27
+ elsif(feature_type == 'exon')
28
+ parents = record.get_attributes('Parent')
29
+ parents.each do |parent|
30
+ if exonhash[parent] == nil
31
+ exonhash[parent] = Array.new
32
+ end
33
+ exonhash[parent] << record
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -14,36 +14,48 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
14
  $LOAD_PATH.unshift(File.dirname(__FILE__))
15
15
  require 'bio-gngm'
16
16
  require 'bio'
17
+ require 'pp'
17
18
  length = 0
18
19
  chr_name = ""
19
- file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
20
- file.each do |entry|
21
- length = entry.length
22
- chr_name = entry.entry_id
23
- end
20
+ #file = Bio::FastaFormat.open("/Users/macleand/Desktop/deletion_simulation/NC_000962.fna")
21
+ #file.each do |entry|
22
+ # length = entry.length
23
+ # chr_name = entry.entry_id
24
+ #end
24
25
 
25
26
 
26
- region = "gi|57116681|ref|NC_000962.2|:1-#{length}"
27
+ region = "1:2000000-3000000"
27
28
 
28
29
  puts "analyzing - #{region}"
29
30
 
30
- g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/deletion_simulation/aln.sort.bam",
31
+ g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/insertion_finding/athal/aln.sort.bam",
31
32
  :format => :bam,
32
- :fasta => "/Users/macleand/Desktop/deletion_simulation/NC_000962.fna",
33
- :samtools => {:q => 20, :Q => 50, :r => region
33
+ :fasta => "/Users/macleand/Desktop/insertion_finding/athal/chr1.fa",
34
+ :samtools => {:q => 50, :Q => 13, :r => region
34
35
  }
35
36
  )
36
37
 
37
- g.get_unmapped_mate_frequency(:ref_window_size => 76, :ref_window_slide => 76)
38
- g.collect_threads(:start => 0.0, :stop => 0.5, :slide => 0.1, :size => 0.1)
39
- puts g.threads
38
+ g.get_unmapped_mate_frequency(:ref_window_size => 152, :ref_window_slide => 10)
39
+
40
+
41
+
42
+ g.collect_threads(:start => 0.0, :stop => 1.0, :slide => 0.1, :size => 0.1)
43
+ pp g.threads
44
+ g.threads.delete_if {|x| x.last.length <= 3 }
40
45
 
41
46
  begin
42
- g.calculate_clusters(:pseudo => true)
43
- filename = "sim_2_#{region}_all_threads.png"
44
- g.draw_threads(filename, :draw_legend => "sim_#{region}_legend.png")
47
+ #g.calculate_clusters(:pseudo => true)
48
+ g.calculate_clusters(:k => 4, :adjust => 0.5, :control_chd => 0.0, :expected_chd => 0.3, :pseudo => false)
49
+ filename = "deletion_real_data#{region}_all_threads.png"
50
+ g.draw_threads(filename, :draw_legend => "deletion_real_data#{region}_legend.png")
45
51
  ##no bands or signal to draw without clustering...
46
- filename = "sim_#{region}_hits.png"
52
+ filename = "deletion_real_data#{region}_bands.png"
53
+ g.draw_bands(filename)
54
+ filename = "deletion_real_data#{region}_signal.png"
55
+ g.draw_signal(filename)
56
+
57
+
58
+ filename = "deletion_real_data_#{region}_hits.png"
47
59
  g.draw_hit_count(filename)
48
60
  rescue Exception => e
49
61
  puts e.message, e.backtrace
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-gngm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-19 00:00:00.000000000 Z
12
+ date: 2012-11-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bio
16
- requirement: &70226478302620 !ruby/object:Gem::Requirement
16
+ requirement: &70148237802860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.4.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70226478302620
24
+ version_requirements: *70148237802860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bio-samtools
27
- requirement: &70226478299960 !ruby/object:Gem::Requirement
27
+ requirement: &70148237802000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.5.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70226478299960
35
+ version_requirements: *70148237802000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: rinruby
38
- requirement: &70226478297240 !ruby/object:Gem::Requirement
38
+ requirement: &70148237801440 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 2.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70226478297240
46
+ version_requirements: *70148237801440
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: shoulda
49
- requirement: &70226478383720 !ruby/object:Gem::Requirement
49
+ requirement: &70148237800620 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70226478383720
57
+ version_requirements: *70148237800620
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: bundler
60
- requirement: &70226478382700 !ruby/object:Gem::Requirement
60
+ requirement: &70148237793360 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70226478382700
68
+ version_requirements: *70148237793360
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: jeweler
71
- requirement: &70226478381080 !ruby/object:Gem::Requirement
71
+ requirement: &70148237791880 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70226478381080
79
+ version_requirements: *70148237791880
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rcov
82
- requirement: &70226478380320 !ruby/object:Gem::Requirement
82
+ requirement: &70148237789480 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70226478380320
90
+ version_requirements: *70148237789480
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: bio
93
- requirement: &70226478379680 !ruby/object:Gem::Requirement
93
+ requirement: &70148237788820 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: 1.4.2
99
99
  type: :development
100
100
  prerelease: false
101
- version_requirements: *70226478379680
101
+ version_requirements: *70148237788820
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: bio-samtools
104
- requirement: &70226478379180 !ruby/object:Gem::Requirement
104
+ requirement: &70148237788260 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: 0.5.0
110
110
  type: :development
111
111
  prerelease: false
112
- version_requirements: *70226478379180
112
+ version_requirements: *70148237788260
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rinruby
115
- requirement: &70226478378620 !ruby/object:Gem::Requirement
115
+ requirement: &70148237787540 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 2.0.2
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *70226478378620
123
+ version_requirements: *70148237787540
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: rdoc
126
- requirement: &70226478378060 !ruby/object:Gem::Requirement
126
+ requirement: &70148237787000 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :development
133
133
  prerelease: false
134
- version_requirements: *70226478378060
134
+ version_requirements: *70148237787000
135
135
  description: Identify causative mutations in a model genome from NGS reads using the
136
136
  NGM method.
137
137
  email: maclean.daniel@gmail.com
@@ -213,6 +213,7 @@ files:
213
213
  - examples/use_indels.rb
214
214
  - lib/bio-gngm.rb
215
215
  - lib/bio/util/bio-gngm.rb
216
+ - lib/bio/util/mutation_effects.rb
216
217
  - scripts/get_subseq.rb
217
218
  - scripts/make_histograms_laerfyve.rb
218
219
  - scripts/make_histograms_laerfyve_stitched.rb
@@ -260,7 +261,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
260
261
  version: '0'
261
262
  segments:
262
263
  - 0
263
- hash: 3948590901607660961
264
+ hash: 729394422036910323
264
265
  required_rubygems_version: !ruby/object:Gem::Requirement
265
266
  none: false
266
267
  requirements: