bio-gngm 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/.document +5 -0
  2. data/Gemfile +20 -0
  3. data/Gemfile.lock +33 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/bio-gngm.gemspec +173 -0
  9. data/doc/Bio.html +129 -0
  10. data/doc/Bio/DB.html +128 -0
  11. data/doc/Bio/DB/Pileup.html +316 -0
  12. data/doc/Bio/DB/Vcf.html +683 -0
  13. data/doc/Bio/Util.html +135 -0
  14. data/doc/Bio/Util/Gngm.html +1655 -0
  15. data/doc/LICENSE_txt.html +111 -0
  16. data/doc/_index.html +169 -0
  17. data/doc/class_list.html +47 -0
  18. data/doc/created.rid +4 -0
  19. data/doc/css/common.css +1 -0
  20. data/doc/css/full_list.css +55 -0
  21. data/doc/css/style.css +322 -0
  22. data/doc/doc/created.rid +0 -0
  23. data/doc/file_list.html +52 -0
  24. data/doc/frames.html +13 -0
  25. data/doc/images/add.png +0 -0
  26. data/doc/images/bands.png +0 -0
  27. data/doc/images/brick.png +0 -0
  28. data/doc/images/brick_link.png +0 -0
  29. data/doc/images/bug.png +0 -0
  30. data/doc/images/bullet_black.png +0 -0
  31. data/doc/images/bullet_toggle_minus.png +0 -0
  32. data/doc/images/bullet_toggle_plus.png +0 -0
  33. data/doc/images/date.png +0 -0
  34. data/doc/images/delete.png +0 -0
  35. data/doc/images/find.png +0 -0
  36. data/doc/images/loadingAnimation.gif +0 -0
  37. data/doc/images/macFFBgHack.png +0 -0
  38. data/doc/images/package.png +0 -0
  39. data/doc/images/page_green.png +0 -0
  40. data/doc/images/page_white_text.png +0 -0
  41. data/doc/images/page_white_width.png +0 -0
  42. data/doc/images/plugin.png +0 -0
  43. data/doc/images/ruby.png +0 -0
  44. data/doc/images/signal.png +0 -0
  45. data/doc/images/tag_blue.png +0 -0
  46. data/doc/images/tag_green.png +0 -0
  47. data/doc/images/threads.png +0 -0
  48. data/doc/images/transparent.png +0 -0
  49. data/doc/images/wrench.png +0 -0
  50. data/doc/images/wrench_orange.png +0 -0
  51. data/doc/images/zoom.png +0 -0
  52. data/doc/index.html +88 -0
  53. data/doc/js/app.js +205 -0
  54. data/doc/js/darkfish.js +153 -0
  55. data/doc/js/full_list.js +167 -0
  56. data/doc/js/jquery.js +18 -0
  57. data/doc/js/navigation.js +142 -0
  58. data/doc/js/search.js +94 -0
  59. data/doc/js/search_index.js +1 -0
  60. data/doc/js/searcher.js +228 -0
  61. data/doc/lib/bio-gngm_rb.html +103 -0
  62. data/doc/lib/bio/util/bio-gngm_rb.html +96 -0
  63. data/doc/method_list.html +382 -0
  64. data/doc/rdoc.css +543 -0
  65. data/doc/table_of_contents.html +161 -0
  66. data/examples/.DS_Store +0 -0
  67. data/examples/make_histograms.rb +40 -0
  68. data/examples/make_threads.rb +42 -0
  69. data/examples/make_threads_isize.rb +41 -0
  70. data/examples/use_indels.rb +36 -0
  71. data/lib/bio-gngm.rb +12 -0
  72. data/lib/bio/util/bio-gngm.rb +1029 -0
  73. data/scripts/get_subseq.rb +16 -0
  74. data/scripts/make_histograms_laerfyve.rb +83 -0
  75. data/scripts/make_histograms_laerfyve_stitched.rb +59 -0
  76. data/scripts/make_threads_isize_laerfyfe.rb +52 -0
  77. data/scripts/make_threads_unmapped_laerfyfe.rb +72 -0
  78. data/scripts/make_threads_unmapped_laerfyfe_pseudo.rb +56 -0
  79. data/scripts/make_threads_unmapped_simulation.rb +54 -0
  80. data/scripts/make_threads_unmapped_simulation_immediate_region.rb +59 -0
  81. data/scripts/optimise_freq_window_size.rb +82 -0
  82. data/stitched_contigs.zip +0 -0
  83. data/test/data/ids2.txt +1 -0
  84. data/test/data/sorted.bam +0 -0
  85. data/test/data/test +0 -0
  86. data/test/data/test.bam +0 -0
  87. data/test/data/test.fa +20 -0
  88. data/test/data/test.fai +0 -0
  89. data/test/data/test.sai +0 -0
  90. data/test/data/test.tam +10 -0
  91. data/test/data/test_chr.fasta +1000 -0
  92. data/test/data/test_chr.fasta.amb +2 -0
  93. data/test/data/test_chr.fasta.ann +3 -0
  94. data/test/data/test_chr.fasta.bwt +0 -0
  95. data/test/data/test_chr.fasta.fai +1 -0
  96. data/test/data/test_chr.fasta.pac +0 -0
  97. data/test/data/test_chr.fasta.rbwt +0 -0
  98. data/test/data/test_chr.fasta.rpac +0 -0
  99. data/test/data/test_chr.fasta.rsa +0 -0
  100. data/test/data/test_chr.fasta.sa +0 -0
  101. data/test/data/testu.bam +0 -0
  102. data/test/data/testu.bam.bai +0 -0
  103. data/test/helper.rb +18 -0
  104. data/test/test_bio-gngm.rb +126 -0
  105. metadata +276 -0
@@ -0,0 +1,161 @@
1
+ <!DOCTYPE html>
2
+
3
+ <html>
4
+ <head>
5
+ <meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
6
+
7
+ <title>Table of Contents</title>
8
+
9
+ <link type="text/css" media="screen" href="./rdoc.css" rel="stylesheet">
10
+
11
+ <script type="text/javascript">
12
+ var rdoc_rel_prefix = "./";
13
+ </script>
14
+
15
+ <script type="text/javascript" charset="utf-8" src="./js/jquery.js"></script>
16
+ <script type="text/javascript" charset="utf-8" src="./js/navigation.js"></script>
17
+ <script type="text/javascript" charset="utf-8" src="./js/search_index.js"></script>
18
+ <script type="text/javascript" charset="utf-8" src="./js/search.js"></script>
19
+ <script type="text/javascript" charset="utf-8" src="./js/searcher.js"></script>
20
+ <script type="text/javascript" charset="utf-8" src="./js/darkfish.js"></script>
21
+
22
+
23
+ <body class="indexpage">
24
+ <h1>Table of Contents</h1>
25
+
26
+ <h2>Pages</h2>
27
+ <ul>
28
+ <li class="file">
29
+ <a href="LICENSE_txt.html">LICENSE</a>
30
+ </li>
31
+
32
+ </ul>
33
+
34
+ <h2 id="classes">Classes/Modules</h2>
35
+ <ul>
36
+ <li class="module">
37
+ <a href="Bio.html">Bio</a>
38
+ </li>
39
+ <li class="module">
40
+ <a href="Bio/DB.html">Bio::DB</a>
41
+ </li>
42
+ <li class="class">
43
+ <a href="Bio/DB/Pileup.html">Bio::DB::Pileup</a>
44
+ </li>
45
+ <li class="class">
46
+ <a href="Bio/DB/Vcf.html">Bio::DB::Vcf</a>
47
+ </li>
48
+ <li class="class">
49
+ <a href="Bio/Util.html">Bio::Util</a>
50
+ </li>
51
+ <li class="class">
52
+ <a href="Bio/Util/Gngm.html">Bio::Util::Gngm</a>
53
+
54
+ <img class="toc-toggle" src="images/transparent.png" alt="" title="toggle headings">
55
+ <ul class="initially-hidden">
56
+ <li><a href="Bio/Util/Gngm.html#label-Background">Background</a>
57
+ <li><a href="Bio/Util/Gngm.html#label-Example">Example</a>
58
+ <li><a href="Bio/Util/Gngm.html#label-Polymorphisms+and+statistics">Polymorphisms and statistics</a>
59
+ <li><a href="Bio/Util/Gngm.html#label-SNPs">SNPs</a>
60
+ <li><a href="Bio/Util/Gngm.html#label-Short+INDELS">Short INDELS</a>
61
+ <li><a href="Bio/Util/Gngm.html#label-Insertion+Size">Insertion Size</a>
62
+ <li><a href="Bio/Util/Gngm.html#label-Unmapped+Mate+Pairs+%2F+Paired+Ends.">Unmapped Mate Pairs / Paired Ends.</a>
63
+ <li><a href="Bio/Util/Gngm.html#label-Input+types">Input types</a>
64
+ <li><a href="Bio/Util/Gngm.html#label-Workflow">Workflow</a>
65
+ <li><a href="Bio/Util/Gngm.html#label-Prerequisites">Prerequisites</a>
66
+ <li><a href="Bio/Util/Gngm.html#label-Acknowledgements">Acknowledgements</a>
67
+ <li><a href="Bio/Util/Gngm.html#label-Using+bio-gngm">Using bio-gngm</a>
68
+ <li><a href="Bio/Util/Gngm.html#label-API">API</a>
69
+ </ul>
70
+ </li>
71
+
72
+ </ul>
73
+
74
+ <h2 id="methods">Methods</h2>
75
+ <ul>
76
+
77
+ <li class="method"><a href="Bio/Util/Gngm.html#method-c-new">::new &mdash; Bio::Util::Gngm</a>
78
+
79
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-alternatives">#alternatives &mdash; Bio::DB::Vcf</a>
80
+
81
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-calculate_clusters">#calculate_clusters &mdash; Bio::Util::Gngm</a>
82
+
83
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-calculate_densities">#calculate_densities &mdash; Bio::Util::Gngm</a>
84
+
85
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-calculate_signal">#calculate_signal &mdash; Bio::Util::Gngm</a>
86
+
87
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-close">#close &mdash; Bio::Util::Gngm</a>
88
+
89
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-clusters">#clusters &mdash; Bio::Util::Gngm</a>
90
+
91
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-collect_threads">#collect_threads &mdash; Bio::Util::Gngm</a>
92
+
93
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-densities">#densities &mdash; Bio::Util::Gngm</a>
94
+
95
+ <li class="method"><a href="Bio/DB/Pileup.html#method-i-discordant_chastity">#discordant_chastity &mdash; Bio::DB::Pileup</a>
96
+
97
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-draw_bands">#draw_bands &mdash; Bio::Util::Gngm</a>
98
+
99
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-draw_hit_count">#draw_hit_count &mdash; Bio::Util::Gngm</a>
100
+
101
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-draw_peaks">#draw_peaks &mdash; Bio::Util::Gngm</a>
102
+
103
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-draw_signal">#draw_signal &mdash; Bio::Util::Gngm</a>
104
+
105
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-draw_threads">#draw_threads &mdash; Bio::Util::Gngm</a>
106
+
107
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-frequency_histogram">#frequency_histogram &mdash; Bio::Util::Gngm</a>
108
+
109
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-get_band">#get_band &mdash; Bio::Util::Gngm</a>
110
+
111
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-get_insert_size_frequency">#get_insert_size_frequency &mdash; Bio::Util::Gngm</a>
112
+
113
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-get_unmapped_mate_frequency">#get_unmapped_mate_frequency &mdash; Bio::Util::Gngm</a>
114
+
115
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-gq">#gq &mdash; Bio::DB::Vcf</a>
116
+
117
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-has_just_one_variant-3F">#has_just_one_variant? &mdash; Bio::DB::Vcf</a>
118
+
119
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-hit_count">#hit_count &mdash; Bio::Util::Gngm</a>
120
+
121
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-is_deletion-3F">#is_deletion? &mdash; Bio::DB::Vcf</a>
122
+
123
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-is_indel-3F">#is_indel? &mdash; Bio::DB::Vcf</a>
124
+
125
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-is_insertion-3F">#is_insertion? &mdash; Bio::DB::Vcf</a>
126
+
127
+ <li class="method"><a href="Bio/DB/Pileup.html#method-i-is_snp-3F">#is_snp? &mdash; Bio::DB::Pileup</a>
128
+
129
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-mq">#mq &mdash; Bio::DB::Vcf</a>
130
+
131
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-non_ref_allele_count">#non_ref_allele_count &mdash; Bio::DB::Vcf</a>
132
+
133
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-non_ref_allele_freq">#non_ref_allele_freq &mdash; Bio::DB::Vcf</a>
134
+
135
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-pass_quality-3F">#pass_quality? &mdash; Bio::DB::Vcf</a>
136
+
137
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-peaks">#peaks &mdash; Bio::Util::Gngm</a>
138
+
139
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-pl">#pl &mdash; Bio::DB::Vcf</a>
140
+
141
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-signal">#signal &mdash; Bio::Util::Gngm</a>
142
+
143
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-snp_positions">#snp_positions &mdash; Bio::Util::Gngm</a>
144
+
145
+ <li class="method"><a href="Bio/Util/Gngm.html#method-i-threads">#threads &mdash; Bio::Util::Gngm</a>
146
+
147
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-to_s">#to_s &mdash; Bio::DB::Vcf</a>
148
+
149
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-used_depth">#used_depth &mdash; Bio::DB::Vcf</a>
150
+
151
+ <li class="method"><a href="Bio/DB/Vcf.html#method-i-variant-3F">#variant? &mdash; Bio::DB::Vcf</a>
152
+
153
+ </ul>
154
+
155
+
156
+ <footer id="validator-badges">
157
+ <p><a href="http://validator.w3.org/check/referer">[Validate]</a>
158
+ <p>Generated by <a href="https://github.com/rdoc/rdoc">RDoc</a> 3.11.
159
+ <p>Generated with the <a href="http://deveiate.org/projects/Darkfish-Rdoc/">Darkfish Rdoc Generator</a> 3.
160
+ </footer>
161
+
Binary file
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # make_histograms
4
+ #
5
+ # Created by Dan MacLean (TSL) on 2012-01-17.
6
+ # Copyright (c) . All rights reserved.
7
+ ###################################################
8
+
9
+ ### An example script to get SNP positions and make histograms
10
+ ### of the frequncy of discordant SNPs. Generates plots for each.
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'bio-gngm'
15
+ require 'bio-samtools'
16
+
17
+
18
+
19
+
20
+ #open the BAM file and specify the region of interest
21
+ g = Bio::Util::Gngm.new(:file => "aln.bam",
22
+ :format => :bam,
23
+ :fasta => "reference.fasta",
24
+ :samtools => {:r => "Chr1:1-6000000",
25
+ :q => 20,
26
+ :Q => 50
27
+ }
28
+ )
29
+ #retrieve the SNPs from the BAM file
30
+ g.snp_positions
31
+
32
+ #plot a frequency histogram for different bin sizes
33
+ [100000, 250000, 500000].each do |bin_width|
34
+ file_name = "#{bin_width}.png"
35
+ g.frequency_histogram("#{file_name}",bin_width)
36
+ end
37
+
38
+ #close the BAM file
39
+ g.close
40
+
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # make_threads
4
+ #
5
+ # Created by Dan MacLean (TSL) on 2012-01-17.
6
+ # Copyright (c) . All rights reserved.
7
+ ###################################################
8
+
9
+ ### An example script to specify a region, get SNP positions, make density threads for different kernels,
10
+ ### cluster for different values of k and then draw the threads, bands and signal. Generates plots for each new set of parameters
11
+
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-gngm'
16
+
17
+
18
+ g = Bio::Util::Gngm.new(:file => "/Users/macleand/Desktop/ngm/fli.sorted.bam",
19
+ :format => :bam,
20
+ :fasta => "/Users/macleand/Desktop/ngm/TAIR9_chr_all.fas",
21
+ :samtools => {:r => "Chr1:1000000-2000000",
22
+ :q => 20,
23
+ :Q => 50
24
+ },
25
+ )
26
+ g.snp_positions( :min_depth => 10, :mapping_quality => 40.0, :min_non_ref_count => 5)
27
+ g.collect_threads(:start => 0.3, :stop => 0.8, :slide => 0.01, :size => 0.2 )
28
+ [0.25, 0.5, 1.0].each do |kernel_adjust|
29
+ [4, 9, 11].each do | k |
30
+ filename = "#{name}_#{k}_#{kernel_adjust}_all_threads.png"
31
+ g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => 1.0, :expected_chd => 0.5)
32
+ g.draw_threads(filename)
33
+ filename = "#{name}_#{k}_#{kernel_adjust}_clustered_bands.png"
34
+ g.draw_bands(filename)
35
+ filename = "#{name}_#{k}_#{kernel_adjust}_signal.png"
36
+ g.draw_signal(filename)
37
+ end
38
+ end
39
+ g.close
40
+ end
41
+
42
+ sam.close
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # make_threads_isize
4
+ #
5
+ # Created by Dan MacLean (TSL) on 2012-01-17.
6
+ # Copyright (c) . All rights reserved.
7
+ ###################################################
8
+
9
+ ### An example script to luse th insert size between paired end reads to infer large deletions. The script, make density threads based on the proportion of reads with insert size above 0.5 for different kernels,
10
+ ### cluster for different values of k and then draw the threads, bands and signal. Generates plots for each new set of parameters. Contains a rescue clause in case something goes wrong, these sorts of calculations
11
+ ### can be very data-sparse and density curves can be hard to make and cluster when lots of window are empty
12
+
13
+ require 'bio-gngm'
14
+
15
+
16
+ g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
17
+ :format => :bam,
18
+ :fasta => "reference.fasta",
19
+ :samtools => {:r => "1:1000000-10000000",
20
+ :q => 20,
21
+ :Q => 50
22
+ }
23
+ )
24
+ g.get_insert_size_frequency(:ref_window_size => 152, :ref_window_slide => 152, :isize => 150)
25
+ g.collect_threads
26
+ [0.25, 0.5, 1.0].each do |kernel_adjust|
27
+ [4, 9, 11].each do | k |
28
+ begin
29
+ g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => 0.5, :expected_chd => 0.9)
30
+ filename = "isize_#{k}_#{kernel_adjust}_all_threads.png"
31
+ g.draw_threads(filename, :draw_legend => "isize_#{k}_#{kernel_adjust}_legend.png")
32
+ filename = "isize_#{k}_#{kernel_adjust}_bands.png"
33
+ g.draw_bands(filename)
34
+ filename = "isize_#{k}_#{kernel_adjust}_signal.png"
35
+ g.draw_signal(filename)
36
+ rescue Exception => e
37
+ puts "failed on #{k} #{kernel_adjust}"
38
+ puts e.message, e.backtrace
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # use_indels
4
+ #
5
+ # Created by Dan MacLean (TSL) on 2012-01-17.
6
+ # Copyright (c) . All rights reserved.
7
+ ###################################################
8
+
9
+ ### An example script to specify a region, get small deletion positions, make density threads for different kernels,
10
+ ### cluster for different values of k and then draw the threads, bands and signal. Generates plots for each new set of parameters
11
+
12
+ require 'bio-gngm'
13
+
14
+
15
+
16
+ g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
17
+ :format => :bam,
18
+ :fasta => "reference.fasta",
19
+ :samtools => {:r => "Chr1:1-3000000",
20
+ :q => 20,
21
+ :Q => 50
22
+ }
23
+ )
24
+ g.snp_positions(:deletions_only => true)
25
+ g.collect_threads
26
+ g.calculate_clusters(:k => 9, :adjust => 0.5, :control_chd => 0.5, :expected_chd => 1.0)
27
+ filename = "indels_all_threads.png"
28
+ g.draw_threads(filename)
29
+ filename = "indels_clustered_bands.png"
30
+ g.draw_bands(filename)
31
+ filename = "indels_signal.png"
32
+ g.draw_signal(filename)
33
+ filename = "indels_peaks.png"
34
+ g.draw_peaks(filename)
35
+ g.close
36
+
@@ -0,0 +1,12 @@
1
+ # Please require your code below, respecting the bioruby directory tree.
2
+ # For instance, perhaps the only uncommented line in this file might
3
+ # be something like this:
4
+ #
5
+ # require 'bio/sequence/awesome_sequence_plugin_thingy'
6
+ #
7
+ # and then create the ruby file 'lib/bio/sequence/awesome_sequence_thingy.rb'
8
+ # and put your plugin's code there. It is bad practice to write other code
9
+ # directly into this file, because doing so causes confusion if this biogem
10
+ # was ever to get merged into the main bioruby tree.
11
+
12
+ require 'bio/util/bio-gngm'
@@ -0,0 +1,1029 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # bio-gngm
4
+ #
5
+ # Created by Dan MacLean (TSL) on 2011-12-07.
6
+ # Copyright (c) . All rights reserved.
7
+ ###################################################
8
+
9
+ require 'rinruby'
10
+ require 'bio-samtools'
11
+ require 'bio/db/pileup'
12
+ require 'bio/db/vcf'
13
+ require 'pp'
14
+
15
+ =begin
16
+ Extends the methods of the Bio::DB::Pileup class in bio-samtools. A pileup object represents the SAMtools pileup format at
17
+ http://samtools.sourceforge.net/pileup.shtml. These extension methods are used by the Bio::Util::Gngm object internally and
18
+ are not exposed to the user of the Bio::Util::Gngm object through that.
19
+ =end
20
+ class Bio::DB::Pileup
21
+
22
+ #attributes set by call to Bio::DB::Pileup#discordant_chastity
23
+ attr_accessor :top_non_ref_count, :second_non_ref_count, :third_non_ref_count
24
+
25
+ #calculates the discordant chastity statistic as defined in Austin et al (2011) http://bar.utoronto.ca/ngm/description.html and http://onlinelibrary.wiley.com/doi/10.1111/j.1365-313X.2011.04619.x/abstract;jsessionid=F73E2DA628523B26205297CEE95526DA.d02t04
26
+ #Austin _et_ _al_ (2011) *Next-generation* *mapping* *of* *Arabidopsis* *genes* _Plant_ _Journal_ *67*(4):7125-725
27
+ #
28
+ #Briefly,
29
+ # The statistic measures the degree of difference between the SNP and the expected reference base.
30
+ # Using the mapping information comprising a SNP, the most frequent base that is not the reference base
31
+ # is compared to the next most common base after it.
32
+ # (from http://bar.utoronto.ca/ngm/description.html )
33
+ def discordant_chastity
34
+ arr = self.non_refs.to_a.sort {|a,b| b.last <=> a.last }
35
+ @top_non_ref_count, @second_non_ref_count, @third_non_ref_count = arr.collect {|c| c.last}
36
+ case
37
+ when self.non_ref_count == 0 then 0.0
38
+ when @top_non_ref_count == @coverage then 1.0
39
+ when @second_non_ref_count > 0 then @top_non_ref_count.to_f / (@top_non_ref_count + @second_non_ref_count).to_f
40
+ else @top_non_ref_count.to_f / @coverage.to_f
41
+ end
42
+ end
43
+
44
+ #returns true if self is a SNP with minimum coverage depth of +:min_depth+ and minimum non-reference bases of +:min_non_ref_count+
45
+ #returns false for every position where the reference base is N or n if +:ignore_reference_n+ is set to true
46
+ #
47
+ #Options and Defaults:
48
+ #- :min_depth => 2
49
+ #- :min_non_ref_count => 2
50
+ #- :ignore_reference_n => false
51
+ #
52
+ #Example
53
+ # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 2)
54
+ # pileup.is_snp?(:min_depth => 5, :min_non_ref_count => 1, :ignore_reference_n => true)
55
+ def is_snp?(opts)
56
+ if opts[:ignore_reference_n] and self.ref_base == "N" or self.ref_base == "n"
57
+ return false
58
+ elsif self.coverage >= opts[:min_depth] and self.non_ref_count >= opts[:min_non_ref_count]
59
+ return true
60
+ end
61
+ false
62
+ end
63
+ end
64
+
65
+
66
+
67
+ #Extends the methods of the Bio::DB::Vcf class in bio-samtools. A Vcf object represents the VCF format described at
68
+ #http://www.1000genomes.org/node/101 . The Bio::DB::Vcf object returns all information in the VCF line, but the implementation here acts as if
69
+ #there is only possibly one variant at each position and ignores positions at which there may be multiple variants. Vcf format is only used when
70
+ #the Bio::Util::Gngm object requests information about indels using SAMtools mpileup method.
71
+ class Bio::DB::Vcf
72
+ #returns true if the +alt+ column of the Vcf is not *.*
73
+ #
74
+ #Examples
75
+ #
76
+ #vcf record = 20 14370 rs6054257 G A 29 PASS ...
77
+ # vcf.variant? #=> true
78
+ #vcf record = 20 1230237 . T . 47 PASS ...
79
+ # vcf.variant? #=> false
80
+ def variant?
81
+ not self.alt == "." rescue false
82
+ end
83
+
84
+ #Return a short string representing chromosome, position, reference sequence, alt sequence(s) and the info string of the Vcf object.
85
+ def to_s
86
+ "#{self.chrom} #{self.pos} #{self.ref} #{self.alt} #{self.info}"
87
+ end
88
+
89
+ #The depth of reads actually used in the genotype call by Vcftools. The sum of the DP4 attribute. Returns 0.0 if no value is calculated.
90
+ def used_depth
91
+ self.info["DP4"].split(",").inject {|sum,n| sum.to_f + n.to_f} rescue 0.0
92
+ end
93
+
94
+ #List of alternate alleles at this locus, obtained by splitting the vcf.alt attribute string on commas
95
+ #
96
+ #Example
97
+ #vcf.alt = "ACT,TCA"
98
+ # vcf.alternatives = ["ACT", "TCA"]
99
+ #vcf.alt = "T"
100
+ # vcf.alternatives = ["T"]
101
+ def alternatives
102
+ self.alt.split(",") rescue []
103
+ end
104
+
105
+ ##Returns the depth of reads containing the non reference allele. IE the sum of the last two figures in the DP4 attribute.
106
+ def non_ref_allele_count
107
+ self.info["DP4"].split(",")[2..3].inject {|sum,n| sum.to_f + n.to_f } rescue 0.0
108
+ end
109
+
110
+ #Returns the non-reference allele frequency based on depth of reads used for the genotype call,
111
+ #
112
+ #IE
113
+ # vcf.non_ref_allele_count / vcf.used_depth
114
+ def non_ref_allele_freq
115
+ self.non_ref_allele_count / self.used_depth
116
+ end
117
+
118
+ #Returns the mean Mapping Quality from the reads over this position as defined by the Vcf MQ attribute.
119
+ def mq
120
+ self.info["MQ"].to_f rescue 0.0
121
+ end
122
+
123
+ ##Returns the genotype quality score from the sample data (as defined by the Vcf GQ attribute) for the first sample in the Vcf only.
124
+ def gq
125
+ self.samples["1"]["GQ"].to_f rescue 0.0
126
+ end
127
+
128
+ #Returns the phred scaled likelihood of the first non-reference allele (as defined by the Vcf PL attribute) for the first sample in the Vcf only.
129
+ def pl
130
+ self.samples["1"]["PL"].split(",")[1].to_f rescue 0.0
131
+ end
132
+
133
+ #Returns true if only one variant allele is recorded. Loci with more than one allele are too complicated for now, so are discarded...
134
+ def has_just_one_variant?
135
+ self.alternatives.length == 1 and self.variant?
136
+ end
137
+
138
+ #Returns true if the position passes criteria
139
+ #
140
+ #Options and Defaults:
141
+ #- :min_depth => 2
142
+ #- :min_non_ref_count => 2
143
+ #- :mapping_quality => 10
144
+ #
145
+ #Example
146
+ # vcf.pass_quality?(:min_depth => 5, :min_non_ref_count => 2, :mapping_quality => 25)
147
+ def pass_quality?(options)
148
+ (self.used_depth >= options[:min_depth] and self.mq >= options[:mapping_quality] and self.non_ref_allele_count >= options[:min_non_ref_count])
149
+ end
150
+
151
+ #Returns true if the length of the alt column is less than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
152
+ #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
153
+ def is_deletion?(options)
154
+ case
155
+ when (not self.has_just_one_variant?) then false
156
+ when ( self.alt.length < self.ref.length and self.pass_quality?(options) ) then true
157
+ else false
158
+ end
159
+ rescue ## if something goes wrong, skip the postion,
160
+ false
161
+ end
162
+
163
+ #Returns true if the length of the alt column is greater than that of the ref column in the Vcf and if Vcf.pass_quality? is true.
164
+ #Looks only at the positions that are predicted simple deletions, any positions where the alt alleles includes more than one deletion or a SNP or an insertion also is ignored.
165
+ def is_insertion?(options)
166
+ case
167
+ when (not self.has_just_one_variant?) then false
168
+ when ( self.alt.length > self.ref.length and self.pass_quality?(options) ) then true
169
+ else false
170
+ end
171
+ rescue ## if something goes wrong, skip the postion,
172
+ false
173
+ end
174
+
175
+ #Returns true if either Vcf.is_insertion? or Vcf.is_deletion? is true
176
+ def is_indel?(opts)
177
+ self.is_insertion?(opts) || self.is_deletion?(opts)
178
+ end
179
+
180
+
181
+ end
182
+
183
+
184
+ module Bio
185
+ class Util
186
+ =begin
187
+ A Bio::Util::Gngm object represents a single region on a reference genome that is to be examined using the NGM technique described in Austin et al (2011) http://bar.utoronto.ca/ngm/description.html and http://onlinelibrary.wiley.com/doi/10.1111/j.1365-313X.2011.04619.x/abstract;jsessionid=F73E2DA628523B26205297CEE95526DA.d02t04
188
+ Austin _et_ _al_ (2011) *Next-generation* *mapping* *of* *Arabidopsis* *genes* _Plant_ _Journal_ *67*(4):7125-725 .
189
+
190
+ Bio::Util::Gngm provides methods for finding SNPs, small INDELS and larger INDELS, creating histograms of polymorphism frequency,
191
+ creating and clustering density curves, creating signal plots and finding peaks. The ratio of reference-agreeing and reference-differing reads can be specified.
192
+
193
+ == Background
194
+ The basic concept of the technique is that density curves of polymorphism frequency across the region of interest are plotted and analysed. Each curve is called a thread, as it represents a polymorphism that
195
+ was called with a statistic within a certain user-specified range, eg if a SNP was called with 50% non-reference bases from sequence reads (say all A), and 50% reference reads (all T) then a discordant
196
+ chastity statistic (ChD) of 0.5 would be calculated and assigned to that SNP. Depending on the width and slide of the windows the user had specified, the frequency of SNPs with ChD in the specified range
197
+ would be drawn in the same density curve. In the figure below each different coloured curve represents the frequency of SNPs with similar ChD.
198
+
199
+ link:images/threads.png
200
+
201
+ Each of these density curves is called a thread. Threads are clustered into groups called bands and the bands containing the expected and control polymorphisms extracted. In the figure below, the control band is 0.5, the expected mutation in 1.0.
202
+ Typically and in the Austin et al (2011) description of NGM the control band is the heterophasic band that represents natural variation, the thing taken to be the baseline. For a simple SNP, numerically the discordant chastity is expected to be 0.5.
203
+ Conversely the expected band is the homophasic band that represents the selected for SNP region. Normally the discordant chastity is expected to be 1.0.
204
+
205
+ link:images/bands.png
206
+
207
+ The points where the signal from the control and expected band converge most is a likely candidate region for the causative mutation, so here at about the 1.6 millionth nucleotide.
208
+
209
+ link:images/signal.png
210
+
211
+ == Example
212
+ require 'bio-gngm'
213
+
214
+
215
+
216
+
217
+ g = Bio::Util::Gngm.new(:file => "aln.sorted.bam",
218
+ :format => :bam,
219
+ :fasta => "reference.fasta",
220
+ :samtools => {:r => "chr1:1-100000",
221
+ :q => 20,
222
+ :Q => 50
223
+ },
224
+ :min_non_ref_freq => 0.5,
225
+ :min_non_ref => 3
226
+ )
227
+ g.snp_positions
228
+ g.collect_threads(:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 )
229
+ [0.25, 0.5, 1.0].each do |kernel_adjust| # loop through different kernel values
230
+ [4, 9, 11].each do | k | # loop through different cluster numbers
231
+
232
+ #cluster
233
+ g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => 0.7, :expected_chd => 0.5)
234
+ #draw thread and bands
235
+ filename = "#{name}_#{k}_#{kernel_adjust}_all_threads.png"
236
+ g.draw_threads(filename)
237
+
238
+ filename = "#{name}_#{k}_#{kernel_adjust}_clustered_bands.png"
239
+ g.draw_bands(filename, :add_lines => [100,30000,675432])
240
+
241
+ #draw signal
242
+ filename = "#{name}_#{k}_#{kernel_adjust}_signal.png"
243
+ g.draw_signal(filename)
244
+
245
+ #auto-guess peaks
246
+ filename = "#{name}_#{k}_#{kernel_adjust}_peaks.png"
247
+ g.draw_peaks(filename)
248
+ end
249
+ end
250
+ g.close #close BAM file
251
+
252
+ == Polymorphisms and statistics
253
+ Bio::Util::Gngm will allow you to look for polymorphisms that are SNPs, INDELS (as insertions uniquely, deletions uniquely or both) and longer insertions or deletions based on the insert size on paired-end read alignments.
254
+ Each has a different statistic attached to it.
255
+
256
+ === SNPs
257
+ Simple Single Nucleotide Polymorphisms are called and its ChD statistic calculated as described in Austin et al (2011).
258
+
259
+ === Short INDELS
260
+ These are called via SAMtools/BCFtools so are limited to the INDELs that can be called that way. The implementation at the moment only considers positions with one INDEL, sites with
261
+ more than one potential INDEL (ie multiple alleles) are disregarded as a position at all. See the Bio::DB::Vcf extensions in this package for a description of what constitutes an INDEL.
262
+ The Vcf attribute Bio::DB::Vcf#non_ref_allele_freq is used as the statistic in this case.
263
+
264
+ === Insertion Size
265
+ Paired-end alignments have an expected distance between the paired reads (called insert size, or isize). Groups of reads in one position with larger or smaller than expected isize can indicate large deletions or insertions.
266
+ Due to the details of read preparation the actual isize varies around a mean value with an expected proportion of 50% of reads having isize above the mean, and 50% below. To create density curves of insertion size frequency a moves along the
267
+ window of user-specified size is moved along the reference genome in user-specified steps and all alignments in that window are examined. The Bio::DB::Sam#isize attribute is inspected for all alignments passing
268
+ user-specified quality and the proportion of reads in that window that have an insert size > the expected insert size is used as the statistic in this case. Proportions approaching 1 indicate that the sequenced organism has a deletion in that section relative to the reference.
269
+ Proportions approaching 0 indicate an insertion in that section relative to the reference. Proportions around 0.5 indicate random variation of insert size, IE no INDEL. Seems to be a good idea to keep the window size similar to the read + isize. Useful in conjunction with assessing unmapped mates.
270
+
271
+ === Unmapped Mate Pairs / Paired Ends.
272
+ Paired-end alignments where one mate finds a mapping but the other doesnt, can indicate an insertion/deletion larger than the insert size of the reads used (IE one read disappeared into the deleted section). This method uses a statistic based on proportion of
273
+ mapped/unmapped reads in a window. Proportions of reads that are mapped but the mate is unmapped should be about 0.5 in a window over an insertion/deletion (since the reads can go in either direction..). With no insertion deletion, the
274
+ proportion should be closer to 0.
275
+
276
+ == Input types
277
+ A sorted BAM file is used as the source of alignments. Pileup is not used nor likely to be as it is a deprecated function within SAMtools. With the BAM file you will need the reference FASTA and the BAM index (.bai).
278
+
279
+ == Workflow
280
+ 1. Create Bio::Util::Gngm object for a specific region in the reference genome
281
+ 2. Polymorphisms are found
282
+ 3. Density curves (threads) are calculated
283
+ 4. Clustering density threads into bands is done
284
+ 5. Signal is compared between band of interest and control
285
+ 6. Figures are printed
286
+
287
+ == Prerequisites
288
+ - Ruby 1.9.3 or greater (if you have an earlier version, try RVM for installing different versions of Ruby alongside your system install and switching nicely between them)
289
+ - R 2.11.1 or greater
290
+
291
+ The following ruby-gems are required
292
+ - rinruby >= 2.0.2
293
+ - bio-samtools >= 0.5.0
294
+
295
+ The following R packages are required
296
+ - ggplot2
297
+ - peaks
298
+
299
+ == Acknowledgements
300
+ Thanks very much indeed to Ryan Austin, who invented NGM in the first place and was very forthcoming with R code, around which this implementation is based.
301
+
302
+ == Using bio-gngm
303
+ The package is not yet released, a gem will be prepared soon. Until then scripts run fine when saved in the package scripts from within the package directory with the below pre-amble at the top of the script. Run scripts from the root of the package directory.
304
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
305
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
306
+ require 'bio-samtools'
307
+ require 'bio-gngm'
308
+
309
+ == API
310
+ =end
311
+ class Gngm
312
+
313
+ attr_accessor :file
314
+
315
+ #Ruby 1.9.3 has a rounding error in the Range#step function such that some decimal places are rounded off to 0.00000000000000...1 above their place. So this constant is used to identify
316
+ #windows within a short distance and prevent any rounding errors. Hopefully I should be able to remove this in later versions.
317
+ ERROR_MARGIN = 0.000001
318
+
319
+ public
320
+ #Returns a new Bio::Util::Gngm object.
321
+ #
322
+ # g = Bio::Util::Gngm.new(:file => "aln.sort.bam",
323
+ # :format => :bam,
324
+ # :samtools => {:q => 20, :Q => 50, :r => "Chr1:1-100000"},
325
+ # :fasta => "reference.fa"
326
+ #
327
+ # )
328
+ #
329
+ #Required parameters and defaults:
330
+ #- <tt>:file => nil</tt> -the path to the bam file containing the alignments, a .bai index must be present
331
+ #- <tt>:format => :bam</tt> -always bam
332
+ #- <tt>:fasta => nil</tt> -the path to the FASTA formatted reference sequence
333
+ #- <tt>:samtools => {:q => 20, :Q => 50, :r => "Chr1:100-1100"}</tt> -options for samtools, see bio-samtools documentation for further details. The :r option is required to specify the region of interest
334
+ #Optional parameters and defaults:
335
+ #Most of these are parameters for specific methods and can be over-ridden when particular methods are called
336
+ #- <tt>:variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true}</tt> -for SNP/Indel calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used.
337
+ #- <tt>:threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 }</tt> -options for thread windows
338
+ #- <tt>:insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150}</tt> -options for insert size calculations
339
+ #- <tt>:histo_bin_width => 250000</tt> -bin width for histograms of SNP frequency
340
+ #- <tt>:graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil}</tt> -graphics output options, +:draw_legend+ draws a legend plot for band figures only
341
+ #- <tt>:peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000}</tt> -parameters for automated peak calling, parameters relate to R package Peaks. +:range+ is the width of the box to draw on the peak plot
342
+ def initialize(options)
343
+ @file = nil
344
+ @snp_positions = nil
345
+ @threads = nil
346
+ @densities = nil
347
+ @clusters = nil
348
+ @control_band = nil
349
+ @expected_band = nil
350
+ @signal = nil
351
+ @peak_indices = nil
352
+ @peak_y_values = nil
353
+ @density_max_y = nil #the maximum y value needed to plot the entire set density plots of threads and maintain a consistent scale for plots
354
+ @colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
355
+ @thread_colours = {}
356
+ @opts = {
357
+ :file => nil,
358
+ :format => :bam,
359
+ :fasta => nil,
360
+ :samtools => {:q => 20, :Q => 50},
361
+ ##indels = call any and only indels.. :deletions_only :insertions_only = only one tyoe
362
+ ## some options are designed to be equivalent to vcfutils.pl from bvftools options when using vcf
363
+ ##:min_depth (-d)
364
+ ##:max_depth (-D)
365
+ ##:mapping_quality (-Q) minimum RMS mappinq quality for SNPs (mq in info fields)
366
+ ##:min_non_ref_count (-a) minimum num of alt bases ... the sum of the last two numbers in DP4 in info fields
367
+ ##doesnt do anything with window filtering or pv values...
368
+ :insert_size_opts => {:ref_window_size => 200, :ref_window_slide => 50, :isize => 150},
369
+ :variant_call => {:indels => false, :deletions_only => false, :insertions_only => false, :min_depth => 2, :max_depth => 10000000, :mapping_quality => 10.0, :min_non_ref_count => 2, :ignore_reference_n => true},
370
+ :histo_bin_width => 250000,
371
+ :graphics => {:width => 1000, :height => 500, :draw_legend => false, :add_boxes => nil},
372
+ :adjust => 1,
373
+ :control_chd => 0.5,
374
+ :expected_chd => 1.0,
375
+ :threads => {:start => 0.2, :stop => 1.0, :slide => 0.01, :size => 0.1 },
376
+ :peaks => {:sigma => 3.0, :threshold => 10.0, :background => false, :iterations => 13, :markov => false, :window => 3, :range => 10000} ##range is the width of the box to draw on the peak plot
377
+ }
378
+ @opts.merge!(options)
379
+ open_file
380
+ end
381
+
382
+ private
383
+ #opens the file
384
+ def open_file
385
+ case @opts[:format]
386
+ when :bam then open_bam
387
+ end
388
+ end
389
+
390
+ private
391
+ #calls Bio::DB::Sam.open
392
+ def open_bam
393
+ @file = Bio::DB::Sam.new(:bam => @opts[:file], :fasta => @opts[:fasta] )
394
+ @file.open
395
+ end
396
+
397
+ public
398
+ #for BAM files calls Bio::DB::Sam#close to close the connections to input files safely
399
+ def close
400
+ case @opts[:format]
401
+ when :bam then @file.close
402
+ end
403
+ end
404
+
405
+ public
406
+ #Returns array of arrays <tt>[[position, statistic]]</tt> for polymorphisms passing filters in +optsa+
407
+ #Default options are those in the +:variant_call+ global options hash which can be over ridden in the method call
408
+ #
409
+ #Options and defaults:
410
+ #- <tt>:indels => false</tt> -call small insertions AND deletions instead of simple SNPs
411
+ #- <tt>:deletions_only => false</tt> -call just deletions instead of simple SNPs
412
+ #- <tt>:insertions_only => false</tt> -call small insertions instead of simple SNPs
413
+ #- <tt>:min_depth => 2</tt> -minimum quality passing depth of coverage at a position for a SNP call
414
+ #- <tt>:max_depth => 10000000</tt> -maximum quality passing depth of coverage at a position for a SNP call
415
+ #- <tt>:mapping_quality => 10.0</tt> -minimum mapping quality required for a read to be used in depth calculation
416
+ #- <tt>:min_non_ref_count => 2</tt> -minimum number of reads not matching the reference for SNP to be called
417
+ #- <tt>:ignore_reference_n => true</tt> -ignore positions where the reference is N or n
418
+ #
419
+ #When INDEL calling only one of <tt>:indels, :deletions_only, :insertions_only</tt> should be used. If all are +false+, SNPs are called.
420
+ #
421
+ #Sets the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect.
422
+ def snp_positions(optsa={})
423
+ opts = @opts[:variant_call].merge(optsa)
424
+ return @snp_positions if @snp_positions
425
+ case
426
+ when @file.instance_of?(Bio::DB::Sam) then get_snp_positions_from_bam(opts)
427
+ end
428
+ end
429
+
430
+ private
431
+ #Calls SNP/short INDEL positions from a BAM file and the appropriate statistic according to quality criteria passed by Bio::Util::Gngm#snp_positions.
432
+ #Sets @snp_positions
433
+ def get_snp_positions_from_bam(options={})
434
+ opts = @opts[:variant_call].merge(options)
435
+ if opts[:indels] and (opts[:deletions_only] or opts[:insertions_only])
436
+ raise "Cant have indels and deletions only or insertions only, need to specify ':indels => true' to get both"
437
+ end
438
+ arr = []
439
+ ##when we are calling mpileup_plus we need to add :g to the samtools options
440
+ if opts[:indels] or opts[:deletions_only] or opts[:insertions_only]
441
+ @opts[:samtools][:g] = true
442
+ end
443
+
444
+ if not @opts[:samtools][:g]
445
+ @file.mpileup(@opts[:samtools]) do |pileup|
446
+ arr << [pileup.pos, pileup.discordant_chastity] if pileup.is_snp?(opts)
447
+ end
448
+ else
449
+ @file.mpileup_plus(@opts[:samtools]) do |vcf|
450
+ next if not vcf.variant? ##we dont care about the calls for reference agreeing positions
451
+ next if (opts[:ignore_reference_n] and vcf.ref =~ /N/i)
452
+ ##indel use returns the vcf allele_frequency, not the ChDs (because calculating it is a mess... )
453
+ if opts[:indels]
454
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_indel?(opts)
455
+ elsif opts[:deletions_only]
456
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_deletion?(opts)
457
+ elsif opts[:insertions_only]
458
+ arr << [vcf.pos, vcf.non_ref_allele_freq] if vcf.is_insertion?(opts)
459
+ end
460
+ end
461
+ end
462
+
463
+ @snp_positions = arr
464
+ arr
465
+ end
466
+
467
+ private
468
+ #Gets the insert size for each alignment in the BAM positions from a BAM file according to quality criteria passed by Bio::Util::Gngm#get_insert_size_frequency.
469
+ def get_insert_size_frequency_from_bam(opts={})
470
+ reference_window_size,reference_window_slide = opts[:ref_window_size], opts[:ref_window_slide]
471
+ arr = []
472
+ @opts[:samtools][:r] =~ /(.*):(.*)-(.*)/
473
+ chr,rstart,rstop = $1.to_s,$2.to_i,$3.to_i
474
+ (rstart..rstop).step(reference_window_slide) do |win_start|
475
+ win_tot = 0.0
476
+ win_over_isize = 0.0
477
+ @file.fetch(chr, win_start, win_start + reference_window_size).each do |alignment|
478
+ next if not alignment_passes(alignment)
479
+ win_tot = win_tot + 1
480
+ win_over_isize = win_over_isize + 1 if alignment.isize.abs > opts[:isize]
481
+ end
482
+ prop = win_over_isize / win_tot
483
+ arr << [win_start, prop]
484
+ end
485
+ @snp_positions = arr
486
+ end
487
+ #Gets the proportion of reads with unmapped mates in a window
488
+ def get_unmapped_mate_frequency_from_bam(opts={})
489
+ reference_window_size,reference_window_slide = opts[:ref_window_size], opts[:ref_window_slide]
490
+ arr = []
491
+ @opts[:samtools][:r] =~ /(.*):(.*)-(.*)/
492
+ chr,rstart,rstop = $1.to_s,$2.to_i,$3.to_i
493
+ (rstart..rstop).step(reference_window_slide) do |win_start|
494
+ #puts "__________________#{win_start}____________________"
495
+ win_tot = 0.0
496
+ win_mates_unmapped = 0.0
497
+ @file.fetch(chr, win_start, win_start + reference_window_size).each do |alignment|
498
+ next if (alignment.failed_quality) # or @opts[:samtools][:q] <= alignment.mapq or not alignment.is_paired)
499
+ win_tot = win_tot + 1
500
+ win_mates_unmapped = win_mates_unmapped + 1 if alignment.mate_unmapped
501
+ end
502
+
503
+ #puts "win tot #{win_tot}"
504
+ #puts "win mates #{win_mates_unmapped}"
505
+ prop = win_mates_unmapped / win_tot
506
+ #puts "prop #{prop}"
507
+ arr << [win_start, prop]
508
+ end
509
+ @snp_positions = arr
510
+ end
511
+
512
+ private
513
+ #Returns true if the passed Bio::DB::Sam passes the quality criteria
514
+ def alignment_passes(aln)
515
+ not aln.failed_quality && @opts[:samtools][:q] <= aln.mapq && aln.is_paired and not aln.mate_unmapped
516
+ end
517
+
518
+ public
519
+ #Returns array of arrays <tt>[[window start position, proportion of alignments > insert size]]</tt>.
520
+ #Does this by taking successive windows across reference and collects the proportion of the reads in that window
521
+ #that have an insert size > the expected insert size. Proportions approaching 1 indicate that the sequenced organism has a deletion in that section, proportions approaching 0 indicate an insertion in that section, proportions around 0.5 indicate random variation of insert size, IE no indel.
522
+ #
523
+ #Each section should be approximately the size of the insertion you expect to find and should increment in as small steps as possible.
524
+ #
525
+ #Options and defaults:
526
+ #- <tt>:ref_window_size => 200</tt> width of window in which to calculate proportions
527
+ #- <tt>:ref_window_slide => 50</tt> number of bases to move window in each step
528
+ #- <tt>:isize => 150</tt> expected insert size
529
+ #
530
+ #Sets the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect
531
+ def get_insert_size_frequency(options={})
532
+ opts = @opts[:insert_size_opts].merge(options)
533
+ return @snp_positions if @snp_positions
534
+ case
535
+ when @file.instance_of?(Bio::DB::Sam) then get_insert_size_frequency_from_bam(opts)
536
+ end
537
+ end
538
+
539
+ #Returns array of arrays <tt>[[window start position, proportion of reads with unmapped mates]]</tt>.
540
+ #Does this by taking successive windows across reference and counting the reads with unmapped mates
541
+ #Proportions approaching 0.5 indicate that the sequenced organism has an insertion in that section, proportions approaching 0 indicate nothing different in that section.
542
+ #
543
+ #Each section should be approximately the size of the insertion you expect to find and should increment in as small steps as possible.
544
+ #
545
+ #Options and defaults:
546
+ #- <tt>:ref_window_size => 200</tt> width of window in which to calculate proportions
547
+ #- <tt>:ref_window_slide => 50</tt> number of bases to move window in each step
548
+ #
549
+ #Sets the instance variable @snp_positions. Only gets positions the first time it is called, in subsequent calls pre-computed positions and statistics are returned, so changing parameters has no effect
550
+ def get_unmapped_mate_frequency(options={})
551
+ opts = @opts[:insert_size_opts].merge(options)
552
+ return @snp_positions if @snp_positions
553
+ case
554
+ when @file.instance_of?(Bio::DB::Sam) then get_unmapped_mate_frequency_from_bam(opts)
555
+ end
556
+ end
557
+
558
+
559
+
560
+ public
561
+ #Draws a histogram of polymorphism frequencies across the reference genome section defined in Bio::Util::Gngm#initialize with bin width +bin_width+ and writes it to a PNG file +file+
562
+ def frequency_histogram(file="myfile.png", bin_width=@opts[:histo_bin_width], opts=@opts[:graphics])
563
+ posns = self.snp_positions.collect {|a| a.first}
564
+ r = new_r
565
+ r.eval "suppressMessages ( library(ggplot2) )" #setup R environment...
566
+ r.posns = posns
567
+ r.eval "data = data.frame(position=posns)"
568
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
569
+ graph_cmd = "qplot(position,data=data, geom='histogram', binwidth = #{bin_width}, alpha=I(1/3), main='#{file}', color='red')"
570
+ r.eval(graph_cmd)
571
+ r.eval "dev.off()"
572
+ r.quit
573
+ end
574
+
575
+ #Returns contents of @threads, an array of arrays <tt>[[window 1, snp position 1, snp position 2 ... snp position n],[window 2, snp position 1, snp position 2 ... snp position n] ]</tt>.
576
+ #If @threads is nil (because snps have not yet been gathered into threads) the Bio::Util::Gngm#collect_threads method is called and @threads is set before returning
577
+ #
578
+ #Options and defaults:
579
+ #- <tt>:start => 0.2</tt> -first window
580
+ #- <tt>:stop => 1.0</tt> -last window
581
+ #- <tt>:slide => 0.01</tt> -distance between windows
582
+ #- <tt>:size => 0.1</tt> -window width
583
+ public
584
+ def threads(opts=@opts[:threads])
585
+ @threads ||= collect_threads(opts)
586
+ end
587
+
588
+ public
589
+ #Resets contents of instance variable @threads and returns an array of arrays <tt>[[window 1, snp position 1, snp position 2 ... snp position n],[window 2, snp position 1, snp position 2 ... snp position n] ]</tt>.
590
+ #Always sets @threads regardless of whether it contains anything or not so is useful for trying out different window sizes etc
591
+ #
592
+ #Options and defaults:
593
+ #- <tt>:start => 0.2</tt> -first window
594
+ #- <tt>:stop => 1.0</tt> -last window
595
+ #- <tt>:slide => 0.01</tt> -distance between windows
596
+ #- <tt>:size => 0.1</tt> -window width
597
+ def collect_threads(options={})
598
+ opts = @opts[:threads].merge(options)
599
+ opts[:slide] = 0.000001 if opts[:slide] < 0.000001 ##to allow for the rounding error in the step function...
600
+ raise RuntimeError, "snp positions have not been calculated yet" if not @snp_positions
601
+ start,stop,slide,size = opts[:start].to_f, opts[:stop].to_f, opts[:slide].to_f, opts[:size].to_f
602
+ arr = []
603
+ (start..stop).step(slide) do |win|
604
+ arr << [win, @snp_positions.select {|x| x.last >= win and x.last < win + size }.collect {|y| y.first} ]
605
+ end
606
+ @threads = arr
607
+ end
608
+
609
+ private
610
+ #Returns the value of @density_max_y or if nil, calls Bio::Util::Gngm#get_density_max_y to work out the maximum y axis value for plots
611
+ #Might not work properly as seems to call non-existent method...
612
+ def density_max_y
613
+ @density_max_y ||= get_density_max_y
614
+ end
615
+
616
+ private
617
+ def calculate_density_max_y
618
+ mx = 0.0
619
+ self.densities.each do |x|
620
+ m = x[2].max
621
+ mx = m if m > mx
622
+ end
623
+ @density_max_y = mx
624
+ end
625
+
626
+ public
627
+ #Draws the threads in a single PNG file +file+
628
+ #
629
+ #Options and defaults
630
+ #- <tt>:draw_legend => nil</tt> -if a filename is provided a legend will be drawn in a second plot
631
+ #- <tt>:width => 1000</tt> -width of the PNG in pixels
632
+ #- <tt>:height => 500</tt> -height of the PNG in pixels
633
+ def draw_threads(file="myfile.png", options={})
634
+ opts = @opts[:graphics].merge(options)
635
+ #uses R's standard plot functions.. needed because ggplot can die unexpectedly...
636
+ raise RuntimeError, "Can't draw threads until clustering is done" unless @clusters
637
+ r = new_r
638
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
639
+ plot_open = false
640
+ self.densities.each do |t|
641
+ r.curr_win = t.last
642
+ r.dx = t[1]
643
+ r.dy = t[2]
644
+ if plot_open
645
+ r.eval "lines(dx,dy, col=\"#{@thread_colours[t.first]}\", xlab='position', ylab='density')"
646
+ else
647
+ r.eval "plot(dx,dy, type=\"l\", col=\"#{@thread_colours[t.first]}\",ylim=c(0,#{density_max_y}), main='#{file}',xlab='position', ylab='density')"
648
+ plot_open = true
649
+ end
650
+ end
651
+ r.eval "dev.off()"
652
+ if opts[:draw_legend]
653
+ r.eval "png('#{opts[:draw_legend]}', width=#{opts[:width]}, height=#{opts[:height]})"
654
+ colours = @thread_colours.each.sort.collect {|x| x.last}.join("','")
655
+ names = @thread_colours.each.sort.collect {|x| x.first}.join("','")
656
+ r.eval "plot(1,xlab="",ylab="",axes=FALSE)"
657
+ r.eval "legend('top', c('#{names}'), lty=c(1),lwd=c(1),col=c('#{colours}'), ncol=4)"
658
+ r.eval "dev.off()"
659
+ end
660
+ r.quit
661
+ end
662
+
663
+
664
+ public
665
+ #Returns the instance variable @densities array of arrays <tt>[window, [density curve x values], [density curve y values] ]</tt>. The R function +density()+ is used to calculate the values. If @densities is nil when called this method will run the Bio::Util::Gngm#calculate_densities method and set @densities
666
+ #With this method you cannot recalculate the densities after they have been done once.
667
+ #
668
+ #Options and defaults
669
+ #- <tt>adjust = 1</tt>, -the kernel adjustment parameter for the R +density+ function
670
+ def densities(adjust=1)
671
+ @densities ||= calculate_densities(adjust)
672
+ end
673
+
674
+ public
675
+ #Sets and returns the array of arrays <tt>[window, [density curve x values], [density curve y values] ]</tt> Calculates the density curve using the R function +density()+ Always sets @densities regardless of whether it contains anything or not so is useful for trying out adjustment values.
676
+ #Ignores threads with fewer than 2 polymorphisms since density can't be computed with so few polymorphisms.
677
+ #
678
+ #Options and defaults
679
+ #- <tt>adjust = 1</tt>, -the kernel adjustment parameter for the R +density+ function
680
+ def calculate_densities(adjust=1)
681
+ r = new_r
682
+ densities = []
683
+ self.threads.each do |t|
684
+ next if t.last.length < 2 ##length of density array is smaller or == threads, since too small windows are ignored...
685
+ r.curr_win = t.last
686
+ r.eval "d = density(curr_win,n=240,kernel=\"gaussian\", from=#{@snp_positions.first[0]}, to=#{@snp_positions.last[0]}, adjust=#{adjust})"
687
+ densities << [t.first, r.pull("d$x"), r.pull("d$y")]
688
+ end
689
+ r.quit
690
+ @densities = densities
691
+ calculate_density_max_y ##need to re-do every time we get new densities
692
+ densities
693
+ end
694
+
695
+
696
+ public
697
+ #Draws the clustered bands that correspond to the expected and control window in a single PNG file +file+
698
+ #
699
+ #Options and defaults
700
+ #- <tt>:add_lines => nil</tt> -if an array of positions is provided eg +[100,345] , vertical lines will be drawn at these positions. Useful for indicating feature positions on the plot
701
+ #- <tt>:width => 1000</tt> -width of the PNG in pixels
702
+ #- <tt>:height => 500</tt> -height of the PNG in pixels
703
+ def draw_bands(file="myfile.png", optsa={})
704
+ opts = @opts[:graphics].merge(optsa)
705
+ pp optsa
706
+ raise RuntimeError, "Can't draw threads until clustering is done" unless @clusters
707
+ #uses R's standard plot functions.
708
+ ##same as draw_threads, but skips threads that aren't on the bands lists
709
+ ##
710
+ r = new_r
711
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
712
+ plot_open = false
713
+ self.densities.each do |t|
714
+ if @control_band.include?(t[0]) or @expected_band.include?(t[0])
715
+ r.dx = t[1]
716
+ r.dy = t[2]
717
+ r.curr_win = t.last
718
+ #r.eval "d = density(curr_win,n=240,kernel=\"gaussian\", from=#{@snp_positions.first[0]}, to=#{@snp_positions.last[0]})"
719
+ if plot_open
720
+ r.eval "lines(dx, dy, col=\"#{@thread_colours[t.first]}\")"
721
+ else
722
+ r.eval "plot(dx, dy, type=\"l\", col=\"#{@thread_colours[t.first]}\",ylim=c(0,#{density_max_y}), main='#{file}',xlab='position', ylab='density')"
723
+ plot_open = true
724
+ end
725
+ end
726
+ end
727
+ label1 = "Control band: " + @control_band.min.to_s + " < ChD < " + @control_band.max.to_s
728
+ label2 = "Expected band: " + @expected_band.min.to_s + " < ChD < " + @expected_band.max.to_s
729
+ r.eval "legend('top', c('#{label1}','#{label2}'), lty=c(1,1),lwd=c(2.5,2.5),col=c('#{@thread_colours[@control_band.first]}','#{@thread_colours[@expected_band.first]}'))"
730
+ if opts[:add_lines] and opts[:add_lines].instance_of?(Array)
731
+ opts[:add_lines].each do |pos|
732
+ r.eval "abline(v=#{pos})"
733
+ end
734
+ end
735
+ r.eval "dev.off()"
736
+ r.quit
737
+ end
738
+
739
+ public
740
+ #Returns the array instance variable @clusters. The R function +kmeans()+ is used to calculate the clusters based on a correlation matrix of the density curves. If @clusters is nil when called this method will run the Bio::Util::Gngm#calculate_clusters method and set @clusters
741
+ #With this method you cannot recalculate the clusters after they have been done once.
742
+ #
743
+ #Options and defaults
744
+ #- <tt>:k => 9</tt>, -the number of clusters for the R +kmeans+ function
745
+ #- <tt>:seed => false</tt> -set this to a number to make the randomized clustering reproducible
746
+ #- <tt>:control_chd => 0.5</tt> -the value of the control thread/window
747
+ #- <tt>:expected_chd => 1.0</tt> -the value of the expected thread/window
748
+ #- <tt>:adjust => 1.0</tt> -the kernel adjustment parameter for the R +density+ function
749
+ def clusters(opts={})
750
+ @clusters ||= calculate_clusters(opts={})
751
+ end
752
+
753
+ public
754
+ #Calculates the k-means clusters of density curves (groups threads into bands), [density curve y values] ]</tt> Calculates the clusters using the R function +kmeans()+ Recalculates @densities as it does with Bio::Util::Gngm#calculate_densities, so clustering can be done without having to explicitly call Bio::Util::Gngm#calculate_densities.
755
+ #Clusters are recalulated every time regardless of whether its been done before contains anything or not so is useful for trying out different values for the parameters. When clusters are calculated the expected and control
756
+ #bands are compared with the Bio::Util::Gngm#calculate_signal method and the @signal array populated. Resets the instance variables @control_band, @expected_band, @signal, @peak_indices, @peak_y_values and @clusters
757
+ #
758
+ #Options and defaults
759
+ #- <tt>:k => 9</tt>, -the number of clusters for the R +kmeans+ function
760
+ #- <tt>:seed => false</tt> -set this to a number to make the randomized clustering reproducible
761
+ #- <tt>:control_chd => 0.5</tt> -the value of the control thread/window
762
+ #- <tt>:expected_chd => 1.0</tt> -the value of the expected thread/window
763
+ #- <tt>:adjust => 1.0</tt> -the kernel adjustment parameter for the R +density+ function
764
+ #- <tt>:pseudo => false</tt> - force the densities into a single thread cluster when the number of distinct threads with SNPs is < the value of k. This is only useful in a situation where the spread of the statistic is very limited. EG for using mapped/unmapped mate pairs then almost all windows will have proportion 1.0 but a tiny number will be close to 0.5 with few other values considered.
765
+ def calculate_clusters( opts={} )
766
+ options = {:k => 9, :seed => false, :adjust => 1, :control_chd => 0.5, :expected_chd => 1.0, :pseudo => false}
767
+ options = options.merge(opts)
768
+ if options[:pseudo]
769
+ put_threads_into_individual_clusters(options)
770
+ return
771
+ end
772
+ r = new_r
773
+ names = []
774
+ name = "a"
775
+ @control_band = nil #needs resetting as we are working with new clusters
776
+ @expected_band = nil #needs resetting as we are working with new clusters
777
+ @signal = nil #needs resetting as we are working with new clusters
778
+ @peak_indices = nil #needs resetting as we are working with new cluster
779
+ @peak_y_values = nil #needs resetting as we are working with new cluster
780
+ self.calculate_densities(options[:adjust]).each do |d|
781
+ density_array = d.last
782
+ r.assign name, density_array ##although windows go in in numeric order, r wont allow numbers as names in data frames so we need a proxy
783
+ names << "#{name}=#{name}"
784
+ name = name.next
785
+ end
786
+ data_frame_command = "data = data.frame(" + names.join(",") + ")"
787
+ r.eval data_frame_command
788
+ r.eval "set.seed(#{options[:seed]})" if options[:seed]
789
+ r.eval "k = kmeans(cor(data),#{options[:k]},nstart=1000)"
790
+ @clusters = r.pull "k$cluster" ##clusters are returned in the order in densities
791
+ r.quit
792
+ ##now set the cluster colours..
793
+ colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
794
+ ci = 0
795
+ col_nums = {} ##hash of cluster numbers and colours
796
+ @clusters.each_index do |i|
797
+ if not col_nums[@clusters[i]]
798
+ col_nums[@clusters[i]] = colours[ci]
799
+ ci += 1
800
+ ci = 0 if ci > 11
801
+ end
802
+ @thread_colours[self.densities[i].first] = col_nums[@clusters[i]]
803
+ end
804
+ @control_band = get_band(options[:control_chd])
805
+ @expected_band = get_band(options[:expected_chd])
806
+ calculate_signal
807
+ end
808
+
809
+ private
810
+ ##gives each window/thread a seperate and arbitrary cluster, used when you suspect the statistic will not spread across all possible windows very well. Wont specifiy @control_band or @expected_band and therefore wont directly calculate the signal
811
+ def put_threads_into_individual_clusters(options)
812
+ @control_band = nil #needs resetting as we are working with new clusters
813
+ @expected_band = nil #needs resetting as we are working with new clusters
814
+ @signal = nil #needs resetting as we are working with new clusters
815
+ @peak_indices = nil #needs resetting as we are working with new cluster
816
+ @peak_y_values = nil #needs resetting as we are working with new cluster
817
+ self.calculate_densities(options[:adjust])
818
+ @clusters = Array.new (@densities.length) {|x| 1 + x}
819
+ ##now set the cluster colours..
820
+ colours = %w[#A6CEE3 #1F78B4 #B2DF8A #33A02C #FB9A99 #E31A1C #FDBF6F #FF7F00 #CAB2D6 #6A3D9A #FFFF99 #B15928]
821
+ ci = 0
822
+ col_nums = {} ##hash of cluster numbers and colours
823
+ @clusters.each_index do |i|
824
+ if not col_nums[@clusters[i]]
825
+ col_nums[@clusters[i]] = colours[ci]
826
+ ci += 1
827
+ ci = 0 if ci > 11
828
+ end
829
+ @thread_colours[self.densities[i].first] = col_nums[@clusters[i]]
830
+ end
831
+ #@control_band = get_band(options[:control_chd])
832
+ #@expected_band = get_band(options[:expected_chd])
833
+ #calculate_signal
834
+ end
835
+
836
+ ##returns an array of the names of the window threads in the control (heterophasic) band
837
+ #def control_band(control=0.5)
838
+ # puts "in control band with control = #{control}"
839
+ # @control_band ||= get_band(control)
840
+ #end
841
+
842
+ ##returns an array of the names of the window threads in the expected (homophasic) band
843
+ #def expected_band(expected=1.0)
844
+ # @expected_band ||= get_band(expected)
845
+ #end
846
+
847
+ ##gets an array of windows that cluster with a given window
848
+ public
849
+ def get_band(window=1.0)
850
+ ##because of the weird step rounding error we need to find the internal name of the window.. so find it from the list from the name the user
851
+ ##expects it to be, may give more than one passing window so keep only first one..
852
+ windows = find_window(window)
853
+ raise RuntimeError, "Couldnt find window #{window}, or window has no data to calculate: \n windows are #{self.densities.collect {|d| d.first} }" if windows.empty? ##if we have a window that is close enough to the specified window
854
+ idx = find_index(windows.first)
855
+ #find out which cluster the window is in
856
+ cluster = self.clusters[idx]
857
+ ##get the other windows in the same cluster, ie the band...
858
+ band = []
859
+ self.clusters.each_index do |i|
860
+ if self.clusters[i] == cluster
861
+ band << self.densities[i].first
862
+ end
863
+ end
864
+ band
865
+ end
866
+
867
+ public
868
+ #Draws the contents of the @signal instance variable in a single PNG file +file+
869
+ def draw_signal(file="myfile.png", opts=@opts[:graphics]) #data.frame(bubs=data$bubbles_found,conf=data$bubbles_confirmed)
870
+ r = new_r
871
+ x_vals = self.densities[0][1]
872
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
873
+ r.x_vals = x_vals
874
+ r.signal = self.signal
875
+ r.eval "plot(x_vals,signal, type=\"l\", xlab='position', ylab='ratio of signals (expected / control ~ homo / hetero)', main='#{file}' )"
876
+ r.eval "dev.off()"
877
+ end
878
+
879
+ private
880
+ def print_signal
881
+ end
882
+
883
+ public
884
+ #Returns the positions of the peaks in the signal curve calculated by Bio::Util::Gngm#get_peaks as an array
885
+ def peaks
886
+ @peak_indices.collect {|x| self.densities[0][1][x].to_f.floor}
887
+ end
888
+
889
+ public
890
+ #Draws the peaks calculated from the signal curve by the R function +Peaks+ in Bio::Util::Gngm#calculate_peaks. Adds boxes of width +:range+ to each peak and annotates the limits. Options are set in the global options hash +:peaks+.
891
+ #and relate to the Peaks function in R
892
+ def draw_peaks(file="myfile.png",opts=@opts[:graphics])
893
+ opts_a = @opts[:peaks]
894
+ opts_a.merge!(opts)
895
+ opts = opts_a ##sigh ...
896
+ #opts[:background] = opts[:background].to_s.upcase
897
+ #opts[:markov] = opts[:markov].to_s.upcase
898
+ self.get_peaks(opts)
899
+ r = new_r
900
+ #r.eval "suppressMessages ( library('Peaks') )"
901
+ r.signal = self.signal
902
+ r.x_vals = self.densities[0][1]
903
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
904
+ #r.eval "spec = SpectrumSearch(signal,#{opts[:sigma]},threshold=#{opts[:threshold]},background=#{opts[:background]},iterations=#{opts[:iterations]},markov=#{opts[:markov]},window=#{opts[:window]})"
905
+ #peak_positions = r.pull "spec$pos"
906
+ #y = r.pull "spec$y"
907
+ r.y = @peak_y_values
908
+ r.pos = @peak_indices
909
+ r.eval "plot(x_vals,y, type=\"l\", xlab='position', ylab='Peaks', main='#{file}' )"
910
+ @peak_indices.each do |peak|
911
+ r.eval "rect(x_vals[#{peak}]-(#{opts[:range]/2}), 0, x_vals[#{peak}]+#{opts[:range]/2}, max(y), col=rgb(r=0,g=1,b=0, alpha=0.3) )"
912
+ r.eval "text(x_vals[#{peak}]-(#{opts[:range]/2}),max(y) + 0.05, floor(x_vals[#{peak}]-(#{opts[:range]/2})) )"
913
+ r.eval "text(x_vals[#{peak}]+(#{opts[:range]/2}), max(y) + 0.05, floor(x_vals[#{peak}]+(#{opts[:range]/2})) )"
914
+ end
915
+ r.eval "dev.off()"
916
+ r.quit
917
+ end
918
+
919
+ private
920
+ #Calculates the position of peaks in the signal curve
921
+ def get_peaks(opts=@opts[:peaks])
922
+ opts[:background] = opts[:background].to_s.upcase
923
+ opts[:markov] = opts[:markov].to_s.upcase
924
+ r = new_r
925
+ r.eval "suppressMessages ( library('Peaks') )"
926
+ r.signal = self.signal
927
+ r.x_vals = self.densities[0][1]
928
+ r.eval "spec = SpectrumSearch(signal,#{opts[:sigma]},threshold=#{opts[:threshold]},background=#{opts[:background]},iterations=#{opts[:iterations]},markov=#{opts[:markov]},window=#{opts[:window]})"
929
+ @peak_indices = r.pull "spec$pos"
930
+ if @peak_indices.instance_of?(Fixnum)
931
+ @peak_indices = [@peak_indices]
932
+ end
933
+ @peak_y_values = r.pull "spec$y"
934
+ r.quit
935
+ end
936
+
937
+ public
938
+ #Returns an array of polymorphisms in each thread/window <tt>[[window, polymorphism count] ]. Useful for sparse polymorphism counts or over small regions where small polymorphism counts can cause artificially large peaks in density curves.
939
+ def hit_count
940
+ arr = []
941
+ self.threads.each do |thread|
942
+ arr << [thread.first, thread.last.length]
943
+ end
944
+ arr
945
+ end
946
+
947
+ public
948
+ #Draws a barplot of the number of polymorphisms in each thread/window in a single PNG file +file+
949
+ def draw_hit_count(file="myfile.png",opts=@opts[:graphics])
950
+ r = new_r
951
+ wins = []
952
+ hits = []
953
+ self.threads.each do |thread|
954
+ wins << thread.first
955
+ if thread.last.empty?
956
+ hits << 0.01 ##pseudovalue gets around the case where a thread has no hits... which messes up barplot in R
957
+ else
958
+ hits << thread.last.length
959
+ end
960
+ end
961
+ r.wins = wins
962
+ r.hits = hits
963
+ r.eval "png('#{file}', width=#{opts[:width]}, height=#{opts[:height]})"
964
+ r.eval "barplot(hits, names.arg=c(wins), xlab='window', log='y', ylab='number of hits', main='Number of Polymorphisms #{file}', col=rgb(r=0,g=1,b=1, alpha=0.3), na.rm = TRUE)"
965
+ r.eval "dev.off()"
966
+ end
967
+
968
+ public
969
+ #Returns an array of values representing the ratio of average of the expected threads/windows to the control threads/windows. Sets @signal, the signal curve.
970
+ def calculate_signal
971
+ r = new_r
972
+ name = "a"
973
+ control_names = []
974
+ expected_names = []
975
+ self.densities.each do |d|
976
+ if @control_band.include?(d.first)
977
+ density_array = d.last
978
+ r.assign name, density_array ##although windows go in in numeric order, r wont allow numbers as names in data frames so we need a proxy
979
+ control_names << "#{name}=#{name}"
980
+ elsif @expected_band.include?(d.first)
981
+ density_array = d.last
982
+ r.assign name, density_array
983
+ expected_names << "#{name}=#{name}"
984
+ end
985
+ name = name.next
986
+ end
987
+ data_frame_command = "control = data.frame(" + control_names.join(",") + ")"
988
+ r.eval data_frame_command
989
+ r.eval "control_mean = apply(control, 1, function(ecks) mean((as.numeric(ecks))) )"
990
+ data_frame_command = "expected = data.frame(" + expected_names.join(",") + ")"
991
+ r.eval data_frame_command
992
+ r.eval "expected_mean = apply(expected, 1, function(ecks) mean((as.numeric(ecks))) )"
993
+ r.eval "signal = expected_mean / control_mean"
994
+ signal = r.pull "signal"
995
+ r.quit
996
+ @signal = signal
997
+ end
998
+
999
+ public
1000
+ def signal
1001
+ @signal ||= calculate_signal
1002
+ end
1003
+
1004
+
1005
+
1006
+ ##finds the index of a window in the densties array
1007
+ private
1008
+ def find_index(window)
1009
+ self.densities.index {|x| x.first == window}
1010
+ end
1011
+
1012
+ private
1013
+ #finds the windows internal name, taking into account the Ruby rounding error
1014
+ def find_window(number)
1015
+ self.densities.collect {|d| d.first if d.first == number or (d.first >= number - ERROR_MARGIN and d.first <= number + ERROR_MARGIN) }.compact
1016
+ end
1017
+
1018
+ private
1019
+ #Returns a new rinruby object
1020
+ def new_r
1021
+ r = RinRuby.new(echo = false, interactive = false)
1022
+ r.eval "options(warn=-1)"
1023
+ return r
1024
+ end
1025
+
1026
+
1027
+ end
1028
+ end
1029
+ end