BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,282 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # rubocop:disable LineLength
29
+ module BioDSL
30
+ # == Plot matches from the stream as a dotplot.
31
+ #
32
+ # +plot_matches+ is used to create dotplots of matches in the stream.
33
+ # plot_matches uses Q_BEG, Q_END, S_BEG, S_END from the stream. If strand
34
+ # information is available either by a STRAND key with the value '+' or '-',
35
+ # or by a DIRECTION key with the value 'forward' or 'reverse' then forward
36
+ # matches will be output in green and reverse matches in red (in all
37
+ # terminals, but +dumb+).
38
+ #
39
+ # Default graphics are crufty ASCII and you probably want high resolution
40
+ # postscript or SVG output instead with is easy using the +terminal+ option.
41
+ # Plotting is done using GNUplot which allows for different types of output.
42
+ #
43
+ # GNUplot must be installed for plot_matches to work. Read more here:
44
+ #
45
+ # http://www.gnuplot.info/
46
+ #
47
+ # == Usage
48
+ #
49
+ # plot_matches([direction: <string>[, output: <file>[, force: <bool>
50
+ # [, terminal: <string>[, title: <string>[, xlabel: <string>
51
+ # [, ylabel: <string>[, test: <bool>]]]]]]]])
52
+ #
53
+ # === Options
54
+ #
55
+ # * direction: <string> - Plot matches from forward|reverse|both direction(s)
56
+ # (default=both).
57
+ # * output: <file> - Output file.
58
+ # * force: <bool> - Force overwrite existing output file.
59
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
60
+ # (default=dumb).
61
+ # * title: <string> - Plot title (default="Matches").
62
+ # * xlabel: <string> - X-axis label (default="x").
63
+ # * ylabel: <string> - Y-axis label (default="y").
64
+ # * test: <bool> - Output Gnuplot script instead of plot.
65
+ #
66
+ # == Examples
67
+ #
68
+ # Here we plot two matches from a table. The vector records are shown in the
69
+ # +dump+ output:
70
+ #
71
+ # BP.new.read_table(input: "test.tab").dump.plot_matches.run
72
+ #
73
+ # {:Q_BEG=>0, :Q_END=>10, :S_BEG=>0, :S_END=>10, :STRAND=>"+"}
74
+ # {:Q_BEG=>0, :Q_END=>10, :S_BEG=>0, :S_END=>10, :STRAND=>"-"}
75
+ #
76
+ # Matches
77
+ # + + + + + +
78
+ # 10 +>>>-----------+-------------+------------+-------------+----------->>>+
79
+ # | >>>> : : : : >>>> |
80
+ # | >>>> : : : : >>>> |
81
+ # 8 ++..........>>>>>......................................>>>>>..........++
82
+ # | : >>>> : : >>>> : |
83
+ # | : >>>> : : >>>> : |
84
+ # 6 ++.......................>>>>>............>>>>>.......................++
85
+ # | : :>>>> >>>>: : |
86
+ # | : : >>>> : : |
87
+ # | : :>>>> >>>>: : |
88
+ # 4 ++.......................>>>>>............>>>>>.......................++
89
+ # | : >>>> : : >>>> : |
90
+ # | : >>>> : : >>>> : |
91
+ # 2 ++..........>>>>>......................................>>>>>..........++
92
+ # | >>>> : : : : >>>> |
93
+ # | >>>> : : : : >>>> |
94
+ # 0 +>>>-----------+-------------+------------+-------------+----------->>>+
95
+ # + + + + + +
96
+ # 0 2 4 6 8 10
97
+ # x
98
+ #
99
+ # To render X11 output (i.e. instant view) use the +terminal+ option:
100
+ #
101
+ # plot_matches(terminal: :x11).run
102
+ #
103
+ # To generate a PNG image and save to file:
104
+ #
105
+ # plot_matches(terminal: :png, output: "plot.png").run
106
+ #
107
+ # rubocop:disable ClassLength
108
+ # rubocop:enable LineLength
109
+ class PlotMatches
110
+ require 'gnuplotter'
111
+ require 'BioDSL/helpers/aux_helper'
112
+
113
+ include AuxHelper
114
+
115
+ STATS = %i(records_in records_out matches_in)
116
+
117
+ # Constructor for PlotMatches.
118
+ #
119
+ # @param options [Hash] Options hash.
120
+ # @option options [Symbol] :direction
121
+ # @option options [String] :output
122
+ # @option options [Boolean] :force
123
+ # @option options [Symbol] :terminal
124
+ # @option options [String] :title
125
+ # @option options [String] :xlabel
126
+ # @option options [String] :ylabel
127
+ # @option options [Boolean] :test
128
+ #
129
+ # @return [PlotMatches] Class instance.
130
+ def initialize(options)
131
+ @options = options
132
+ @gp = nil
133
+ @style1 = {using: '1:2:3:4', with: 'vectors nohead ls 1'}
134
+ @style2 = {using: '1:2:3:4', with: 'vectors nohead ls 2'}
135
+
136
+ aux_exist('gnuplot')
137
+ check_options
138
+ defaults
139
+ end
140
+
141
+ # Return lambda for command plot_matches.
142
+ #
143
+ # @return [Proc] Command lambda.
144
+ def lmb
145
+ lambda do |input, output, status|
146
+ status_init(status, STATS)
147
+
148
+ @gp = GnuPlotter.new
149
+ plot_defaults
150
+
151
+ @gp.add_dataset(@style1) do |forward|
152
+ @gp.add_dataset(@style2) do |reverse|
153
+ input.each do |record|
154
+ @status[:records_in] += 1
155
+
156
+ plot_match(forward, reverse, record)
157
+
158
+ process_output(output, record)
159
+ end
160
+ end
161
+ end
162
+
163
+ plot_output
164
+ end
165
+ end
166
+
167
+ private
168
+
169
+ # Check options.
170
+ def check_options
171
+ options_allowed(@options, :direction, :output, :force, :terminal, :title,
172
+ :xlabel, :ylabel, :test)
173
+ options_allowed_values(@options, direction: [:forward, :reverse, :both])
174
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
175
+ :aqua, :png, :pdf])
176
+ options_allowed_values(@options, test: [nil, true, false])
177
+ options_files_exist_force(@options, :output)
178
+ end
179
+
180
+ # Set default options.
181
+ def defaults
182
+ @options[:direction] ||= :both
183
+ @options[:terminal] ||= :dumb
184
+ @options[:title] ||= 'Matches'
185
+ @options[:xlabel] ||= 'x'
186
+ @options[:ylabel] ||= 'y'
187
+ end
188
+
189
+ # Set plot default attributes.
190
+ def plot_defaults
191
+ @gp.set terminal: @options[:terminal].to_s
192
+ @gp.set title: @options[:title]
193
+ @gp.set xlabel: @options[:xlabel]
194
+ @gp.set ylabel: @options[:ylabel]
195
+ @gp.set autoscale: 'xfix'
196
+ @gp.set autoscale: 'yfix'
197
+ @gp.set style: 'fill solid 0.5 border'
198
+ @gp.set xtics: 'border out'
199
+ @gp.set ytics: 'border out'
200
+ @gp.set grid: :true
201
+ @gp.set nokey: :true
202
+ @gp.set style: 'line 1 linetype 1 linecolor rgb "green" linewidth ' \
203
+ '2 pointtype 6 pointsize default'
204
+ @gp.set style: 'line 2 linetype 1 linecolor rgb "red" linewidth ' \
205
+ '2 pointtype 6 pointsize default'
206
+ end
207
+
208
+ # Add match data to forward or reverse dataset.
209
+ #
210
+ # @param forward [GnuPlotter::DataSet] Forward matches.
211
+ # @param reverse [GnuPlotter::DataSet] Reverse matches.
212
+ # @param record [Hash] BioDSL record.
213
+ def plot_match(forward, reverse, record)
214
+ return unless record[:Q_BEG] && record[:Q_END] &&
215
+ record[:S_BEG] && record[:S_END]
216
+ @status[:matches_in] += 1
217
+
218
+ q_len = record[:Q_END] - record[:Q_BEG]
219
+ s_len = record[:S_END] - record[:S_BEG]
220
+
221
+ plot_match_strand(forward, reverse, record, q_len, s_len)
222
+ plot_match_direction(forward, reverse, record, q_len, s_len)
223
+ end
224
+
225
+ # Add match data to forward or reverse dataset depeding on match strand.
226
+ #
227
+ # @param forward [GnuPlotter::DataSet] Forward matches.
228
+ # @param reverse [GnuPlotter::DataSet] Reverse matches.
229
+ # @param record [Hash] BioDSL record.
230
+ # @param q_len [Integer] Length of query match.
231
+ # @param s_len [Integer] Length of subject match.
232
+ def plot_match_strand(forward, reverse, record, q_len, s_len)
233
+ return unless record[:STRAND]
234
+
235
+ if record[:STRAND] == '+'
236
+ forward << [record[:Q_BEG], record[:S_BEG], q_len, s_len]
237
+ else
238
+ reverse << [record[:Q_END], record[:S_BEG], -1 * q_len, s_len]
239
+ end
240
+ end
241
+
242
+ # Add match data to forward or reverse dataset depeding on match direction.
243
+ #
244
+ # @param forward [GnuPlotter::DataSet] Forward matches.
245
+ # @param reverse [GnuPlotter::DataSet] Reverse matches.
246
+ # @param record [Hash] BioDSL record.
247
+ # @param q_len [Integer] Length of query match.
248
+ # @param s_len [Integer] Length of subject match.
249
+ def plot_match_direction(forward, reverse, record, q_len, s_len)
250
+ return unless record[:DIRECTION]
251
+
252
+ if record[:DIRECTION] == 'forward'
253
+ forward << [record[:Q_BEG], record[:S_BEG], q_len, s_len]
254
+ else
255
+ reverse << [record[:Q_END], record[:S_BEG], -1 * q_len, s_len]
256
+ end
257
+ end
258
+
259
+ # Output plot data
260
+ def plot_output
261
+ @gp.set output: @options[:output] if @options[:output]
262
+
263
+ if @options[:test]
264
+ $stderr.puts @gp.to_gp
265
+ elsif @options[:terminal] == :dumb
266
+ puts @gp.plot
267
+ else
268
+ @gp.plot
269
+ end
270
+ end
271
+
272
+ # Emit record to output stream if defined.
273
+ #
274
+ # @param output [Enumerator::Yielder] Output stream.
275
+ # @param record [Hash] BioDSL record.
276
+ def process_output(output, record)
277
+ return unless output
278
+ output << record
279
+ @status[:records_out] += 1
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,278 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Plot the residue distribution of sequences in the stream.
30
+ #
31
+ # +plot_residue_distribution+ creates a residue distribution plot per sequence
32
+ # position of sequences in the stream. Plotting is done using GNUplot which
33
+ # allows for different types of output the default one being crufty ASCII
34
+ # graphics.
35
+ #
36
+ # If plotting distributions from sequences of variable length you can use the
37
+ # +count+ option to co-plot the relative count at each base position. This
38
+ # allow you to explain areas with a scewed distribution.
39
+ #
40
+ # GNUplot must be installed for +plot_residue_distribution+ to work. Read more
41
+ # here:
42
+ #
43
+ # http://www.gnuplot.info/
44
+ #
45
+ # == Usage
46
+ #
47
+ # plot_residue_distribution([count: <bool>[, output: <file>
48
+ # [, force: <bool> [, terminal: <string>
49
+ # [, title: <string>[, xlabel: <string>
50
+ # [, ylabel: <string>[, test: <bool>]]]]]]])
51
+ #
52
+ # === Options
53
+ #
54
+ # * count: <bool> - Plot relative count (default=false).
55
+ # * output: <file> - Output file.
56
+ # * force: <bool> - Force overwrite existing output file.
57
+ # * terminal: <string> - Terminal for output: dumb|post|svg|x11|aqua|png|pdf
58
+ # (default=dumb).
59
+ # * title: <string> - Plot title (default="Heatmap").
60
+ # * xlabel: <string> - X-axis label (default="x").
61
+ # * ylabel: <string> - Y-axis label (default="y").
62
+ # * test: <bool> - Output Gnuplot script instead of plot.
63
+ #
64
+ # == Examples
65
+ #
66
+ # Here we plot a residue distribution of a FASTA file:
67
+ #
68
+ # BP.new.read_fasta(input: "test.fna").plot_residue_distribution.run
69
+ #
70
+ # rubocop: disable ClassLength
71
+ class PlotResidueDistribution
72
+ require 'gnuplotter'
73
+ require 'set'
74
+ require 'BioDSL/helpers/aux_helper'
75
+
76
+ include AuxHelper
77
+
78
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
79
+ residues_out)
80
+
81
+ # Constructo for PlotResidueDistribution.
82
+ #
83
+ # @param options [Hash] Options hash.
84
+ # @option options [Boolean] :count
85
+ # @option options [String] :output
86
+ # @option options [Boolean] :force
87
+ # @option options [:Symbol] :terminal
88
+ # @option options [String] :title
89
+ # @option options [String] :xlabel
90
+ # @option options [String] :ylabel
91
+ # @option options [Boolean] :test
92
+ #
93
+ # @return [PlotResidueDistribution] Class instance.
94
+ def initialize(options)
95
+ @options = options
96
+ @counts = Hash.new { |h, k| h[k] = Hash.new(0) }
97
+ @total = Hash.new(0)
98
+ @residues = Set.new
99
+ @gp = nil
100
+ @offset = Set.new # Hackery thing to offset datasets 1 postion.
101
+
102
+ aux_exist('gnuplot')
103
+ check_options
104
+ defaults
105
+ end
106
+
107
+ # Return command lambda for PlotResidueDistribution.
108
+ #
109
+ # @return [Proc] Command lambda.
110
+ def lmb
111
+ lambda do |input, output, status|
112
+ status_init(status, STATS)
113
+
114
+ input.each do |record|
115
+ @status[:records_in] += 1
116
+
117
+ count_residues(record) if record.key? :SEQ
118
+
119
+ next unless output
120
+ output << record
121
+ @status[:records_out] += 1
122
+
123
+ if record.key? :SEQ
124
+ @status[:sequences_out] += 1
125
+ @status[:residues_out] += record[:SEQ].length
126
+ end
127
+ end
128
+
129
+ plot_create
130
+ plot_output
131
+ end
132
+ end
133
+
134
+ private
135
+
136
+ # Check options.
137
+ def check_options
138
+ options_allowed(@options, :count, :output, :force, :terminal, :title,
139
+ :xlabel, :ylabel, :test)
140
+ options_allowed_values(@options, terminal: [:dumb, :post, :svg, :x11,
141
+ :aqua, :png, :pdf])
142
+ options_allowed_values(@options, count: [nil, true, false])
143
+ options_allowed_values(@options, test: [nil, true, false])
144
+ options_files_exist_force(@options, :output)
145
+ end
146
+
147
+ # Set default options.
148
+ def defaults
149
+ @options[:terminal] ||= :dumb
150
+ @options[:title] ||= 'Residue Distribution'
151
+ @options[:xlabel] ||= 'Sequence position'
152
+ @options[:ylabel] ||= '%'
153
+ end
154
+
155
+ # Given a record with a sequence count its residues.
156
+ #
157
+ # @param record [Hash] BioDSL record
158
+ def count_residues(record)
159
+ @status[:sequences_in] += 1
160
+ @status[:residues_in] += record[:SEQ].length
161
+
162
+ record[:SEQ].upcase.chars.each_with_index do |char, i|
163
+ c = char.to_sym
164
+ @counts[i][c] += 1
165
+ @total[i] += 1
166
+ @residues.add(c)
167
+ end
168
+ end
169
+
170
+ # Create plot.
171
+ def plot_create
172
+ @gp = GnuPlotter.new
173
+ plot_defaults
174
+
175
+ @residues.sort.reverse.each_with_index do |residue, i|
176
+ plot_residue(residue, i)
177
+ end
178
+
179
+ plot_count if @options[:count]
180
+ end
181
+
182
+ # Plot residue data.
183
+ def plot_residue(residue, i)
184
+ @gp.add_dataset(using: 1, with: "histogram lt #{i + 1}",
185
+ title: "\"#{residue}\"") do |plotter|
186
+ @counts.each do |pos, dist|
187
+ plotter << 0.0 unless @offset.include? residue
188
+ plotter << 100 * dist[residue].to_f / @total[pos]
189
+ @offset << residue
190
+ end
191
+ end
192
+ end
193
+
194
+ # Plot count data.
195
+ def plot_count
196
+ max = @total.values.max
197
+ style = {using: '1:2', with: 'lines lw 2 lt rgb "black"',
198
+ title: '"count"'}
199
+
200
+ @gp.add_dataset(style) do |plotter|
201
+ @counts.each_key do |pos|
202
+ plotter << [0, 0.0] unless @offset.include? :count
203
+ plotter << [pos, 100 * @total[pos].to_f / max]
204
+ @offset << :count
205
+ end
206
+ end
207
+ end
208
+
209
+ # Set plot defaults
210
+ #
211
+ # rubocop: disable MethodLength
212
+ def plot_defaults
213
+ @gp.set terminal: @options[:terminal].to_s
214
+ @gp.set title: @options[:title]
215
+ @gp.set xlabel: @options[:xlabel]
216
+ @gp.set ylabel: @options[:ylabel]
217
+ @gp.set output: @options[:output] if @options[:output]
218
+ @gp.set xtics: 'out'
219
+ @gp.set ytics: 'out'
220
+ @gp.set yrange: '[0:100]'
221
+ @gp.set xrange: "[0:#{@counts.size}]"
222
+ @gp.set auto: 'fix'
223
+ @gp.set offsets: '1'
224
+ @gp.set key: 'outside right top vertical Left reverse noenhanced ' \
225
+ 'autotitles columnhead nobox'
226
+ @gp.set key: 'invert samplen 4 spacing 1 width 0 height 0'
227
+ @gp.set style: 'fill solid 0.5 border'
228
+ @gp.set style: 'histogram rowstacked'
229
+ @gp.set style: 'data histograms'
230
+ @gp.set boxwidth: '0.75 absolute'
231
+
232
+ plot_colors
233
+ end
234
+
235
+ # Set plot line colors
236
+ # color scheme: http://en.wikipedia.org/wiki/Help:Distinguishable_colors
237
+ def plot_colors
238
+ @gp.set linetype: '1 lc rgb "#FF0010"' # Red
239
+ @gp.set linetype: '2 lc rgb "#191919"' # Ebony
240
+ @gp.set linetype: '3 lc rgb "#0075DC"' # Blue
241
+ @gp.set linetype: '4 lc rgb "#2BCE48"' # Green
242
+ @gp.set linetype: '5 lc rgb "#FFFF00"' # Yellow
243
+ @gp.set linetype: '6 lc rgb "#4C005C"' # Damson
244
+ @gp.set linetype: '7 lc rgb "#993F00"' # Caramel
245
+ @gp.set linetype: '8 lc rgb "#FFCC99"' # Honeydew
246
+ @gp.set linetype: '9 lc rgb "#808080"' # Iron
247
+ @gp.set linetype: '10 lc rgb "#94FFB5"' # Jade
248
+ @gp.set linetype: '11 lc rgb "#8F7C00"' # Khaki
249
+ @gp.set linetype: '12 lc rgb "#9DCC00"' # Lime
250
+ @gp.set linetype: '13 lc rgb "#C20088"' # Mallow
251
+ @gp.set linetype: '14 lc rgb "#003380"' # Navy
252
+ @gp.set linetype: '15 lc rgb "#FFA405"' # Orpiment
253
+ @gp.set linetype: '16 lc rgb "#FFA8BB"' # Pink
254
+ @gp.set linetype: '17 lc rgb "#426600"' # Quagmire
255
+ @gp.set linetype: '18 lc rgb "#F0A3FF"' # Amethyst
256
+ @gp.set linetype: '19 lc rgb "#5EF1F2"' # Sky
257
+ @gp.set linetype: '20 lc rgb "#00998F"' # Turquoise
258
+ @gp.set linetype: '21 lc rgb "#E0FF66"' # Uranium
259
+ @gp.set linetype: '22 lc rgb "#740AFF"' # Violet
260
+ @gp.set linetype: '23 lc rgb "#990000"' # Wine
261
+ @gp.set linetype: '24 lc rgb "#FFFF80"' # Xanthin
262
+ @gp.set linetype: '25 lc rgb "#005C31"' # Forest
263
+ @gp.set linetype: '26 lc rgb "#FF5005"' # Zinnia
264
+ @gp.set linetype: 'cycle 26'
265
+ end
266
+
267
+ # Output plot data.
268
+ def plot_output
269
+ if @options[:test]
270
+ $stderr.puts @gp.to_gp
271
+ elsif @options[:terminal] == :dumb
272
+ puts @gp.plot
273
+ else
274
+ @gp.plot
275
+ end
276
+ end
277
+ end
278
+ end