BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,419 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # rubocop: disable ClassLength
30
+
31
+ # == Write tabular output from the stream.
32
+ #
33
+ # Description
34
+ #
35
+ # +write_table+ writes tabular output from the stream.
36
+ #
37
+ # == Usage
38
+ # write_table([keys: <string> | skip: <string>][, output: <file>[, force:
39
+ # <bool>[, header: <bool>[, pretty: <bool>[, commify: <bool>
40
+ # [, delimiter: <string>[, first: <uint> | last: <uint>
41
+ # [, gzip: <bool>, [bzip2: <bool>]]]]]]]]]
42
+ #
43
+ # === Options
44
+ # * keys <string> - Comma separated list of keys to print in that order.
45
+ # * skip <string> - Comma separated list of keys to skip printing.
46
+ # * output <file> - Output file.
47
+ # * force <bool> - Force overwrite existing output file.
48
+ # * header <bool> - Output header.
49
+ # * pretty <bool> - Pretty print table.
50
+ # * commify <bool> - Commify numbers when pretty printing.
51
+ # * delimiter <string> - Specify delimiter (default="\t").
52
+ # * first <uint> - Only output +first+ number of rows.
53
+ # * last <uint> - Only output +last+ number of rows.
54
+ # * gzip <bool> - Write gzipped output file.
55
+ # * bzip2 <bool> - Write bzipped output file.
56
+ #
57
+ # == Examples
58
+ #
59
+ # Consider the following records in the stream:
60
+ #
61
+ # {ORGANISM: Human
62
+ # COUNT: 23524
63
+ # SEQ: ATACGTCAG},
64
+ # {ORGANISM: Dog
65
+ # COUNT: 2442
66
+ # SEQ: AGCATGAC},
67
+ # {ORGANISM: Mouse
68
+ # COUNT: 234
69
+ # SEQ: GACTG},
70
+ # {ORGANISM: Cat
71
+ # COUNT: 2342
72
+ # SEQ: AAATGCA}
73
+ #
74
+ # To write all records from the stream as a table, do:
75
+ #
76
+ # write_table()
77
+ #
78
+ # Human 23524 ATACGTCAG
79
+ # Dog 2442 AGCATGAC
80
+ # Mouse 234 GACTG
81
+ # Cat 2342 AAATGCA
82
+ #
83
+ # If you supply the +header+ option, then the first row in the table will be a
84
+ # 'header' line prefixed with a '#':
85
+ #
86
+ # write_table(header: true)
87
+ #
88
+ # #ORGANISM COUNT SEQ
89
+ # Human 23524 ATACGTCAG
90
+ # Dog 2442 AGCATGAC
91
+ # Mouse 234 GACTG
92
+ # Cat 2342 AAATGCA
93
+ #
94
+ # You can also change the delimiter from the default (tab) to e.g. ',':
95
+ #
96
+ # write_table(delimiter: ',')
97
+ #
98
+ # Human,23524,ATACGTCAG
99
+ # Dog,2442,AGCATGAC
100
+ # Mouse,234,GACTG
101
+ # Cat,2342,AAATGCA
102
+ #
103
+ # If you want the values output in a specific order you have to supply a comma
104
+ # separated list using the +keys+ option that will print only those keys in
105
+ # that order:
106
+ #
107
+ # write_table(keys: [:SEQ, :COUNT])
108
+ #
109
+ # ATACGTCAG 23524
110
+ # AGCATGAC 2442
111
+ # GACTG 234
112
+ # AAATGCA 2342
113
+ #
114
+ # Keys in the format V0, V1, V2 ... Vn, is automagically sorted numerically.
115
+ #
116
+ # Alternatively, if you have some keys that you don't want in the tabular
117
+ # output, use the +skip+ option. So to print all keys except SEQ and SEQ_TYPE
118
+ # do:
119
+ #
120
+ # write_table(skip: [:SEQ])
121
+ #
122
+ # Human 23524
123
+ # Dog 2442
124
+ # Mouse 234
125
+ # Cat 2342
126
+ #
127
+ # And if you want a pretty printed table use the +pretty+ option and throw in
128
+ # the +commify+ option if you want commified numbers:
129
+ #
130
+ # write_tab(pretty: true, header: true, commify: true)
131
+ #
132
+ # +----------+--------+-----------+
133
+ # | ORGANISM | COUNT | SEQ |
134
+ # +----------+--------+-----------+
135
+ # | Human | 23,524 | ATACGTCAG |
136
+ # | Dog | 2,442 | AGCATGAC |
137
+ # | Mouse | 234 | GACTG |
138
+ # | Cat | 2,342 | AAATGCA |
139
+ # +----------+--------+-----------+
140
+ #
141
+ # To write a table to a file 'test.tab':
142
+ #
143
+ # write_table(output: "test.tab")
144
+ #
145
+ # To write a table to a file 'test.tab' with only the first 3 rows:
146
+ #
147
+ # write_table(output: "test.tab", first: 3)
148
+ #
149
+ # To write a table to a file 'test.tab' with only the last 3 rows:
150
+ #
151
+ # write_table(output: "test.tab", last: 3)
152
+ #
153
+ # To overwrite output file if this exists use the +force+ option:
154
+ #
155
+ # write_table(output: "test.tab", force: true)
156
+ #
157
+ # To write gzipped output to a file 'test.tab.gz'.
158
+ #
159
+ # write_table(output: "test.tab.gz", gzip: true)
160
+ #
161
+ # To write bzipped output to a file 'test.tab.bz2'.
162
+ #
163
+ # write_table(output: "test.tab.bz2", bzip2: true)
164
+ class WriteTable
165
+ require 'set'
166
+ require 'terminal-table'
167
+
168
+ STATS = %i(records_in records_out)
169
+
170
+ # Constructor for WriteTable.
171
+ #
172
+ # @param options [Hash] Options hash.
173
+ # @option options [Array] :keys
174
+ # @option options [Array] :skip
175
+ # @option options [String] :output
176
+ # @option options [Boolean] :force
177
+ # @option options [Boolean] :header
178
+ # @option options [Boolean] :pretty
179
+ # @option options [Boolean] :commify
180
+ # @option options [String] :delimiter
181
+ # @option options [Fixnum] :first
182
+ # @option options [Fixnum] :last
183
+ # @option options [Boolean] :gzip
184
+ # @option options [Boolean] :bzip2
185
+ #
186
+ # @return [WriteTable] Class instance.
187
+ def initialize(options)
188
+ @options = options
189
+ check_options
190
+ @options[:delimiter] ||= "\t"
191
+ @compress = choose_compression
192
+ @headings = nil
193
+ @header = @options[:header] ? true : false
194
+ @last = []
195
+ @rows = []
196
+ end
197
+
198
+ # Return command lambda for write_table.
199
+ #
200
+ # @return [Proc] Command lambda.
201
+ def lmb
202
+ lambda do |input, output, status|
203
+ status_init(status, STATS)
204
+
205
+ if @options[:output]
206
+ Filesys.open(@options[:output], 'w', compress: @compress) do |tab_out|
207
+ write_table(input, output, tab_out)
208
+ end
209
+ else
210
+ write_table(input, output, $stdout)
211
+ end
212
+ end
213
+ end
214
+
215
+ private
216
+
217
+ # Check options.
218
+ def check_options
219
+ options_allowed(@options, :keys, :skip, :output, :force, :header, :pretty,
220
+ :commify, :delimiter, :first, :last, :gzip, :bzip2)
221
+ options_unique(@options, :keys, :skip)
222
+ options_unique(@options, :first, :last)
223
+ options_unique(@options, :gzip, :bzip2)
224
+ options_allowed_values(@options, force: [nil, true, false])
225
+ options_allowed_values(@options, header: [nil, true, false])
226
+ options_tie(@options, commify: :pretty)
227
+ options_conflict(@options, delimiter: :pretty)
228
+ options_allowed_values(@options, pretty: [nil, true, false],
229
+ commify: [nil, true, false],
230
+ gzip: [nil, true, false],
231
+ bzip2: [nil, true, false])
232
+ options_tie(@options, gzip: :output, bzip2: :output)
233
+ options_files_exist_force(@options, :output)
234
+ end
235
+
236
+ # Choose compression to use which can either be gzip or bzip2 or no
237
+ # compression.
238
+ #
239
+ # @return [Symbol,nil] Compression.
240
+ def choose_compression
241
+ if @options[:gzip]
242
+ :gzip
243
+ elsif @options[:bzip2]
244
+ :bzip2
245
+ end
246
+ end
247
+
248
+ # Write table from records read from the input stream and emit records
249
+ # to the output stream and table rows to the tab_out IO.
250
+ #
251
+ # @param input [Enumerator] Input stream.
252
+ # @param output [Enumerator::Yielder] Output stream.
253
+ # @param tab_out [IO,STDOUT] Output to file or stdout.
254
+ def write_table(input, output, tab_out)
255
+ input.each_with_index do |record, i|
256
+ @status[:records_in] += 1
257
+
258
+ compile_headings(record) unless @headings
259
+
260
+ row = record.values_at(*@headings)
261
+
262
+ if @options[:pretty]
263
+ @rows << row
264
+ else
265
+ output_row(tab_out, row, i)
266
+ end
267
+
268
+ if output
269
+ output << record
270
+ @status[:records_out] += 1
271
+ end
272
+ end
273
+
274
+ @options[:pretty] ? output_pretty(tab_out) : output_last(tab_out)
275
+ end
276
+
277
+ # Compile a list of headings to be used with the output table.
278
+ #
279
+ # @param record [Hash] BioDSL record.
280
+ def compile_headings(record)
281
+ @headings = if @options[:keys]
282
+ @options[:keys].map(&:to_sym)
283
+ elsif record.keys.first =~ /^V\d+$/
284
+ sort_keys(record)
285
+ else
286
+ record.keys
287
+ end
288
+
289
+ skip_headings if @options[:skip]
290
+ end
291
+
292
+ # Sort keys in the form V[0-9]+ on the numerical part in ascending order.
293
+ def sort_keys(record)
294
+ record.keys.sort do |a, b|
295
+ a.to_s[1..a.to_s.size].to_i <=> b.to_s[1..a.to_s.size].to_i
296
+ end
297
+ end
298
+
299
+ # Output row.
300
+ #
301
+ # @param tab_out [Enumerator::Yielder,STDOUT]
302
+ # @param row [Array] Row to output
303
+ # @param i [Fixnum] Row number
304
+ def output_row(tab_out, row, i)
305
+ output_header(tab_out) if @header
306
+
307
+ return if row.compact.empty?
308
+
309
+ if @options[:first]
310
+ process_first(tab_out, row, i)
311
+ elsif @options[:last]
312
+ process_last(row)
313
+ else
314
+ tab_out.puts row.join(@options[:delimiter])
315
+ end
316
+ end
317
+
318
+ # Output header to the given IO if the +header+ flag is set.
319
+ #
320
+ # @param tab_out [IO,STDOUT] Table output IO.
321
+ def output_header(tab_out)
322
+ unless @headings.compact.empty?
323
+ tab_out.puts '#' + @headings.join(@options[:delimiter])
324
+ end
325
+
326
+ @header = false
327
+ end
328
+
329
+ # Output row to IO if row is among the first number requested.
330
+ #
331
+ # @param tab_out [IO,STDOUT] Table output IO.
332
+ # @param row [Array] Row with table data.
333
+ # @param i [Integer] Row number.
334
+ def process_first(tab_out, row, i)
335
+ return unless i < @options[:first]
336
+ tab_out.puts row.join(@options[:delimiter])
337
+ end
338
+
339
+ # Add row to last buffer and adjust the size of the buffer to the number of
340
+ # rows requested.
341
+ #
342
+ # @param row [Array] Row with table data.
343
+ def process_last(row)
344
+ @last << row
345
+ @last.shift if @last.size > @options[:last]
346
+ end
347
+
348
+ # Skip headings according to the specified options.
349
+ def skip_headings
350
+ skip = @options[:skip].each_with_object(Set.new) { |e, a| a << e.to_sym }
351
+ @headings.reject! { |r| skip.include? r }
352
+ end
353
+
354
+ # Output data rows as pretty printed table.
355
+ #
356
+ # @param tab_out [IO,STDOUT] Table output IO.
357
+ def output_pretty(tab_out)
358
+ return unless @options[:pretty]
359
+
360
+ table = Terminal::Table.new
361
+
362
+ unless @rows.empty?
363
+ table.headings = @headings if @options[:header]
364
+ commify if @options[:commify]
365
+ fill_table(table)
366
+ align_columns(table)
367
+ end
368
+
369
+ tab_out.puts table
370
+ end
371
+
372
+ # Insert commas in large numbers for readability.
373
+ def commify
374
+ @rows.each do |row|
375
+ row.each_with_index do |cell, i|
376
+ if cell.is_a? Integer
377
+ row[i] = cell.to_i.commify
378
+ elsif cell.is_a? Float
379
+ row[i] = cell.to_f.commify
380
+ end
381
+ end
382
+ end
383
+ end
384
+
385
+ # Fill terminal table with data.
386
+ #
387
+ # @param table [Terminal::Table] Table to be pretty printed.
388
+ def fill_table(table)
389
+ table.rows = if @options[:first]
390
+ @rows.first(@options[:first])
391
+ elsif @options[:last]
392
+ @rows.last(@options[:last])
393
+ else
394
+ @rows
395
+ end
396
+ end
397
+
398
+ # Iterate over the first row in the given table to be pretty printed and
399
+ # determine the alignment of each column.
400
+ #
401
+ # @param table [Terminal::Table] Table to be pretty printed.
402
+ def align_columns(table)
403
+ @rows.first.each_with_index do |cell, i|
404
+ next unless cell.is_a?(Fixnum) ||
405
+ cell.is_a?(Float) ||
406
+ cell.delete(',') =~ /^[0-9]+$/
407
+
408
+ table.align_column(i, :right)
409
+ end
410
+ end
411
+
412
+ # Output last table rows.
413
+ #
414
+ # @param tab_out [IO,STDOUT] Table output IO.
415
+ def output_last(tab_out)
416
+ @last.each { |row| tab_out.puts(row.join(@options[:delimiter])) }
417
+ end
418
+ end
419
+ end
@@ -0,0 +1,167 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Write aligned sequences from stream as a tree.
30
+ #
31
+ # Description
32
+ #
33
+ # +write_tree+ takes aligned sequences from the stream and uses FastTree to to
34
+ # create a distance tree between the sequences. The tree is in Newick format.
35
+ # FastTree must be installed.
36
+ #
37
+ # For more about the FastTree here:
38
+ #
39
+ # http://www.microbesonline.org/fasttree/
40
+ #
41
+ # == Usage
42
+ # write_tree([, output: <file>[, force: <bool>[, type: <string>]]])
43
+ #
44
+ # === Options
45
+ # * output <file> - Output file.
46
+ # * force <bool> - Force overwrite existing output file.
47
+ # * type <string> - Sequence type :dna|:rna|:protein (default=:dna).
48
+ #
49
+ # == Examples
50
+ #
51
+ # To create a tree from aligned FASTA sequences in the file `align.fna` do:
52
+ #
53
+ # BP.new.
54
+ # read_fasta(input: "align.fna").
55
+ # write_tree(output: "align.tree").
56
+ # run
57
+ class WriteTree
58
+ require 'open3'
59
+ require 'BioDSL/helpers/aux_helper'
60
+
61
+ include AuxHelper
62
+
63
+ STATS = %i(records_in records_out sequences_in residues_in)
64
+
65
+ # Constructor for WriteTree.
66
+ #
67
+ # @param options [Hash] Options hash.
68
+ # @option options [String] :output
69
+ # @option options [Boolean] :force
70
+ # @option options [Symbol] :type
71
+ #
72
+ # @return [WriteTree] Class instance.
73
+ def initialize(options)
74
+ @options = options
75
+
76
+ aux_exist('FastTree')
77
+ check_options
78
+
79
+ @cmd = compile_command
80
+ end
81
+
82
+ # rubocop: disable Metrics/AbcSize
83
+ # rubocop: disable MethodLength
84
+
85
+ # Return command lambda for write_tree.
86
+ #
87
+ # @return [Proc] Command lambda.
88
+ def lmb
89
+ lambda do |input, output, status|
90
+ status_init(status, STATS)
91
+
92
+ Open3.popen3(@cmd) do |stdin, stdout, stderr, wait_thr|
93
+ input.each_with_index do |record, i|
94
+ @status[:records_in] += 1
95
+
96
+ write_seq(stdin, record, i) if record[:SEQ]
97
+
98
+ output << record && @status[:records_out] += 1 if output
99
+ end
100
+
101
+ stdin.close
102
+
103
+ tree_data = stdout.read.chomp
104
+
105
+ stdout.close
106
+
107
+ exit_status = wait_thr.value
108
+
109
+ fail stderr.read unless exit_status.success?
110
+
111
+ write_tree(tree_data)
112
+ end
113
+ end
114
+ end
115
+
116
+ # rubocop: enable Metrics/AbcSize
117
+ # rubocop: enable MethodLength
118
+
119
+ private
120
+
121
+ # Check options.
122
+ def check_options
123
+ options_allowed(@options, :force, :output, :type)
124
+ options_allowed_values(@options, type: [:dna, :rna, :protein])
125
+ options_files_exist_force(@options, :output)
126
+ end
127
+
128
+ # Compile command for running FastTree.
129
+ #
130
+ # @return [String] FastTree command.
131
+ def compile_command
132
+ cmd = []
133
+ cmd << 'FastTree'
134
+ cmd << '-nt' unless @options[:type] == :protein
135
+ cmd << '-quiet' unless BioDSL.verbose
136
+ cmd.join(' ')
137
+ end
138
+
139
+ # Write a record with sequence to stdin.
140
+ #
141
+ # @param stdin [IO] Open3 IO.
142
+ # @param record [Hash] BioDSL record.
143
+ # @param i [Integer] Record index.
144
+ def write_seq(stdin, record, i)
145
+ entry = BioDSL::Seq.new_bp(record)
146
+ entry.seq_name ||= i
147
+
148
+ @status[:sequences_in] += 1
149
+ @status[:residues_in] += entry.length
150
+
151
+ stdin.puts entry.to_fasta
152
+ end
153
+
154
+ # Write tree data to file or stdout.
155
+ #
156
+ # @param tree_data [String] Tree data in Newick format.
157
+ def write_tree(tree_data)
158
+ if @options[:output]
159
+ File.open(@options[:output], 'w') do |ios|
160
+ ios.puts tree_data
161
+ end
162
+ else
163
+ puts tree_data
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,31 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # http://www.gnu.org/copyleft/gpl.html #
16
+ # #
17
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
18
+ # #
19
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+
23
+ module BioDSL
24
+ # Module that require all files in the BioDSL/commands/ directory
25
+ module Commands
26
+ Dir[File.join(File.dirname(__FILE__), 'commands', '*')].each do |file|
27
+ require file.split(File::SEPARATOR)[-3..-1].join(File::SEPARATOR).
28
+ chomp('.rb')
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,55 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # Module with Config constants.
30
+ module Config
31
+ require 'parallel'
32
+ require 'BioDSL/helpers/options_helper'
33
+
34
+ extend OptionsHelper
35
+
36
+ HISTORY_FILE = File.join(ENV['HOME'], '.BioDSL_history')
37
+ LOG_FILE = File.join(ENV['HOME'], '.BioDSL_log')
38
+ RC_FILE = File.join(ENV['HOME'], '.BioDSLrc')
39
+ STATUS_PROGRESS_INTERVAL = 0.1 # update progress every n second.
40
+
41
+ options = options_load_rc({}, :pipeline)
42
+
43
+ TMP_DIR = if options && !options[:tmp_dir].empty?
44
+ options[:tmp_dir].first
45
+ else
46
+ Dir.tmpdir
47
+ end
48
+
49
+ CORES_MAX = if options && !options[:processor_count].empty?
50
+ options[:processor_count].first.to_i
51
+ else
52
+ Parallel.processor_count
53
+ end
54
+ end
55
+ end