BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,345 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # rubocop:disable ClassLength
30
+
31
+ # == Assemble sequences the stream using Ray.
32
+ #
33
+ # +assemble_seq_ray+ is a wrapper around the deBruijn graph assembler Ray:
34
+ #
35
+ # http://denovoassembler.sourceforge.net/
36
+ #
37
+ # Any records containing sequence information will be included in the
38
+ # assembly, but only the assembled contig sequences will be output to the
39
+ # stream.
40
+ #
41
+ # The sequences records may contain quality scores, and if the sequence
42
+ # names indicates that the sequence order is inter-leaved paired-end
43
+ # assembly will be performed.
44
+ #
45
+ # Kmer values must be odd.
46
+ #
47
+ # == Usage
48
+ #
49
+ # assemble_seq_ray([kmer_min: <uint>[, kmer_max: <uint>
50
+ # [, contig_min: <uint>[, cpus: <uint>]]]])
51
+ #
52
+ # === Options
53
+ #
54
+ # * kmer_min: <uint> - Minimum k-mer value (default: 21).
55
+ # * kmer_max: <uint> - Maximum k-mer value (default: 49).
56
+ # * contig_min: <uint> - Minimum contig size (default: 500).
57
+ # * cpus: <uint> - Number of CPUs to use (default: 1).
58
+ #
59
+ # == Examples
60
+ #
61
+ # If you have two pair-end sequence files with the Illumina data then you
62
+ # can assemble these using +assemble_seq_ray+ like this:
63
+ #
64
+ # BP.new.
65
+ # read_fastq(input: "file1.fq", input2: "file2.fq).
66
+ # assemble_seq_ray.
67
+ # write_fasta(output: "contigs.fna").
68
+ # run
69
+ class AssembleSeqRay
70
+ require 'English'
71
+ require 'BioDSL/helpers/aux_helper'
72
+
73
+ include AuxHelper
74
+
75
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
76
+ residues_out n50 contig_min contig_max kmer)
77
+
78
+ # Constructor for the AssembleSeqRay class.
79
+ #
80
+ # @param [Hash] options Options hash.
81
+ # @option options [Integer] :kmer_min Minimum kmer value.
82
+ # @option options [Integer] :kmer_max Maximum kmer value.
83
+ # @option options [Integer] :cpus CPUs to use.
84
+ #
85
+ # @return [AssembleSeqRay] Returns an instance of the class.
86
+ def initialize(options)
87
+ @options = options
88
+ @lengths = []
89
+ @paired = nil
90
+
91
+ aux_exist('Ray')
92
+ aux_exist('mpiexec')
93
+ defaults
94
+ check_options
95
+ end
96
+
97
+ # Return a lambda for the AssembleSeqRay command.
98
+ #
99
+ # @return [Proc] Returns the command lambda.
100
+ def lmb
101
+ lambda do |input, output, status|
102
+ status_init(status, STATS)
103
+
104
+ TmpDir.create('reads.fa') do |fa_in, tmp_dir|
105
+ process_input(input, output, fa_in)
106
+ @paired = paired?(fa_in)
107
+
108
+ n50s = run_assemblies(fa_in, tmp_dir)
109
+
110
+ best_kmer = n50s.sort_by(&:n50).reverse.first.kmer
111
+
112
+ process_output(output, tmp_dir, best_kmer)
113
+ end
114
+ end
115
+ end
116
+
117
+ private
118
+
119
+ # Run assemblies for all kmers and return a list of N50 objects which
120
+ # contain info about the resulting n50 for each kmer.
121
+ #
122
+ # @param fa_in [String] Path to input FASTA file.
123
+ # @param tmp_dir [String] Temporary directory path.
124
+ #
125
+ # @return [Array] List of N50 objects.
126
+ def run_assemblies(fa_in, tmp_dir)
127
+ n50s = []
128
+
129
+ (@options[:kmer_min]..@options[:kmer_max]).step(2).to_a.each do |kmer|
130
+ result_dir = File.join(tmp_dir, kmer.to_s)
131
+ execute_ray(fa_in, result_dir, kmer)
132
+ n50s << parse_result(result_dir, kmer)
133
+ end
134
+
135
+ n50s
136
+ end
137
+
138
+ # Check the options.
139
+ def check_options
140
+ options_allowed(@options, :kmer_min, :kmer_max, :contig_min, :cpus)
141
+ options_assert(@options, ':kmer_min >= 21')
142
+ options_assert(@options, ':kmer_min <= 255')
143
+ options_assert(@options, ':kmer_max >= 21')
144
+ options_assert(@options, ':kmer_max <= 255')
145
+ options_assert(@options, ':contig_min > 0')
146
+ options_assert(@options, ':cpus >= 1')
147
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
148
+
149
+ assert_uneven(@options, :kmer_min)
150
+ assert_uneven(@options, :kmer_max)
151
+ end
152
+
153
+ # Assert that the value to a given key and options hash is uneven.
154
+ #
155
+ # @param options [Hash] Options hash.
156
+ # @param key [Symbol] Hash key whos value to check.
157
+ #
158
+ # @raise [RuntimeError] if even.
159
+ def assert_uneven(options, key)
160
+ return unless options[key].even?
161
+
162
+ fail "#{key} must be an odd number - not #{options[key]}"
163
+ end
164
+
165
+ # Set the default option values.
166
+ def defaults
167
+ @options[:kmer_min] ||= 21
168
+ @options[:kmer_max] ||= 49
169
+ @options[:contig_min] ||= 500
170
+ @options[:cpus] ||= 1
171
+ end
172
+
173
+ # Read all records from input and emit non-sequence records to the output
174
+ # stream. Sequence records are saved to a temporary file.
175
+ #
176
+ # @param input [Enumerator] input stream.
177
+ # @param output [Enumerator::Yielder] Output stream.
178
+ # @param fa_in [String] Path to temporary FASTA file.
179
+ def process_input(input, output, fa_in)
180
+ BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
181
+ input.each do |record|
182
+ @status[:records_in] += 1
183
+
184
+ if record.key? :SEQ
185
+ entry = BioDSL::Seq.new_bp(record)
186
+
187
+ @status[:sequences_in] += 1
188
+ @status[:residues_in] += entry.length
189
+
190
+ fasta_io.puts entry.to_fasta
191
+ else
192
+ @status[:records_out] += 1
193
+ output.puts record
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ # Check if the reads in a given FASTA file are
200
+ # paired by inspecting the sequence names of the first
201
+ # two entries.
202
+ #
203
+ # @param file [String] Path to FASTA file.
204
+ #
205
+ # @return [Booleon] True if paired else false.
206
+ def paired?(file)
207
+ BioDSL::Fasta.open(file, 'r') do |ios|
208
+ entry1 = ios.next_entry
209
+ entry2 = ios.next_entry
210
+
211
+ begin
212
+ BioDSL::Seq.check_name_pair(entry1, entry2)
213
+
214
+ return true
215
+ rescue SeqError
216
+ return false
217
+ end
218
+ end
219
+ end
220
+
221
+ # Execute Ray.
222
+ #
223
+ # @param fa_in [String] Path to input FASTA file.
224
+ # @param tmp_dir [String] Temporary directory path.
225
+ # @param kmer [Fixnum] Kmer size.
226
+ #
227
+ # @raise If execution fails.
228
+ def execute_ray(fa_in, tmp_dir, kmer)
229
+ cmd_line = compile_cmd_line(fa_in, tmp_dir, kmer)
230
+ $stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
231
+ system(cmd_line)
232
+
233
+ fail cmd_line unless $CHILD_STATUS.success?
234
+ end
235
+
236
+ # Compile the command and options for executing IDBA.
237
+ #
238
+ # @param fa_in [String] Path to input FASTA file.
239
+ # @param out_dir [String] Output directory path.
240
+ # @param kmer [Fixnum] Kmer size.
241
+ #
242
+ # @return [String] The command line for the IDBA system call.
243
+ def compile_cmd_line(fa_in, out_dir, kmer)
244
+ # mpiexec -n 6 Ray -k 31 -i interleaved -o output_dir
245
+ # mpiexec -n 6 Ray -k 31 -s single -o output_dir
246
+ cmd = []
247
+ cmd << 'mpiexec'
248
+ cmd << "-n #{@options[:cpus]}"
249
+ cmd << 'Ray'
250
+ cmd << "-k #{kmer}"
251
+
252
+ if @paired
253
+ cmd << "-i #{fa_in}"
254
+ else
255
+ cmd << "-s #{fa_in}"
256
+ end
257
+
258
+ cmd << "-o #{out_dir}"
259
+ cmd << '> /dev/null 2>&1' unless BioDSL.verbose
260
+
261
+ cmd.join(' ')
262
+ end
263
+
264
+ # Read the assembled scaffolds and return a N50 object.
265
+ #
266
+ # @param dir [String] Path to output dir.
267
+ # @param kmer [Fixnum] Kmer size.
268
+ #
269
+ # @return [N50] Result object
270
+ def parse_result(dir, kmer)
271
+ lengths = []
272
+
273
+ BioDSL::Fasta.open(File.join(dir, 'Scaffolds.fasta')) do |ios|
274
+ ios.each do |entry|
275
+ lengths << entry.length if entry.length >= @options[:contig_min]
276
+ end
277
+ end
278
+
279
+ N50.new(kmer, calc_n50(lengths))
280
+ end
281
+
282
+ # Calculate the n50.
283
+ #
284
+ # {http://en.wikipedia.org/wiki/N50_statistic}
285
+ #
286
+ # @param lengths [Array] List of contig lengths.
287
+ def calc_n50(lengths)
288
+ lengths.sort!
289
+ lengths.reverse!
290
+
291
+ sum = lengths.inject(&:+)
292
+ count = 0
293
+
294
+ lengths.each do |length|
295
+ count += length
296
+
297
+ return length if count >= sum * 0.50
298
+ end
299
+
300
+ nil
301
+ end
302
+
303
+ # Read the best contigs and emit to the output stream.
304
+ #
305
+ # @param output [Enumerator::Yielder] Output stream.
306
+ # @param dir [String] Path to tmp_dir.
307
+ # @param kmer [Fixnum] Highest n50 scoring kmer.
308
+ def process_output(output, dir, kmer)
309
+ lengths = []
310
+ file = File.join(dir, kmer.to_s, 'Scaffolds.fasta')
311
+
312
+ BioDSL::Fasta.open(file, 'r') do |ios|
313
+ ios.each do |entry|
314
+ next if entry.length < @options[:contig_min]
315
+
316
+ lengths << entry.length
317
+ output << entry.to_bp
318
+
319
+ @status[:records_out] += 1
320
+ @status[:sequences_out] += 1
321
+ @status[:residues_out] += entry.length
322
+ end
323
+ end
324
+
325
+ add_stats(kmer, lengths)
326
+ end
327
+
328
+ # Add status values to status hash.
329
+ #
330
+ # @param kmer [Fixnum] Highest n50 scoring kmer.
331
+ # @param lengths [Array] List of contig lengths.
332
+ def add_stats(kmer, lengths)
333
+ @status[:kmer] = kmer
334
+ @status[:paired] = @paired
335
+
336
+ unless lengths.empty?
337
+ @status[:contig_min] = lengths.min
338
+ @status[:contig_max] = lengths.max
339
+ @status[:n50] = calc_n50(lengths)
340
+ end
341
+ end
342
+
343
+ N50 = Struct.new(:kmer, :n50)
344
+ end
345
+ end
@@ -0,0 +1,252 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Assemble sequences the stream using SPAdes.
30
+ #
31
+ # +assemble_seq_spades+ is a wrapper around the single prokaryotic genome
32
+ # assembler SPAdes:
33
+ #
34
+ # http://bioinf.spbau.ru/spades
35
+ #
36
+ # Any records containing sequence information will be included in the
37
+ # assembly, but only the assembled contig sequences will be output to the
38
+ # stream.
39
+ #
40
+ # The sequences records may contain qualty scores, and if the sequence
41
+ # names indicates that the sequence order is inter-leaved paired-end
42
+ # assembly will be performed.
43
+ #
44
+ # == Usage
45
+ #
46
+ # assemble_seq_spades([careful: <bool>[, cpus: <uint>[, kmers: <list>]]])
47
+ #
48
+ # === Options
49
+ #
50
+ # * careful: <bool> - Run SPAdes with the careful flag set.
51
+ # * cpus: <uint> - Number of CPUs to use (default: 1).
52
+ # * kmers: <list> - List of kmers to use (default: auto).
53
+ #
54
+ # == Examples
55
+ #
56
+ # If you have two pair-end sequence files with the Illumina data then you
57
+ # can assemble these using assemble_seq_spades like this:
58
+ #
59
+ # BP.new.
60
+ # read_fastq(input: "file1.fq", input2: "file2.fq).
61
+ # assemble_seq_spades(kmers: [55,77,99,127]).
62
+ # write_fasta(output: "contigs.fna").
63
+ # run
64
+ # rubocop:disable ClassLength
65
+ class AssembleSeqSpades
66
+ require 'English'
67
+ require 'BioDSL/helpers/aux_helper'
68
+
69
+ include AuxHelper
70
+
71
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
72
+ records_out assembled)
73
+
74
+ # Constructor for the AssembleSeqSpades class.
75
+ #
76
+ # @param [Hash] options Options hash.
77
+ #
78
+ # @option options [Boolean] :careful
79
+ # Flag indicating use of careful assembly.
80
+ #
81
+ # @option options [Array] :kmers
82
+ # List of kmers to use.
83
+ #
84
+ # @option options [Integer] :cpus
85
+ # CPUs to use.
86
+ #
87
+ # @return [AssembleSeqSpades] Returns an instance of the class.
88
+ def initialize(options)
89
+ @options = options
90
+ @lengths = []
91
+ @type = nil
92
+
93
+ aux_exist('spades.py')
94
+ check_options
95
+ defaults
96
+ end
97
+
98
+ # Return a lambda for the AssembleSeqSpades command.
99
+ #
100
+ # @return [Proc] Returns the command lambda.
101
+ def lmb
102
+ lambda do |input, output, status|
103
+ status_init(status, STATS)
104
+
105
+ TmpDir.create('reads.fq', 'reads.fa') do |in_fq, in_fa, tmp_dir|
106
+ process_input(in_fq, in_fa, input, output)
107
+ input_file = (@type == :fastq) ? in_fq : in_fa
108
+ execute_spades(input_file, tmp_dir)
109
+ process_output(output, File.join(tmp_dir, 'scaffolds.fasta'))
110
+ end
111
+
112
+ calc_n50(status)
113
+ end
114
+ end
115
+
116
+ private
117
+
118
+ # Check the options.
119
+ def check_options
120
+ options_allowed(@options, :careful, :cpus, :kmers)
121
+ options_allowed_values(@options, careful: [true, false, nil])
122
+ options_assert(@options, ':cpus >= 1')
123
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
124
+ end
125
+
126
+ # Set default options.
127
+ def defaults
128
+ @options[:cpus] ||= 1
129
+ end
130
+
131
+ # Process input stream and write all sequence records to a temporary file.
132
+ #
133
+ # @param in_fq [String] Path to FASTQ temp file.
134
+ # @param in_fa [String] Path to FASTA temp file.
135
+ # @param input [Enumerator] Input stream.
136
+ # @param output [Enumerator::Yielder] Output stream.
137
+ def process_input(in_fq, in_fa, input, output)
138
+ BioDSL::Fastq.open(in_fq, 'w') do |io_fq|
139
+ BioDSL::Fasta.open(in_fa, 'w') do |io_fa|
140
+ input.each do |record|
141
+ @status[:records_in] += 1
142
+
143
+ if record.key? :SEQ
144
+ write_sequence(io_fq, io_fa, record)
145
+ else
146
+ @status[:records_out] += 1
147
+ output.puts record
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ # Write a sequence record to the temporary file.
155
+ #
156
+ # @param io_fq [BioDSL::Fastq::IO] FASTQ IO stream.
157
+ # @param io_fa [BioDSL::Fasta::IO] FASTA IO stream.
158
+ # @param record [Hash] BioPiece record with sequence.
159
+ def write_sequence(io_fq, io_fa, record)
160
+ entry = BioDSL::Seq.new_bp(record)
161
+
162
+ @status[:sequences_in] += 1
163
+ @status[:residues_in] += entry.length
164
+
165
+ if entry.qual
166
+ @type = :fastq
167
+ io_fq.puts entry.to_fastq
168
+ else
169
+ io_fa.puts entry.to_fasta
170
+ end
171
+ end
172
+
173
+ # Execute spades using a system call.
174
+ #
175
+ # @param input_file [String] Path to input file.
176
+ # @param tmp_dir [String] Path to temp dir.
177
+ #
178
+ # @raise if command fails.
179
+ def execute_spades(input_file, tmp_dir)
180
+ cmd_line = compile_command(input_file, tmp_dir)
181
+
182
+ if BioDSL.verbose
183
+ $stderr.puts cmd_line
184
+ system(cmd_line)
185
+ else
186
+ system(cmd_line + ' > /dev/null 2>&1')
187
+ end
188
+
189
+ fail "Command failed: #{cmd_line}" unless $CHILD_STATUS.success?
190
+ end
191
+
192
+ # Compile the spades command.
193
+ #
194
+ # @param input_file [String] Path to input file.
195
+ # @param tmp_dir [String] Path to temp dir.
196
+ #
197
+ # @return [String] A command string for executing Spades.
198
+ def compile_command(input_file, tmp_dir)
199
+ cmd = []
200
+ cmd << 'spades.py'
201
+ cmd << "--12 #{input_file}"
202
+ cmd << '--only-assembler'
203
+ cmd << '--careful' if @options[:careful]
204
+ cmd << "-k #{@options[:kmers].join(',')}" if @options[:kmers]
205
+ cmd << "-t #{@options[:cpus]}"
206
+ cmd << "-o #{tmp_dir}"
207
+
208
+ cmd.join(' ')
209
+ end
210
+
211
+ # Process the spades output and emit the contigs to the output stream.
212
+ #
213
+ # @param output [Enumerator::Yielder] Output stream
214
+ # @param output_file [String] Path to output FASTA file with contigs.
215
+ def process_output(output, output_file)
216
+ BioDSL::Fasta.open(output_file) do |ios|
217
+ ios.each do |entry|
218
+ output << entry.to_bp
219
+ @status[:records_out] += 1
220
+ @status[:sequences_out] += 1
221
+ @status[:residues_out] += entry.length
222
+
223
+ @lengths << entry.length
224
+ end
225
+ end
226
+ end
227
+
228
+ # Calculate the n50 and add to the status.
229
+ #
230
+ # {http://en.wikipedia.org/wiki/N50_statistic}
231
+ #
232
+ # @param status [Hash] Status hash.
233
+ def calc_n50(status)
234
+ @lengths.sort!
235
+ @lengths.reverse!
236
+
237
+ status[:contig_max] = @lengths.first
238
+ status[:contig_min] = @lengths.last
239
+
240
+ count = 0
241
+
242
+ @lengths.each do |length|
243
+ count += length
244
+
245
+ if count >= status[:residues_out] * 0.50
246
+ status[:contig_n50] = length
247
+ break
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end