BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,239 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Filter rRNA sequences from the stream.
30
+ #
31
+ # Description
32
+ #
33
+ # +filter_rrna+ utilizes +sortmerna+ to identify and filter ribosomal RNA
34
+ # sequences from the stream. The +sortmerna+ and +indexdb_rna+ executables
35
+ # must be installed for +filter_rrna+ to work.
36
+ #
37
+ # Indexed reference files are produced using +indexdb_rna+.
38
+ #
39
+ # For more about the sortmerna look here:
40
+ #
41
+ # http://bioinfo.lifl.fr/RNA/sortmerna/
42
+ #
43
+ # == Usage
44
+ # filter_rrna(ref_fasta: <file(s)>, ref_index: <file(s)>)
45
+ #
46
+ # === Options
47
+ # * ref_fasta <file(s)> - One or more reference FASTA files.
48
+ # * ref_index <file(s)> - One or more index reference files.
49
+ #
50
+ # == Examples
51
+ #
52
+ # To filter all reads matching the SILVA archaea 23S rRNA do:
53
+ #
54
+ # BP.new.
55
+ # read_fastq(input: "reads.fq").
56
+ # filter_rrna(ref_fasta: ["silva-arc-23s-id98.fasta"],
57
+ # ref_index: ["silva-arc-23s-id98.fasta.idx*"]).
58
+ # write_fastq(output: "clean.fq").
59
+ # run
60
+ #
61
+ # rubocop:disable ClassLength
62
+ class FilterRrna
63
+ require 'English'
64
+ require 'set'
65
+ require 'BioDSL/helpers/aux_helper'
66
+
67
+ include AuxHelper
68
+
69
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
70
+ residues_out)
71
+
72
+ # Constructor the FilterRrna class.
73
+ #
74
+ # @param options [Hash] Options hash.
75
+ # @option options [String,Array] Path(s) to reference FASTA files.
76
+ # @option options [String,Array] Path(s) to reference index files.
77
+ #
78
+ # @return [FilterRrnas] Class instance of FilterRrnas.
79
+ def initialize(options)
80
+ @options = options
81
+ @filter = Set.new
82
+
83
+ aux_exist('sortmerna')
84
+ check_options
85
+ end
86
+
87
+ # Return the command lambda for filter_rrnas.
88
+ #
89
+ # @return [Proc] Command lambda.
90
+ def lmb
91
+ lambda do |input, output, status|
92
+ status_init(status, STATS)
93
+
94
+ TmpDir.create('tmp', 'seq', 'out') do |tmp_file, seq_file, out_file|
95
+ ref_files = process_ref_files
96
+ process_input(input, tmp_file, seq_file)
97
+ execute_sortmerna(ref_files, seq_file, out_file)
98
+ parse_sortme_output(out_file)
99
+ process_output(output, tmp_file)
100
+ end
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ # Check options.
107
+ def check_options
108
+ options_allowed(@options, :ref_fasta, :ref_index)
109
+ options_files_exist(@options, :ref_fasta, :ref_index)
110
+ end
111
+
112
+ # Given reference index and fasta files in the options hash, process these
113
+ # into a string of the format read by 'sortmerna': fasta1,id1:fasta2,id2:...
114
+ #
115
+ # @return [String] Reference file string for sortmerna.
116
+ def process_ref_files
117
+ ref_index = @options[:ref_index]
118
+ ref_fasta = @options[:ref_fasta]
119
+
120
+ if ref_index.is_a? Array
121
+ ref_index.map { |f| f.sub!(/\*$/, '') }
122
+ else
123
+ ref_index.sub!(/\*$/, '')
124
+ end
125
+
126
+ ref_fasta = [ref_fasta.split(',')] if ref_fasta.is_a? String
127
+ ref_index = [ref_index.split(',')] if ref_index.is_a? String
128
+
129
+ ref_fasta.zip(ref_index).map { |m| m.join(',') }.join(':')
130
+ end
131
+
132
+ # Execute 'sortmerna'.
133
+ #
134
+ # @param ref_files [String] Reference file string for sortmerna.
135
+ # @param seq_file [String] Path to intput file with reads.
136
+ # @param out_file [String] Path to output file.
137
+ #
138
+ # @raise if execution of 'sortmerna' fails.
139
+ def execute_sortmerna(ref_files, seq_file, out_file)
140
+ cmd = ['sortmerna']
141
+ cmd << "--ref #{ref_files}"
142
+ cmd << "--reads #{seq_file}"
143
+ cmd << "--aligned #{out_file}"
144
+ cmd << '--fastx'
145
+ cmd << '-v' if BioDSL.verbose
146
+
147
+ cmd_line = cmd.join(' ')
148
+
149
+ $stderr.puts "Running command: #{cmd_line}" if BioDSL.verbose
150
+
151
+ system(cmd_line)
152
+
153
+ fail "command failed: #{cmd_line}" unless $CHILD_STATUS.success?
154
+ end
155
+
156
+ # Parse the 'sortmerna' output file and add all sequence name indices to the
157
+ # filter set.
158
+ #
159
+ # @param out_file [String] Path to output file.
160
+ def parse_sortme_output(out_file)
161
+ BioDSL::Fasta.open("#{out_file}.fasta", 'r') do |ios|
162
+ ios.each do |entry|
163
+ @filter << entry.seq_name.to_i
164
+ end
165
+ end
166
+ end
167
+
168
+ # Process input stream and serialize all records and write a temporary FASTA
169
+ # file.
170
+ #
171
+ # @param input [Enumerator] Input stream.
172
+ # @param tmp_file [String] Path to tmp file for serialized records.
173
+ # @param seq_file [String] Path to tmp FASTA sequence file.
174
+ def process_input(input, tmp_file, seq_file)
175
+ BioDSL::Fasta.open(seq_file, 'w') do |seq_io|
176
+ File.open(tmp_file, 'wb') do |tmp_ios|
177
+ BioDSL::Serializer.new(tmp_ios) do |s|
178
+ input.each_with_index do |record, i|
179
+ @status[:records_in] += 1
180
+
181
+ s << record
182
+ # FIXME: need << method
183
+ seq_io.puts record2entry(record, i).to_fasta if record.key? :SEQ
184
+ end
185
+ end
186
+ end
187
+ end
188
+ end
189
+
190
+ # Given a BioDSL record and an index create a new sequence entry object
191
+ # that is returned using the index as sequence name.
192
+ #
193
+ # @param record [Hash] BioDSL record
194
+ # @param i [Integer] Index.
195
+ #
196
+ # @return [BioDSL::Seq] Sequence entry.
197
+ def record2entry(record, i)
198
+ entry = BioDSL::Seq.new(seq_name: i, seq: record[:SEQ])
199
+ @status[:sequences_in] += 1
200
+ @status[:residues_in] += entry.length
201
+ entry
202
+ end
203
+
204
+ # Process the serialized data and output all records, that does not match
205
+ # the filter, to the output stream.
206
+ #
207
+ # @param output [Enumerator::Yielder] Output stream.
208
+ # @param tmp_file [String] Path to tmp file with serialized records.
209
+ def process_output(output, tmp_file)
210
+ File.open(tmp_file, 'rb') do |ios|
211
+ BioDSL::Serializer.new(ios) do |s|
212
+ s.each_with_index do |record, i|
213
+ output_record(output, record, i)
214
+ end
215
+ end
216
+ end
217
+ end
218
+
219
+ # Output a record to the output stream unless it contains sequence
220
+ # information that should be filtered.
221
+ #
222
+ # @param output [Enumerator::Yielder] Output stream.
223
+ # @param record [Hash] BioDSL record.
224
+ # @param i [Integer] Index.
225
+ def output_record(output, record, i)
226
+ if record.key? :SEQ
227
+ unless @filter.include? i
228
+ output << record
229
+ @status[:records_out] += 1
230
+ @status[:sequences_out] += 1
231
+ @status[:residues_out] += record[:SEQ].length
232
+ end
233
+ else
234
+ output << record
235
+ @status[:records_out] += 1
236
+ end
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,237 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Genecall sequences in the stream.
30
+ #
31
+ # +Genecall+ predict genes in prokaryotic single genomes or metagenomes using
32
+ # Prodigal 2.6 which must be installed:
33
+ #
34
+ # http://prodigal.ornl.gov/
35
+ #
36
+ # The records produced are of the type:
37
+ #
38
+ # {:RECORD_TYPE=>"gene",
39
+ # :S_BEG=>2, :S_END=>109,
40
+ # :S_LEN=>108,
41
+ # :STRAND=>"-",
42
+ # :SEQ_NAME=>"contig1",
43
+ # :SEQ=>"MGKVIGIDLGTTNSCVAVMDGKTAKVIENAEGMRTT",
44
+ # :SEQ_LEN=>36}
45
+ #
46
+ # == Usage
47
+ #
48
+ # genecall([type: <string>[, procedure: <string>[, closed_ends: <bool>
49
+ # [, masked: <bool>]]]])
50
+ #
51
+ # === Options
52
+ #
53
+ # * type: <string> - Output dna or protein sequence (default: dna).
54
+ # * procedure: <string> - Single or meta (default: single).
55
+ # * closed_ends: <bool> - Don't allow truncated gene at ends.
56
+ # * masked: <bool> - Ignore stretch of Ns.
57
+ #
58
+ # == Examples
59
+ #
60
+ # To genecall a genome do:
61
+ #
62
+ # BP.new.
63
+ # read_fasta(input: "contigs.fna").
64
+ # genecall.
65
+ # grab(select: "genecall", key: :type, exact: true).
66
+ # write_fasta(output: "genes.fna").
67
+ # run
68
+ #
69
+ # To add genecall data to the sequence name use +merge_values+:
70
+ #
71
+ # BP.new.
72
+ # read_fasta(input: "contigs.fna").
73
+ # genecall(type: "protein").
74
+ # grab(select: "genecall", key: :type, exact: true).
75
+ # merge_values(keys: [:SEQ_NAME, :S_BEG, :S_END, :S_LEN, :STRAND]).
76
+ # write_fasta(output: "genes.faa").
77
+ # run
78
+ class Genecall
79
+ require 'English'
80
+ require 'BioDSL/helpers/aux_helper'
81
+
82
+ include AuxHelper
83
+
84
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
85
+ residues_out)
86
+
87
+ # Constructor for the Genecall class.
88
+ #
89
+ # @param [Hash] options Options hash.
90
+ # @option options [Symbol] :type of output.
91
+ # @option options [Symbol] :procedure used for genecalling.
92
+ # @option options [Boolean] :closed_ends disallow truncated genes at ends.
93
+ # @option options [Boolean] :masked ignore stretch of Ns.
94
+ #
95
+ # @return [Genecall] Returns an instance of the class.
96
+ def initialize(options)
97
+ @options = options
98
+ @names = {}
99
+
100
+ aux_exist('prodigal')
101
+ defaults
102
+ check_options
103
+
104
+ @type = @options[:type].to_sym
105
+ end
106
+
107
+ # Return a lambda for the genecall command.
108
+ #
109
+ # @return [Proc] Returns the command lambda.
110
+ def lmb
111
+ lambda do |input, output, status|
112
+ status_init(status, STATS)
113
+
114
+ TmpDir.create('i.fa', 'o.fna', 'o.faa') do |tmp_in, tmp_fna, tmp_faa|
115
+ process_input(input, output, tmp_in)
116
+ run_prodigal(tmp_in, tmp_fna, tmp_faa)
117
+ process_output(output, tmp_fna, tmp_faa)
118
+ end
119
+ end
120
+ end
121
+
122
+ private
123
+
124
+ # Run Prodigal on the input file.
125
+ #
126
+ # @param tmp_in [String] Path to input FASTA file.
127
+ # @param tmp_fna [String] Path to output FASTA DNA file.
128
+ # @param tmp_faa [String] Path to output FASTA Protein file.
129
+ def run_prodigal(tmp_in, tmp_fna, tmp_faa)
130
+ cmd = []
131
+ cmd << 'prodigal'
132
+ cmd << '-f gff'
133
+ cmd << '-c' if @options[:closed_ends]
134
+ cmd << '-m' if @options[:masked]
135
+ cmd << "-p #{@options[:procedure]}"
136
+ cmd << "-i #{tmp_in}"
137
+ cmd << "-d #{tmp_fna}"
138
+ cmd << "-a #{tmp_faa}"
139
+ cmd << '-q' unless BioDSL.verbose
140
+ cmd << '> /dev/null 2>&1' unless BioDSL.verbose
141
+
142
+ cmd_line = cmd.join(' ')
143
+
144
+ $stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
145
+ system(cmd_line)
146
+
147
+ fail cmd_line unless $CHILD_STATUS.success?
148
+ end
149
+
150
+ # Check the options.
151
+ def check_options
152
+ options_allowed(@options, :type, :procedure, :closed_ends, :masked)
153
+ options_allowed_values(@options, type: [:dna, :protein, 'dna',
154
+ 'protein'])
155
+ options_allowed_values(@options, procedure: ['single', 'meta', :single,
156
+ :meta])
157
+ options_allowed_values(@options, closed_ends: [nil, true, false])
158
+ options_allowed_values(@options, masked: [nil, true, false])
159
+ end
160
+
161
+ # Set the default option values.
162
+ def defaults
163
+ @options[:type] ||= :dna
164
+ @options[:procedure] ||= :single
165
+ end
166
+
167
+ # Read all records from input and emit non-sequence records to the output
168
+ # stream. Sequence records are saved to a temporary file.
169
+ #
170
+ # @param input [Enumerator] input stream.
171
+ # @param output [Enumerator::Yielder] Output stream.
172
+ # @param fa_in [String] Path to temporary FASTA file.
173
+ def process_input(input, output, fa_in)
174
+ BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
175
+ input.each_with_index do |record, i|
176
+ @status[:records_in] += 1
177
+
178
+ if record.key? :SEQ
179
+ entry = BioDSL::Seq.new(seq_name: i, seq: record[:SEQ])
180
+ @names[i] = record[:SEQ_NAME] || i
181
+
182
+ @status[:sequences_in] += 1
183
+ @status[:sequences_out] += 1
184
+ @status[:residues_in] += entry.length
185
+ @status[:residues_out] += entry.length
186
+
187
+ fasta_io.puts entry.to_fasta
188
+ end
189
+
190
+ @status[:records_out] += 1
191
+ output << record
192
+ end
193
+ end
194
+ end
195
+
196
+ # Read the output from file and emit to the output stream.
197
+ #
198
+ # @param output [Enumerator::Yielder] Output stream.
199
+ # @param tmp_fna [String] Path to output FASTA DNA file.
200
+ # @param tmp_faa [String] Path to output FASTA Protein file.
201
+ def process_output(output, tmp_fna, tmp_faa)
202
+ file = (@type == :dna) ? tmp_fna : tmp_faa
203
+
204
+ BioDSL::Fasta.open(file, 'r') do |ios|
205
+ ios.each do |entry|
206
+ output << parse_entry(entry)
207
+
208
+ @status[:records_out] += 1
209
+ @status[:sequences_out] += 1
210
+ @status[:residues_out] += entry.length
211
+ end
212
+ end
213
+ end
214
+
215
+ # Parse Prodigal genecall data from sequence name.
216
+ #
217
+ # @param entry [BioDSL::Seq] Sequence object.
218
+ #
219
+ # @return [Hash] BioPiece record.
220
+ def parse_entry(entry)
221
+ record = {}
222
+ fields = entry.seq_name.split(' # ')
223
+
224
+ record[:RECORD_TYPE] = 'genecall'
225
+ record[:S_BEG] = fields[1].to_i - 1
226
+ record[:S_END] = fields[2].to_i - 1
227
+ record[:S_LEN] = record[:S_END] - record[:S_BEG] + 1
228
+ record[:STRAND] = fields[3] == '1' ? '+' : '-'
229
+ record[:SEQ_NAME] = @names[fields[0].split('_').first.to_i]
230
+ record[:SEQ] = entry.seq
231
+ record[:SEQ_LEN] = entry.length
232
+ record[:SEQ_TYPE] = @type.to_s
233
+
234
+ record
235
+ end
236
+ end
237
+ end