BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,171 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Run usearch_local on sequences in the stream.
30
+ #
31
+ # This is a wrapper for the +usearch+ tool to run the program usearch_local.
32
+ # Basically sequence type records are searched against a reference database
33
+ # and records with hit information are output.
34
+ #
35
+ # Please refer to the manual:
36
+ #
37
+ # http://drive5.com/usearch/manual/cmd_usearch_local.html
38
+ #
39
+ # Usearch 7.0 must be installed for +usearch+ to work. Read more here:
40
+ #
41
+ # http://www.drive5.com/usearch/
42
+ #
43
+ # == Usage
44
+ #
45
+ # usearch_local(<database: <file>, <identity: float>,
46
+ # <strand: "plus|both">[, cpus: <uint>])
47
+ #
48
+ # === Options
49
+ #
50
+ # * database: <file> - Database to search (in FASTA format).
51
+ # * identity: <float> - Similarity for matching in percent between 0.0 and
52
+ # 1.0.
53
+ # * strand: <string> - For nucleotide search report hits from plus or both
54
+ # strands.
55
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
56
+ #
57
+ # == Examples
58
+ #
59
+ class UsearchLocal
60
+ require 'BioDSL/helpers/aux_helper'
61
+
62
+ include AuxHelper
63
+
64
+ STATS = %i(records_in records_out sequences_in hits_out)
65
+
66
+ # Constructor for UsearchLocal.
67
+ #
68
+ # @param options [Hash] Options hash.
69
+ # @option options [String] :database
70
+ # @option options [Float] :identity
71
+ # @option options [String,Symbol] :strand
72
+ # @option options [Integer] :cpus
73
+ #
74
+ # @return [UsearchLocal] Class instance.
75
+ def initialize(options)
76
+ @options = options
77
+ @options[:cpus] ||= 1
78
+
79
+ aux_exist('usearch')
80
+ check_options
81
+ end
82
+
83
+ # Return command lambda for usearch_local.
84
+ #
85
+ # @return [Proc] Command lambda.
86
+ def lmb
87
+ lambda do |input, output, status|
88
+ status_init(status, STATS)
89
+
90
+ TmpDir.create('in', 'out') do |tmp_in, tmp_out|
91
+ process_input(input, output, tmp_in)
92
+ run_usearch_local(tmp_in, tmp_out)
93
+ process_output(output, tmp_out)
94
+ end
95
+ end
96
+ end
97
+
98
+ private
99
+
100
+ # Check options.
101
+ def check_options
102
+ options_allowed(@options, :database, :identity, :strand, :cpus)
103
+ options_required(@options, :database, :identity)
104
+ options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
105
+ options_files_exist(@options, :database)
106
+ options_assert(@options, ':identity > 0.0')
107
+ options_assert(@options, ':identity <= 1.0')
108
+ options_assert(@options, ':cpus >= 1')
109
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
110
+ end
111
+
112
+ # Process input and emit to the output stream while saving all records
113
+ # containing sequences to a temporary FASTA file.
114
+ #
115
+ # @param input [Enumerator] Input stream.
116
+ # @param output [Enumerator::Yielder] Output stream.
117
+ # @param tmp_in [String] Path to temporary file.
118
+ def process_input(input, output, tmp_in)
119
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
120
+ input.each_with_index do |record, i|
121
+ @status[:records_in] += 1
122
+
123
+ output << record
124
+
125
+ @status[:records_out] += 1
126
+
127
+ next unless record[:SEQ]
128
+
129
+ @status[:sequences_in] += 1
130
+ seq_name = record[:SEQ_NAME] || i.to_s
131
+
132
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
133
+
134
+ ios.puts entry.to_fasta
135
+ end
136
+ end
137
+ end
138
+
139
+ # Run usearch local on the input file and save results in the output file.
140
+ def run_usearch_local(tmp_in, tmp_out)
141
+ run_opts = {
142
+ input: tmp_in,
143
+ output: tmp_out,
144
+ database: @options[:database],
145
+ strand: @options[:strand],
146
+ identity: @options[:identity],
147
+ cpus: @options[:cpus],
148
+ verbose: @options[:verbose]
149
+ }
150
+
151
+ BioDSL::Usearch.usearch_local(run_opts)
152
+ rescue BioDSL::UsearchError => e
153
+ raise unless e.message =~ /Empty input file/
154
+ end
155
+
156
+ # Parse usearch output file and emit records to the output stream.
157
+ #
158
+ # @param output [Enumerator::Yielder] Output stream.
159
+ # @param tmp_out [String] Path to output file.
160
+ def process_output(output, tmp_out)
161
+ BioDSL::Usearch.open(tmp_out) do |ios|
162
+ ios.each(:uc) do |record|
163
+ record[:RECORD_TYPE] = 'usearch'
164
+ output << record
165
+ @status[:hits_out] += 1
166
+ @status[:records_out] += 1
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,207 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Write sequences from stream in FASTA format.
30
+ #
31
+ # Description
32
+ #
33
+ # +write_fasta+ writes sequence from the data stream in FASTA format.
34
+ # However, a FASTA entry will only be written if a SEQ key and a SEQ_NAME key
35
+ # is present. An example FASTA entry:
36
+ #
37
+ # >test1
38
+ # TATGACGCGCATCGACAGCAGCACGAGCATGCATCGACTG
39
+ # TGCACTGACTACGAGCATCACTATATCATCATCATAATCT
40
+ # TACGACATCTAGGGACTAC
41
+ #
42
+ # For more about the FASTA format:
43
+ #
44
+ # http://en.wikipedia.org/wiki/FASTA_format
45
+ #
46
+ # == Usage
47
+ # write_fasta([wrap: <uin>[, output: <file>[, force: <bool>
48
+ # [, gzip: <bool> | bzip2: <bool>]]]])
49
+ #
50
+ # === Options
51
+ # * output <file> - Output file.
52
+ # * force <bool> - Force overwrite existing output file.
53
+ # * wrap <uint> - Wrap sequence into lines of wrap length.
54
+ # * gzip <bool> - Write gzipped output file.
55
+ # * bzip2 <bool> - Write bzipped output file.
56
+ #
57
+ # == Examples
58
+ #
59
+ # To write FASTA entries to STDOUT.
60
+ #
61
+ # write_fasta
62
+ #
63
+ # To write FASTA entries wrapped in lines of length of 80 to STDOUT.
64
+ #
65
+ # write_fasta(wrap: 80)
66
+ #
67
+ # To write FASTA entries to a file 'test.fna'.
68
+ #
69
+ # write_fasta(output: "test.fna")
70
+ #
71
+ # To overwrite output file if this exists use the force option:
72
+ #
73
+ # write_fasta(output: "test.fna", force: true)
74
+ #
75
+ # To write gzipped FASTA entries to file 'test.fna.gz'.
76
+ #
77
+ # write_fasta(output: "test.fna.gz", gzip: true)
78
+ #
79
+ # To write bzipped FASTA entries to file 'test.fna.bz2'.
80
+ #
81
+ # write_fasta(output: "test.fna.bz2", bzip2: true)
82
+ class WriteFasta
83
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
84
+ residues_out)
85
+
86
+ # Constructor for the WriteFasta class.
87
+ #
88
+ # @param [Hash] options Options hash.
89
+ # @option options [Bool] :force Flag allowing overwriting files.
90
+ # @option options [String] :output Output file path.
91
+ # @option options [Integer] :wrap Wrap sequences at this length (default no
92
+ # wrap)
93
+ # @option options [Bool] :gzip Output will be gzip'ed.
94
+ # @option options [Bool] :bzip2 Output will be bzip2'ed.
95
+ #
96
+ # @return [WriteFasta] Returns an instance of the class.
97
+ def initialize(options)
98
+ @options = options
99
+ check_options
100
+ @options[:output] ||= $stdout
101
+ end
102
+
103
+ # Return a lambda for the write_fasta command.
104
+ #
105
+ # @return [Proc] Returns the write_fasta command lambda.
106
+ def lmb
107
+ lambda do |input, output, status|
108
+ status_init(status, STATS)
109
+
110
+ if @options[:output] == $stdout
111
+ write_stdout(input, output)
112
+ else
113
+ write_file(input, output)
114
+ end
115
+ end
116
+ end
117
+
118
+ private
119
+
120
+ # Check the options.
121
+ def check_options
122
+ options_allowed(@options, :force, :output, :wrap, :gzip, :bzip2)
123
+ options_unique(@options, :gzip, :bzip2)
124
+ options_tie(@options, gzip: :output, bzip2: :output)
125
+ options_files_exist_force(@options, :output)
126
+ end
127
+
128
+ # Write all sequence entries to stdout.
129
+ #
130
+ # @param input [Enumerator] The input stream.
131
+ # @param output [Enumerator::Yielder] The output stream.
132
+ def write_stdout(input, output)
133
+ wrap = @options[:wrap]
134
+
135
+ input.each do |record|
136
+ @status[:records_in] += 1
137
+
138
+ if (entry = record2entry(record))
139
+ $stdout.puts entry.to_fasta(wrap)
140
+ @status[:sequences_in] += 1
141
+ @status[:sequences_out] += 1
142
+ @status[:residues_in] += entry.length
143
+ @status[:residues_out] += entry.length
144
+ end
145
+
146
+ write_output(output, record)
147
+ end
148
+ end
149
+
150
+ # rubocop: disable Metrics/AbcSize
151
+
152
+ # Write all sequence entries to a specified file.
153
+ #
154
+ # @param input [Enumerator] The input stream.
155
+ # @param output [Enumerator::Yielder] The output stream.
156
+ def write_file(input, output)
157
+ Fasta.open(@options[:output], 'w', compress: compress) do |ios|
158
+ input.each do |record|
159
+ @status[:records_in] += 1
160
+
161
+ if (entry = record2entry(record))
162
+ ios.puts entry.to_fasta(@options[:wrap])
163
+ @status[:sequences_in] += 1
164
+ @status[:sequences_out] += 1
165
+ @status[:residues_in] += entry.length
166
+ @status[:residues_out] += entry.length
167
+ end
168
+
169
+ write_output(output, record)
170
+ end
171
+ end
172
+ end
173
+
174
+ # rubocop: enable Metrics/AbcSize
175
+
176
+ # Write a given record to the output stream if this exist.
177
+ #
178
+ # @param output [Enumerator::Yielder, nil] Output stream.
179
+ # @param record [Hash] Biopices record to write.
180
+ def write_output(output, record)
181
+ return unless output
182
+
183
+ output << record
184
+ @status[:records_out] += 1
185
+ end
186
+
187
+ # Creates a Seq object from a given record if SEQ_NAME and SEQ is present.
188
+ #
189
+ # @param record [Hash] Biopices record to convert.
190
+ #
191
+ # @return [BioDSL::Seq] Sequence entry.
192
+ def record2entry(record)
193
+ return unless record.key? :SEQ_NAME
194
+ return unless record.key? :SEQ
195
+
196
+ BioDSL::Seq.new_bp(record)
197
+ end
198
+
199
+ # Determine what compression should be used for output.
200
+ #
201
+ # @return [Symbol, nil] Compression flag or nil if no compression.
202
+ def compress
203
+ return :gzip if @options[:gzip]
204
+ return :bzip2 if @options[:bzip2]
205
+ end
206
+ end
207
+ end
@@ -0,0 +1,191 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Write sequences from stream in FASTQ format.
30
+ #
31
+ # Description
32
+ #
33
+ # +write_fastq+ writes sequence from the data stream in FASTQ format. However,
34
+ # a FASTQ entry will only be written if a SEQ key and a SEQ_NAME key is
35
+ # present. An example FASTQ entry:
36
+ #
37
+ # >test1
38
+ # TATGACGCGCATCGACAGCAGCACGAGCATGCATCGACTG
39
+ # TGCACTGACTACGAGCATCACTATATCATCATCATAATCT
40
+ # TACGACATCTAGGGACTAC
41
+ #
42
+ # For more about the FASTQ format:
43
+ #
44
+ # http://en.wikipedia.org/wiki/FASTQ_format
45
+ #
46
+ # == Usage
47
+ # write_fastq([encoding: <:base_33|:base_64>[, output: <file>
48
+ # [, force: <bool>[, gzip: <bool> | bzip2: <bool>]]])
49
+ #
50
+ # === Options
51
+ # * encoding <base> - Encoding quality scores using :base_33 (default) or
52
+ # :base_64.
53
+ # * output <file> - Output file.
54
+ # * force <bool> - Force overwrite existing output file.
55
+ # * gzip <bool> - Write gzipped output file.
56
+ # * bzip2 <bool> - Write bzipped output file.
57
+ #
58
+ # == Examples
59
+ #
60
+ # To write FASTQ entries to STDOUT.
61
+ #
62
+ # write_fastq
63
+ #
64
+ # To write FASTQ entries to a file 'test.fq'.
65
+ #
66
+ # write_fastq(output: "test.fq")
67
+ #
68
+ # To overwrite output file if this exists use the force option:
69
+ #
70
+ # write_fastq(output: "test.fq", force: true)
71
+ #
72
+ # To write gzipped FASTQ entries to file 'test.fq.gz'.
73
+ #
74
+ # write_fastq(output: "test.fq.gz", gzip: true)
75
+ #
76
+ # To write bzipped FASTQ entries to file 'test.fq.bz2'.
77
+ #
78
+ # write_fastq(output: "test.fq.bz2", bzip2: true)
79
+ class WriteFastq
80
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
81
+ residues_out)
82
+
83
+ # Constructor for WriteFastq.
84
+ #
85
+ # @param options [Hash] Options hash.
86
+ # @option options [String,Symbol] :encoding
87
+ # @option options [Boolean] :force
88
+ # @option options [String] :output
89
+ # @option options [Boolean] :gzip
90
+ # @option options [Boolean] :bzip2
91
+ #
92
+ # @return [WriteFastq] Class instance.
93
+ def initialize(options)
94
+ @options = options
95
+ check_options
96
+ @options[:output] ||= $stdout
97
+ @compress = choose_compression
98
+ @encoding = choose_encoding
99
+ end
100
+
101
+ # Return command lambda for write_fastq.
102
+ #
103
+ # @return [Proc] Command lambda.
104
+ def lmb
105
+ lambda do |input, output, status|
106
+ status_init(status, STATS)
107
+
108
+ if @options[:output] == $stdout
109
+ process_input(input, output, $stdout)
110
+ else
111
+ Fastq.open(@options[:output], 'w', compress: @compress) do |ios|
112
+ process_input(input, output, ios)
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ private
119
+
120
+ # Check options.
121
+ def check_options
122
+ options_allowed(@options, :encoding, :force, :output, :gzip, :bzip2)
123
+ options_allowed_values(@options, encoding: [:base_33, :base_64, 'base_33',
124
+ 'base_64'])
125
+ options_unique(@options, :gzip, :bzip2)
126
+ options_tie(@options, gzip: :output, bzip2: :output)
127
+ options_files_exist_force(@options, :output)
128
+ end
129
+
130
+ # Process all records in the input stream and output FASTQ data to the given
131
+ # ios, and finally emit all records to the output stream if specified.
132
+ #
133
+ # @param input [Enumerable] Input stream.
134
+ # @param output [Enumerable::Yielder] Output stream.
135
+ # @param ios [BioDSL::Fastq::IO,STDOUT] Output IO.
136
+ def process_input(input, output, ios)
137
+ input.each do |record|
138
+ @status[:records_in] += 1
139
+
140
+ if record[:SEQ]
141
+ @status[:sequences_in] += 1
142
+ @status[:residues_in] += record[:SEQ].length
143
+
144
+ write_fastq(record, ios) if record[:SEQ_NAME] && record[:SCORES]
145
+ end
146
+
147
+ if output
148
+ output << record
149
+ @status[:records_out] += 1
150
+ end
151
+ end
152
+ end
153
+
154
+ # Given a BioPeices record convert this to a sequence entry and output in
155
+ # FASTQ format to the speficied IO.
156
+ #
157
+ # @param record [Hash] BioDSL record.
158
+ # @param ios [BioDSL::Fastq::IO,STDOUT] Output IO.
159
+ def write_fastq(record, ios)
160
+ entry = BioDSL::Seq.new_bp(record)
161
+ entry.qual_convert!(:base_33, @encoding)
162
+
163
+ ios.puts entry.to_fastq
164
+ @status[:sequences_out] += 1
165
+ @status[:residues_out] += entry.length
166
+ end
167
+
168
+ # Choose compression to use which can either be gzip or bzip2 or no
169
+ # compression.
170
+ #
171
+ # @return [Symbol,nil] Compression.
172
+ def choose_compression
173
+ if @options[:gzip]
174
+ :gzip
175
+ elsif @options[:bzip2]
176
+ :bzip2
177
+ end
178
+ end
179
+
180
+ # Chose the quality score encoding.
181
+ #
182
+ # @return [Symbol,nil] Encoding.
183
+ def choose_encoding
184
+ if @options[:encoding]
185
+ @options[:encoding].to_sym
186
+ else
187
+ :base_33
188
+ end
189
+ end
190
+ end
191
+ end