BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,223 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Sort records in the stream.
30
+ #
31
+ # +sort+ records in the stream given a specific key. Sorting on multiple keys
32
+ # is currently not supported.
33
+ #
34
+ # == Usage
35
+ #
36
+ # sort(key: <value>[, reverse: <bool>[, block_size: <uint>]])
37
+ #
38
+ # === Options
39
+ #
40
+ # * key: <value> - Sort records on the value for key.
41
+ # * reverse: <bool> - Reverse sort.
42
+ # * block_size: <uint> - Block size used for disk based sorting
43
+ # (default=250_000_000).
44
+ #
45
+ # == Examples
46
+ #
47
+ # Consider the following table in the file `test.tab`:
48
+ #
49
+ # #COUNT ORGANISM
50
+ # 4 Dog
51
+ # 3 Cat
52
+ # 1 Eel
53
+ #
54
+ # To sort this accoring to COUNT in descending order do:
55
+ #
56
+ # BP.new.read_table(input: "test.tab").sort(key: :COUNT).dump.run
57
+ #
58
+ # {:COUNT=>1, :ORGANISM=>"Eel"}
59
+ # {:COUNT=>3, :ORGANISM=>"Cat"}
60
+ # {:COUNT=>4, :ORGANISM=>"Dog"}
61
+ #
62
+ # And in ascending order:
63
+ #
64
+ # BP.new.
65
+ # read_table(input: "test.tab").
66
+ # sort(key: :COUNT, reverse: true).
67
+ # dump.
68
+ # run
69
+ #
70
+ # {:COUNT=>4, :ORGANISM=>"Dog"}
71
+ # {:COUNT=>3, :ORGANISM=>"Cat"}
72
+ # {:COUNT=>1, :ORGANISM=>"Eel"}
73
+ #
74
+ # The type of value determines the sorting, alphabetical order:
75
+ #
76
+ # BP.new.read_table(input: "test.tab").sort(key: :ORGANISM).dump.run
77
+ #
78
+ # {:COUNT=>3, :ORGANISM=>"Cat"}
79
+ # {:COUNT=>4, :ORGANISM=>"Dog"}
80
+ # {:COUNT=>1, :ORGANISM=>"Eel"}
81
+ #
82
+ # And reverse alphabetic order:
83
+ #
84
+ # BP.new.
85
+ # read_table(input: "test.tab").
86
+ # sort(key: :ORGANISM, reverse: true).
87
+ # dump.
88
+ # run
89
+ #
90
+ # {:COUNT=>1, :ORGANISM=>"Eel"}
91
+ # {:COUNT=>4, :ORGANISM=>"Dog"}
92
+ # {:COUNT=>3, :ORGANISM=>"Cat"}
93
+ class Sort
94
+ require 'pqueue'
95
+
96
+ STATS = %i(records_in records_out)
97
+ SORT_BLOCK_SIZE = 250_000_000 # max bytes to hold in memory.
98
+
99
+ # Constructor for Sort.
100
+ #
101
+ # @param options [Hash] Options hash.
102
+ #
103
+ # @option options [String,Symbol] :key
104
+ # @option options [Boolean] :reverse
105
+ # @option options [Integer] :block_size
106
+ #
107
+ # @return [Sort] Class instance.
108
+ def initialize(options)
109
+ @options = options
110
+ @block_size = options[:block_size] || SORT_BLOCK_SIZE
111
+ @key = options[:key].to_sym
112
+ @files = []
113
+ @records = []
114
+ @size = 0
115
+ @pqueue = pqueue_init
116
+ @fds = nil
117
+
118
+ check_options
119
+ end
120
+
121
+ # Return command lambda for Sort.
122
+ #
123
+ # @return [Proc] Command lambda.
124
+ def lmb
125
+ lambda do |input, output, status|
126
+ status_init(status, STATS)
127
+
128
+ input.each do |record|
129
+ @status[:records_in] += 1
130
+ @records << record
131
+ @size += record.to_s.size
132
+ save_block if @size > @block_size
133
+ end
134
+
135
+ save_block
136
+ open_block_files
137
+ fill_pqueue
138
+ output_pqueue(output)
139
+ end
140
+ end
141
+
142
+ private
143
+
144
+ # Check options.
145
+ def check_options
146
+ options_allowed(@options, :key, :reverse, :block_size)
147
+ options_required(@options, :key)
148
+ options_allowed_values(@options, reverse: [nil, true, false])
149
+ options_assert(@options, ':block_size > 0')
150
+ end
151
+
152
+ # Initialize pqueue
153
+ def pqueue_init
154
+ PQueue.new do |a, b|
155
+ if @options[:reverse]
156
+ a.first[@key] <=> b.first[@key]
157
+ else
158
+ b.first[@key] <=> a.first[@key]
159
+ end
160
+ end
161
+ end
162
+
163
+ # Save a block of records after sorting this.
164
+ def save_block
165
+ return if @records.empty?
166
+
167
+ @records.sort_by! { |r| r[@options[:key].to_sym] }
168
+ @records.reverse! if @options[:reverse]
169
+
170
+ serialize_records
171
+
172
+ @records = []
173
+ @size = 0
174
+ end
175
+
176
+ # Save sorted records to file.
177
+ def serialize_records
178
+ file = Tempfile.new('sort')
179
+
180
+ File.open(file, 'wb') do |ios|
181
+ BioDSL::Serializer.new(ios) do |serializer|
182
+ @records.each { |record| serializer << record }
183
+ end
184
+ end
185
+
186
+ @files << file
187
+ end
188
+
189
+ # Open all sorted files.
190
+ def open_block_files
191
+ @fds = @files.inject([]) { |a, e| a << File.open(e, 'rb') }
192
+ at_exit { @fds.map(&:close) }
193
+ end
194
+
195
+ # Fill the pqueue with the first record from each of the file descriptors.
196
+ def fill_pqueue
197
+ @fds.each_with_index do |fd, i|
198
+ BioDSL::Serializer.new(fd) do |serializer|
199
+ @pqueue << [serializer.next_entry, i] unless fd.eof?
200
+ end
201
+ end
202
+ end
203
+
204
+ # Output all records from the pqueue while filling this with the next record
205
+ # from the list of file descriptors.
206
+ #
207
+ # @param output [Enumerator::Yeilder] Output stream.
208
+ def output_pqueue(output)
209
+ until @pqueue.empty?
210
+ record, i = @pqueue.pop
211
+
212
+ output << record
213
+ @status[:records_out] += 1
214
+
215
+ fd = @fds[i]
216
+
217
+ BioDSL::Serializer.new(fd) do |serializer|
218
+ @pqueue << [serializer.next_entry, i] unless fd.eof?
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,220 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Splite pair-end sequences in the stream.
30
+ #
31
+ # split_pair_seq splits sequences in the stream previously merged with
32
+ # merge_pair_seq. Sequence names must be in either Illumina1.3/1.5 format
33
+ # trailing a /1 or /2 or Illumina1.8 containing 1: or 2:. A sequence split
34
+ # into two will be output as two records where the first will be named with 1
35
+ # and the second with 2.
36
+ #
37
+ # == Usage
38
+ #
39
+ # split_pair_seq
40
+ #
41
+ # === Options
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following records created with merge_pair_seq:
46
+ #
47
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
48
+ # :SEQ=>"TGGGGAATATTGGACAATGGCCTGTTTGCTACCCACGCTT",
49
+ # :SEQ_LEN=>40,
50
+ # :SCORES=>"<??????BDDDDDDDDGGGG?????BB<-<BDDDDDFEEF",
51
+ # :SEQ_LEN_LEFT=>20,
52
+ # :SEQ_LEN_RIGHT=>20}
53
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
54
+ # :SEQ=>"TAGGGAATCTTGCACAATGGACTCTTCGCTACCCATGCTT",
55
+ # :SEQ_LEN=>40,
56
+ # :SCORES=>"<???9?BBBDBDDBDDFFFF,5<??BB?DDABDBDDFFFF",
57
+ # :SEQ_LEN_LEFT=>20,
58
+ # :SEQ_LEN_RIGHT=>20}
59
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
60
+ # :SEQ=>"TAGGGAATCTTGCACAATGGCCTCTTCGCTACCCATGCTT",
61
+ # :SEQ_LEN=>40,
62
+ # :SCORES=>"?????BBBBBDDBDDBFFFF??,<??B?BB?BBBBBFF?F",
63
+ # :SEQ_LEN_LEFT=>20,
64
+ # :SEQ_LEN_RIGHT=>20}
65
+ #
66
+ # These can be split using split_pair_seq:
67
+ #
68
+ # BP.new.
69
+ # read_fastq(input: "test.fq", encoding: :base_33).
70
+ # merge_pair_seq.
71
+ # split_pair_seq.
72
+ # dump.
73
+ # run
74
+ #
75
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
76
+ # :SEQ=>"TGGGGAATATTGGACAATGG",
77
+ # :SEQ_LEN=>20,
78
+ # :SCORES=>"<??????BDDDDDDDDGGGG"}
79
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 2:N:0:14",
80
+ # :SEQ=>"CCTGTTTGCTACCCACGCTT",
81
+ # :SEQ_LEN=>20,
82
+ # :SCORES=>"?????BB<-<BDDDDDFEEF"}
83
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
84
+ # :SEQ=>"TAGGGAATCTTGCACAATGG",
85
+ # :SEQ_LEN=>20,
86
+ # :SCORES=>"<???9?BBBDBDDBDDFFFF"}
87
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 2:N:0:14",
88
+ # :SEQ=>"ACTCTTCGCTACCCATGCTT",
89
+ # :SEQ_LEN=>20,
90
+ # :SCORES=>",5<??BB?DDABDBDDFFFF"}
91
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
92
+ # :SEQ=>"TAGGGAATCTTGCACAATGG",
93
+ # :SEQ_LEN=>20,
94
+ # :SCORES=>"?????BBBBBDDBDDBFFFF"}
95
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 2:N:0:14",
96
+ # :SEQ=>"CCTCTTCGCTACCCATGCTT",
97
+ # :SEQ_LEN=>20,
98
+ # :SCORES=>"??,<??B?BB?BBBBBFF?F"}
99
+ class SplitPairSeq
100
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
101
+ residues_out)
102
+
103
+ # Constructor for SplitPairSeq.
104
+ #
105
+ # @param options [Hash] Options hash.
106
+ #
107
+ # @return [SplitPairSeq] Class instance.
108
+ def initialize(options)
109
+ @options = options
110
+
111
+ check_options
112
+ end
113
+
114
+ # Return command lambda for split_pair_seq.
115
+ #
116
+ # @return [Proc] Command lambda.
117
+ def lmb
118
+ lambda do |input, output, status|
119
+ status_init(status, STATS)
120
+
121
+ input.each do |record|
122
+ @status[:records_in] += 1
123
+
124
+ if record[:SEQ_NAME] && record[:SEQ] && record[:SEQ_LEN_LEFT] &&
125
+ record[:SEQ_LEN_RIGHT]
126
+ split_pair_seq(output, record)
127
+ else
128
+ output << record
129
+
130
+ @status[:records_out] += 1
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ private
137
+
138
+ # Check options.
139
+ def check_options
140
+ options_allowed(@options, nil)
141
+ end
142
+
143
+ # Output two sequence entries from a sequence in the given record that has
144
+ # been split at a position defined by the SEQ_LEN_LEFT key in the record.
145
+ #
146
+ # @param output [Enumerator::Yielder] Output stream.
147
+ # @param record [Hash] BioDSL record.
148
+ #
149
+ # rubocop: disable Metrics/AbcSize
150
+ def split_pair_seq(output, record)
151
+ entry = BioDSL::Seq.new_bp(record)
152
+
153
+ @status[:sequences_in] += 1
154
+ @status[:residues_in] += entry.length
155
+
156
+ pos = get_split_pos(record, entry)
157
+
158
+ entry1, entry2 = split_entry(entry, pos)
159
+
160
+ output << entry1.to_bp
161
+ output << entry2.to_bp
162
+
163
+ @status[:sequences_out] += 2
164
+ @status[:residues_out] += entry1.length + entry2.length
165
+ @status[:records_out] += 2
166
+ end
167
+
168
+ # Given a record locate the sequence split position.
169
+ #
170
+ # @param record [Hash] BioDSL record.
171
+ # @param entry [BioDSL::Seq] Sequence entry.
172
+ #
173
+ # @return [Integer] Sequence split position.
174
+ #
175
+ # @raise [BioDSL::SeqError]
176
+ # If left and right lengths don't fit entry length.
177
+ def get_split_pos(record, entry)
178
+ len_left = record[:SEQ_LEN_LEFT].to_i
179
+ len_right = record[:SEQ_LEN_RIGHT].to_i
180
+
181
+ unless len_left + len_right == entry.length
182
+ fail BioDSL::SeqError, 'SEQ_LEN_LEFT + SEQ_LEN_RIGHT != SEQ_LEN ' \
183
+ "#{len_left} + #{len_right} != #{entry.length}"
184
+ end
185
+
186
+ len_left
187
+ end
188
+
189
+ # Split the given entry at the given position and return two new entries.
190
+ #
191
+ # @param entry [BioDSL::Seq] Sequence entry.
192
+ # @param pos [Integer] Split position.
193
+ #
194
+ # @return [Array] Tuple with the two new entries.
195
+ def split_entry(entry, pos)
196
+ entry1 = entry[0...pos]
197
+ entry2 = entry[pos..-1]
198
+
199
+ fix_seq_names(entry, entry2)
200
+
201
+ [entry1, entry2]
202
+ end
203
+
204
+ # Fix sequence names.
205
+ #
206
+ # @param entry1 [BioDSL::Seq] Sequence entry1.
207
+ # @param entry2 [BioDSL::Seq] Sequence entry2.
208
+ #
209
+ # @raise [RuntimeError] If names wasn't fixed.
210
+ def fix_seq_names(entry1, entry2)
211
+ if entry1.seq_name =~ /^[^ ]+ \d:/
212
+ entry2.seq_name.sub!(/ \d:/, ' 2:')
213
+ elsif entry1.seq_name =~ /^.+\/\d$/
214
+ entry2.seq_name[-1] = '2'
215
+ else
216
+ fail "Could not match sequence name: #{entry1.seq_name}"
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,165 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Split the values of a key into new key/value pairs.
30
+ #
31
+ # +split_values+ splits the value of a given key into multiple values that are
32
+ # added to the record. The keys used for the values are per default based on
33
+ # the given key with an added index, but using the +keys+ option allows
34
+ # specifying a list of keys to use instead.
35
+ #
36
+ # == Usage
37
+ #
38
+ # split_values(<key>: <string>>[, delimiter: <string>[, keys: <list>]])
39
+ #
40
+ # === Options
41
+ #
42
+ # # key: <string> - Key who's value to split.
43
+ # * keys: <list> - List of keys to use with split values.
44
+ # * delimiter: <string> - Delimiter (default='_').
45
+ #
46
+ # == Examples
47
+ #
48
+ # Consider the following records:
49
+ #
50
+ # {ID: "FOO:count=10", SEQ: "gataag"}
51
+ # {ID: "FOO_10_20", SEQ: "gataag"}
52
+ #
53
+ # To split the value belinging to ID do:
54
+ #
55
+ # split_values(key: :ID)
56
+ #
57
+ # {:ID=>"FOO:count=10", :SEQ=>"gataag"}
58
+ # {:ID=>"FOO_10_20", :SEQ=>"gataag", :ID_0=>"FOO", :ID_1=>10, :ID_2=>20}
59
+ #
60
+ # Using a different delimiter:
61
+ #
62
+ # split_values(key: "ID", delimiter: ':count=')
63
+ #
64
+ # {:ID=>"FOO:count=10", :SEQ=>"gataag", :ID_0=>"FOO", :ID_1=>10}
65
+ # {:ID=>"FOO_10_20", :SEQ=>"gataag"}
66
+ #
67
+ # Using a different delimiter and a list of keys:
68
+ #
69
+ # split_values(key: "ID", keys: ["ID", :COUNT], delimiter: ':count=')
70
+ #
71
+ # {:ID=>"FOO", :SEQ=>"gataag", :COUNT=>10}
72
+ # {:ID=>"FOO_10_20", :SEQ=>"gataag"}
73
+ class SplitValues
74
+ STATS = %i(records_in records_out)
75
+
76
+ # Constructor for SplitValues.
77
+ #
78
+ # @param options [Hash] Options hash.
79
+ # @option options [String,Symbol] :key
80
+ # @option options [Array] :keys
81
+ # @option options [String] :delimiter
82
+ #
83
+ # @return [SplitValues] Class instance.
84
+ def initialize(options)
85
+ @options = options
86
+
87
+ check_options
88
+
89
+ @first = true
90
+ @convert = []
91
+ @keys = @options[:keys]
92
+ @key = @options[:key].to_sym
93
+ @delimiter = @options[:delimiter] || '_'
94
+ end
95
+
96
+ # Return command lambda for split_values.
97
+ #
98
+ # @return [Proc] Command lambda.
99
+ def lmb
100
+ lambda do |input, output, status|
101
+ status_init(status, STATS)
102
+
103
+ input.each do |record|
104
+ @status[:records_in] += 1
105
+
106
+ if (value = record[@key])
107
+ values = value.split(@delimiter)
108
+
109
+ if values.size > 1
110
+ determine_types(values) if @first
111
+
112
+ split_values(values, record)
113
+ end
114
+ end
115
+
116
+ output << record
117
+
118
+ @status[:records_out] += 1
119
+ end
120
+ end
121
+ end
122
+
123
+ private
124
+
125
+ # Check options.
126
+ def check_options
127
+ options_allowed(@options, :key, :keys, :delimiter)
128
+ options_required(@options, :key)
129
+ end
130
+
131
+ # Given an array of values determine the types that must be converted to
132
+ # integers or floats and save the value index in a class variable.
133
+ #
134
+ # @param values [Array] List of values.
135
+ def determine_types(values)
136
+ values.each_with_index do |val, i|
137
+ val = val.to_num
138
+
139
+ if val.is_a? Fixnum
140
+ @convert[i] = :to_i
141
+ elsif val.is_a? Float
142
+ @convert[i] = :to_f
143
+ end
144
+ end
145
+
146
+ @first = false
147
+ end
148
+
149
+ # Convert values and add to record.
150
+ #
151
+ # @param values [Array] List of values.
152
+ # @param record [Hash] BioDSL record.
153
+ def split_values(values, record)
154
+ values.each_with_index do |val, i|
155
+ val = val.send(@convert[i]) if @convert[i]
156
+
157
+ if @keys
158
+ record[@keys[i].to_sym] = val
159
+ else
160
+ record["#{@key}_#{i}".to_sym] = val
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end