BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
data/lib/BioDSL/csv.rb ADDED
@@ -0,0 +1,307 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Monkey patching Array to add convert_types method.
29
+ class Array
30
+ # Method that converts variable types given an array of types.
31
+ # Example: ["fish", 0.0, 1].convert_types([:to_s, :to_f, :to_i])
32
+ def convert_types(types)
33
+ if size != types.size
34
+ fail ArgumentError, "Array and types size mismatch: #{size} != " \
35
+ "#{types.size}"
36
+ end
37
+
38
+ types.each_with_index do |type, i|
39
+ self[i] = self[i].send(type)
40
+ end
41
+
42
+ self
43
+ end
44
+ end
45
+
46
+ module BioDSL
47
+ class CSVError < StandardError; end
48
+
49
+ # rubocop: disable ClassLength
50
+
51
+ # Class for manipulating CSV or table files.
52
+ # Allow reading and writing of gzip and bzip2 data.
53
+ # Auto-convert data types.
54
+ # Returns lines, arrays or hashes.
55
+ class CSV
56
+ def self.open(*args)
57
+ io = IO.open(*args)
58
+
59
+ if block_given?
60
+ yield new(io)
61
+ else
62
+ return new(io)
63
+ end
64
+ end
65
+
66
+ # Method that reads all CSV data from a file into an array of arrays (array
67
+ # of rows) which is returned. In the default mode all columns are read.
68
+ # Using the select option subselects the columns based on a given Array or
69
+ # if a heder line is present a given Hash. Visa versa for the reject option.
70
+ # Header lines are prefixed with '#' and are returned if the include_header
71
+ # option is given.
72
+ #
73
+ # Options:
74
+ # * include_header
75
+ # * delimiter.
76
+ # * select.
77
+ # * reject.
78
+ def self.read_array(file, options = {})
79
+ data = []
80
+
81
+ open(file) do |ios|
82
+ ios.each_array(options) { |row| data << row }
83
+ end
84
+
85
+ data
86
+ end
87
+
88
+ # Method that reads all CSV data from a file into an array of hashes (array
89
+ # of rows) which is returned. In the default mode all columns are read.
90
+ # Using the select option subselects the columns based on a given Array or
91
+ # if a heder line is present a given Hash. Visa versa for the reject option.
92
+ # Header lines are prefixed with '#'.
93
+ #
94
+ # Options:
95
+ # * delimiter.
96
+ # * select.
97
+ # * reject.
98
+ def self.read_hash(file, options = {})
99
+ data = []
100
+
101
+ open(file) do |ios|
102
+ ios.each_hash(options) { |row| data << row }
103
+ end
104
+
105
+ data
106
+ end
107
+
108
+ # Constructor method for CSV.
109
+ def initialize(io)
110
+ @io = io
111
+ @delimiter = "\s"
112
+ @header = nil
113
+ @fields = nil
114
+ @types = nil
115
+ end
116
+
117
+ # Method to skip a given number or non-empty lines.
118
+ def skip(num)
119
+ while num != 0 && (line = @io.gets)
120
+ line.chomp!
121
+
122
+ num -= 1 unless line.empty?
123
+ end
124
+ end
125
+
126
+ # Method to iterate over a CSV IO object yielding arrays or an enumerator
127
+ # CSV.each_array(options={}) { |item| block } -> ary
128
+ # CSV.each_array(options={}) -> Enumerator
129
+ #
130
+ # Options:
131
+ # * :include_header -
132
+ # * :delimiter -
133
+ # * :select -
134
+ # * :reject -
135
+ def each_array(options = {})
136
+ return to_enum :each_array unless block_given?
137
+
138
+ delimiter = options[:delimiter] || @delimiter
139
+
140
+ @io.each do |line|
141
+ line.chomp!
142
+ next if line.empty?
143
+
144
+ fields = line.split(delimiter)
145
+
146
+ if line[0] == '#'
147
+ get_header(fields, options) unless @header
148
+ get_fields(fields, options) unless @fields
149
+
150
+ yield @header.map(&:to_s) if options[:include_header]
151
+ else
152
+ get_header(fields, options) unless @header
153
+ get_fields(fields, options) unless @fields
154
+
155
+ fields = fields.values_at(*@fields) if @fields
156
+
157
+ determine_types(fields) unless @types
158
+
159
+ yield fields.convert_types(@types)
160
+ end
161
+ end
162
+
163
+ self
164
+ end
165
+
166
+ # Method to iterate over a CSV IO object yielding hashes or an enumerator
167
+ # CSV.each_hash(options={}) { |item| block } -> hash
168
+ # CSV.each_hash(options={}) -> Enumerator
169
+ #
170
+ # Options:
171
+ # * :delimiter -
172
+ # * :select -
173
+ # * :reject -
174
+ def each_hash(options = {})
175
+ each_array(options) do |array|
176
+ hash = {}
177
+
178
+ array.convert_types(@types).each_with_index do |field, i|
179
+ hash[@header[i]] = field
180
+ end
181
+
182
+ yield hash
183
+ end
184
+
185
+ self
186
+ end
187
+
188
+ private
189
+
190
+ # Method to set the @header given a list of fields (a row).
191
+ # Options:
192
+ # * :select - list of column indexes, names or a range to select.
193
+ # * :reject - list of column indexes, names or a range to reject.
194
+ def get_header(fields, options)
195
+ if fields[0][0] == '#'
196
+ fields[0] = fields[0][1..-1]
197
+ @header = fields.map(&:to_sym)
198
+ else
199
+ @header = []
200
+ fields.each_with_index { |_field, i| @header << "V#{i}".to_sym }
201
+ end
202
+
203
+ if options[:select]
204
+ if options[:select].first.is_a? Fixnum
205
+ if options[:select].max >= @header.size
206
+ fail CSVError, "Selected columns out of bounds: #{options[:select].
207
+ select { |c| c >= @header.size }}"
208
+ end
209
+ else
210
+ options[:select].each do |value|
211
+ unless @header.include? value.to_sym
212
+ fail CSVError, "Selected value: #{value} not in header: " \
213
+ " #{@header}"
214
+ end
215
+ end
216
+ end
217
+ elsif options[:reject]
218
+ if options[:reject].first.is_a? Fixnum
219
+ if options[:reject].max >= @header.size
220
+ fail CSVError, "Rejected columns out of bounds: #{options[:reject].
221
+ reject { |c| c >= @header.size }}"
222
+ end
223
+ else
224
+ options[:reject].map do |value|
225
+ unless @header.include? value.to_sym
226
+ fail CSVError, "Rejected value: #{value} not found in header: " \
227
+ "#{@header}"
228
+ end
229
+ end
230
+ end
231
+ end
232
+
233
+ @header
234
+ end
235
+
236
+ # Method to determine the indexes of fields to be parsed and store these in
237
+ # @fields.
238
+ # Options:
239
+ # * :select - list of column indexes, names or a range to select.
240
+ # * :reject - list of column indexes, names or a range to reject.
241
+ def get_fields(fields, options)
242
+ if options[:select]
243
+ if options[:select].first.is_a? Fixnum
244
+ @fields = options[:select]
245
+ else
246
+ fail CSVError, 'No header found' unless @header
247
+
248
+ fields = []
249
+
250
+ options[:select].each do |value|
251
+ fields << @header.index(value.to_sym)
252
+ end
253
+
254
+ @fields = fields
255
+ end
256
+
257
+ @header = @header.values_at(*@fields)
258
+ elsif options[:reject]
259
+ if options[:reject].first.is_a? Fixnum
260
+ reject = if options[:reject].is_a?(Range)
261
+ options[:reject].to_a
262
+ else
263
+ options[:reject]
264
+ end
265
+ @fields = (0...fields.size).to_a - reject
266
+ else
267
+ fail CSVError, 'No header found' unless @header
268
+
269
+ reject = options[:reject].map(&:to_sym)
270
+
271
+ @fields = @header.map.with_index.to_h.
272
+ delete_if { |k, _| reject.include? k }.values
273
+ end
274
+
275
+ @header = @header.values_at(*@fields)
276
+ end
277
+ end
278
+
279
+ # Method that determines the data types used in an array of fields.
280
+ def determine_types(fields)
281
+ types = []
282
+
283
+ fields.each do |field|
284
+ field = field.to_num
285
+
286
+ if field.is_a? Fixnum
287
+ types << :to_i
288
+ elsif field.is_a? Float
289
+ types << :to_f
290
+ elsif field.is_a? String
291
+ types << :to_s
292
+ else
293
+ types << nil
294
+ end
295
+ end
296
+
297
+ @types = types
298
+ end
299
+
300
+ # IO class for CSV.
301
+ class IO < Filesys
302
+ def gets
303
+ @io.gets
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,42 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Class variabel visible across the BioDSL module scope.
31
+ @@debug = false
32
+
33
+ # Class variable getter method.
34
+ def self.debug
35
+ @@debug
36
+ end
37
+
38
+ # Class variable setter method.
39
+ def self.debug=(x)
40
+ @@debug = x
41
+ end
42
+ end
@@ -0,0 +1,133 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with FASTA.
29
+ class FastaError < StandardError; end
30
+
31
+ class Fasta
32
+ def self.open(*args)
33
+ ios = IO.open(*args)
34
+
35
+ if block_given?
36
+ begin
37
+ yield self.new(ios)
38
+ ensure
39
+ ios.close
40
+ end
41
+ else
42
+ return self.new(ios)
43
+ end
44
+ end
45
+
46
+ def self.read(*args)
47
+ entries = []
48
+
49
+ Fasta.open(*args) do |ios|
50
+ ios.each do |entry|
51
+ entries << entry
52
+ end
53
+ end
54
+
55
+ entries
56
+ end
57
+
58
+ attr_accessor :seq_name, :seq
59
+
60
+ def initialize(io)
61
+ @io = io
62
+ @seq_name = nil
63
+ @seq = ""
64
+ @got_first = nil
65
+ @got_last = nil
66
+ end
67
+
68
+ def each
69
+ while entry = next_entry
70
+ yield entry
71
+ end
72
+ end
73
+
74
+ def puts(*args)
75
+ @io.puts(*args)
76
+ end
77
+
78
+ # Method to get the next FASTA entry form an ios and return this
79
+ # as a Seq object. If no entry is found or eof then nil is returned.
80
+ def next_entry
81
+ @io.each do |line|
82
+ line.chomp!
83
+
84
+ next if line.empty?
85
+
86
+ if line[0] == '>'
87
+ if not @got_first and not @seq.empty?
88
+ raise FastaError, "Bad FASTA format -> content before Fasta header: #{@seq}" unless @seq.empty?
89
+ end
90
+
91
+ @got_first = true
92
+
93
+ if @seq_name
94
+ entry = Seq.new(seq_name: @seq_name, seq: @seq)
95
+ @seq_name = line[1 .. -1]
96
+ @seq = ""
97
+
98
+ raise FastaError, "Bad FASTA format -> truncated Fasta header: no content after '>'" if @seq_name.empty?
99
+
100
+ return entry
101
+ else
102
+ @seq_name = line[1 .. -1]
103
+
104
+ raise FastaError, "Bad FASTA format -> truncated Fasta header: no content after '>'" if @seq_name.empty?
105
+ end
106
+ else
107
+ @seq << line
108
+ end
109
+ end
110
+
111
+ if @seq_name
112
+ @got_last = true
113
+ entry = Seq.new(seq_name: @seq_name, seq: @seq)
114
+ @seq_name = nil
115
+ return entry
116
+ end
117
+
118
+ if not @got_last and not @seq.empty?
119
+ raise FastaError, "Bad FASTA format -> content witout Fasta header: #{@seq}"
120
+ end
121
+
122
+ nil
123
+ end
124
+
125
+ class IO < Filesys
126
+ def each
127
+ while not @io.eof?
128
+ yield @io.gets
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,77 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with FASTQ.
29
+ class FastqError < StandardError; end
30
+
31
+ # Class for parsing FASTQ entries from an ios and return as Seq objects.
32
+ class Fastq < BioDSL::Filesys
33
+ def self.open(*args)
34
+ ios = IO.open(*args)
35
+
36
+ if block_given?
37
+ begin
38
+ yield self.new(ios)
39
+ ensure
40
+ ios.close
41
+ end
42
+ else
43
+ return self.new(ios)
44
+ end
45
+ end
46
+
47
+ def initialize(io)
48
+ @io = io
49
+ end
50
+
51
+ def each
52
+ while entry = next_entry
53
+ yield entry
54
+ end
55
+ end
56
+
57
+ # Method to get the next FASTQ entry from an ios and return this
58
+ # as a Seq object. If no entry is found or eof then nil is returned.
59
+ def next_entry
60
+ return nil if @io.eof?
61
+ seq_name = @io.gets[1 .. -2]
62
+ seq = @io.gets.chomp
63
+ @io.gets
64
+ qual = @io.gets.chomp
65
+
66
+ Seq.new(seq_name: seq_name, seq: seq, qual: qual)
67
+ end
68
+
69
+ class IO < Filesys
70
+ def each
71
+ while not @io.eof?
72
+ yield @io.gets
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,137 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ # Error class for all exceptions to do with Filesys.
29
+ class FilesysError < StandardError; end
30
+
31
+ class Filesys
32
+ require 'open3'
33
+
34
+ include Enumerable
35
+
36
+ # Cross-platform way of finding an executable in the $PATH.
37
+ #
38
+ # which('ruby') #=> /usr/bin/ruby
39
+ def self.which(cmd)
40
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
41
+
42
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
43
+ exts.each { |ext|
44
+ exe = File.join(path, "#{cmd}#{ext}")
45
+ return exe if File.executable?(exe) && !File.directory?(exe)
46
+ }
47
+ end
48
+
49
+ nil
50
+ end
51
+
52
+ # Class method that returns a path to a unique temporary file.
53
+ # If no directory is specified reverts to the systems tmp directory.
54
+ def self.tmpfile(tmp_dir = ENV["TMPDIR"])
55
+ time = Time.now.to_i
56
+ user = ENV["USER"]
57
+ pid = $$
58
+ path = tmp_dir + [user, time + pid, pid].join("_") + ".tmp"
59
+ path
60
+ end
61
+
62
+ def self.open(*args)
63
+ file = args.shift
64
+ mode = args.shift
65
+ options = args.shift || {}
66
+
67
+ if mode == 'w'
68
+ case options[:compress]
69
+ when :gzip
70
+ ios, = Open3.pipeline_w("gzip -f", out: file)
71
+ when :bzip, :bzip2
72
+ ios, = Open3.pipeline_w("bzip2 -c", out: file)
73
+ else
74
+ ios = File.open(file, mode, options)
75
+ end
76
+ else
77
+ type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
78
+ case type
79
+ when /gzip/
80
+ ios = IO.popen("gzip -cd #{file}")
81
+ when /bzip/
82
+ ios = IO.popen("bzcat #{file}")
83
+ else
84
+ ios = File.open(file, mode, options)
85
+ end
86
+ end
87
+
88
+ if block_given?
89
+ begin
90
+ yield self.new(ios)
91
+ ensure
92
+ ios.close
93
+ end
94
+ else
95
+ return self.new(ios)
96
+ end
97
+ end
98
+
99
+ attr_reader :io
100
+
101
+ def initialize(ios)
102
+ @io = ios
103
+ end
104
+
105
+ def gets
106
+ @io.gets
107
+ end
108
+
109
+ def puts(*args)
110
+ @io.puts(*args)
111
+ end
112
+
113
+ def read
114
+ @io.read
115
+ end
116
+
117
+ def write(arg)
118
+ @io.write arg
119
+ end
120
+
121
+ def close
122
+ @io.close
123
+ end
124
+
125
+ def eof?
126
+ @io.eof?
127
+ end
128
+
129
+ # Iterator method for parsing entries.
130
+ def each
131
+ return to_enum :each unless block_given?
132
+
133
+ @io.each { |line| yield line }
134
+ end
135
+ end
136
+ end
137
+