BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,42 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Class variabel visible across the BioDSL module scope.
31
+ @@test = false
32
+
33
+ # Class variable getter method.
34
+ def self.test
35
+ @@test
36
+ end
37
+
38
+ # Class variable setter method.
39
+ def self.test=(x)
40
+ @@test = x
41
+ end
42
+ end
@@ -0,0 +1,68 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
3
+ # #
4
+ # This program is free software; you can redistribute it and/or #
5
+ # modify it under the terms of the GNU General Public License #
6
+ # as published by the Free Software Foundation; either version 2 #
7
+ # of the License, or (at your option) any later version. #
8
+ # #
9
+ # This program is distributed in the hope that it will be useful, #
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
12
+ # GNU General Public License for more details. #
13
+ # #
14
+ # You should have received a copy of the GNU General Public License #
15
+ # along with this program; if not, write to the Free Software #
16
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
17
+ # USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+ module BioDSL
27
+ # Module to provide a temporary directory.
28
+ module TmpDir
29
+ require 'tempfile'
30
+
31
+ # Create a temporary directory in block context. The directory is deleted
32
+ # when the TmpDir object is garbage collected or the Ruby intepreter exits.
33
+ # If called with a list of filenames, these are provided as block arguments
34
+ # such that the files parent are the temporary directory. However, the last
35
+ # block argument is always the path to the temporary directory.
36
+ #
37
+ # @param files [Array] List of file names.
38
+ #
39
+ # @example
40
+ # BioDSL::TmpDir.create do |dir|
41
+ # puts dir
42
+ # # => "<tmp_dir>"
43
+ # end
44
+ #
45
+ # @example
46
+ # BioDSL::TmpDir.create("foo", "bar") do |foo, bar, dir|
47
+ # puts foo
48
+ # # => "<tmp_dir>/foo"
49
+ # puts bar
50
+ # # => "<tmp_dir>/foo"
51
+ # puts dir
52
+ # # => "<tmp_dir>"
53
+ # end
54
+ def self.create(*files, &block)
55
+ fail 'no block given' unless block
56
+
57
+ Dir.mktmpdir(nil, BioDSL::Config::TMP_DIR) do |dir|
58
+ paths = files.each_with_object([]) { |e, a| a << File.join(dir, e) }
59
+
60
+ if paths.empty?
61
+ block.call(dir)
62
+ else
63
+ block.call(paths << dir)
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,301 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Error class for all exceptions to do with Usearch.
31
+ class UsearchError < StandardError; end
32
+
33
+ # rubocop: disable ClassLength
34
+
35
+ # Class with methods to execute Usearch and parse the results.
36
+ class Usearch
37
+ include Enumerable
38
+
39
+ # Execute cluster_smallmem.
40
+ #
41
+ # @param options [Hash] Options Hash
42
+ # @option options [String] :input
43
+ # @option options [String] :output
44
+ # @option options [String] :database
45
+ # @option options [Float] :identity
46
+ # @option options [Fixnum] :cpus
47
+ # @option options [String] :strand
48
+ def self.cluster_smallmem(options)
49
+ usearch = new(options)
50
+ usearch.cluster_smallmem
51
+ end
52
+
53
+ # Execute cluster_otus.
54
+ #
55
+ # @param options [Hash] Options Hash
56
+ # @option options [String] :input
57
+ # @option options [String] :output
58
+ # @option options [String] :database
59
+ # @option options [Float] :identity
60
+ # @option options [Fixnum] :cpus
61
+ # @option options [String] :strand
62
+ def self.cluster_otus(options)
63
+ usearch = new(options)
64
+ usearch.cluster_otus
65
+ end
66
+
67
+ # Execute uchime_ref.
68
+ #
69
+ # @param options [Hash] Options Hash
70
+ # @option options [String] :input
71
+ # @option options [String] :output
72
+ # @option options [String] :database
73
+ # @option options [Float] :identity
74
+ # @option options [Fixnum] :cpus
75
+ # @option options [String] :strand
76
+ def self.uchime_ref(options)
77
+ usearch = new(options)
78
+ usearch.uchime_ref
79
+ end
80
+
81
+ # Execute usearch_local.
82
+ #
83
+ # @param options [Hash] Options Hash
84
+ # @option options [String] :input
85
+ # @option options [String] :output
86
+ # @option options [String] :database
87
+ # @option options [Float] :identity
88
+ # @option options [Fixnum] :cpus
89
+ # @option options [String] :strand
90
+ def self.usearch_global(options)
91
+ usearch = new(options)
92
+ usearch.usearch_global
93
+ end
94
+
95
+ # Execute usearch_local.
96
+ #
97
+ # @param options [Hash] Options Hash
98
+ # @option options [String] :input
99
+ # @option options [String] :output
100
+ # @option options [String] :database
101
+ # @option options [Float] :identity
102
+ # @option options [Fixnum] :cpus
103
+ # @option options [String] :strand
104
+ def self.usearch_local(options)
105
+ usearch = new(options)
106
+ usearch.usearch_local
107
+ end
108
+
109
+ # Open a Usearch file.
110
+ #
111
+ # @param [Array] List of open arguments.
112
+ #
113
+ # @yield [IO] stream.
114
+ # @return [IO] stream.
115
+ def self.open(*args)
116
+ ios = IO.open(*args)
117
+
118
+ if block_given?
119
+ yield ios
120
+ else
121
+ return ios
122
+ end
123
+ end
124
+
125
+ # Constructor for Usearch class.
126
+ #
127
+ # @param options [Hash] Options Hash
128
+ # @option options [String] :input
129
+ # @option options [String] :output
130
+ # @option options [String] :database
131
+ # @option options [Float] :identity
132
+ # @option options [Fixnum] :cpus
133
+ # @option options [String] :strand
134
+ #
135
+ # @return [Usearch] Class instance.
136
+ def initialize(options)
137
+ @options = options
138
+ @stderr = nil
139
+
140
+ return self unless File.size(@options[:input]) == 0
141
+
142
+ fail UsearchError, %(Empty input file -> "#{@options[:input]}")
143
+ end
144
+
145
+ # Combose a command list and execute cluster_smallmem with this.
146
+ #
147
+ # @return [self]
148
+ def cluster_smallmem
149
+ command = []
150
+ command << 'usearch'
151
+ command << "-cluster_smallmem #{@options[:input]}"
152
+ command << "-id #{@options[:identity]}"
153
+ command << "-threads #{@options[:cpus]}" if @options[:cpus]
154
+ command << "-strand #{@options[:strand]}"
155
+
156
+ if @options[:align]
157
+ command << "-msaout #{@options[:output]}"
158
+ else
159
+ command << "-uc #{@options[:output]}"
160
+ end
161
+
162
+ execute(command)
163
+
164
+ self
165
+ end
166
+
167
+ # Combose a command list and execute cluster_otus with this.
168
+ #
169
+ # @return [self]
170
+ def cluster_otus
171
+ command = []
172
+ command << 'usearch'
173
+ command << "-cluster_otus #{@options[:input]}"
174
+ command << "-otus #{@options[:output]}"
175
+ command << "-id #{@options[:identity]}"
176
+ command << "-threads #{@options[:cpus]}" if @options[:cpus]
177
+
178
+ execute(command)
179
+
180
+ self
181
+ end
182
+
183
+ # Combose a command list and execute uchime_ref with this.
184
+ #
185
+ # @return [self]
186
+ def uchime_ref
187
+ command = []
188
+ command << 'usearch'
189
+ command << "-uchime_ref #{@options[:input]}"
190
+ command << "-db #{@options[:database]}"
191
+ command << "-strand #{@options[:strand]}"
192
+ command << "-threads #{@options[:cpus]}" if @options[:cpus]
193
+ command << "-nonchimeras #{@options[:output]}"
194
+
195
+ execute(command)
196
+
197
+ self
198
+ end
199
+
200
+ # Combose a command list and execute usearch_global with this.
201
+ #
202
+ # @return [self]
203
+ def usearch_global
204
+ command = []
205
+ command << 'usearch'
206
+ command << '-notrunclabels'
207
+ command << "-usearch_global #{@options[:input]}"
208
+ command << "-db #{@options[:database]}"
209
+ command << "-strand #{@options[:strand]}" if @options[:strand]
210
+ command << "-threads #{@options[:cpus]}" if @options[:cpus]
211
+ command << "-id #{@options[:identity]}"
212
+ command << "-uc #{@options[:output]}"
213
+
214
+ execute(command)
215
+
216
+ self
217
+ end
218
+
219
+ # Combose a command list and execute usearch_local with this.
220
+ #
221
+ # @return [self]
222
+ def usearch_local
223
+ command = []
224
+ command << 'usearch'
225
+ command << '-notrunclabels'
226
+ command << "-usearch_local #{@options[:input]}"
227
+ command << "-db #{@options[:database]}"
228
+ command << "-strand #{@options[:strand]}" if @options[:strand]
229
+ command << "-threads #{@options[:cpus]}" if @options[:cpus]
230
+ command << "-id #{@options[:identity]}"
231
+ command << "-uc #{@options[:output]}"
232
+
233
+ execute(command)
234
+
235
+ self
236
+ end
237
+
238
+ private
239
+
240
+ # Execute Usearch on a given command.
241
+ #
242
+ # @param command [Array] Usearch command list.
243
+ def execute(command)
244
+ command << '--quiet' unless @options[:verbose]
245
+ command_str = command.join(' ')
246
+
247
+ $stderr.puts "Running command: #{command_str}" if @options[:verbose]
248
+
249
+ Open3.popen3(command_str) do |_stdin, _stdout, stderr, wait_thr|
250
+ @stderr = stderr.read.split $INPUT_RECORD_SEPARATOR
251
+ exit_status = wait_thr.value # Process::Status object returned.
252
+
253
+ unless exit_status.success?
254
+ # TODO: write error message to log.
255
+ fail UsearchError, "Command failed: #{command_str} + \
256
+ #{@stderr.join $INPUT_RECORD_SEPARATOR}"
257
+ end
258
+ end
259
+ end
260
+
261
+ # Class for Usearch IO.
262
+ class IO < Filesys
263
+ # Parse a given type of Uclust format and yield the result.
264
+ #
265
+ # @param format [Symbol] Format type to parse.
266
+ def each(format = :uc)
267
+ case format
268
+ when :uc then each_uc { |e| yield e }
269
+ else
270
+ fail UsearchError, "Unknown iterator format: #{format}"
271
+ end
272
+ end
273
+
274
+ # rubocop: disable Metrics/AbcSize
275
+
276
+ # Parse each UC type record and yield the result.
277
+ #
278
+ # @yield [Hash] BioDSL record with UC result.
279
+ def each_uc
280
+ @io.each do |line|
281
+ fields = line.chomp.split("\t")
282
+ record = {TYPE: fields[0],
283
+ CLUSTER: fields[1].to_i}
284
+
285
+ case fields[0]
286
+ when 'C' then record[:CLUSTER_SIZE] = fields[2].to_i
287
+ else record[:SEQ_LEN] = fields[2].to_i
288
+ end
289
+
290
+ record[:IDENT] = fields[3].to_f if fields[0] == 'H'
291
+ record[:STRAND] = fields[4]
292
+ record[:CIGAR] = fields[7]
293
+ record[:Q_ID] = fields[8]
294
+ record[:S_ID] = fields[9] if fields[0] == 'H'
295
+
296
+ yield record
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
@@ -0,0 +1,42 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # Class variabel visible across the BioDSL module scope.
31
+ @@verbose = false
32
+
33
+ # Class variable getter method.
34
+ def self.verbose
35
+ @@verbose
36
+ end
37
+
38
+ # Class variable setter method.
39
+ def self.verbose=(x)
40
+ @@verbose = x
41
+ end
42
+ end
@@ -0,0 +1,31 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ VERSION = '1.0.0'
31
+ end
data/lib/BioDSL.rb ADDED
@@ -0,0 +1,81 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ fail 'Ruby 2.0 or later required' if RUBY_VERSION < '2.0'
29
+
30
+ # Commify numbers.
31
+ class Numeric
32
+ def commify
33
+ to_s.gsub(/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/, '\1,')
34
+ end
35
+ end
36
+
37
+ # Convert string to float or integer if applicable.
38
+ class String
39
+ def to_num
40
+ Integer(self)
41
+ to_i
42
+ rescue ArgumentError
43
+ begin
44
+ Float(self)
45
+ to_f
46
+ rescue ArgumentError
47
+ self
48
+ end
49
+ end
50
+ end
51
+
52
+ # Namespace for BioDSL.
53
+ module BioDSL
54
+ require 'pp'
55
+ require 'BioDSL/cary'
56
+ require 'BioDSL/commands'
57
+ require 'BioDSL/debug'
58
+ require 'BioDSL/helpers'
59
+ require 'BioDSL/seq'
60
+ require 'BioDSL/config'
61
+ require 'BioDSL/hamming'
62
+ require 'BioDSL/version'
63
+ require 'BioDSL/filesys'
64
+ require 'BioDSL/csv'
65
+ require 'BioDSL/fork'
66
+ require 'BioDSL/html_report'
67
+ require 'BioDSL/pipeline'
68
+ require 'BioDSL/fasta'
69
+ require 'BioDSL/fastq'
70
+ require 'BioDSL/math'
71
+ require 'BioDSL/mummer'
72
+ require 'BioDSL/taxonomy'
73
+ require 'BioDSL/tmp_dir'
74
+ require 'BioDSL/serializer'
75
+ require 'BioDSL/stream'
76
+ require 'BioDSL/test'
77
+ require 'BioDSL/usearch'
78
+ require 'BioDSL/verbose'
79
+ end
80
+
81
+ BP = BioDSL::Pipeline # Module alias for irb short hand
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', '..', '..')
3
+
4
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
5
+ # #
6
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
7
+ # #
8
+ # This program is free software; you can redistribute it and/or #
9
+ # modify it under the terms of the GNU General Public License #
10
+ # as published by the Free Software Foundation; either version 2 #
11
+ # of the License, or (at your option) any later version. #
12
+ # #
13
+ # This program is distributed in the hope that it will be useful, #
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
16
+ # GNU General Public License for more details. #
17
+ # #
18
+ # You should have received a copy of the GNU General Public License #
19
+ # along with this program; if not, write to the Free Software #
20
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
21
+ # USA. #
22
+ # #
23
+ # http://www.gnu.org/copyleft/gpl.html #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+ # #
27
+ # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
28
+ # #
29
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
30
+
31
+ require 'test/helper'
32
+
33
+ # Test class for AddKey.
34
+ class TestAddKey < Test::Unit::TestCase
35
+ def setup
36
+ @input, @output = BioDSL::Stream.pipe
37
+ @input2, @output2 = BioDSL::Stream.pipe
38
+
39
+ @output.write(one: 1, two: 2, three: 3)
40
+ @output.write(SEQ_NAME: 'test1', SEQ: 'atcg', SEQ_LEN: 4)
41
+ @output.write(SEQ_NAME: 'test2', SEQ: 'gtac', SEQ_LEN: 4)
42
+ @output.close
43
+
44
+ @p = BioDSL::Pipeline.new
45
+ end
46
+
47
+ test 'BioDSL::Pipeline#add_key with disallowed option raises' do
48
+ assert_raise(BioDSL::OptionError) { @p.add_key(foo: 'bar') }
49
+ end
50
+
51
+ test 'BioDSL::Pipeline#add_key with value and prefix options raise' do
52
+ assert_raise(BioDSL::OptionError) do
53
+ @p.add_key(key: 'SEQ_NAME', value: 'foobar', prefix: 'foo')
54
+ end
55
+ end
56
+
57
+ test 'BioDSL::Pipeline#add_key with allowed options don\'t raise' do
58
+ assert_nothing_raised { @p.add_key(key: 'SEQ_NAME', value: 'fobar') }
59
+ end
60
+
61
+ test 'BioDSL::Pipeline#add_key status returns correctly' do
62
+ @p.add_key(key: 'SEQ_NAME', value: 'fobar').
63
+ run(input: @input, output: @output2)
64
+
65
+ assert_equal(3, @p.status.last[:records_in])
66
+ assert_equal(3, @p.status.last[:records_out])
67
+ end
68
+
69
+ test 'BioDSL::Pipeline#add_key with value returns correctly' do
70
+ @p.add_key(key: 'SEQ_NAME', value: 'fobar').
71
+ run(input: @input, output: @output2)
72
+
73
+ expected = <<-EXP.gsub(/^\s+\|/, '')
74
+ |{:one=>1, :two=>2, :three=>3, :SEQ_NAME=>"fobar"}
75
+ |{:SEQ_NAME=>"fobar", :SEQ=>"atcg", :SEQ_LEN=>4}
76
+ |{:SEQ_NAME=>"fobar", :SEQ=>"gtac", :SEQ_LEN=>4}
77
+ EXP
78
+
79
+ assert_equal(expected, collect_result)
80
+ end
81
+
82
+ test 'BioDSL::Pipeline#add_key with empty prefix returns correctly' do
83
+ @p.add_key(key: 'SEQ_NAME', prefix: '').run(input: @input, output: @output2)
84
+ expected = <<-EXP.gsub(/^\s+\|/, '')
85
+ |{:one=>1, :two=>2, :three=>3, :SEQ_NAME=>"0"}
86
+ |{:SEQ_NAME=>"1", :SEQ=>"atcg", :SEQ_LEN=>4}
87
+ |{:SEQ_NAME=>"2", :SEQ=>"gtac", :SEQ_LEN=>4}
88
+ EXP
89
+
90
+ assert_equal(expected, collect_result)
91
+ end
92
+
93
+ test 'BioDSL::Pipeline#add_key with prefix returns correctly' do
94
+ @p.add_key(key: 'SEQ_NAME', prefix: 'ID_').
95
+ run(input: @input, output: @output2)
96
+
97
+ expected = <<-EXP.gsub(/^\s+\|/, '')
98
+ |{:one=>1, :two=>2, :three=>3, :SEQ_NAME=>"ID_0"}
99
+ |{:SEQ_NAME=>"ID_1", :SEQ=>"atcg", :SEQ_LEN=>4}
100
+ |{:SEQ_NAME=>"ID_2", :SEQ=>"gtac", :SEQ_LEN=>4}
101
+ EXP
102
+
103
+ assert_equal(expected, collect_result)
104
+ end
105
+ end