BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,226 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Create taxonomy index from sequences in the stream.
30
+ #
31
+ # +index_taxonomy+ is used to create a taxonomy index to allow subsequent
32
+ # taxonomic classification with +classify_seq+. The records with taxnomic
33
+ # information must contain :SEQ_NAME and :SEQ keys where the :SEQ_NAME value
34
+ # must be formatted with an initial ID number followed by a space and then the
35
+ # taxonomy string progressing from kingdom to species level. Only the
36
+ # following leves are accepted:
37
+ #
38
+ # * K - kingdom
39
+ # * P - phylum
40
+ # * C - class
41
+ # * O - order
42
+ # * F - family
43
+ # * G - genus
44
+ # * S - species
45
+ #
46
+ # Truncated taxonomic strings are allowed, e.g. a string that stops at family
47
+ # level. Below is an example of a full taxonomic string:
48
+ #
49
+ # 32 K#Bacteria;P#Actinobacteria;C#Actinobacteria;O#Acidimicrobiales; \
50
+ # F#Acidimicrobiaceae;G#Ferrimicrobium;S#Ferrimicrobium acidiphilum
51
+ #
52
+ # The resulting index consists of the following files (here using the default
53
+ # "taxonomy" as prefix) which are saved to a specified +output_dir+:
54
+ #
55
+ # * taxonomy_tax_index.dat - return node for a given node id.
56
+ # * taxonomy_kmer_index.dat - return list of node ids for a given level and
57
+ # kmer.
58
+ #
59
+ # The index is constructed by breaking the sequences into kmers of a given
60
+ # kmer_size and using a given step_size:
61
+ #
62
+ # Example FASTA entry:
63
+ #
64
+ # >2 K#Bacteria;P#Proteobacteria;C#Gammaproteobacteria;O#Vibrionales; \
65
+ # F#Vibrionaceae;G#Vibrio;S#Vibrio
66
+ # UCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGA
67
+ #
68
+ # This sequence is broken down to a list of oligos using the default kmer_size
69
+ # and step_size of 8 and 1, respectively:
70
+ #
71
+ # UCCUACGG
72
+ # CCUACGGG
73
+ # CUACGGGA
74
+ # UACGGGAG
75
+ # ACGGGAGG
76
+ # ...
77
+ #
78
+ # Oligos containing ambiguity codes are skipped. Each oligo is encoded as an
79
+ # kmer (integer) by encoding two bits per nucletoide:
80
+ #
81
+ # * A = 00
82
+ # * U = 01
83
+ # * C = 10
84
+ # * G = 11
85
+ #
86
+ # E.g. UCCUACGG = 0110100100101111 = 26927
87
+ #
88
+ # For each node in the tree a vector is kept containing information of all
89
+ # observed oligos for that particular node. Thus all child nodes contain a
90
+ # subset of oligos compared to the parent node. Finally, the tree is saved to
91
+ # files.
92
+ #
93
+ # It should be noted that the speed and accuarcy of the classification is
94
+ # strongly dependent on the size and quality of the taxonomic database used
95
+ # (RDP, GreenGenes or Silva) and for a particular amplicon it is strongly
96
+ # recommended to create a slice from the database aligment matching the
97
+ # amplicon.
98
+ #
99
+ # == Usage
100
+ #
101
+ # index_taxonomy(<output_dir: <dir>>[, kmer_size: <uint>
102
+ # [, step_size: <uint>[, prefix: <string>
103
+ # [, force: <bool>]]]])
104
+ #
105
+ # === Options
106
+ #
107
+ # * output_dir: <dir> - Output directory to contain index files.
108
+ # * kmer_size: <uint> - Size of kmer to use (default=8).
109
+ # * step_size: <uint> - Size of steps (default=1).
110
+ # * prefix: <string> - Prefix to use with index file names
111
+ # (default="taxonomy").
112
+ # * force: <bool> - Force overwrite existing index files.
113
+ #
114
+ # == Examples
115
+ #
116
+ # BP.new.
117
+ # read_fasta(input: "RDP_11_Bacteria.fna").
118
+ # index_taxonomy(output_dir: "RDP_11").
119
+ # run
120
+ class IndexTaxonomy
121
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
122
+ residues_out)
123
+
124
+ # Constructor for IndexTaxonomy.
125
+ #
126
+ # @param options [Hash] Options hash.
127
+ # @option options [String] :output_dir Path to output directory.
128
+ # @option options [String] :prefix Database file name prefix.
129
+ # @option options [Integer] :kmer_size Kmer size to use for indexing.
130
+ # @option options [Integer] :step_size Step size to use for indexing.
131
+ # @option options [Boolean] :force Flag for force-overwriting output files.
132
+ #
133
+ # @return [IndexTaxonomy] Instance of class.
134
+ def initialize(options)
135
+ @options = options
136
+
137
+ defaults
138
+ check_options
139
+ create_output_dir
140
+ check_output_files
141
+
142
+ @index = BioDSL::Taxonomy::Index.new(options)
143
+ end
144
+
145
+ # Return command lambda for index_taxonomy.
146
+ #
147
+ # @return [Proc] Command lambda.
148
+ def lmb
149
+ lambda do |input, output, status|
150
+ status_init(status, STATS)
151
+
152
+ input.each do |record|
153
+ @status[:records_in] += 1
154
+
155
+ add_to_index(record) if record[:SEQ_NAME] && record[:SEQ]
156
+
157
+ output << record
158
+ @status[:records_out] += 1
159
+ end
160
+
161
+ @index.save
162
+ end
163
+ end
164
+
165
+ private
166
+
167
+ # Check options.
168
+ def check_options
169
+ options_allowed(@options, :output_dir, :kmer_size, :step_size, :prefix,
170
+ :force)
171
+ options_required(@options, :output_dir)
172
+ options_allowed_values(@options, force: [nil, true, false])
173
+ options_files_exist_force(@options, :report)
174
+ options_assert(@options, ':kmer_size > 0')
175
+ options_assert(@options, ':kmer_size <= 12')
176
+ options_assert(@options, ':step_size > 0')
177
+ options_assert(@options, ':step_size <= 12')
178
+ end
179
+
180
+ # Set the default options hash values.
181
+ def defaults
182
+ @options[:prefix] ||= 'taxonomy'
183
+ @options[:kmer_size] ||= 8
184
+ @options[:step_size] ||= 1
185
+ end
186
+
187
+ # Create the output directory specified in the options hash if this does not
188
+ # already exist.
189
+ def create_output_dir
190
+ return if File.exist?(@options[:output_dir])
191
+
192
+ FileUtils.mkdir_p(@options[:output_dir])
193
+ end
194
+
195
+ # Check if the output files already exist and throw an exception if so and
196
+ # the no force options is used.
197
+ #
198
+ # @raise [BioDSL::OptionsError] If file exists and force option not used.
199
+ def check_output_files
200
+ files = [
201
+ File.join(@options[:output_dir], "#{@options[:prefix]}_tax_index.dat"),
202
+ File.join(@options[:output_dir], "#{@options[:prefix]}_kmer_index.dat")
203
+ ]
204
+
205
+ files.each do |file|
206
+ next unless File.exist? file
207
+
208
+ unless @options[:force]
209
+ msg = "File exists: #{file} - use 'force: true' to overwrite"
210
+ fail BioDSL::OptionError, msg
211
+ end
212
+ end
213
+ end
214
+
215
+ # Add to the taxonomy index the sequence information from a given record.
216
+ #
217
+ # @param record [Hash] BioDSL record with sequence info.
218
+ def add_to_index(record)
219
+ @status[:sequences_in] += 1
220
+
221
+ _, seq_name = record[:SEQ_NAME].split(' ', 2)
222
+
223
+ @index.add(BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ]))
224
+ end
225
+ end
226
+ end
@@ -0,0 +1,175 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Mask sequences in the stream based on quality scores.
30
+ #
31
+ # +mask_seq+ masks sequences in the stream using either hard masking or
32
+ # soft masking (default). Hard masking is replacing residues with
33
+ # corresponding quality score below a specified +quality_min+ with an N,
34
+ # while soft is replacing such residues with lower case. The sequences are
35
+ # values to SEQ keys and the quality scores are values to SCORES keys. The
36
+ # SCORES are encoded as ranges of ASCII characters from '!' to 'I'
37
+ # indicating scores from 0 to 40.
38
+ #
39
+ # == Usage
40
+ #
41
+ # mask_seq([quality_min: <uint>[, mask: <:soft|:hard>]])
42
+ #
43
+ # === Options
44
+ #
45
+ # * quality_min: <uint> - Minimum quality (default=20).
46
+ # * mask: <string> - Soft or Hard mask (default=soft).
47
+ #
48
+ # == Examples
49
+ #
50
+ # Consider the following FASTQ entry in the file test.fq:
51
+ #
52
+ # @HWI-EAS157_20FFGAAXX:2:1:888:434
53
+ # TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGGGCGAT
54
+ # +HWI-EAS157_20FFGAAXX:2:1:888:434
55
+ # !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
56
+ #
57
+ # We can read in these sequence using +read_fastq+ and then soft mask the
58
+ # sequence with mask_seq like this:
59
+ #
60
+ # BP.new.read_fastq(input: "test.fq").mask_seq.dump.run
61
+ #
62
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
63
+ # :SEQ=>"ttggtcgctcgctccgcgacCTCAGATCAGACGTGGGCGAT",
64
+ # :SEQ_LEN=>41,
65
+ # :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
66
+ #
67
+ # Using the +quality_min+ option we can change the cutoff:
68
+ #
69
+ # BP.new.read_fastq(input: "test.fq").mask_seq(quality_min: 25).dump.run
70
+ #
71
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
72
+ # :SEQ=>"ttggtcgctcgctccgcgacctcagATCAGACGTGGGCGAT",
73
+ # :SEQ_LEN=>41,
74
+ # :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
75
+ #
76
+ # Using the +mask+ option for hard masking:
77
+ #
78
+ # BP.new.read_fastq(input: "test.fq").mask_seq(mask: :hard).dump.run
79
+ #
80
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
81
+ # :SEQ=>"NNNNNNNNNNNNNNNNNNNNCTCAGATCAGACGTGGGCGAT",
82
+ # :SEQ_LEN=>41,
83
+ # :SCORES=>"!\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI"}
84
+ class MaskSeq
85
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
86
+ residues_out masked)
87
+
88
+ # Constructor for MaskSeq.
89
+ #
90
+ # @param options [Hash] Options hash.
91
+ # @option options [Integer] Minimum quality score.
92
+ # @option options [Symbol,String] Mask scheme.
93
+ #
94
+ # @return [MaskSeq] Instance of MaskSeq.
95
+ def initialize(options)
96
+ @options = options
97
+
98
+ check_options
99
+ defaults
100
+
101
+ @mask = options[:mask].to_sym
102
+ end
103
+
104
+ # Return command lambda for mask_seq.
105
+ #
106
+ # @return [Proc] command lambda.
107
+ def lmb
108
+ lambda do |input, output, status|
109
+ status_init(status, STATS)
110
+
111
+ input.each do |record|
112
+ @status[:records_in] += 1
113
+
114
+ mask_seq(record) if record[:SEQ] && record[:SCORES]
115
+
116
+ output << record
117
+
118
+ @status[:records_out] += 1
119
+ end
120
+
121
+ @status[:masked_percent] =
122
+ (100 * @status[:masked].to_f / @status[:residues_in]).round(2)
123
+ end
124
+ end
125
+
126
+ private
127
+
128
+ # Check options.
129
+ def check_options
130
+ options_allowed(@options, :quality_min, :mask)
131
+ options_allowed_values(@options, mask: [:soft, :hard, 'soft', 'hard'])
132
+ options_assert(@options, ':quality_min >= 0')
133
+ options_assert(@options, ':quality_min <= 40')
134
+ end
135
+
136
+ # Set default options.
137
+ def defaults
138
+ @options[:quality_min] ||= 20
139
+ @options[:mask] ||= :soft
140
+ end
141
+
142
+ # Mask sequence in given record.
143
+ #
144
+ # @param record [Hash] BioDSL record.
145
+ def mask_seq(record)
146
+ entry = BioDSL::Seq.new_bp(record)
147
+
148
+ @status[:sequences_in] += 1
149
+ @status[:residues_in] += entry.length
150
+
151
+ @mask == :soft ? mask_seq_soft(entry) : mask_seq_hard(entry)
152
+
153
+ @status[:sequences_out] += 1
154
+ @status[:residues_out] += entry.length
155
+
156
+ record.merge! entry.to_bp
157
+ end
158
+
159
+ # Soft mask sequences in given entry.
160
+ #
161
+ # @param entry [BioDSL::seq] sequences entry.
162
+ def mask_seq_soft(entry)
163
+ entry.mask_seq_soft!(@options[:quality_min])
164
+ @status[:masked] += entry.seq.count('a-z')
165
+ end
166
+
167
+ # Hard mask sequences in given entry.
168
+ #
169
+ # @param entry [BioDSL::seq] sequences entry.
170
+ def mask_seq_hard(entry)
171
+ entry.mask_seq_hard!(@options[:quality_min])
172
+ @status[:masked] += entry.seq.count('N')
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,168 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Calculate the mean or local mean of quality SCORES in the stream.
30
+ #
31
+ # +mean_scores+ calculates either the global or local mean value or quality
32
+ # SCORES in the stream. The quality SCORES are encoded Phred style in
33
+ # character string.
34
+ #
35
+ # The global (default) behaviour calculates the SCORES_MEAN as the sum of all
36
+ # the scores over the length of the SCORES string.
37
+ #
38
+ # The local means SCORES_MEAN_LOCAL are calculated using means from a sliding
39
+ # window, where the smallest mean is returned.
40
+ #
41
+ # Thus, subquality records, with either an overall low mean quality or with
42
+ # local dip in quality, can be filtered using +grab+.
43
+ #
44
+ # == Usage
45
+ #
46
+ # mean_scores([local: <bool>[, window_size: <uint>]])
47
+ #
48
+ # === Options
49
+ #
50
+ # * local: <bool> - Calculate local mean score (default=false).
51
+ # * window_size: <uint> - Size of sliding window (defaul=5).
52
+ #
53
+ # == Examples
54
+ #
55
+ # Consider the following FASTQ entry in the file test.fq:
56
+ #
57
+ # @HWI-EAS157_20FFGAAXX:2:1:888:434
58
+ # TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG
59
+ # +
60
+ # BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII
61
+ #
62
+ # The values of the scores in decimal are:
63
+ #
64
+ # SCORES: 33;34;35;36;37;38;39;40;40;40;40;40;40;40;11;11;11;11;11;40;37;
65
+ # 37;40;40;40;40;40;40;40;40;40;40;40;
66
+ #
67
+ # To calculate the mean score do:
68
+ #
69
+ # BP.new.read_fastq(input: "test.fq").mean_scores.dump.run
70
+ #
71
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
72
+ # :SEQ=>"TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG",
73
+ # :SEQ_LEN=>33,
74
+ # :SCORES=>"BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII",
75
+ # :SCORES_MEAN=>34.58}
76
+ #
77
+ # To calculate local means for a sliding window, do:
78
+ #
79
+ # BP.new.read_fastq(input: "test.fq").mean_scores(local: true).dump.run
80
+ #
81
+ # {:SEQ_NAME=>"HWI-EAS157_20FFGAAXX:2:1:888:434",
82
+ # :SEQ=>"TTGGTCGCTCGCTCGACCTCAGATCAGACGTGG",
83
+ # :SEQ_LEN=>33,
84
+ # :SCORES=>"BCDEFGHIIIIIII,,,,,IFFIIIIIIIIIII",
85
+ # :SCORES_MEAN_LOCAL=>11.0}
86
+ #
87
+ # Which indicates a local minimum was located at the stretch of ,,,,, =
88
+ # 11+11+11+11+11 / 5 = 11.0
89
+ class MeanScores
90
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
91
+ residues_out min_mean max_mean mean_mean)
92
+
93
+ # Constructor for MeanScores.
94
+ #
95
+ # @param options [Hash] Options hash.
96
+ # @option options [Boolean] :local
97
+ # @option options [Fixnum] :window_size
98
+ #
99
+ # @return [MeanScores] Class instance.
100
+ def initialize(options)
101
+ @options = options
102
+ @min = Float::INFINITY
103
+ @max = 0
104
+ @sum = 0
105
+ @count = 0
106
+
107
+ check_options
108
+ defaults
109
+ end
110
+
111
+ # Return command lambda for mean_scores.
112
+ #
113
+ # @return [Proc] Command lambda.
114
+ def lmb
115
+ lambda do |input, output, status|
116
+ status_init(status, STATS)
117
+
118
+ input.each do |record|
119
+ @status[:records_in] += 1
120
+
121
+ calc_mean(record) if record[:SCORES] && record[:SCORES].length > 0
122
+
123
+ output << record
124
+
125
+ @status[:records_out] += 1
126
+ end
127
+
128
+ @status[:mean_mean] = (@sum.to_f / @count).round(2)
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ # Check options
135
+ def check_options
136
+ options_allowed(@options, :local, :window_size)
137
+ options_tie(@options, window_size: :local)
138
+ options_allowed_values(@options, local: [true, false])
139
+ options_assert(@options, ':window_size > 1')
140
+ end
141
+
142
+ # Set default options.
143
+ def defaults
144
+ @options[:window_size] ||= 5
145
+ end
146
+
147
+ # Calculate the mean score for a given record and record
148
+ # count, sum, min and max.
149
+ #
150
+ # @param record [Hash] BioDSL record.
151
+ def calc_mean(record)
152
+ entry = BioDSL::Seq.new_bp(record)
153
+
154
+ if @options[:local]
155
+ mean = entry.scores_mean_local(@options[:window_size]).round(2)
156
+ record[:SCORES_MEAN_LOCAL] = mean
157
+ else
158
+ mean = entry.scores_mean.round(2)
159
+ record[:SCORES_MEAN] = mean
160
+ end
161
+
162
+ @sum += mean
163
+ @status[:min_mean] = mean if mean < @status[:min_mean]
164
+ @status[:max_mean] = mean if mean > @status[:max_mean]
165
+ @count += 1
166
+ end
167
+ end
168
+ end