BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
data/lib/BioDSL/seq.rb ADDED
@@ -0,0 +1,742 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ require 'narray'
29
+ require 'BioDSL/seq/ambiguity'
30
+ require 'BioDSL/seq/assemble'
31
+ require 'BioDSL/seq/digest'
32
+ require 'BioDSL/seq/kmer'
33
+ require 'BioDSL/seq/translate'
34
+ require 'BioDSL/seq/trim'
35
+ require 'BioDSL/seq/backtrack'
36
+ require 'BioDSL/seq/dynamic'
37
+ require 'BioDSL/seq/homopolymer'
38
+ require 'BioDSL/seq/levenshtein'
39
+
40
+ # Error class for all exceptions to do with Seq.
41
+ class SeqError < StandardError; end
42
+
43
+ class Seq
44
+ # Residue alphabets
45
+ DNA = %w[a t c g]
46
+ RNA = %w[a u c g]
47
+ PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
48
+ INDELS = %w[. - _ ~]
49
+
50
+ # Quality scores bases
51
+ SCORE_BASE = 33
52
+ SCORE_MIN = 0
53
+ SCORE_MAX = 40
54
+
55
+ include BioDSL::Digest
56
+ include BioDSL::Homopolymer
57
+ include BioDSL::Translate
58
+ include BioDSL::Trim
59
+ include BioDSL::Kmer
60
+ include BioDSL::BackTrack
61
+
62
+ attr_accessor :seq_name, :seq, :type, :qual
63
+
64
+ # Class method to instantiate a new Sequence object given
65
+ # a Biopiece record.
66
+ def self.new_bp(record)
67
+ seq_name = record[:SEQ_NAME]
68
+ seq = record[:SEQ]
69
+ type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
70
+ qual = record[:SCORES]
71
+
72
+ self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
73
+ end
74
+
75
+ # Class method that generates all possible oligos of a specifed length and type.
76
+ def self.generate_oligos(length, type)
77
+ raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
78
+
79
+ case type.downcase
80
+ when :dna then alph = DNA
81
+ when :rna then alph = RNA
82
+ when :protein then alph = PROTEIN
83
+ else
84
+ raise SeqError, "Unknown sequence type: #{type}"
85
+ end
86
+
87
+ oligos = [""]
88
+
89
+ (1 .. length).each do
90
+ list = []
91
+
92
+ oligos.each do |oligo|
93
+ alph.each do |char|
94
+ list << oligo + char
95
+ end
96
+ end
97
+
98
+ oligos = list
99
+ end
100
+
101
+ oligos
102
+ end
103
+
104
+ def self.check_name_pair(entry1, entry2)
105
+ if entry1.seq_name =~ /^([^ ]+) \d:/
106
+ name1 = $1
107
+ elsif entry1.seq_name =~ /^(.+)\/\d$/
108
+ name1 = $1
109
+ else
110
+ raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
111
+ end
112
+
113
+ if entry2.seq_name =~ /^([^ ]+) \d:/
114
+ name2 = $1
115
+ elsif entry2.seq_name =~ /^(.+)\/\d$/
116
+ name2 = $1
117
+ else
118
+ raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
119
+ end
120
+
121
+ if name1 != name2
122
+ raise SeqError, "Name mismatch: #{name1} != #{name2}"
123
+ end
124
+ end
125
+
126
+ # Initialize a sequence object with the following options:
127
+ # - :seq_name Name of the sequence.
128
+ # - :seq The sequence.
129
+ # - :type The sequence type - DNA, RNA, or protein
130
+ # - :qual An Illumina type quality scores string.
131
+ def initialize(options = {})
132
+ @seq_name = options[:seq_name]
133
+ @seq = options[:seq]
134
+ @type = options[:type]
135
+ @qual = options[:qual]
136
+
137
+ if @seq and @qual and @seq.length != @qual.length
138
+ raise SeqError, "Sequence length and score length mismatch:" \
139
+ "#{@seq.length} != #{@qual.length}"
140
+ end
141
+ end
142
+
143
+ # Method that guesses and returns the sequence type
144
+ # by inspecting the first 100 residues.
145
+ def type_guess
146
+ raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
147
+
148
+ case self.seq[0 ... 100].downcase
149
+ when /[flpqie]/ then return :protein
150
+ when /[u]/ then return :rna
151
+ else return :dna
152
+ end
153
+ end
154
+
155
+ # Method that guesses and sets the sequence type
156
+ # by inspecting the first 100 residues.
157
+ def type_guess!
158
+ self.type = self.type_guess
159
+ self
160
+ end
161
+
162
+ # Returns the length of a sequence.
163
+ def length
164
+ self.seq.nil? ? 0 : self.seq.length
165
+ end
166
+
167
+ alias :len :length
168
+
169
+ # Return the number indels in a sequence.
170
+ def indels
171
+ regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
172
+ self.seq.scan(regex).size
173
+ end
174
+
175
+ # Method to remove indels from seq and qual if qual.
176
+ def indels_remove
177
+ if self.qual.nil?
178
+ self.seq.delete!(Regexp.escape(INDELS.join('')))
179
+ else
180
+ na_seq = NArray.to_na(self.seq, "byte")
181
+ na_qual = NArray.to_na(self.qual, "byte")
182
+ mask = NArray.byte(self.length)
183
+
184
+ INDELS.each do |c|
185
+ mask += na_seq.eq(c.ord)
186
+ end
187
+
188
+ mask = mask.eq(0)
189
+
190
+ self.seq = na_seq[mask].to_s
191
+ self.qual = na_qual[mask].to_s
192
+ end
193
+
194
+ self
195
+ end
196
+
197
+ # Method that returns true is a given sequence type is DNA.
198
+ def is_dna?
199
+ self.type == :dna
200
+ end
201
+
202
+ # Method that returns true is a given sequence type is RNA.
203
+ def is_rna?
204
+ self.type == :rna
205
+ end
206
+
207
+ # Method that returns true is a given sequence type is protein.
208
+ def is_protein?
209
+ self.type == :protein
210
+ end
211
+
212
+ # Method to transcribe DNA to RNA.
213
+ def to_rna
214
+ raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
215
+ raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
216
+ self.type = :rna
217
+ self.seq.tr!('Tt','Uu')
218
+ end
219
+
220
+ # Method to reverse-transcribe RNA to DNA.
221
+ def to_dna
222
+ raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
223
+ raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
224
+ self.type = :dna
225
+ self.seq.tr!('Uu','Tt')
226
+ end
227
+
228
+ # Method that given a Seq entry returns a BioDSL record (a hash).
229
+ def to_bp
230
+ record = {}
231
+ record[:SEQ_NAME] = self.seq_name if self.seq_name
232
+ record[:SEQ] = self.seq if self.seq
233
+ record[:SEQ_LEN] = self.seq.length if self.seq
234
+ record[:SCORES] = self.qual if self.qual
235
+ record
236
+ end
237
+
238
+ # Method that given a Seq entry returns a FASTA entry (a string).
239
+ def to_fasta(wrap = nil)
240
+ raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
241
+ raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
242
+
243
+ seq_name = self.seq_name.to_s
244
+ seq = self.seq.to_s
245
+
246
+ unless wrap.nil?
247
+ seq.gsub!(/(.{#{wrap}})/) do |match|
248
+ match << $/
249
+ end
250
+
251
+ seq.chomp!
252
+ end
253
+
254
+ ">#{seq_name}#{$/}#{seq}#{$/}"
255
+ end
256
+
257
+ # Method that given a Seq entry returns a FASTQ entry (a string).
258
+ def to_fastq
259
+ raise SeqError, "Missing seq_name" if self.seq_name.nil?
260
+ raise SeqError, "Missing seq" if self.seq.nil?
261
+ raise SeqError, "Missing qual" if self.qual.nil?
262
+
263
+ seq_name = self.seq_name.to_s
264
+ seq = self.seq.to_s
265
+ qual = self.qual.to_s
266
+
267
+ "@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
268
+ end
269
+
270
+ # Method that generates a unique key for a
271
+ # DNA sequence and return this key as a Fixnum.
272
+ def to_key
273
+ key = 0
274
+
275
+ self.seq.upcase.each_char do |char|
276
+ key <<= 2
277
+
278
+ case char
279
+ when 'A' then key |= 0
280
+ when 'C' then key |= 1
281
+ when 'G' then key |= 2
282
+ when 'T' then key |= 3
283
+ else raise SeqError, "Bad residue: #{char}"
284
+ end
285
+ end
286
+
287
+ key
288
+ end
289
+
290
+ # Method to reverse the sequence.
291
+ def reverse
292
+ entry = Seq.new(
293
+ seq_name: self.seq_name,
294
+ seq: self.seq.reverse,
295
+ type: self.type,
296
+ qual: (self.qual ? self.qual.reverse : self.qual)
297
+ )
298
+
299
+ entry
300
+ end
301
+
302
+ # Method to reverse the sequence.
303
+ def reverse!
304
+ self.seq.reverse!
305
+ self.qual.reverse! if self.qual
306
+ self
307
+ end
308
+
309
+ # Method that complements sequence including ambiguity codes.
310
+ def complement
311
+ raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
312
+
313
+ entry = Seq.new(
314
+ seq_name: self.seq_name,
315
+ type: self.type,
316
+ qual: self.qual
317
+ )
318
+
319
+ if self.is_dna?
320
+ entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
321
+ elsif self.is_rna?
322
+ entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
323
+ else
324
+ raise SeqError, "Cannot complement sequence type: #{self.type}"
325
+ end
326
+
327
+ entry
328
+ end
329
+
330
+ # Method that complements sequence including ambiguity codes.
331
+ def complement!
332
+ raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
333
+
334
+ if self.is_dna?
335
+ self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
336
+ elsif self.is_rna?
337
+ self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
338
+ else
339
+ raise SeqError, "Cannot complement sequence type: #{self.type}"
340
+ end
341
+
342
+ self
343
+ end
344
+
345
+ # Method to determine the Hamming Distance between
346
+ # two Sequence objects (case insensitive).
347
+ def hamming_distance(entry, options = {})
348
+ if options[:ambiguity]
349
+ BioDSL::Hamming.distance(self.seq, entry.seq, options)
350
+ else
351
+ BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
352
+ end
353
+ end
354
+
355
+ # Method to determine the Edit Distance between
356
+ # two Sequence objects (case insensitive).
357
+ def edit_distance(entry)
358
+ Levenshtein.distance(self.seq, entry.seq)
359
+ end
360
+
361
+ # Method that generates a random sequence of a given length and type.
362
+ def generate(length, type)
363
+ raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
364
+
365
+ case type
366
+ when :dna then alph = DNA
367
+ when :rna then alph = RNA
368
+ when :protein then alph = PROTEIN
369
+ else
370
+ raise SeqError, "Unknown sequence type: #{type}"
371
+ end
372
+
373
+ seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
374
+ self.seq = seq_new
375
+ self.type = type
376
+ seq_new
377
+ end
378
+
379
+ # Method to return a new Seq object with shuffled sequence.
380
+ def shuffle
381
+ Seq.new(
382
+ seq_name: self.seq_name,
383
+ seq: self.seq.split('').shuffle!.join,
384
+ type: self.type,
385
+ qual: self.qual
386
+ )
387
+ end
388
+
389
+ # Method to shuffle a sequence randomly inline.
390
+ def shuffle!
391
+ self.seq = self.seq.split('').shuffle!.join
392
+ self
393
+ end
394
+
395
+ # Method to add two Seq objects.
396
+ def +(entry)
397
+ new_entry = Seq.new()
398
+ new_entry.seq = self.seq + entry.seq
399
+ new_entry.type = self.type if self.type == entry.type
400
+ new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
401
+ new_entry
402
+ end
403
+
404
+ # Method to concatenate sequence entries.
405
+ def <<(entry)
406
+ raise SeqError, "sequences of different types" unless self.type == entry.type
407
+ raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
408
+
409
+ self.seq << entry.seq
410
+ self.qual << entry.qual unless entry.qual.nil?
411
+
412
+ self
413
+ end
414
+
415
+ # Index method for Seq objects.
416
+ def [](*args)
417
+ entry = Seq.new
418
+ entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
419
+ entry.seq = self.seq[*args] || ""
420
+ entry.type = self.type
421
+ entry.qual = self.qual[*args] || "" unless self.qual.nil?
422
+
423
+ entry
424
+ end
425
+
426
+ # Index assignment method for Seq objects.
427
+ def []=(*args, entry)
428
+ self.seq[*args] = entry.seq[*args]
429
+ self.qual[*args] = entry.qual[*args] unless self.qual.nil?
430
+
431
+ self
432
+ end
433
+
434
+ # Method that returns the residue compositions of a sequence in
435
+ # a hash where the key is the residue and the value is the residue
436
+ # count.
437
+ def composition
438
+ comp = Hash.new(0);
439
+
440
+ self.seq.upcase.each_char do |char|
441
+ comp[char] += 1
442
+ end
443
+
444
+ comp
445
+ end
446
+
447
+ # Method that returns the percentage of hard masked residues
448
+ # or N's in a sequence.
449
+ def hard_mask
450
+ ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
451
+ end
452
+
453
+ # Method that returns the percentage of soft masked residues
454
+ # or lower cased residues in a sequence.
455
+ def soft_mask
456
+ ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
457
+ end
458
+
459
+ # Hard masks sequence residues where the corresponding quality score
460
+ # is below a given cutoff.
461
+ def mask_seq_hard!(cutoff)
462
+ raise SeqError, "seq is nil" if self.seq.nil?
463
+ raise SeqError, "qual is nil" if self.qual.nil?
464
+ raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
465
+
466
+ na_seq = NArray.to_na(self.seq.upcase, "byte")
467
+ na_qual = NArray.to_na(self.qual, "byte")
468
+ mask = (na_qual - SCORE_BASE) < cutoff
469
+ mask *= na_seq.ne("-".ord)
470
+
471
+ na_seq[mask] = 'N'.ord
472
+
473
+ self.seq = na_seq.to_s
474
+
475
+ self
476
+ end
477
+
478
+ # Soft masks sequence residues where the corresponding quality score
479
+ # is below a given cutoff. Masked sequence will be lowercased and
480
+ # remaining will be uppercased.
481
+ def mask_seq_soft!(cutoff)
482
+ raise SeqError, "seq is nil" if self.seq.nil?
483
+ raise SeqError, "qual is nil" if self.qual.nil?
484
+ raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
485
+
486
+ na_seq = NArray.to_na(self.seq.upcase, "byte")
487
+ na_qual = NArray.to_na(self.qual, "byte")
488
+ mask = (na_qual - SCORE_BASE) < cutoff
489
+ mask *= na_seq.ne("-".ord)
490
+
491
+ na_seq[mask] ^= ' '.ord
492
+
493
+ self.seq = na_seq.to_s
494
+
495
+ self
496
+ end
497
+
498
+ # Method that determines if a quality score string can be
499
+ # absolutely identified as base 33.
500
+ def qual_base33?
501
+ self.qual.match(/[!-:]/) ? true : false
502
+ end
503
+
504
+ # Method that determines if a quality score string may be base 64.
505
+ def qual_base64?
506
+ self.qual.match(/[K-h]/) ? true : false
507
+ end
508
+
509
+ # Method to determine if a quality score is valid accepting only 0-40 range.
510
+ def qual_valid?(encoding)
511
+ raise SeqError, "Missing qual" if self.qual.nil?
512
+
513
+ case encoding
514
+ when :base_33 then return true if self.qual.match(/^[!-I]*$/)
515
+ when :base_64 then return true if self.qual.match(/^[@-h]*$/)
516
+ else raise SeqError, "unknown quality score encoding: #{encoding}"
517
+ end
518
+
519
+ false
520
+ end
521
+
522
+ # Method to coerce quality scores to be within the 0-40 range.
523
+ def qual_coerce!(encoding)
524
+ raise SeqError, "Missing qual" if self.qual.nil?
525
+
526
+ case encoding
527
+ when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
528
+ when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
529
+ else
530
+ raise SeqError, "unknown quality score encoding: #{encoding}"
531
+ end
532
+
533
+ self
534
+ end
535
+
536
+ # Method to convert quality scores.
537
+ def qual_convert!(from, to)
538
+ raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
539
+ raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
540
+
541
+ if from == :base_33 and to == :base_64
542
+ qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
543
+ elsif from == :base_64 and to == :base_33
544
+ qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
545
+ qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
546
+ end
547
+
548
+ self
549
+ end
550
+
551
+ # Method to calculate and return the mean quality score.
552
+ def scores_mean
553
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
554
+
555
+ na_qual = NArray.to_na(self.qual, "byte")
556
+ na_qual -= SCORE_BASE
557
+
558
+ na_qual.mean
559
+ end
560
+
561
+ # Method to calculate and return the min quality score.
562
+ def scores_min
563
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
564
+
565
+ na_qual = NArray.to_na(self.qual, "byte")
566
+ na_qual -= SCORE_BASE
567
+
568
+ na_qual.min
569
+ end
570
+
571
+ # Method to calculate and return the max quality score.
572
+ def scores_max
573
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
574
+
575
+ na_qual = NArray.to_na(self.qual, "byte")
576
+ na_qual -= SCORE_BASE
577
+
578
+ na_qual.max
579
+ end
580
+
581
+ # Method to run a sliding window of a specified size across a Phred type
582
+ # scores string and calculate for each window the mean score and return
583
+ # the minimum mean score.
584
+ def scores_mean_local(window_size)
585
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
586
+
587
+ scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
588
+ end
589
+
590
+ # Method to find open reading frames (ORFs).
591
+ def each_orf(options = {})
592
+ size_min = options[:size_min] || 0
593
+ size_max = options[:size_max] || self.length
594
+ start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
595
+ stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
596
+ pick_longest = options[:pick_longest]
597
+
598
+ orfs = []
599
+ pos_beg = 0
600
+
601
+ regex_start = Regexp.new(start_codons.split(',').join('|'), true)
602
+ regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
603
+
604
+ while pos_beg and pos_beg < self.length - size_min
605
+ if pos_beg = self.seq.index(regex_start, pos_beg)
606
+ if pos_end = self.seq.index(regex_stop, pos_beg)
607
+ length = (pos_end - pos_beg) + 3
608
+
609
+ if (length % 3) == 0
610
+ if size_min <= length and length <= size_max
611
+ subseq = self[pos_beg ... pos_beg + length]
612
+
613
+ orfs << Orf.new(subseq, pos_beg, pos_end + 2)
614
+ end
615
+ end
616
+ end
617
+
618
+ pos_beg += 1
619
+ end
620
+ end
621
+
622
+ if pick_longest
623
+ orf_hash = {}
624
+
625
+ orfs.each { |orf| orf_hash[orf.stop] = orf unless orf_hash[orf.stop] }
626
+
627
+ orfs = orf_hash.values
628
+ end
629
+
630
+ if block_given?
631
+ orfs.each { |orf| yield orf }
632
+ else
633
+ return orfs
634
+ end
635
+ end
636
+
637
+ class Orf
638
+ attr_reader :entry, :start, :stop
639
+
640
+ def initialize(entry, start, stop)
641
+ @entry = entry
642
+ @start = start
643
+ @stop = stop
644
+ end
645
+ end
646
+
647
+ private
648
+
649
+ inline do |builder|
650
+ builder.c %{
651
+ VALUE qual_coerce_C(
652
+ VALUE _qual,
653
+ VALUE _qual_len,
654
+ VALUE _min_value,
655
+ VALUE _max_value
656
+ )
657
+ {
658
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
659
+ unsigned int qual_len = FIX2UINT(_qual_len);
660
+ unsigned int min_value = FIX2UINT(_min_value);
661
+ unsigned int max_value = FIX2UINT(_max_value);
662
+ unsigned int i = 0;
663
+
664
+ for (i = 0; i < qual_len; i++)
665
+ {
666
+ if (qual[i] > max_value) {
667
+ qual[i] = max_value;
668
+ } else if (qual[i] < min_value) {
669
+ qual[i] = min_value;
670
+ }
671
+ }
672
+
673
+ return Qnil;
674
+ }
675
+ }
676
+
677
+ builder.c %{
678
+ VALUE qual_convert_C(
679
+ VALUE _qual,
680
+ VALUE _qual_len,
681
+ VALUE _value
682
+ )
683
+ {
684
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
685
+ unsigned int qual_len = FIX2UINT(_qual_len);
686
+ unsigned int value = FIX2UINT(_value);
687
+ unsigned int i = 0;
688
+
689
+ for (i = 0; i < qual_len; i++)
690
+ {
691
+ qual[i] += value;
692
+ }
693
+
694
+ return Qnil;
695
+ }
696
+ }
697
+
698
+ builder.c %{
699
+ VALUE scores_mean_local_C(
700
+ VALUE _qual,
701
+ VALUE _qual_len,
702
+ VALUE _score_base,
703
+ VALUE _window_size
704
+ )
705
+ {
706
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
707
+ unsigned int qual_len = FIX2UINT(_qual_len);
708
+ unsigned int score_base = FIX2UINT(_score_base);
709
+ unsigned int window_size = FIX2UINT(_window_size);
710
+ unsigned int sum = 0;
711
+ unsigned int i = 0;
712
+ float mean = 0.0;
713
+ float new_mean = 0.0;
714
+
715
+ // fill window
716
+ for (i = 0; i < window_size; i++)
717
+ sum += qual[i] - score_base;
718
+
719
+ mean = sum / window_size;
720
+
721
+ // run window across the rest of the scores
722
+ while (i < qual_len)
723
+ {
724
+ sum += qual[i] - score_base;
725
+ sum -= qual[i - window_size] - score_base;
726
+
727
+ new_mean = sum / window_size;
728
+
729
+ if (new_mean < mean)
730
+ mean = new_mean;
731
+
732
+ i++;
733
+ }
734
+
735
+ return rb_float_new(mean);
736
+ }
737
+ }
738
+ end
739
+ end
740
+ end
741
+
742
+ __END__