BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
data/lib/BioDSL/seq.rb ADDED
@@ -0,0 +1,742 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
+ # #
19
+ # http://www.gnu.org/copyleft/gpl.html #
20
+ # #
21
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
+ # #
23
+ # This software is part BioDSL (www.BioDSL.org). #
24
+ # #
25
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
+
27
+ module BioDSL
28
+ require 'narray'
29
+ require 'BioDSL/seq/ambiguity'
30
+ require 'BioDSL/seq/assemble'
31
+ require 'BioDSL/seq/digest'
32
+ require 'BioDSL/seq/kmer'
33
+ require 'BioDSL/seq/translate'
34
+ require 'BioDSL/seq/trim'
35
+ require 'BioDSL/seq/backtrack'
36
+ require 'BioDSL/seq/dynamic'
37
+ require 'BioDSL/seq/homopolymer'
38
+ require 'BioDSL/seq/levenshtein'
39
+
40
+ # Error class for all exceptions to do with Seq.
41
+ class SeqError < StandardError; end
42
+
43
+ class Seq
44
+ # Residue alphabets
45
+ DNA = %w[a t c g]
46
+ RNA = %w[a u c g]
47
+ PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
48
+ INDELS = %w[. - _ ~]
49
+
50
+ # Quality scores bases
51
+ SCORE_BASE = 33
52
+ SCORE_MIN = 0
53
+ SCORE_MAX = 40
54
+
55
+ include BioDSL::Digest
56
+ include BioDSL::Homopolymer
57
+ include BioDSL::Translate
58
+ include BioDSL::Trim
59
+ include BioDSL::Kmer
60
+ include BioDSL::BackTrack
61
+
62
+ attr_accessor :seq_name, :seq, :type, :qual
63
+
64
+ # Class method to instantiate a new Sequence object given
65
+ # a Biopiece record.
66
+ def self.new_bp(record)
67
+ seq_name = record[:SEQ_NAME]
68
+ seq = record[:SEQ]
69
+ type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
70
+ qual = record[:SCORES]
71
+
72
+ self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
73
+ end
74
+
75
+ # Class method that generates all possible oligos of a specifed length and type.
76
+ def self.generate_oligos(length, type)
77
+ raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
78
+
79
+ case type.downcase
80
+ when :dna then alph = DNA
81
+ when :rna then alph = RNA
82
+ when :protein then alph = PROTEIN
83
+ else
84
+ raise SeqError, "Unknown sequence type: #{type}"
85
+ end
86
+
87
+ oligos = [""]
88
+
89
+ (1 .. length).each do
90
+ list = []
91
+
92
+ oligos.each do |oligo|
93
+ alph.each do |char|
94
+ list << oligo + char
95
+ end
96
+ end
97
+
98
+ oligos = list
99
+ end
100
+
101
+ oligos
102
+ end
103
+
104
+ def self.check_name_pair(entry1, entry2)
105
+ if entry1.seq_name =~ /^([^ ]+) \d:/
106
+ name1 = $1
107
+ elsif entry1.seq_name =~ /^(.+)\/\d$/
108
+ name1 = $1
109
+ else
110
+ raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
111
+ end
112
+
113
+ if entry2.seq_name =~ /^([^ ]+) \d:/
114
+ name2 = $1
115
+ elsif entry2.seq_name =~ /^(.+)\/\d$/
116
+ name2 = $1
117
+ else
118
+ raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
119
+ end
120
+
121
+ if name1 != name2
122
+ raise SeqError, "Name mismatch: #{name1} != #{name2}"
123
+ end
124
+ end
125
+
126
+ # Initialize a sequence object with the following options:
127
+ # - :seq_name Name of the sequence.
128
+ # - :seq The sequence.
129
+ # - :type The sequence type - DNA, RNA, or protein
130
+ # - :qual An Illumina type quality scores string.
131
+ def initialize(options = {})
132
+ @seq_name = options[:seq_name]
133
+ @seq = options[:seq]
134
+ @type = options[:type]
135
+ @qual = options[:qual]
136
+
137
+ if @seq and @qual and @seq.length != @qual.length
138
+ raise SeqError, "Sequence length and score length mismatch:" \
139
+ "#{@seq.length} != #{@qual.length}"
140
+ end
141
+ end
142
+
143
+ # Method that guesses and returns the sequence type
144
+ # by inspecting the first 100 residues.
145
+ def type_guess
146
+ raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
147
+
148
+ case self.seq[0 ... 100].downcase
149
+ when /[flpqie]/ then return :protein
150
+ when /[u]/ then return :rna
151
+ else return :dna
152
+ end
153
+ end
154
+
155
+ # Method that guesses and sets the sequence type
156
+ # by inspecting the first 100 residues.
157
+ def type_guess!
158
+ self.type = self.type_guess
159
+ self
160
+ end
161
+
162
+ # Returns the length of a sequence.
163
+ def length
164
+ self.seq.nil? ? 0 : self.seq.length
165
+ end
166
+
167
+ alias :len :length
168
+
169
+ # Return the number indels in a sequence.
170
+ def indels
171
+ regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
172
+ self.seq.scan(regex).size
173
+ end
174
+
175
+ # Method to remove indels from seq and qual if qual.
176
+ def indels_remove
177
+ if self.qual.nil?
178
+ self.seq.delete!(Regexp.escape(INDELS.join('')))
179
+ else
180
+ na_seq = NArray.to_na(self.seq, "byte")
181
+ na_qual = NArray.to_na(self.qual, "byte")
182
+ mask = NArray.byte(self.length)
183
+
184
+ INDELS.each do |c|
185
+ mask += na_seq.eq(c.ord)
186
+ end
187
+
188
+ mask = mask.eq(0)
189
+
190
+ self.seq = na_seq[mask].to_s
191
+ self.qual = na_qual[mask].to_s
192
+ end
193
+
194
+ self
195
+ end
196
+
197
+ # Method that returns true is a given sequence type is DNA.
198
+ def is_dna?
199
+ self.type == :dna
200
+ end
201
+
202
+ # Method that returns true is a given sequence type is RNA.
203
+ def is_rna?
204
+ self.type == :rna
205
+ end
206
+
207
+ # Method that returns true is a given sequence type is protein.
208
+ def is_protein?
209
+ self.type == :protein
210
+ end
211
+
212
+ # Method to transcribe DNA to RNA.
213
+ def to_rna
214
+ raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
215
+ raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
216
+ self.type = :rna
217
+ self.seq.tr!('Tt','Uu')
218
+ end
219
+
220
+ # Method to reverse-transcribe RNA to DNA.
221
+ def to_dna
222
+ raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
223
+ raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
224
+ self.type = :dna
225
+ self.seq.tr!('Uu','Tt')
226
+ end
227
+
228
+ # Method that given a Seq entry returns a BioDSL record (a hash).
229
+ def to_bp
230
+ record = {}
231
+ record[:SEQ_NAME] = self.seq_name if self.seq_name
232
+ record[:SEQ] = self.seq if self.seq
233
+ record[:SEQ_LEN] = self.seq.length if self.seq
234
+ record[:SCORES] = self.qual if self.qual
235
+ record
236
+ end
237
+
238
+ # Method that given a Seq entry returns a FASTA entry (a string).
239
+ def to_fasta(wrap = nil)
240
+ raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
241
+ raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
242
+
243
+ seq_name = self.seq_name.to_s
244
+ seq = self.seq.to_s
245
+
246
+ unless wrap.nil?
247
+ seq.gsub!(/(.{#{wrap}})/) do |match|
248
+ match << $/
249
+ end
250
+
251
+ seq.chomp!
252
+ end
253
+
254
+ ">#{seq_name}#{$/}#{seq}#{$/}"
255
+ end
256
+
257
+ # Method that given a Seq entry returns a FASTQ entry (a string).
258
+ def to_fastq
259
+ raise SeqError, "Missing seq_name" if self.seq_name.nil?
260
+ raise SeqError, "Missing seq" if self.seq.nil?
261
+ raise SeqError, "Missing qual" if self.qual.nil?
262
+
263
+ seq_name = self.seq_name.to_s
264
+ seq = self.seq.to_s
265
+ qual = self.qual.to_s
266
+
267
+ "@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
268
+ end
269
+
270
+ # Method that generates a unique key for a
271
+ # DNA sequence and return this key as a Fixnum.
272
+ def to_key
273
+ key = 0
274
+
275
+ self.seq.upcase.each_char do |char|
276
+ key <<= 2
277
+
278
+ case char
279
+ when 'A' then key |= 0
280
+ when 'C' then key |= 1
281
+ when 'G' then key |= 2
282
+ when 'T' then key |= 3
283
+ else raise SeqError, "Bad residue: #{char}"
284
+ end
285
+ end
286
+
287
+ key
288
+ end
289
+
290
+ # Method to reverse the sequence.
291
+ def reverse
292
+ entry = Seq.new(
293
+ seq_name: self.seq_name,
294
+ seq: self.seq.reverse,
295
+ type: self.type,
296
+ qual: (self.qual ? self.qual.reverse : self.qual)
297
+ )
298
+
299
+ entry
300
+ end
301
+
302
+ # Method to reverse the sequence.
303
+ def reverse!
304
+ self.seq.reverse!
305
+ self.qual.reverse! if self.qual
306
+ self
307
+ end
308
+
309
+ # Method that complements sequence including ambiguity codes.
310
+ def complement
311
+ raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
312
+
313
+ entry = Seq.new(
314
+ seq_name: self.seq_name,
315
+ type: self.type,
316
+ qual: self.qual
317
+ )
318
+
319
+ if self.is_dna?
320
+ entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
321
+ elsif self.is_rna?
322
+ entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
323
+ else
324
+ raise SeqError, "Cannot complement sequence type: #{self.type}"
325
+ end
326
+
327
+ entry
328
+ end
329
+
330
+ # Method that complements sequence including ambiguity codes.
331
+ def complement!
332
+ raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
333
+
334
+ if self.is_dna?
335
+ self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
336
+ elsif self.is_rna?
337
+ self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
338
+ else
339
+ raise SeqError, "Cannot complement sequence type: #{self.type}"
340
+ end
341
+
342
+ self
343
+ end
344
+
345
+ # Method to determine the Hamming Distance between
346
+ # two Sequence objects (case insensitive).
347
+ def hamming_distance(entry, options = {})
348
+ if options[:ambiguity]
349
+ BioDSL::Hamming.distance(self.seq, entry.seq, options)
350
+ else
351
+ BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
352
+ end
353
+ end
354
+
355
+ # Method to determine the Edit Distance between
356
+ # two Sequence objects (case insensitive).
357
+ def edit_distance(entry)
358
+ Levenshtein.distance(self.seq, entry.seq)
359
+ end
360
+
361
+ # Method that generates a random sequence of a given length and type.
362
+ def generate(length, type)
363
+ raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
364
+
365
+ case type
366
+ when :dna then alph = DNA
367
+ when :rna then alph = RNA
368
+ when :protein then alph = PROTEIN
369
+ else
370
+ raise SeqError, "Unknown sequence type: #{type}"
371
+ end
372
+
373
+ seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
374
+ self.seq = seq_new
375
+ self.type = type
376
+ seq_new
377
+ end
378
+
379
+ # Method to return a new Seq object with shuffled sequence.
380
+ def shuffle
381
+ Seq.new(
382
+ seq_name: self.seq_name,
383
+ seq: self.seq.split('').shuffle!.join,
384
+ type: self.type,
385
+ qual: self.qual
386
+ )
387
+ end
388
+
389
+ # Method to shuffle a sequence randomly inline.
390
+ def shuffle!
391
+ self.seq = self.seq.split('').shuffle!.join
392
+ self
393
+ end
394
+
395
+ # Method to add two Seq objects.
396
+ def +(entry)
397
+ new_entry = Seq.new()
398
+ new_entry.seq = self.seq + entry.seq
399
+ new_entry.type = self.type if self.type == entry.type
400
+ new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
401
+ new_entry
402
+ end
403
+
404
+ # Method to concatenate sequence entries.
405
+ def <<(entry)
406
+ raise SeqError, "sequences of different types" unless self.type == entry.type
407
+ raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
408
+
409
+ self.seq << entry.seq
410
+ self.qual << entry.qual unless entry.qual.nil?
411
+
412
+ self
413
+ end
414
+
415
+ # Index method for Seq objects.
416
+ def [](*args)
417
+ entry = Seq.new
418
+ entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
419
+ entry.seq = self.seq[*args] || ""
420
+ entry.type = self.type
421
+ entry.qual = self.qual[*args] || "" unless self.qual.nil?
422
+
423
+ entry
424
+ end
425
+
426
+ # Index assignment method for Seq objects.
427
+ def []=(*args, entry)
428
+ self.seq[*args] = entry.seq[*args]
429
+ self.qual[*args] = entry.qual[*args] unless self.qual.nil?
430
+
431
+ self
432
+ end
433
+
434
+ # Method that returns the residue compositions of a sequence in
435
+ # a hash where the key is the residue and the value is the residue
436
+ # count.
437
+ def composition
438
+ comp = Hash.new(0);
439
+
440
+ self.seq.upcase.each_char do |char|
441
+ comp[char] += 1
442
+ end
443
+
444
+ comp
445
+ end
446
+
447
+ # Method that returns the percentage of hard masked residues
448
+ # or N's in a sequence.
449
+ def hard_mask
450
+ ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
451
+ end
452
+
453
+ # Method that returns the percentage of soft masked residues
454
+ # or lower cased residues in a sequence.
455
+ def soft_mask
456
+ ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
457
+ end
458
+
459
+ # Hard masks sequence residues where the corresponding quality score
460
+ # is below a given cutoff.
461
+ def mask_seq_hard!(cutoff)
462
+ raise SeqError, "seq is nil" if self.seq.nil?
463
+ raise SeqError, "qual is nil" if self.qual.nil?
464
+ raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
465
+
466
+ na_seq = NArray.to_na(self.seq.upcase, "byte")
467
+ na_qual = NArray.to_na(self.qual, "byte")
468
+ mask = (na_qual - SCORE_BASE) < cutoff
469
+ mask *= na_seq.ne("-".ord)
470
+
471
+ na_seq[mask] = 'N'.ord
472
+
473
+ self.seq = na_seq.to_s
474
+
475
+ self
476
+ end
477
+
478
+ # Soft masks sequence residues where the corresponding quality score
479
+ # is below a given cutoff. Masked sequence will be lowercased and
480
+ # remaining will be uppercased.
481
+ def mask_seq_soft!(cutoff)
482
+ raise SeqError, "seq is nil" if self.seq.nil?
483
+ raise SeqError, "qual is nil" if self.qual.nil?
484
+ raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
485
+
486
+ na_seq = NArray.to_na(self.seq.upcase, "byte")
487
+ na_qual = NArray.to_na(self.qual, "byte")
488
+ mask = (na_qual - SCORE_BASE) < cutoff
489
+ mask *= na_seq.ne("-".ord)
490
+
491
+ na_seq[mask] ^= ' '.ord
492
+
493
+ self.seq = na_seq.to_s
494
+
495
+ self
496
+ end
497
+
498
+ # Method that determines if a quality score string can be
499
+ # absolutely identified as base 33.
500
+ def qual_base33?
501
+ self.qual.match(/[!-:]/) ? true : false
502
+ end
503
+
504
+ # Method that determines if a quality score string may be base 64.
505
+ def qual_base64?
506
+ self.qual.match(/[K-h]/) ? true : false
507
+ end
508
+
509
+ # Method to determine if a quality score is valid accepting only 0-40 range.
510
+ def qual_valid?(encoding)
511
+ raise SeqError, "Missing qual" if self.qual.nil?
512
+
513
+ case encoding
514
+ when :base_33 then return true if self.qual.match(/^[!-I]*$/)
515
+ when :base_64 then return true if self.qual.match(/^[@-h]*$/)
516
+ else raise SeqError, "unknown quality score encoding: #{encoding}"
517
+ end
518
+
519
+ false
520
+ end
521
+
522
+ # Method to coerce quality scores to be within the 0-40 range.
523
+ def qual_coerce!(encoding)
524
+ raise SeqError, "Missing qual" if self.qual.nil?
525
+
526
+ case encoding
527
+ when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
528
+ when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
529
+ else
530
+ raise SeqError, "unknown quality score encoding: #{encoding}"
531
+ end
532
+
533
+ self
534
+ end
535
+
536
+ # Method to convert quality scores.
537
+ def qual_convert!(from, to)
538
+ raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
539
+ raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
540
+
541
+ if from == :base_33 and to == :base_64
542
+ qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
543
+ elsif from == :base_64 and to == :base_33
544
+ qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
545
+ qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
546
+ end
547
+
548
+ self
549
+ end
550
+
551
+ # Method to calculate and return the mean quality score.
552
+ def scores_mean
553
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
554
+
555
+ na_qual = NArray.to_na(self.qual, "byte")
556
+ na_qual -= SCORE_BASE
557
+
558
+ na_qual.mean
559
+ end
560
+
561
+ # Method to calculate and return the min quality score.
562
+ def scores_min
563
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
564
+
565
+ na_qual = NArray.to_na(self.qual, "byte")
566
+ na_qual -= SCORE_BASE
567
+
568
+ na_qual.min
569
+ end
570
+
571
+ # Method to calculate and return the max quality score.
572
+ def scores_max
573
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
574
+
575
+ na_qual = NArray.to_na(self.qual, "byte")
576
+ na_qual -= SCORE_BASE
577
+
578
+ na_qual.max
579
+ end
580
+
581
+ # Method to run a sliding window of a specified size across a Phred type
582
+ # scores string and calculate for each window the mean score and return
583
+ # the minimum mean score.
584
+ def scores_mean_local(window_size)
585
+ raise SeqError, "Missing qual in entry" if self.qual.nil?
586
+
587
+ scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
588
+ end
589
+
590
+ # Method to find open reading frames (ORFs).
591
+ def each_orf(options = {})
592
+ size_min = options[:size_min] || 0
593
+ size_max = options[:size_max] || self.length
594
+ start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
595
+ stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
596
+ pick_longest = options[:pick_longest]
597
+
598
+ orfs = []
599
+ pos_beg = 0
600
+
601
+ regex_start = Regexp.new(start_codons.split(',').join('|'), true)
602
+ regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
603
+
604
+ while pos_beg and pos_beg < self.length - size_min
605
+ if pos_beg = self.seq.index(regex_start, pos_beg)
606
+ if pos_end = self.seq.index(regex_stop, pos_beg)
607
+ length = (pos_end - pos_beg) + 3
608
+
609
+ if (length % 3) == 0
610
+ if size_min <= length and length <= size_max
611
+ subseq = self[pos_beg ... pos_beg + length]
612
+
613
+ orfs << Orf.new(subseq, pos_beg, pos_end + 2)
614
+ end
615
+ end
616
+ end
617
+
618
+ pos_beg += 1
619
+ end
620
+ end
621
+
622
+ if pick_longest
623
+ orf_hash = {}
624
+
625
+ orfs.each { |orf| orf_hash[orf.stop] = orf unless orf_hash[orf.stop] }
626
+
627
+ orfs = orf_hash.values
628
+ end
629
+
630
+ if block_given?
631
+ orfs.each { |orf| yield orf }
632
+ else
633
+ return orfs
634
+ end
635
+ end
636
+
637
+ class Orf
638
+ attr_reader :entry, :start, :stop
639
+
640
+ def initialize(entry, start, stop)
641
+ @entry = entry
642
+ @start = start
643
+ @stop = stop
644
+ end
645
+ end
646
+
647
+ private
648
+
649
+ inline do |builder|
650
+ builder.c %{
651
+ VALUE qual_coerce_C(
652
+ VALUE _qual,
653
+ VALUE _qual_len,
654
+ VALUE _min_value,
655
+ VALUE _max_value
656
+ )
657
+ {
658
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
659
+ unsigned int qual_len = FIX2UINT(_qual_len);
660
+ unsigned int min_value = FIX2UINT(_min_value);
661
+ unsigned int max_value = FIX2UINT(_max_value);
662
+ unsigned int i = 0;
663
+
664
+ for (i = 0; i < qual_len; i++)
665
+ {
666
+ if (qual[i] > max_value) {
667
+ qual[i] = max_value;
668
+ } else if (qual[i] < min_value) {
669
+ qual[i] = min_value;
670
+ }
671
+ }
672
+
673
+ return Qnil;
674
+ }
675
+ }
676
+
677
+ builder.c %{
678
+ VALUE qual_convert_C(
679
+ VALUE _qual,
680
+ VALUE _qual_len,
681
+ VALUE _value
682
+ )
683
+ {
684
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
685
+ unsigned int qual_len = FIX2UINT(_qual_len);
686
+ unsigned int value = FIX2UINT(_value);
687
+ unsigned int i = 0;
688
+
689
+ for (i = 0; i < qual_len; i++)
690
+ {
691
+ qual[i] += value;
692
+ }
693
+
694
+ return Qnil;
695
+ }
696
+ }
697
+
698
+ builder.c %{
699
+ VALUE scores_mean_local_C(
700
+ VALUE _qual,
701
+ VALUE _qual_len,
702
+ VALUE _score_base,
703
+ VALUE _window_size
704
+ )
705
+ {
706
+ unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
707
+ unsigned int qual_len = FIX2UINT(_qual_len);
708
+ unsigned int score_base = FIX2UINT(_score_base);
709
+ unsigned int window_size = FIX2UINT(_window_size);
710
+ unsigned int sum = 0;
711
+ unsigned int i = 0;
712
+ float mean = 0.0;
713
+ float new_mean = 0.0;
714
+
715
+ // fill window
716
+ for (i = 0; i < window_size; i++)
717
+ sum += qual[i] - score_base;
718
+
719
+ mean = sum / window_size;
720
+
721
+ // run window across the rest of the scores
722
+ while (i < qual_len)
723
+ {
724
+ sum += qual[i] - score_base;
725
+ sum -= qual[i - window_size] - score_base;
726
+
727
+ new_mean = sum / window_size;
728
+
729
+ if (new_mean < mean)
730
+ mean = new_mean;
731
+
732
+ i++;
733
+ }
734
+
735
+ return rb_float_new(mean);
736
+ }
737
+ }
738
+ end
739
+ end
740
+ end
741
+
742
+ __END__