BioDSL 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
data/lib/BioDSL/seq.rb
ADDED
@@ -0,0 +1,742 @@
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
18
|
+
# #
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
20
|
+
# #
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
|
+
# #
|
23
|
+
# This software is part BioDSL (www.BioDSL.org). #
|
24
|
+
# #
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
|
+
|
27
|
+
module BioDSL
|
28
|
+
require 'narray'
|
29
|
+
require 'BioDSL/seq/ambiguity'
|
30
|
+
require 'BioDSL/seq/assemble'
|
31
|
+
require 'BioDSL/seq/digest'
|
32
|
+
require 'BioDSL/seq/kmer'
|
33
|
+
require 'BioDSL/seq/translate'
|
34
|
+
require 'BioDSL/seq/trim'
|
35
|
+
require 'BioDSL/seq/backtrack'
|
36
|
+
require 'BioDSL/seq/dynamic'
|
37
|
+
require 'BioDSL/seq/homopolymer'
|
38
|
+
require 'BioDSL/seq/levenshtein'
|
39
|
+
|
40
|
+
# Error class for all exceptions to do with Seq.
|
41
|
+
class SeqError < StandardError; end
|
42
|
+
|
43
|
+
class Seq
|
44
|
+
# Residue alphabets
|
45
|
+
DNA = %w[a t c g]
|
46
|
+
RNA = %w[a u c g]
|
47
|
+
PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
|
48
|
+
INDELS = %w[. - _ ~]
|
49
|
+
|
50
|
+
# Quality scores bases
|
51
|
+
SCORE_BASE = 33
|
52
|
+
SCORE_MIN = 0
|
53
|
+
SCORE_MAX = 40
|
54
|
+
|
55
|
+
include BioDSL::Digest
|
56
|
+
include BioDSL::Homopolymer
|
57
|
+
include BioDSL::Translate
|
58
|
+
include BioDSL::Trim
|
59
|
+
include BioDSL::Kmer
|
60
|
+
include BioDSL::BackTrack
|
61
|
+
|
62
|
+
attr_accessor :seq_name, :seq, :type, :qual
|
63
|
+
|
64
|
+
# Class method to instantiate a new Sequence object given
|
65
|
+
# a Biopiece record.
|
66
|
+
def self.new_bp(record)
|
67
|
+
seq_name = record[:SEQ_NAME]
|
68
|
+
seq = record[:SEQ]
|
69
|
+
type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
|
70
|
+
qual = record[:SCORES]
|
71
|
+
|
72
|
+
self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Class method that generates all possible oligos of a specifed length and type.
|
76
|
+
def self.generate_oligos(length, type)
|
77
|
+
raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
|
78
|
+
|
79
|
+
case type.downcase
|
80
|
+
when :dna then alph = DNA
|
81
|
+
when :rna then alph = RNA
|
82
|
+
when :protein then alph = PROTEIN
|
83
|
+
else
|
84
|
+
raise SeqError, "Unknown sequence type: #{type}"
|
85
|
+
end
|
86
|
+
|
87
|
+
oligos = [""]
|
88
|
+
|
89
|
+
(1 .. length).each do
|
90
|
+
list = []
|
91
|
+
|
92
|
+
oligos.each do |oligo|
|
93
|
+
alph.each do |char|
|
94
|
+
list << oligo + char
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
oligos = list
|
99
|
+
end
|
100
|
+
|
101
|
+
oligos
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.check_name_pair(entry1, entry2)
|
105
|
+
if entry1.seq_name =~ /^([^ ]+) \d:/
|
106
|
+
name1 = $1
|
107
|
+
elsif entry1.seq_name =~ /^(.+)\/\d$/
|
108
|
+
name1 = $1
|
109
|
+
else
|
110
|
+
raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
|
111
|
+
end
|
112
|
+
|
113
|
+
if entry2.seq_name =~ /^([^ ]+) \d:/
|
114
|
+
name2 = $1
|
115
|
+
elsif entry2.seq_name =~ /^(.+)\/\d$/
|
116
|
+
name2 = $1
|
117
|
+
else
|
118
|
+
raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
|
119
|
+
end
|
120
|
+
|
121
|
+
if name1 != name2
|
122
|
+
raise SeqError, "Name mismatch: #{name1} != #{name2}"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Initialize a sequence object with the following options:
|
127
|
+
# - :seq_name Name of the sequence.
|
128
|
+
# - :seq The sequence.
|
129
|
+
# - :type The sequence type - DNA, RNA, or protein
|
130
|
+
# - :qual An Illumina type quality scores string.
|
131
|
+
def initialize(options = {})
|
132
|
+
@seq_name = options[:seq_name]
|
133
|
+
@seq = options[:seq]
|
134
|
+
@type = options[:type]
|
135
|
+
@qual = options[:qual]
|
136
|
+
|
137
|
+
if @seq and @qual and @seq.length != @qual.length
|
138
|
+
raise SeqError, "Sequence length and score length mismatch:" \
|
139
|
+
"#{@seq.length} != #{@qual.length}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Method that guesses and returns the sequence type
|
144
|
+
# by inspecting the first 100 residues.
|
145
|
+
def type_guess
|
146
|
+
raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
|
147
|
+
|
148
|
+
case self.seq[0 ... 100].downcase
|
149
|
+
when /[flpqie]/ then return :protein
|
150
|
+
when /[u]/ then return :rna
|
151
|
+
else return :dna
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Method that guesses and sets the sequence type
|
156
|
+
# by inspecting the first 100 residues.
|
157
|
+
def type_guess!
|
158
|
+
self.type = self.type_guess
|
159
|
+
self
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the length of a sequence.
|
163
|
+
def length
|
164
|
+
self.seq.nil? ? 0 : self.seq.length
|
165
|
+
end
|
166
|
+
|
167
|
+
alias :len :length
|
168
|
+
|
169
|
+
# Return the number indels in a sequence.
|
170
|
+
def indels
|
171
|
+
regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
|
172
|
+
self.seq.scan(regex).size
|
173
|
+
end
|
174
|
+
|
175
|
+
# Method to remove indels from seq and qual if qual.
|
176
|
+
def indels_remove
|
177
|
+
if self.qual.nil?
|
178
|
+
self.seq.delete!(Regexp.escape(INDELS.join('')))
|
179
|
+
else
|
180
|
+
na_seq = NArray.to_na(self.seq, "byte")
|
181
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
182
|
+
mask = NArray.byte(self.length)
|
183
|
+
|
184
|
+
INDELS.each do |c|
|
185
|
+
mask += na_seq.eq(c.ord)
|
186
|
+
end
|
187
|
+
|
188
|
+
mask = mask.eq(0)
|
189
|
+
|
190
|
+
self.seq = na_seq[mask].to_s
|
191
|
+
self.qual = na_qual[mask].to_s
|
192
|
+
end
|
193
|
+
|
194
|
+
self
|
195
|
+
end
|
196
|
+
|
197
|
+
# Method that returns true is a given sequence type is DNA.
|
198
|
+
def is_dna?
|
199
|
+
self.type == :dna
|
200
|
+
end
|
201
|
+
|
202
|
+
# Method that returns true is a given sequence type is RNA.
|
203
|
+
def is_rna?
|
204
|
+
self.type == :rna
|
205
|
+
end
|
206
|
+
|
207
|
+
# Method that returns true is a given sequence type is protein.
|
208
|
+
def is_protein?
|
209
|
+
self.type == :protein
|
210
|
+
end
|
211
|
+
|
212
|
+
# Method to transcribe DNA to RNA.
|
213
|
+
def to_rna
|
214
|
+
raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
|
215
|
+
raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
|
216
|
+
self.type = :rna
|
217
|
+
self.seq.tr!('Tt','Uu')
|
218
|
+
end
|
219
|
+
|
220
|
+
# Method to reverse-transcribe RNA to DNA.
|
221
|
+
def to_dna
|
222
|
+
raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
|
223
|
+
raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
|
224
|
+
self.type = :dna
|
225
|
+
self.seq.tr!('Uu','Tt')
|
226
|
+
end
|
227
|
+
|
228
|
+
# Method that given a Seq entry returns a BioDSL record (a hash).
|
229
|
+
def to_bp
|
230
|
+
record = {}
|
231
|
+
record[:SEQ_NAME] = self.seq_name if self.seq_name
|
232
|
+
record[:SEQ] = self.seq if self.seq
|
233
|
+
record[:SEQ_LEN] = self.seq.length if self.seq
|
234
|
+
record[:SCORES] = self.qual if self.qual
|
235
|
+
record
|
236
|
+
end
|
237
|
+
|
238
|
+
# Method that given a Seq entry returns a FASTA entry (a string).
|
239
|
+
def to_fasta(wrap = nil)
|
240
|
+
raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
|
241
|
+
raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
|
242
|
+
|
243
|
+
seq_name = self.seq_name.to_s
|
244
|
+
seq = self.seq.to_s
|
245
|
+
|
246
|
+
unless wrap.nil?
|
247
|
+
seq.gsub!(/(.{#{wrap}})/) do |match|
|
248
|
+
match << $/
|
249
|
+
end
|
250
|
+
|
251
|
+
seq.chomp!
|
252
|
+
end
|
253
|
+
|
254
|
+
">#{seq_name}#{$/}#{seq}#{$/}"
|
255
|
+
end
|
256
|
+
|
257
|
+
# Method that given a Seq entry returns a FASTQ entry (a string).
|
258
|
+
def to_fastq
|
259
|
+
raise SeqError, "Missing seq_name" if self.seq_name.nil?
|
260
|
+
raise SeqError, "Missing seq" if self.seq.nil?
|
261
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
262
|
+
|
263
|
+
seq_name = self.seq_name.to_s
|
264
|
+
seq = self.seq.to_s
|
265
|
+
qual = self.qual.to_s
|
266
|
+
|
267
|
+
"@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
|
268
|
+
end
|
269
|
+
|
270
|
+
# Method that generates a unique key for a
|
271
|
+
# DNA sequence and return this key as a Fixnum.
|
272
|
+
def to_key
|
273
|
+
key = 0
|
274
|
+
|
275
|
+
self.seq.upcase.each_char do |char|
|
276
|
+
key <<= 2
|
277
|
+
|
278
|
+
case char
|
279
|
+
when 'A' then key |= 0
|
280
|
+
when 'C' then key |= 1
|
281
|
+
when 'G' then key |= 2
|
282
|
+
when 'T' then key |= 3
|
283
|
+
else raise SeqError, "Bad residue: #{char}"
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
key
|
288
|
+
end
|
289
|
+
|
290
|
+
# Method to reverse the sequence.
|
291
|
+
def reverse
|
292
|
+
entry = Seq.new(
|
293
|
+
seq_name: self.seq_name,
|
294
|
+
seq: self.seq.reverse,
|
295
|
+
type: self.type,
|
296
|
+
qual: (self.qual ? self.qual.reverse : self.qual)
|
297
|
+
)
|
298
|
+
|
299
|
+
entry
|
300
|
+
end
|
301
|
+
|
302
|
+
# Method to reverse the sequence.
|
303
|
+
def reverse!
|
304
|
+
self.seq.reverse!
|
305
|
+
self.qual.reverse! if self.qual
|
306
|
+
self
|
307
|
+
end
|
308
|
+
|
309
|
+
# Method that complements sequence including ambiguity codes.
|
310
|
+
def complement
|
311
|
+
raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
|
312
|
+
|
313
|
+
entry = Seq.new(
|
314
|
+
seq_name: self.seq_name,
|
315
|
+
type: self.type,
|
316
|
+
qual: self.qual
|
317
|
+
)
|
318
|
+
|
319
|
+
if self.is_dna?
|
320
|
+
entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
321
|
+
elsif self.is_rna?
|
322
|
+
entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
323
|
+
else
|
324
|
+
raise SeqError, "Cannot complement sequence type: #{self.type}"
|
325
|
+
end
|
326
|
+
|
327
|
+
entry
|
328
|
+
end
|
329
|
+
|
330
|
+
# Method that complements sequence including ambiguity codes.
|
331
|
+
def complement!
|
332
|
+
raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
|
333
|
+
|
334
|
+
if self.is_dna?
|
335
|
+
self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
336
|
+
elsif self.is_rna?
|
337
|
+
self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
338
|
+
else
|
339
|
+
raise SeqError, "Cannot complement sequence type: #{self.type}"
|
340
|
+
end
|
341
|
+
|
342
|
+
self
|
343
|
+
end
|
344
|
+
|
345
|
+
# Method to determine the Hamming Distance between
|
346
|
+
# two Sequence objects (case insensitive).
|
347
|
+
def hamming_distance(entry, options = {})
|
348
|
+
if options[:ambiguity]
|
349
|
+
BioDSL::Hamming.distance(self.seq, entry.seq, options)
|
350
|
+
else
|
351
|
+
BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
# Method to determine the Edit Distance between
|
356
|
+
# two Sequence objects (case insensitive).
|
357
|
+
def edit_distance(entry)
|
358
|
+
Levenshtein.distance(self.seq, entry.seq)
|
359
|
+
end
|
360
|
+
|
361
|
+
# Method that generates a random sequence of a given length and type.
|
362
|
+
def generate(length, type)
|
363
|
+
raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
|
364
|
+
|
365
|
+
case type
|
366
|
+
when :dna then alph = DNA
|
367
|
+
when :rna then alph = RNA
|
368
|
+
when :protein then alph = PROTEIN
|
369
|
+
else
|
370
|
+
raise SeqError, "Unknown sequence type: #{type}"
|
371
|
+
end
|
372
|
+
|
373
|
+
seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
|
374
|
+
self.seq = seq_new
|
375
|
+
self.type = type
|
376
|
+
seq_new
|
377
|
+
end
|
378
|
+
|
379
|
+
# Method to return a new Seq object with shuffled sequence.
|
380
|
+
def shuffle
|
381
|
+
Seq.new(
|
382
|
+
seq_name: self.seq_name,
|
383
|
+
seq: self.seq.split('').shuffle!.join,
|
384
|
+
type: self.type,
|
385
|
+
qual: self.qual
|
386
|
+
)
|
387
|
+
end
|
388
|
+
|
389
|
+
# Method to shuffle a sequence randomly inline.
|
390
|
+
def shuffle!
|
391
|
+
self.seq = self.seq.split('').shuffle!.join
|
392
|
+
self
|
393
|
+
end
|
394
|
+
|
395
|
+
# Method to add two Seq objects.
|
396
|
+
def +(entry)
|
397
|
+
new_entry = Seq.new()
|
398
|
+
new_entry.seq = self.seq + entry.seq
|
399
|
+
new_entry.type = self.type if self.type == entry.type
|
400
|
+
new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
|
401
|
+
new_entry
|
402
|
+
end
|
403
|
+
|
404
|
+
# Method to concatenate sequence entries.
|
405
|
+
def <<(entry)
|
406
|
+
raise SeqError, "sequences of different types" unless self.type == entry.type
|
407
|
+
raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
|
408
|
+
|
409
|
+
self.seq << entry.seq
|
410
|
+
self.qual << entry.qual unless entry.qual.nil?
|
411
|
+
|
412
|
+
self
|
413
|
+
end
|
414
|
+
|
415
|
+
# Index method for Seq objects.
|
416
|
+
def [](*args)
|
417
|
+
entry = Seq.new
|
418
|
+
entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
|
419
|
+
entry.seq = self.seq[*args] || ""
|
420
|
+
entry.type = self.type
|
421
|
+
entry.qual = self.qual[*args] || "" unless self.qual.nil?
|
422
|
+
|
423
|
+
entry
|
424
|
+
end
|
425
|
+
|
426
|
+
# Index assignment method for Seq objects.
|
427
|
+
def []=(*args, entry)
|
428
|
+
self.seq[*args] = entry.seq[*args]
|
429
|
+
self.qual[*args] = entry.qual[*args] unless self.qual.nil?
|
430
|
+
|
431
|
+
self
|
432
|
+
end
|
433
|
+
|
434
|
+
# Method that returns the residue compositions of a sequence in
|
435
|
+
# a hash where the key is the residue and the value is the residue
|
436
|
+
# count.
|
437
|
+
def composition
|
438
|
+
comp = Hash.new(0);
|
439
|
+
|
440
|
+
self.seq.upcase.each_char do |char|
|
441
|
+
comp[char] += 1
|
442
|
+
end
|
443
|
+
|
444
|
+
comp
|
445
|
+
end
|
446
|
+
|
447
|
+
# Method that returns the percentage of hard masked residues
|
448
|
+
# or N's in a sequence.
|
449
|
+
def hard_mask
|
450
|
+
((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
|
451
|
+
end
|
452
|
+
|
453
|
+
# Method that returns the percentage of soft masked residues
|
454
|
+
# or lower cased residues in a sequence.
|
455
|
+
def soft_mask
|
456
|
+
((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
|
457
|
+
end
|
458
|
+
|
459
|
+
# Hard masks sequence residues where the corresponding quality score
|
460
|
+
# is below a given cutoff.
|
461
|
+
def mask_seq_hard!(cutoff)
|
462
|
+
raise SeqError, "seq is nil" if self.seq.nil?
|
463
|
+
raise SeqError, "qual is nil" if self.qual.nil?
|
464
|
+
raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
|
465
|
+
|
466
|
+
na_seq = NArray.to_na(self.seq.upcase, "byte")
|
467
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
468
|
+
mask = (na_qual - SCORE_BASE) < cutoff
|
469
|
+
mask *= na_seq.ne("-".ord)
|
470
|
+
|
471
|
+
na_seq[mask] = 'N'.ord
|
472
|
+
|
473
|
+
self.seq = na_seq.to_s
|
474
|
+
|
475
|
+
self
|
476
|
+
end
|
477
|
+
|
478
|
+
# Soft masks sequence residues where the corresponding quality score
|
479
|
+
# is below a given cutoff. Masked sequence will be lowercased and
|
480
|
+
# remaining will be uppercased.
|
481
|
+
def mask_seq_soft!(cutoff)
|
482
|
+
raise SeqError, "seq is nil" if self.seq.nil?
|
483
|
+
raise SeqError, "qual is nil" if self.qual.nil?
|
484
|
+
raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
|
485
|
+
|
486
|
+
na_seq = NArray.to_na(self.seq.upcase, "byte")
|
487
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
488
|
+
mask = (na_qual - SCORE_BASE) < cutoff
|
489
|
+
mask *= na_seq.ne("-".ord)
|
490
|
+
|
491
|
+
na_seq[mask] ^= ' '.ord
|
492
|
+
|
493
|
+
self.seq = na_seq.to_s
|
494
|
+
|
495
|
+
self
|
496
|
+
end
|
497
|
+
|
498
|
+
# Method that determines if a quality score string can be
|
499
|
+
# absolutely identified as base 33.
|
500
|
+
def qual_base33?
|
501
|
+
self.qual.match(/[!-:]/) ? true : false
|
502
|
+
end
|
503
|
+
|
504
|
+
# Method that determines if a quality score string may be base 64.
|
505
|
+
def qual_base64?
|
506
|
+
self.qual.match(/[K-h]/) ? true : false
|
507
|
+
end
|
508
|
+
|
509
|
+
# Method to determine if a quality score is valid accepting only 0-40 range.
|
510
|
+
def qual_valid?(encoding)
|
511
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
512
|
+
|
513
|
+
case encoding
|
514
|
+
when :base_33 then return true if self.qual.match(/^[!-I]*$/)
|
515
|
+
when :base_64 then return true if self.qual.match(/^[@-h]*$/)
|
516
|
+
else raise SeqError, "unknown quality score encoding: #{encoding}"
|
517
|
+
end
|
518
|
+
|
519
|
+
false
|
520
|
+
end
|
521
|
+
|
522
|
+
# Method to coerce quality scores to be within the 0-40 range.
|
523
|
+
def qual_coerce!(encoding)
|
524
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
525
|
+
|
526
|
+
case encoding
|
527
|
+
when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
|
528
|
+
when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
|
529
|
+
else
|
530
|
+
raise SeqError, "unknown quality score encoding: #{encoding}"
|
531
|
+
end
|
532
|
+
|
533
|
+
self
|
534
|
+
end
|
535
|
+
|
536
|
+
# Method to convert quality scores.
|
537
|
+
def qual_convert!(from, to)
|
538
|
+
raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
|
539
|
+
raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
|
540
|
+
|
541
|
+
if from == :base_33 and to == :base_64
|
542
|
+
qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
|
543
|
+
elsif from == :base_64 and to == :base_33
|
544
|
+
qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
|
545
|
+
qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
|
546
|
+
end
|
547
|
+
|
548
|
+
self
|
549
|
+
end
|
550
|
+
|
551
|
+
# Method to calculate and return the mean quality score.
|
552
|
+
def scores_mean
|
553
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
554
|
+
|
555
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
556
|
+
na_qual -= SCORE_BASE
|
557
|
+
|
558
|
+
na_qual.mean
|
559
|
+
end
|
560
|
+
|
561
|
+
# Method to calculate and return the min quality score.
|
562
|
+
def scores_min
|
563
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
564
|
+
|
565
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
566
|
+
na_qual -= SCORE_BASE
|
567
|
+
|
568
|
+
na_qual.min
|
569
|
+
end
|
570
|
+
|
571
|
+
# Method to calculate and return the max quality score.
|
572
|
+
def scores_max
|
573
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
574
|
+
|
575
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
576
|
+
na_qual -= SCORE_BASE
|
577
|
+
|
578
|
+
na_qual.max
|
579
|
+
end
|
580
|
+
|
581
|
+
# Method to run a sliding window of a specified size across a Phred type
|
582
|
+
# scores string and calculate for each window the mean score and return
|
583
|
+
# the minimum mean score.
|
584
|
+
def scores_mean_local(window_size)
|
585
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
586
|
+
|
587
|
+
scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
|
588
|
+
end
|
589
|
+
|
590
|
+
# Method to find open reading frames (ORFs).
|
591
|
+
def each_orf(options = {})
|
592
|
+
size_min = options[:size_min] || 0
|
593
|
+
size_max = options[:size_max] || self.length
|
594
|
+
start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
|
595
|
+
stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
|
596
|
+
pick_longest = options[:pick_longest]
|
597
|
+
|
598
|
+
orfs = []
|
599
|
+
pos_beg = 0
|
600
|
+
|
601
|
+
regex_start = Regexp.new(start_codons.split(',').join('|'), true)
|
602
|
+
regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
|
603
|
+
|
604
|
+
while pos_beg and pos_beg < self.length - size_min
|
605
|
+
if pos_beg = self.seq.index(regex_start, pos_beg)
|
606
|
+
if pos_end = self.seq.index(regex_stop, pos_beg)
|
607
|
+
length = (pos_end - pos_beg) + 3
|
608
|
+
|
609
|
+
if (length % 3) == 0
|
610
|
+
if size_min <= length and length <= size_max
|
611
|
+
subseq = self[pos_beg ... pos_beg + length]
|
612
|
+
|
613
|
+
orfs << Orf.new(subseq, pos_beg, pos_end + 2)
|
614
|
+
end
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
pos_beg += 1
|
619
|
+
end
|
620
|
+
end
|
621
|
+
|
622
|
+
if pick_longest
|
623
|
+
orf_hash = {}
|
624
|
+
|
625
|
+
orfs.each { |orf| orf_hash[orf.stop] = orf unless orf_hash[orf.stop] }
|
626
|
+
|
627
|
+
orfs = orf_hash.values
|
628
|
+
end
|
629
|
+
|
630
|
+
if block_given?
|
631
|
+
orfs.each { |orf| yield orf }
|
632
|
+
else
|
633
|
+
return orfs
|
634
|
+
end
|
635
|
+
end
|
636
|
+
|
637
|
+
class Orf
|
638
|
+
attr_reader :entry, :start, :stop
|
639
|
+
|
640
|
+
def initialize(entry, start, stop)
|
641
|
+
@entry = entry
|
642
|
+
@start = start
|
643
|
+
@stop = stop
|
644
|
+
end
|
645
|
+
end
|
646
|
+
|
647
|
+
private
|
648
|
+
|
649
|
+
inline do |builder|
|
650
|
+
builder.c %{
|
651
|
+
VALUE qual_coerce_C(
|
652
|
+
VALUE _qual,
|
653
|
+
VALUE _qual_len,
|
654
|
+
VALUE _min_value,
|
655
|
+
VALUE _max_value
|
656
|
+
)
|
657
|
+
{
|
658
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
659
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
660
|
+
unsigned int min_value = FIX2UINT(_min_value);
|
661
|
+
unsigned int max_value = FIX2UINT(_max_value);
|
662
|
+
unsigned int i = 0;
|
663
|
+
|
664
|
+
for (i = 0; i < qual_len; i++)
|
665
|
+
{
|
666
|
+
if (qual[i] > max_value) {
|
667
|
+
qual[i] = max_value;
|
668
|
+
} else if (qual[i] < min_value) {
|
669
|
+
qual[i] = min_value;
|
670
|
+
}
|
671
|
+
}
|
672
|
+
|
673
|
+
return Qnil;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
builder.c %{
|
678
|
+
VALUE qual_convert_C(
|
679
|
+
VALUE _qual,
|
680
|
+
VALUE _qual_len,
|
681
|
+
VALUE _value
|
682
|
+
)
|
683
|
+
{
|
684
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
685
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
686
|
+
unsigned int value = FIX2UINT(_value);
|
687
|
+
unsigned int i = 0;
|
688
|
+
|
689
|
+
for (i = 0; i < qual_len; i++)
|
690
|
+
{
|
691
|
+
qual[i] += value;
|
692
|
+
}
|
693
|
+
|
694
|
+
return Qnil;
|
695
|
+
}
|
696
|
+
}
|
697
|
+
|
698
|
+
builder.c %{
|
699
|
+
VALUE scores_mean_local_C(
|
700
|
+
VALUE _qual,
|
701
|
+
VALUE _qual_len,
|
702
|
+
VALUE _score_base,
|
703
|
+
VALUE _window_size
|
704
|
+
)
|
705
|
+
{
|
706
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
707
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
708
|
+
unsigned int score_base = FIX2UINT(_score_base);
|
709
|
+
unsigned int window_size = FIX2UINT(_window_size);
|
710
|
+
unsigned int sum = 0;
|
711
|
+
unsigned int i = 0;
|
712
|
+
float mean = 0.0;
|
713
|
+
float new_mean = 0.0;
|
714
|
+
|
715
|
+
// fill window
|
716
|
+
for (i = 0; i < window_size; i++)
|
717
|
+
sum += qual[i] - score_base;
|
718
|
+
|
719
|
+
mean = sum / window_size;
|
720
|
+
|
721
|
+
// run window across the rest of the scores
|
722
|
+
while (i < qual_len)
|
723
|
+
{
|
724
|
+
sum += qual[i] - score_base;
|
725
|
+
sum -= qual[i - window_size] - score_base;
|
726
|
+
|
727
|
+
new_mean = sum / window_size;
|
728
|
+
|
729
|
+
if (new_mean < mean)
|
730
|
+
mean = new_mean;
|
731
|
+
|
732
|
+
i++;
|
733
|
+
}
|
734
|
+
|
735
|
+
return rb_float_new(mean);
|
736
|
+
}
|
737
|
+
}
|
738
|
+
end
|
739
|
+
end
|
740
|
+
end
|
741
|
+
|
742
|
+
__END__
|