BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
data/lib/BioDSL/seq.rb
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
|
|
18
|
+
# #
|
|
19
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
20
|
+
# #
|
|
21
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
|
+
# #
|
|
23
|
+
# This software is part BioDSL (www.BioDSL.org). #
|
|
24
|
+
# #
|
|
25
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
|
+
|
|
27
|
+
module BioDSL
|
|
28
|
+
require 'narray'
|
|
29
|
+
require 'BioDSL/seq/ambiguity'
|
|
30
|
+
require 'BioDSL/seq/assemble'
|
|
31
|
+
require 'BioDSL/seq/digest'
|
|
32
|
+
require 'BioDSL/seq/kmer'
|
|
33
|
+
require 'BioDSL/seq/translate'
|
|
34
|
+
require 'BioDSL/seq/trim'
|
|
35
|
+
require 'BioDSL/seq/backtrack'
|
|
36
|
+
require 'BioDSL/seq/dynamic'
|
|
37
|
+
require 'BioDSL/seq/homopolymer'
|
|
38
|
+
require 'BioDSL/seq/levenshtein'
|
|
39
|
+
|
|
40
|
+
# Error class for all exceptions to do with Seq.
|
|
41
|
+
class SeqError < StandardError; end
|
|
42
|
+
|
|
43
|
+
class Seq
|
|
44
|
+
# Residue alphabets
|
|
45
|
+
DNA = %w[a t c g]
|
|
46
|
+
RNA = %w[a u c g]
|
|
47
|
+
PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
|
|
48
|
+
INDELS = %w[. - _ ~]
|
|
49
|
+
|
|
50
|
+
# Quality scores bases
|
|
51
|
+
SCORE_BASE = 33
|
|
52
|
+
SCORE_MIN = 0
|
|
53
|
+
SCORE_MAX = 40
|
|
54
|
+
|
|
55
|
+
include BioDSL::Digest
|
|
56
|
+
include BioDSL::Homopolymer
|
|
57
|
+
include BioDSL::Translate
|
|
58
|
+
include BioDSL::Trim
|
|
59
|
+
include BioDSL::Kmer
|
|
60
|
+
include BioDSL::BackTrack
|
|
61
|
+
|
|
62
|
+
attr_accessor :seq_name, :seq, :type, :qual
|
|
63
|
+
|
|
64
|
+
# Class method to instantiate a new Sequence object given
|
|
65
|
+
# a Biopiece record.
|
|
66
|
+
def self.new_bp(record)
|
|
67
|
+
seq_name = record[:SEQ_NAME]
|
|
68
|
+
seq = record[:SEQ]
|
|
69
|
+
type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
|
|
70
|
+
qual = record[:SCORES]
|
|
71
|
+
|
|
72
|
+
self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Class method that generates all possible oligos of a specifed length and type.
|
|
76
|
+
def self.generate_oligos(length, type)
|
|
77
|
+
raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
|
|
78
|
+
|
|
79
|
+
case type.downcase
|
|
80
|
+
when :dna then alph = DNA
|
|
81
|
+
when :rna then alph = RNA
|
|
82
|
+
when :protein then alph = PROTEIN
|
|
83
|
+
else
|
|
84
|
+
raise SeqError, "Unknown sequence type: #{type}"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
oligos = [""]
|
|
88
|
+
|
|
89
|
+
(1 .. length).each do
|
|
90
|
+
list = []
|
|
91
|
+
|
|
92
|
+
oligos.each do |oligo|
|
|
93
|
+
alph.each do |char|
|
|
94
|
+
list << oligo + char
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
oligos = list
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
oligos
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def self.check_name_pair(entry1, entry2)
|
|
105
|
+
if entry1.seq_name =~ /^([^ ]+) \d:/
|
|
106
|
+
name1 = $1
|
|
107
|
+
elsif entry1.seq_name =~ /^(.+)\/\d$/
|
|
108
|
+
name1 = $1
|
|
109
|
+
else
|
|
110
|
+
raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
if entry2.seq_name =~ /^([^ ]+) \d:/
|
|
114
|
+
name2 = $1
|
|
115
|
+
elsif entry2.seq_name =~ /^(.+)\/\d$/
|
|
116
|
+
name2 = $1
|
|
117
|
+
else
|
|
118
|
+
raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if name1 != name2
|
|
122
|
+
raise SeqError, "Name mismatch: #{name1} != #{name2}"
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Initialize a sequence object with the following options:
|
|
127
|
+
# - :seq_name Name of the sequence.
|
|
128
|
+
# - :seq The sequence.
|
|
129
|
+
# - :type The sequence type - DNA, RNA, or protein
|
|
130
|
+
# - :qual An Illumina type quality scores string.
|
|
131
|
+
def initialize(options = {})
|
|
132
|
+
@seq_name = options[:seq_name]
|
|
133
|
+
@seq = options[:seq]
|
|
134
|
+
@type = options[:type]
|
|
135
|
+
@qual = options[:qual]
|
|
136
|
+
|
|
137
|
+
if @seq and @qual and @seq.length != @qual.length
|
|
138
|
+
raise SeqError, "Sequence length and score length mismatch:" \
|
|
139
|
+
"#{@seq.length} != #{@qual.length}"
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Method that guesses and returns the sequence type
|
|
144
|
+
# by inspecting the first 100 residues.
|
|
145
|
+
def type_guess
|
|
146
|
+
raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
|
|
147
|
+
|
|
148
|
+
case self.seq[0 ... 100].downcase
|
|
149
|
+
when /[flpqie]/ then return :protein
|
|
150
|
+
when /[u]/ then return :rna
|
|
151
|
+
else return :dna
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Method that guesses and sets the sequence type
|
|
156
|
+
# by inspecting the first 100 residues.
|
|
157
|
+
def type_guess!
|
|
158
|
+
self.type = self.type_guess
|
|
159
|
+
self
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Returns the length of a sequence.
|
|
163
|
+
def length
|
|
164
|
+
self.seq.nil? ? 0 : self.seq.length
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
alias :len :length
|
|
168
|
+
|
|
169
|
+
# Return the number indels in a sequence.
|
|
170
|
+
def indels
|
|
171
|
+
regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
|
|
172
|
+
self.seq.scan(regex).size
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Method to remove indels from seq and qual if qual.
|
|
176
|
+
def indels_remove
|
|
177
|
+
if self.qual.nil?
|
|
178
|
+
self.seq.delete!(Regexp.escape(INDELS.join('')))
|
|
179
|
+
else
|
|
180
|
+
na_seq = NArray.to_na(self.seq, "byte")
|
|
181
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
182
|
+
mask = NArray.byte(self.length)
|
|
183
|
+
|
|
184
|
+
INDELS.each do |c|
|
|
185
|
+
mask += na_seq.eq(c.ord)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
mask = mask.eq(0)
|
|
189
|
+
|
|
190
|
+
self.seq = na_seq[mask].to_s
|
|
191
|
+
self.qual = na_qual[mask].to_s
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
self
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Method that returns true is a given sequence type is DNA.
|
|
198
|
+
def is_dna?
|
|
199
|
+
self.type == :dna
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Method that returns true is a given sequence type is RNA.
|
|
203
|
+
def is_rna?
|
|
204
|
+
self.type == :rna
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Method that returns true is a given sequence type is protein.
|
|
208
|
+
def is_protein?
|
|
209
|
+
self.type == :protein
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Method to transcribe DNA to RNA.
|
|
213
|
+
def to_rna
|
|
214
|
+
raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
|
|
215
|
+
raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
|
|
216
|
+
self.type = :rna
|
|
217
|
+
self.seq.tr!('Tt','Uu')
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Method to reverse-transcribe RNA to DNA.
|
|
221
|
+
def to_dna
|
|
222
|
+
raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
|
|
223
|
+
raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
|
|
224
|
+
self.type = :dna
|
|
225
|
+
self.seq.tr!('Uu','Tt')
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Method that given a Seq entry returns a BioDSL record (a hash).
|
|
229
|
+
def to_bp
|
|
230
|
+
record = {}
|
|
231
|
+
record[:SEQ_NAME] = self.seq_name if self.seq_name
|
|
232
|
+
record[:SEQ] = self.seq if self.seq
|
|
233
|
+
record[:SEQ_LEN] = self.seq.length if self.seq
|
|
234
|
+
record[:SCORES] = self.qual if self.qual
|
|
235
|
+
record
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Method that given a Seq entry returns a FASTA entry (a string).
|
|
239
|
+
def to_fasta(wrap = nil)
|
|
240
|
+
raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
|
|
241
|
+
raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
|
|
242
|
+
|
|
243
|
+
seq_name = self.seq_name.to_s
|
|
244
|
+
seq = self.seq.to_s
|
|
245
|
+
|
|
246
|
+
unless wrap.nil?
|
|
247
|
+
seq.gsub!(/(.{#{wrap}})/) do |match|
|
|
248
|
+
match << $/
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
seq.chomp!
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
">#{seq_name}#{$/}#{seq}#{$/}"
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Method that given a Seq entry returns a FASTQ entry (a string).
|
|
258
|
+
def to_fastq
|
|
259
|
+
raise SeqError, "Missing seq_name" if self.seq_name.nil?
|
|
260
|
+
raise SeqError, "Missing seq" if self.seq.nil?
|
|
261
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
|
262
|
+
|
|
263
|
+
seq_name = self.seq_name.to_s
|
|
264
|
+
seq = self.seq.to_s
|
|
265
|
+
qual = self.qual.to_s
|
|
266
|
+
|
|
267
|
+
"@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Method that generates a unique key for a
|
|
271
|
+
# DNA sequence and return this key as a Fixnum.
|
|
272
|
+
def to_key
|
|
273
|
+
key = 0
|
|
274
|
+
|
|
275
|
+
self.seq.upcase.each_char do |char|
|
|
276
|
+
key <<= 2
|
|
277
|
+
|
|
278
|
+
case char
|
|
279
|
+
when 'A' then key |= 0
|
|
280
|
+
when 'C' then key |= 1
|
|
281
|
+
when 'G' then key |= 2
|
|
282
|
+
when 'T' then key |= 3
|
|
283
|
+
else raise SeqError, "Bad residue: #{char}"
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
key
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Method to reverse the sequence.
|
|
291
|
+
def reverse
|
|
292
|
+
entry = Seq.new(
|
|
293
|
+
seq_name: self.seq_name,
|
|
294
|
+
seq: self.seq.reverse,
|
|
295
|
+
type: self.type,
|
|
296
|
+
qual: (self.qual ? self.qual.reverse : self.qual)
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
entry
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Method to reverse the sequence.
|
|
303
|
+
def reverse!
|
|
304
|
+
self.seq.reverse!
|
|
305
|
+
self.qual.reverse! if self.qual
|
|
306
|
+
self
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Method that complements sequence including ambiguity codes.
|
|
310
|
+
def complement
|
|
311
|
+
raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
|
|
312
|
+
|
|
313
|
+
entry = Seq.new(
|
|
314
|
+
seq_name: self.seq_name,
|
|
315
|
+
type: self.type,
|
|
316
|
+
qual: self.qual
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if self.is_dna?
|
|
320
|
+
entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
|
321
|
+
elsif self.is_rna?
|
|
322
|
+
entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
|
323
|
+
else
|
|
324
|
+
raise SeqError, "Cannot complement sequence type: #{self.type}"
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
entry
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Method that complements sequence including ambiguity codes.
|
|
331
|
+
def complement!
|
|
332
|
+
raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
|
|
333
|
+
|
|
334
|
+
if self.is_dna?
|
|
335
|
+
self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
|
336
|
+
elsif self.is_rna?
|
|
337
|
+
self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
|
338
|
+
else
|
|
339
|
+
raise SeqError, "Cannot complement sequence type: #{self.type}"
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
self
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Method to determine the Hamming Distance between
|
|
346
|
+
# two Sequence objects (case insensitive).
|
|
347
|
+
def hamming_distance(entry, options = {})
|
|
348
|
+
if options[:ambiguity]
|
|
349
|
+
BioDSL::Hamming.distance(self.seq, entry.seq, options)
|
|
350
|
+
else
|
|
351
|
+
BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Method to determine the Edit Distance between
|
|
356
|
+
# two Sequence objects (case insensitive).
|
|
357
|
+
def edit_distance(entry)
|
|
358
|
+
Levenshtein.distance(self.seq, entry.seq)
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# Method that generates a random sequence of a given length and type.
|
|
362
|
+
def generate(length, type)
|
|
363
|
+
raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
|
|
364
|
+
|
|
365
|
+
case type
|
|
366
|
+
when :dna then alph = DNA
|
|
367
|
+
when :rna then alph = RNA
|
|
368
|
+
when :protein then alph = PROTEIN
|
|
369
|
+
else
|
|
370
|
+
raise SeqError, "Unknown sequence type: #{type}"
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
|
|
374
|
+
self.seq = seq_new
|
|
375
|
+
self.type = type
|
|
376
|
+
seq_new
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# Method to return a new Seq object with shuffled sequence.
|
|
380
|
+
def shuffle
|
|
381
|
+
Seq.new(
|
|
382
|
+
seq_name: self.seq_name,
|
|
383
|
+
seq: self.seq.split('').shuffle!.join,
|
|
384
|
+
type: self.type,
|
|
385
|
+
qual: self.qual
|
|
386
|
+
)
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
# Method to shuffle a sequence randomly inline.
|
|
390
|
+
def shuffle!
|
|
391
|
+
self.seq = self.seq.split('').shuffle!.join
|
|
392
|
+
self
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# Method to add two Seq objects.
|
|
396
|
+
def +(entry)
|
|
397
|
+
new_entry = Seq.new()
|
|
398
|
+
new_entry.seq = self.seq + entry.seq
|
|
399
|
+
new_entry.type = self.type if self.type == entry.type
|
|
400
|
+
new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
|
|
401
|
+
new_entry
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
# Method to concatenate sequence entries.
|
|
405
|
+
def <<(entry)
|
|
406
|
+
raise SeqError, "sequences of different types" unless self.type == entry.type
|
|
407
|
+
raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
|
|
408
|
+
|
|
409
|
+
self.seq << entry.seq
|
|
410
|
+
self.qual << entry.qual unless entry.qual.nil?
|
|
411
|
+
|
|
412
|
+
self
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Index method for Seq objects.
|
|
416
|
+
def [](*args)
|
|
417
|
+
entry = Seq.new
|
|
418
|
+
entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
|
|
419
|
+
entry.seq = self.seq[*args] || ""
|
|
420
|
+
entry.type = self.type
|
|
421
|
+
entry.qual = self.qual[*args] || "" unless self.qual.nil?
|
|
422
|
+
|
|
423
|
+
entry
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# Index assignment method for Seq objects.
|
|
427
|
+
def []=(*args, entry)
|
|
428
|
+
self.seq[*args] = entry.seq[*args]
|
|
429
|
+
self.qual[*args] = entry.qual[*args] unless self.qual.nil?
|
|
430
|
+
|
|
431
|
+
self
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
# Method that returns the residue compositions of a sequence in
|
|
435
|
+
# a hash where the key is the residue and the value is the residue
|
|
436
|
+
# count.
|
|
437
|
+
def composition
|
|
438
|
+
comp = Hash.new(0);
|
|
439
|
+
|
|
440
|
+
self.seq.upcase.each_char do |char|
|
|
441
|
+
comp[char] += 1
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
comp
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# Method that returns the percentage of hard masked residues
|
|
448
|
+
# or N's in a sequence.
|
|
449
|
+
def hard_mask
|
|
450
|
+
((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Method that returns the percentage of soft masked residues
|
|
454
|
+
# or lower cased residues in a sequence.
|
|
455
|
+
def soft_mask
|
|
456
|
+
((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# Hard masks sequence residues where the corresponding quality score
|
|
460
|
+
# is below a given cutoff.
|
|
461
|
+
def mask_seq_hard!(cutoff)
|
|
462
|
+
raise SeqError, "seq is nil" if self.seq.nil?
|
|
463
|
+
raise SeqError, "qual is nil" if self.qual.nil?
|
|
464
|
+
raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
|
|
465
|
+
|
|
466
|
+
na_seq = NArray.to_na(self.seq.upcase, "byte")
|
|
467
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
468
|
+
mask = (na_qual - SCORE_BASE) < cutoff
|
|
469
|
+
mask *= na_seq.ne("-".ord)
|
|
470
|
+
|
|
471
|
+
na_seq[mask] = 'N'.ord
|
|
472
|
+
|
|
473
|
+
self.seq = na_seq.to_s
|
|
474
|
+
|
|
475
|
+
self
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
# Soft masks sequence residues where the corresponding quality score
|
|
479
|
+
# is below a given cutoff. Masked sequence will be lowercased and
|
|
480
|
+
# remaining will be uppercased.
|
|
481
|
+
def mask_seq_soft!(cutoff)
|
|
482
|
+
raise SeqError, "seq is nil" if self.seq.nil?
|
|
483
|
+
raise SeqError, "qual is nil" if self.qual.nil?
|
|
484
|
+
raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
|
|
485
|
+
|
|
486
|
+
na_seq = NArray.to_na(self.seq.upcase, "byte")
|
|
487
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
488
|
+
mask = (na_qual - SCORE_BASE) < cutoff
|
|
489
|
+
mask *= na_seq.ne("-".ord)
|
|
490
|
+
|
|
491
|
+
na_seq[mask] ^= ' '.ord
|
|
492
|
+
|
|
493
|
+
self.seq = na_seq.to_s
|
|
494
|
+
|
|
495
|
+
self
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
# Method that determines if a quality score string can be
|
|
499
|
+
# absolutely identified as base 33.
|
|
500
|
+
def qual_base33?
|
|
501
|
+
self.qual.match(/[!-:]/) ? true : false
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
# Method that determines if a quality score string may be base 64.
|
|
505
|
+
def qual_base64?
|
|
506
|
+
self.qual.match(/[K-h]/) ? true : false
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Method to determine if a quality score is valid accepting only 0-40 range.
|
|
510
|
+
def qual_valid?(encoding)
|
|
511
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
|
512
|
+
|
|
513
|
+
case encoding
|
|
514
|
+
when :base_33 then return true if self.qual.match(/^[!-I]*$/)
|
|
515
|
+
when :base_64 then return true if self.qual.match(/^[@-h]*$/)
|
|
516
|
+
else raise SeqError, "unknown quality score encoding: #{encoding}"
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
false
|
|
520
|
+
end
|
|
521
|
+
|
|
522
|
+
# Method to coerce quality scores to be within the 0-40 range.
|
|
523
|
+
def qual_coerce!(encoding)
|
|
524
|
+
raise SeqError, "Missing qual" if self.qual.nil?
|
|
525
|
+
|
|
526
|
+
case encoding
|
|
527
|
+
when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
|
|
528
|
+
when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
|
|
529
|
+
else
|
|
530
|
+
raise SeqError, "unknown quality score encoding: #{encoding}"
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
self
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# Method to convert quality scores.
|
|
537
|
+
def qual_convert!(from, to)
|
|
538
|
+
raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
|
|
539
|
+
raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
|
|
540
|
+
|
|
541
|
+
if from == :base_33 and to == :base_64
|
|
542
|
+
qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
|
|
543
|
+
elsif from == :base_64 and to == :base_33
|
|
544
|
+
qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
|
|
545
|
+
qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
|
|
546
|
+
end
|
|
547
|
+
|
|
548
|
+
self
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
# Method to calculate and return the mean quality score.
|
|
552
|
+
def scores_mean
|
|
553
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
|
554
|
+
|
|
555
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
556
|
+
na_qual -= SCORE_BASE
|
|
557
|
+
|
|
558
|
+
na_qual.mean
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
# Method to calculate and return the min quality score.
|
|
562
|
+
def scores_min
|
|
563
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
|
564
|
+
|
|
565
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
566
|
+
na_qual -= SCORE_BASE
|
|
567
|
+
|
|
568
|
+
na_qual.min
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Method to calculate and return the max quality score.
|
|
572
|
+
def scores_max
|
|
573
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
|
574
|
+
|
|
575
|
+
na_qual = NArray.to_na(self.qual, "byte")
|
|
576
|
+
na_qual -= SCORE_BASE
|
|
577
|
+
|
|
578
|
+
na_qual.max
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Method to run a sliding window of a specified size across a Phred type
|
|
582
|
+
# scores string and calculate for each window the mean score and return
|
|
583
|
+
# the minimum mean score.
|
|
584
|
+
def scores_mean_local(window_size)
|
|
585
|
+
raise SeqError, "Missing qual in entry" if self.qual.nil?
|
|
586
|
+
|
|
587
|
+
scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
|
|
588
|
+
end
|
|
589
|
+
|
|
590
|
+
# Method to find open reading frames (ORFs).
|
|
591
|
+
def each_orf(options = {})
|
|
592
|
+
size_min = options[:size_min] || 0
|
|
593
|
+
size_max = options[:size_max] || self.length
|
|
594
|
+
start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
|
|
595
|
+
stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
|
|
596
|
+
pick_longest = options[:pick_longest]
|
|
597
|
+
|
|
598
|
+
orfs = []
|
|
599
|
+
pos_beg = 0
|
|
600
|
+
|
|
601
|
+
regex_start = Regexp.new(start_codons.split(',').join('|'), true)
|
|
602
|
+
regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
|
|
603
|
+
|
|
604
|
+
while pos_beg and pos_beg < self.length - size_min
|
|
605
|
+
if pos_beg = self.seq.index(regex_start, pos_beg)
|
|
606
|
+
if pos_end = self.seq.index(regex_stop, pos_beg)
|
|
607
|
+
length = (pos_end - pos_beg) + 3
|
|
608
|
+
|
|
609
|
+
if (length % 3) == 0
|
|
610
|
+
if size_min <= length and length <= size_max
|
|
611
|
+
subseq = self[pos_beg ... pos_beg + length]
|
|
612
|
+
|
|
613
|
+
orfs << Orf.new(subseq, pos_beg, pos_end + 2)
|
|
614
|
+
end
|
|
615
|
+
end
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
pos_beg += 1
|
|
619
|
+
end
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
if pick_longest
|
|
623
|
+
orf_hash = {}
|
|
624
|
+
|
|
625
|
+
orfs.each { |orf| orf_hash[orf.stop] = orf unless orf_hash[orf.stop] }
|
|
626
|
+
|
|
627
|
+
orfs = orf_hash.values
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
if block_given?
|
|
631
|
+
orfs.each { |orf| yield orf }
|
|
632
|
+
else
|
|
633
|
+
return orfs
|
|
634
|
+
end
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
class Orf
|
|
638
|
+
attr_reader :entry, :start, :stop
|
|
639
|
+
|
|
640
|
+
def initialize(entry, start, stop)
|
|
641
|
+
@entry = entry
|
|
642
|
+
@start = start
|
|
643
|
+
@stop = stop
|
|
644
|
+
end
|
|
645
|
+
end
|
|
646
|
+
|
|
647
|
+
private
|
|
648
|
+
|
|
649
|
+
inline do |builder|
|
|
650
|
+
builder.c %{
|
|
651
|
+
VALUE qual_coerce_C(
|
|
652
|
+
VALUE _qual,
|
|
653
|
+
VALUE _qual_len,
|
|
654
|
+
VALUE _min_value,
|
|
655
|
+
VALUE _max_value
|
|
656
|
+
)
|
|
657
|
+
{
|
|
658
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
|
659
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
|
660
|
+
unsigned int min_value = FIX2UINT(_min_value);
|
|
661
|
+
unsigned int max_value = FIX2UINT(_max_value);
|
|
662
|
+
unsigned int i = 0;
|
|
663
|
+
|
|
664
|
+
for (i = 0; i < qual_len; i++)
|
|
665
|
+
{
|
|
666
|
+
if (qual[i] > max_value) {
|
|
667
|
+
qual[i] = max_value;
|
|
668
|
+
} else if (qual[i] < min_value) {
|
|
669
|
+
qual[i] = min_value;
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
return Qnil;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
builder.c %{
|
|
678
|
+
VALUE qual_convert_C(
|
|
679
|
+
VALUE _qual,
|
|
680
|
+
VALUE _qual_len,
|
|
681
|
+
VALUE _value
|
|
682
|
+
)
|
|
683
|
+
{
|
|
684
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
|
685
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
|
686
|
+
unsigned int value = FIX2UINT(_value);
|
|
687
|
+
unsigned int i = 0;
|
|
688
|
+
|
|
689
|
+
for (i = 0; i < qual_len; i++)
|
|
690
|
+
{
|
|
691
|
+
qual[i] += value;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
return Qnil;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
builder.c %{
|
|
699
|
+
VALUE scores_mean_local_C(
|
|
700
|
+
VALUE _qual,
|
|
701
|
+
VALUE _qual_len,
|
|
702
|
+
VALUE _score_base,
|
|
703
|
+
VALUE _window_size
|
|
704
|
+
)
|
|
705
|
+
{
|
|
706
|
+
unsigned char *qual = (unsigned char *) StringValuePtr(_qual);
|
|
707
|
+
unsigned int qual_len = FIX2UINT(_qual_len);
|
|
708
|
+
unsigned int score_base = FIX2UINT(_score_base);
|
|
709
|
+
unsigned int window_size = FIX2UINT(_window_size);
|
|
710
|
+
unsigned int sum = 0;
|
|
711
|
+
unsigned int i = 0;
|
|
712
|
+
float mean = 0.0;
|
|
713
|
+
float new_mean = 0.0;
|
|
714
|
+
|
|
715
|
+
// fill window
|
|
716
|
+
for (i = 0; i < window_size; i++)
|
|
717
|
+
sum += qual[i] - score_base;
|
|
718
|
+
|
|
719
|
+
mean = sum / window_size;
|
|
720
|
+
|
|
721
|
+
// run window across the rest of the scores
|
|
722
|
+
while (i < qual_len)
|
|
723
|
+
{
|
|
724
|
+
sum += qual[i] - score_base;
|
|
725
|
+
sum -= qual[i - window_size] - score_base;
|
|
726
|
+
|
|
727
|
+
new_mean = sum / window_size;
|
|
728
|
+
|
|
729
|
+
if (new_mean < mean)
|
|
730
|
+
mean = new_mean;
|
|
731
|
+
|
|
732
|
+
i++;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
return rb_float_new(mean);
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
end
|
|
739
|
+
end
|
|
740
|
+
end
|
|
741
|
+
|
|
742
|
+
__END__
|