BioDSL 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
data/lib/BioDSL/filesys.rb
CHANGED
@@ -1,35 +1,39 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Namespace for BioDSL.
|
27
29
|
module BioDSL
|
28
30
|
# Error class for all exceptions to do with Filesys.
|
29
31
|
class FilesysError < StandardError; end
|
30
32
|
|
33
|
+
# Class for handling filesystem manipulations.
|
31
34
|
class Filesys
|
32
35
|
require 'open3'
|
36
|
+
require 'English'
|
33
37
|
|
34
38
|
include Enumerable
|
35
39
|
|
@@ -40,10 +44,10 @@ module BioDSL
|
|
40
44
|
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
41
45
|
|
42
46
|
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
43
|
-
exts.each
|
47
|
+
exts.each do |ext|
|
44
48
|
exe = File.join(path, "#{cmd}#{ext}")
|
45
49
|
return exe if File.executable?(exe) && !File.directory?(exe)
|
46
|
-
|
50
|
+
end
|
47
51
|
end
|
48
52
|
|
49
53
|
nil
|
@@ -51,14 +55,15 @@ module BioDSL
|
|
51
55
|
|
52
56
|
# Class method that returns a path to a unique temporary file.
|
53
57
|
# If no directory is specified reverts to the systems tmp directory.
|
54
|
-
def self.tmpfile(tmp_dir = ENV[
|
58
|
+
def self.tmpfile(tmp_dir = ENV['TMPDIR'])
|
55
59
|
time = Time.now.to_i
|
56
|
-
user = ENV[
|
57
|
-
pid =
|
58
|
-
path = tmp_dir + [user, time + pid, pid].join(
|
60
|
+
user = ENV['USER']
|
61
|
+
pid = $PID
|
62
|
+
path = tmp_dir + [user, time + pid, pid].join('_') + '.tmp'
|
59
63
|
path
|
60
64
|
end
|
61
65
|
|
66
|
+
# Open a file which may be compressed with gzip og bzip2.
|
62
67
|
def self.open(*args)
|
63
68
|
file = args.shift
|
64
69
|
mode = args.shift
|
@@ -67,32 +72,37 @@ module BioDSL
|
|
67
72
|
if mode == 'w'
|
68
73
|
case options[:compress]
|
69
74
|
when :gzip
|
70
|
-
ios, = Open3.pipeline_w(
|
75
|
+
ios, = Open3.pipeline_w('gzip -f', out: file)
|
71
76
|
when :bzip, :bzip2
|
72
|
-
ios, = Open3.pipeline_w(
|
73
|
-
else
|
74
|
-
ios = File.open(file, mode, options)
|
75
|
-
end
|
76
|
-
else
|
77
|
-
type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
|
78
|
-
case type
|
79
|
-
when /gzip/
|
80
|
-
ios = IO.popen("gzip -cd #{file}")
|
81
|
-
when /bzip/
|
82
|
-
ios = IO.popen("bzcat #{file}")
|
77
|
+
ios, = Open3.pipeline_w('bzip2 -c', out: file)
|
83
78
|
else
|
84
79
|
ios = File.open(file, mode, options)
|
85
80
|
end
|
81
|
+
else
|
82
|
+
type = if file.respond_to? :path
|
83
|
+
`file -Lk #{file.path}`
|
84
|
+
else
|
85
|
+
`file -Lk #{file}`
|
86
|
+
end
|
87
|
+
|
88
|
+
ios = case type
|
89
|
+
when /gzip/
|
90
|
+
IO.popen("gzip -cd #{file}")
|
91
|
+
when /bzip/
|
92
|
+
IO.popen("bzcat #{file}")
|
93
|
+
else
|
94
|
+
File.open(file, mode, options)
|
95
|
+
end
|
86
96
|
end
|
87
97
|
|
88
98
|
if block_given?
|
89
99
|
begin
|
90
|
-
yield
|
100
|
+
yield new(ios)
|
91
101
|
ensure
|
92
102
|
ios.close
|
93
103
|
end
|
94
104
|
else
|
95
|
-
return
|
105
|
+
return new(ios)
|
96
106
|
end
|
97
107
|
end
|
98
108
|
|
@@ -134,4 +144,3 @@ module BioDSL
|
|
134
144
|
end
|
135
145
|
end
|
136
146
|
end
|
137
|
-
|
data/lib/BioDSL/fork.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
data/lib/BioDSL/hamming.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
data/lib/BioDSL/helpers.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
module BioDSL
|
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
data/lib/BioDSL/html_report.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
data/lib/BioDSL/math.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
# #
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
23
|
# #
|
24
|
-
# This software is part of BioDSL (
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
25
25
|
# #
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
27
|
|
data/lib/BioDSL/mummer.rb
CHANGED
@@ -20,7 +20,7 @@
|
|
20
20
|
# #
|
21
21
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
22
|
# #
|
23
|
-
# This software is part of BioDSL (
|
23
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
24
24
|
# #
|
25
25
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
26
|
|
data/lib/BioDSL/pipeline.rb
CHANGED
@@ -20,7 +20,7 @@
|
|
20
20
|
# #
|
21
21
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
22
22
|
# #
|
23
|
-
# This software is part of BioDSL (
|
23
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
24
24
|
# #
|
25
25
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
26
26
|
module BioDSL
|
data/lib/BioDSL/seq.rb
CHANGED
@@ -1,30 +1,33 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or
|
6
|
-
# modify it under the terms of the GNU General Public License
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
8
|
-
# of the License, or (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
2
|
+
# #
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
4
|
+
# #
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
8
|
+
# of the License, or (at your option) any later version. #
|
9
|
+
# #
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
13
|
+
# GNU General Public License for more details. #
|
14
|
+
# #
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
16
|
+
# along with this program; if not, write to the Free Software #
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
18
|
+
# USA. #
|
19
|
+
# #
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
21
|
+
# #
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
23
|
+
# #
|
24
|
+
# This software is part BioDSL (www.BioDSL.org). #
|
25
|
+
# #
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
27
|
+
|
28
|
+
# Namespace for BioDSL.
|
27
29
|
module BioDSL
|
30
|
+
require 'English'
|
28
31
|
require 'narray'
|
29
32
|
require 'BioDSL/seq/ambiguity'
|
30
33
|
require 'BioDSL/seq/assemble'
|
@@ -40,12 +43,15 @@ module BioDSL
|
|
40
43
|
# Error class for all exceptions to do with Seq.
|
41
44
|
class SeqError < StandardError; end
|
42
45
|
|
46
|
+
# rubocop: disable ClassLength
|
47
|
+
|
48
|
+
# Class for manipulating sequences.
|
43
49
|
class Seq
|
44
50
|
# Residue alphabets
|
45
|
-
DNA = %w
|
46
|
-
RNA = %w
|
47
|
-
PROTEIN = %w
|
48
|
-
INDELS = %w
|
51
|
+
DNA = %w(a t c g)
|
52
|
+
RNA = %w(a u c g)
|
53
|
+
PROTEIN = %w(f l s y c w p h q r i m t n k v a d e g)
|
54
|
+
INDELS = %w(. - _ ~)
|
49
55
|
|
50
56
|
# Quality scores bases
|
51
57
|
SCORE_BASE = 33
|
@@ -69,30 +75,29 @@ module BioDSL
|
|
69
75
|
type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
|
70
76
|
qual = record[:SCORES]
|
71
77
|
|
72
|
-
|
78
|
+
new(seq_name: seq_name, seq: seq, type: type, qual: qual)
|
73
79
|
end
|
74
80
|
|
75
|
-
# Class method that generates all possible oligos of a specifed length and
|
81
|
+
# Class method that generates all possible oligos of a specifed length and
|
82
|
+
# type.
|
76
83
|
def self.generate_oligos(length, type)
|
77
|
-
|
84
|
+
fail SeqError, "Bad length: #{length}" if length <= 0
|
78
85
|
|
79
86
|
case type.downcase
|
80
87
|
when :dna then alph = DNA
|
81
88
|
when :rna then alph = RNA
|
82
89
|
when :protein then alph = PROTEIN
|
83
90
|
else
|
84
|
-
|
91
|
+
fail SeqError, "Unknown sequence type: #{type}"
|
85
92
|
end
|
86
93
|
|
87
|
-
oligos = [
|
94
|
+
oligos = ['']
|
88
95
|
|
89
|
-
(1
|
96
|
+
(1..length).each do
|
90
97
|
list = []
|
91
98
|
|
92
99
|
oligos.each do |oligo|
|
93
|
-
alph.each
|
94
|
-
list << oligo + char
|
95
|
-
end
|
100
|
+
alph.each { |char| list << oligo + char }
|
96
101
|
end
|
97
102
|
|
98
103
|
oligos = list
|
@@ -103,24 +108,22 @@ module BioDSL
|
|
103
108
|
|
104
109
|
def self.check_name_pair(entry1, entry2)
|
105
110
|
if entry1.seq_name =~ /^([^ ]+) \d:/
|
106
|
-
name1 =
|
107
|
-
elsif entry1.seq_name =~
|
108
|
-
name1 =
|
111
|
+
name1 = Regexp.last_match[1]
|
112
|
+
elsif entry1.seq_name =~ %r{^(.+)\/\d$}
|
113
|
+
name1 = Regexp.last_match[1]
|
109
114
|
else
|
110
|
-
|
115
|
+
fail SeqError, "Could not match sequence name: #{entry1.seq_name}"
|
111
116
|
end
|
112
117
|
|
113
118
|
if entry2.seq_name =~ /^([^ ]+) \d:/
|
114
|
-
name2 =
|
115
|
-
elsif entry2.seq_name =~
|
116
|
-
name2 =
|
119
|
+
name2 = Regexp.last_match[1]
|
120
|
+
elsif entry2.seq_name =~ %r{^(.+)\/\d$}
|
121
|
+
name2 = Regexp.last_match[1]
|
117
122
|
else
|
118
|
-
|
123
|
+
fail SeqError, "Could not match sequence name: #{entry2.seq_name}"
|
119
124
|
end
|
120
125
|
|
121
|
-
if name1 != name2
|
122
|
-
raise SeqError, "Name mismatch: #{name1} != #{name2}"
|
123
|
-
end
|
126
|
+
fail SeqError, "Name mismatch: #{name1} != #{name2}" if name1 != name2
|
124
127
|
end
|
125
128
|
|
126
129
|
# Initialize a sequence object with the following options:
|
@@ -134,18 +137,19 @@ module BioDSL
|
|
134
137
|
@type = options[:type]
|
135
138
|
@qual = options[:qual]
|
136
139
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
140
|
+
return unless @seq && @qual
|
141
|
+
return if @seq.length == @qual.length
|
142
|
+
|
143
|
+
fail SeqError, 'Sequence length and score length mismatch: ' \
|
144
|
+
"#{@seq.length} != #{@qual.length}"
|
141
145
|
end
|
142
146
|
|
143
147
|
# Method that guesses and returns the sequence type
|
144
148
|
# by inspecting the first 100 residues.
|
145
149
|
def type_guess
|
146
|
-
|
150
|
+
fail SeqError, 'Guess failed: sequence is nil' if @seq.nil?
|
147
151
|
|
148
|
-
case
|
152
|
+
case @seq[0...100].downcase
|
149
153
|
when /[flpqie]/ then return :protein
|
150
154
|
when /[u]/ then return :rna
|
151
155
|
else return :dna
|
@@ -155,31 +159,31 @@ module BioDSL
|
|
155
159
|
# Method that guesses and sets the sequence type
|
156
160
|
# by inspecting the first 100 residues.
|
157
161
|
def type_guess!
|
158
|
-
|
162
|
+
@type = type_guess
|
159
163
|
self
|
160
164
|
end
|
161
165
|
|
162
166
|
# Returns the length of a sequence.
|
163
167
|
def length
|
164
|
-
|
168
|
+
@seq.nil? ? 0 : @seq.length
|
165
169
|
end
|
166
170
|
|
167
|
-
|
171
|
+
alias_method :len, :length
|
168
172
|
|
169
173
|
# Return the number indels in a sequence.
|
170
174
|
def indels
|
171
175
|
regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
|
172
|
-
|
176
|
+
@seq.scan(regex).size
|
173
177
|
end
|
174
178
|
|
175
179
|
# Method to remove indels from seq and qual if qual.
|
176
180
|
def indels_remove
|
177
|
-
if
|
178
|
-
|
181
|
+
if @qual.nil?
|
182
|
+
@seq.delete!(Regexp.escape(INDELS.join('')))
|
179
183
|
else
|
180
|
-
na_seq = NArray.to_na(
|
181
|
-
na_qual = NArray.to_na(
|
182
|
-
mask = NArray.byte(
|
184
|
+
na_seq = NArray.to_na(@seq, 'byte')
|
185
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
186
|
+
mask = NArray.byte(length)
|
183
187
|
|
184
188
|
INDELS.each do |c|
|
185
189
|
mask += na_seq.eq(c.ord)
|
@@ -187,113 +191,113 @@ module BioDSL
|
|
187
191
|
|
188
192
|
mask = mask.eq(0)
|
189
193
|
|
190
|
-
|
191
|
-
|
194
|
+
@seq = na_seq[mask].to_s
|
195
|
+
@qual = na_qual[mask].to_s
|
192
196
|
end
|
193
197
|
|
194
198
|
self
|
195
199
|
end
|
196
200
|
|
197
201
|
# Method that returns true is a given sequence type is DNA.
|
198
|
-
def
|
199
|
-
|
202
|
+
def dna?
|
203
|
+
@type == :dna
|
200
204
|
end
|
201
205
|
|
202
206
|
# Method that returns true is a given sequence type is RNA.
|
203
|
-
def
|
204
|
-
|
207
|
+
def rna?
|
208
|
+
@type == :rna
|
205
209
|
end
|
206
210
|
|
207
211
|
# Method that returns true is a given sequence type is protein.
|
208
|
-
def
|
209
|
-
|
212
|
+
def protein?
|
213
|
+
@type == :protein
|
210
214
|
end
|
211
215
|
|
212
216
|
# Method to transcribe DNA to RNA.
|
213
217
|
def to_rna
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
+
fail SeqError, 'Cannot transcribe 0 length sequence' if length == 0
|
219
|
+
fail SeqError, 'Cannot transcribe sequence type: #{@type}' unless dna?
|
220
|
+
@type = :rna
|
221
|
+
@seq.tr!('Tt', 'Uu')
|
218
222
|
end
|
219
223
|
|
220
224
|
# Method to reverse-transcribe RNA to DNA.
|
221
225
|
def to_dna
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
+
fail SeqError, 'Cant reverse-transcribe 0 length sequence' if length == 0
|
227
|
+
fail SeqError, "Cant reverse-transcribe seq type: #{@type}" unless rna?
|
228
|
+
@type = :dna
|
229
|
+
@seq.tr!('Uu', 'Tt')
|
226
230
|
end
|
227
231
|
|
228
232
|
# Method that given a Seq entry returns a BioDSL record (a hash).
|
229
233
|
def to_bp
|
230
234
|
record = {}
|
231
|
-
record[:SEQ_NAME] =
|
232
|
-
record[:SEQ] =
|
233
|
-
record[:SEQ_LEN] =
|
234
|
-
record[:SCORES] =
|
235
|
+
record[:SEQ_NAME] = @seq_name if @seq_name
|
236
|
+
record[:SEQ] = @seq if @seq
|
237
|
+
record[:SEQ_LEN] = length if @seq
|
238
|
+
record[:SCORES] = @qual if @qual
|
235
239
|
record
|
236
240
|
end
|
237
241
|
|
238
242
|
# Method that given a Seq entry returns a FASTA entry (a string).
|
239
243
|
def to_fasta(wrap = nil)
|
240
|
-
|
241
|
-
|
244
|
+
fail SeqError, 'Missing seq_name' if @seq_name.nil? || @seq_name == ''
|
245
|
+
fail SeqError, 'Missing seq' if @seq.nil? || @seq.empty?
|
242
246
|
|
243
|
-
seq_name =
|
244
|
-
seq =
|
247
|
+
seq_name = @seq_name.to_s
|
248
|
+
seq = @seq.to_s
|
245
249
|
|
246
250
|
unless wrap.nil?
|
247
251
|
seq.gsub!(/(.{#{wrap}})/) do |match|
|
248
|
-
match <<
|
252
|
+
match << $INPUT_RECORD_SEPARATOR
|
249
253
|
end
|
250
254
|
|
251
255
|
seq.chomp!
|
252
256
|
end
|
253
257
|
|
254
|
-
">#{seq_name}#{
|
258
|
+
">#{seq_name}#{$INPUT_RECORD_SEPARATOR}#{seq}#{$INPUT_RECORD_SEPARATOR}"
|
255
259
|
end
|
256
260
|
|
257
261
|
# Method that given a Seq entry returns a FASTQ entry (a string).
|
258
262
|
def to_fastq
|
259
|
-
|
260
|
-
|
261
|
-
|
263
|
+
fail SeqError, 'Missing seq_name' if @seq_name.nil?
|
264
|
+
fail SeqError, 'Missing seq' if @seq.nil?
|
265
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
262
266
|
|
263
|
-
seq_name =
|
264
|
-
seq =
|
265
|
-
qual =
|
267
|
+
seq_name = @seq_name.to_s
|
268
|
+
seq = @seq.to_s
|
269
|
+
qual = @qual.to_s
|
266
270
|
|
267
|
-
"@#{seq_name}#{
|
271
|
+
"@#{seq_name}#{$RS}#{seq}#{$RS}+#{$RS}#{qual}#{$RS}"
|
268
272
|
end
|
269
273
|
|
270
274
|
# Method that generates a unique key for a
|
271
275
|
# DNA sequence and return this key as a Fixnum.
|
272
276
|
def to_key
|
273
277
|
key = 0
|
274
|
-
|
275
|
-
|
278
|
+
|
279
|
+
@seq.upcase.each_char do |char|
|
276
280
|
key <<= 2
|
277
|
-
|
281
|
+
|
278
282
|
case char
|
279
283
|
when 'A' then key |= 0
|
280
284
|
when 'C' then key |= 1
|
281
285
|
when 'G' then key |= 2
|
282
286
|
when 'T' then key |= 3
|
283
|
-
else
|
287
|
+
else fail SeqError, "Bad residue: #{char}"
|
284
288
|
end
|
285
289
|
end
|
286
|
-
|
290
|
+
|
287
291
|
key
|
288
292
|
end
|
289
293
|
|
290
294
|
# Method to reverse the sequence.
|
291
295
|
def reverse
|
292
296
|
entry = Seq.new(
|
293
|
-
seq_name:
|
294
|
-
seq:
|
295
|
-
type:
|
296
|
-
qual: (
|
297
|
+
seq_name: @seq_name,
|
298
|
+
seq: @seq.reverse,
|
299
|
+
type: @type,
|
300
|
+
qual: (@qual ? @qual.reverse : @qual)
|
297
301
|
)
|
298
302
|
|
299
303
|
entry
|
@@ -301,27 +305,25 @@ module BioDSL
|
|
301
305
|
|
302
306
|
# Method to reverse the sequence.
|
303
307
|
def reverse!
|
304
|
-
|
305
|
-
|
308
|
+
@seq.reverse!
|
309
|
+
@qual.reverse! if @qual
|
306
310
|
self
|
307
311
|
end
|
308
312
|
|
309
313
|
# Method that complements sequence including ambiguity codes.
|
310
314
|
def complement
|
311
|
-
|
315
|
+
fail SeqError, 'Cannot complement 0 length sequence' if length == 0
|
312
316
|
|
313
|
-
entry = Seq.new(
|
314
|
-
seq_name: self.seq_name,
|
315
|
-
type: self.type,
|
316
|
-
qual: self.qual
|
317
|
-
)
|
317
|
+
entry = Seq.new(seq_name: @seq_name, type: @type, qual: @qual)
|
318
318
|
|
319
|
-
if
|
320
|
-
entry.seq =
|
321
|
-
|
322
|
-
|
319
|
+
if dna?
|
320
|
+
entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
321
|
+
'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
322
|
+
elsif rna?
|
323
|
+
entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
324
|
+
'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
323
325
|
else
|
324
|
-
|
326
|
+
fail SeqError, "Cannot complement sequence type: #{@type}"
|
325
327
|
end
|
326
328
|
|
327
329
|
entry
|
@@ -329,14 +331,16 @@ module BioDSL
|
|
329
331
|
|
330
332
|
# Method that complements sequence including ambiguity codes.
|
331
333
|
def complement!
|
332
|
-
|
333
|
-
|
334
|
-
if
|
335
|
-
|
336
|
-
|
337
|
-
|
334
|
+
fail SeqError, 'Cannot complement 0 length sequence' if length == 0
|
335
|
+
|
336
|
+
if dna?
|
337
|
+
@seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
338
|
+
'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
339
|
+
elsif rna?
|
340
|
+
@seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
341
|
+
'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
338
342
|
else
|
339
|
-
|
343
|
+
fail SeqError, "Cannot complement sequence type: #{@type}"
|
340
344
|
end
|
341
345
|
|
342
346
|
self
|
@@ -346,68 +350,70 @@ module BioDSL
|
|
346
350
|
# two Sequence objects (case insensitive).
|
347
351
|
def hamming_distance(entry, options = {})
|
348
352
|
if options[:ambiguity]
|
349
|
-
BioDSL::Hamming.distance(
|
353
|
+
BioDSL::Hamming.distance(@seq, entry.seq, options)
|
350
354
|
else
|
351
|
-
BioDSL::Hamming.distance(
|
355
|
+
BioDSL::Hamming.distance(@seq.upcase, entry.seq.upcase, options)
|
352
356
|
end
|
353
357
|
end
|
354
358
|
|
355
359
|
# Method to determine the Edit Distance between
|
356
360
|
# two Sequence objects (case insensitive).
|
357
361
|
def edit_distance(entry)
|
358
|
-
Levenshtein.distance(
|
362
|
+
Levenshtein.distance(@seq, entry.seq)
|
359
363
|
end
|
360
364
|
|
361
365
|
# Method that generates a random sequence of a given length and type.
|
362
366
|
def generate(length, type)
|
363
|
-
|
367
|
+
fail SeqError, "Cannot generate seq length < 1: #{length}" if length <= 0
|
364
368
|
|
365
369
|
case type
|
366
370
|
when :dna then alph = DNA
|
367
371
|
when :rna then alph = RNA
|
368
372
|
when :protein then alph = PROTEIN
|
369
373
|
else
|
370
|
-
|
374
|
+
fail SeqError, "Unknown sequence type: #{type}"
|
371
375
|
end
|
372
376
|
|
373
|
-
seq_new
|
374
|
-
|
375
|
-
|
377
|
+
seq_new = Array.new(length) { alph[rand(alph.size)] }.join('')
|
378
|
+
@seq = seq_new
|
379
|
+
@type = type
|
380
|
+
|
376
381
|
seq_new
|
377
382
|
end
|
378
383
|
|
379
384
|
# Method to return a new Seq object with shuffled sequence.
|
380
385
|
def shuffle
|
381
386
|
Seq.new(
|
382
|
-
seq_name:
|
383
|
-
seq:
|
384
|
-
type:
|
385
|
-
qual:
|
387
|
+
seq_name: @seq_name,
|
388
|
+
seq: @seq.split('').shuffle!.join,
|
389
|
+
type: @type,
|
390
|
+
qual: @qual
|
386
391
|
)
|
387
392
|
end
|
388
393
|
|
389
394
|
# Method to shuffle a sequence randomly inline.
|
390
395
|
def shuffle!
|
391
|
-
|
396
|
+
@seq = @seq.split('').shuffle!.join
|
392
397
|
self
|
393
398
|
end
|
394
399
|
|
395
400
|
# Method to add two Seq objects.
|
396
|
-
def +(
|
397
|
-
new_entry = Seq.new
|
398
|
-
new_entry.seq =
|
399
|
-
new_entry.type =
|
400
|
-
new_entry.qual =
|
401
|
+
def +(other)
|
402
|
+
new_entry = Seq.new
|
403
|
+
new_entry.seq = @seq + other.seq
|
404
|
+
new_entry.type = @type if @type == other.type
|
405
|
+
new_entry.qual = @qual + other.qual if @qual && other.qual
|
401
406
|
new_entry
|
402
407
|
end
|
403
408
|
|
404
409
|
# Method to concatenate sequence entries.
|
405
410
|
def <<(entry)
|
406
|
-
|
407
|
-
|
411
|
+
fail SeqError, 'sequences of different types' unless @type == entry.type
|
412
|
+
fail SeqError, 'qual is missing in one entry' unless @qual.class ==
|
413
|
+
entry.qual.class
|
408
414
|
|
409
|
-
|
410
|
-
|
415
|
+
@seq << entry.seq
|
416
|
+
@qual << entry.qual unless entry.qual.nil?
|
411
417
|
|
412
418
|
self
|
413
419
|
end
|
@@ -415,18 +421,18 @@ module BioDSL
|
|
415
421
|
# Index method for Seq objects.
|
416
422
|
def [](*args)
|
417
423
|
entry = Seq.new
|
418
|
-
entry.seq_name =
|
419
|
-
entry.seq =
|
420
|
-
entry.type =
|
421
|
-
entry.qual =
|
424
|
+
entry.seq_name = @seq_name.dup unless @seq_name.nil?
|
425
|
+
entry.seq = @seq[*args] || ''
|
426
|
+
entry.type = @type
|
427
|
+
entry.qual = @qual[*args] || '' unless @qual.nil?
|
422
428
|
|
423
429
|
entry
|
424
430
|
end
|
425
431
|
|
426
432
|
# Index assignment method for Seq objects.
|
427
433
|
def []=(*args, entry)
|
428
|
-
|
429
|
-
|
434
|
+
@seq[*args] = entry.seq[*args]
|
435
|
+
@qual[*args] = entry.qual[*args] unless @qual.nil?
|
430
436
|
|
431
437
|
self
|
432
438
|
end
|
@@ -437,7 +443,7 @@ module BioDSL
|
|
437
443
|
def composition
|
438
444
|
comp = Hash.new(0);
|
439
445
|
|
440
|
-
|
446
|
+
@seq.upcase.each_char do |char|
|
441
447
|
comp[char] += 1
|
442
448
|
end
|
443
449
|
|
@@ -447,30 +453,33 @@ module BioDSL
|
|
447
453
|
# Method that returns the percentage of hard masked residues
|
448
454
|
# or N's in a sequence.
|
449
455
|
def hard_mask
|
450
|
-
((
|
456
|
+
((@seq.upcase.scan('N').size.to_f / (length - indels).to_f) * 100).
|
457
|
+
round(2)
|
451
458
|
end
|
452
459
|
|
453
460
|
# Method that returns the percentage of soft masked residues
|
454
461
|
# or lower cased residues in a sequence.
|
455
462
|
def soft_mask
|
456
|
-
((
|
463
|
+
((@seq.scan(/[a-z]/).size.to_f / (length - indels).to_f) * 100).round(2)
|
457
464
|
end
|
458
465
|
|
459
|
-
# Hard masks sequence residues where the corresponding quality
|
460
|
-
#
|
466
|
+
# Hard masks sequence residues where the corresponding quality scoreis below
|
467
|
+
# a given cutoff.
|
461
468
|
def mask_seq_hard!(cutoff)
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
469
|
+
fail SeqError, 'seq is nil' if @seq.nil?
|
470
|
+
fail SeqError, 'qual is nil' if @qual.nil?
|
471
|
+
fail SeqError, "cufoff value: #{cutoff} out of range: " \
|
472
|
+
"#{SCORE_MIN}..#{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
|
473
|
+
include? cutoff
|
474
|
+
|
475
|
+
na_seq = NArray.to_na(@seq.upcase, 'byte')
|
476
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
468
477
|
mask = (na_qual - SCORE_BASE) < cutoff
|
469
|
-
mask
|
478
|
+
mask *= na_seq.ne('-'.ord)
|
470
479
|
|
471
480
|
na_seq[mask] = 'N'.ord
|
472
481
|
|
473
|
-
|
482
|
+
@seq = na_seq.to_s
|
474
483
|
|
475
484
|
self
|
476
485
|
end
|
@@ -479,18 +488,20 @@ module BioDSL
|
|
479
488
|
# is below a given cutoff. Masked sequence will be lowercased and
|
480
489
|
# remaining will be uppercased.
|
481
490
|
def mask_seq_soft!(cutoff)
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
491
|
+
fail SeqError, 'seq is nil' if @seq.nil?
|
492
|
+
fail SeqError, 'qual is nil' if @qual.nil?
|
493
|
+
fail SeqError, "cufoff value: #{cutoff} out of range: " \
|
494
|
+
"#{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
|
495
|
+
include? cutoff
|
496
|
+
|
497
|
+
na_seq = NArray.to_na(@seq.upcase, 'byte')
|
498
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
488
499
|
mask = (na_qual - SCORE_BASE) < cutoff
|
489
|
-
mask
|
500
|
+
mask *= na_seq.ne('-'.ord)
|
490
501
|
|
491
502
|
na_seq[mask] ^= ' '.ord
|
492
503
|
|
493
|
-
|
504
|
+
@seq = na_seq.to_s
|
494
505
|
|
495
506
|
self
|
496
507
|
end
|
@@ -498,22 +509,22 @@ module BioDSL
|
|
498
509
|
# Method that determines if a quality score string can be
|
499
510
|
# absolutely identified as base 33.
|
500
511
|
def qual_base33?
|
501
|
-
|
512
|
+
@qual.match(/[!-:]/) ? true : false
|
502
513
|
end
|
503
|
-
|
514
|
+
|
504
515
|
# Method that determines if a quality score string may be base 64.
|
505
516
|
def qual_base64?
|
506
|
-
|
517
|
+
@qual.match(/[K-h]/) ? true : false
|
507
518
|
end
|
508
519
|
|
509
520
|
# Method to determine if a quality score is valid accepting only 0-40 range.
|
510
521
|
def qual_valid?(encoding)
|
511
|
-
|
522
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
512
523
|
|
513
524
|
case encoding
|
514
|
-
when :base_33 then return true if
|
515
|
-
when :base_64 then return true if
|
516
|
-
else
|
525
|
+
when :base_33 then return true if @qual.match(/^[!-I]*$/)
|
526
|
+
when :base_64 then return true if @qual.match(/^[@-h]*$/)
|
527
|
+
else fail SeqError, "unknown quality score encoding: #{encoding}"
|
517
528
|
end
|
518
529
|
|
519
530
|
false
|
@@ -521,28 +532,34 @@ module BioDSL
|
|
521
532
|
|
522
533
|
# Method to coerce quality scores to be within the 0-40 range.
|
523
534
|
def qual_coerce!(encoding)
|
524
|
-
|
535
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
525
536
|
|
526
537
|
case encoding
|
527
|
-
when :base_33 then qual_coerce_C(
|
528
|
-
when :base_64 then qual_coerce_C(
|
538
|
+
when :base_33 then qual_coerce_C(@qual, @qual.length, 33, 73) # !-J
|
539
|
+
when :base_64 then qual_coerce_C(@qual, @qual.length, 64, 104) # @-h
|
529
540
|
else
|
530
|
-
|
531
|
-
end
|
541
|
+
fail SeqError, "unknown quality score encoding: #{encoding}"
|
542
|
+
end
|
532
543
|
|
533
544
|
self
|
534
545
|
end
|
535
546
|
|
536
547
|
# Method to convert quality scores.
|
537
548
|
def qual_convert!(from, to)
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
549
|
+
unless from == :base_33 || from == :base_64
|
550
|
+
fail SeqError, "unknown quality score encoding: #{from}"
|
551
|
+
end
|
552
|
+
|
553
|
+
unless to == :base_33 || to == :base_64
|
554
|
+
fail SeqError, "unknown quality score encoding: #{to}"
|
555
|
+
end
|
556
|
+
|
557
|
+
if from == :base_33 && to == :base_64
|
558
|
+
qual_convert_C(@qual, @qual.length, 31) # += 64 - 33
|
559
|
+
elsif from == :base_64 && to == :base_33
|
560
|
+
# Handle negative Solexa values from -5 to -1 (set these to 0).
|
561
|
+
qual_coerce_C(@qual, @qual.length, 64, 104)
|
562
|
+
qual_convert_C(@qual, @qual.length, -31) # -= 64 - 33
|
546
563
|
end
|
547
564
|
|
548
565
|
self
|
@@ -550,9 +567,9 @@ module BioDSL
|
|
550
567
|
|
551
568
|
# Method to calculate and return the mean quality score.
|
552
569
|
def scores_mean
|
553
|
-
|
570
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
554
571
|
|
555
|
-
na_qual = NArray.to_na(
|
572
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
556
573
|
na_qual -= SCORE_BASE
|
557
574
|
|
558
575
|
na_qual.mean
|
@@ -560,9 +577,9 @@ module BioDSL
|
|
560
577
|
|
561
578
|
# Method to calculate and return the min quality score.
|
562
579
|
def scores_min
|
563
|
-
|
580
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
564
581
|
|
565
|
-
na_qual = NArray.to_na(
|
582
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
566
583
|
na_qual -= SCORE_BASE
|
567
584
|
|
568
585
|
na_qual.min
|
@@ -570,9 +587,9 @@ module BioDSL
|
|
570
587
|
|
571
588
|
# Method to calculate and return the max quality score.
|
572
589
|
def scores_max
|
573
|
-
|
590
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
574
591
|
|
575
|
-
na_qual = NArray.to_na(
|
592
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
576
593
|
na_qual -= SCORE_BASE
|
577
594
|
|
578
595
|
na_qual.max
|
@@ -582,17 +599,17 @@ module BioDSL
|
|
582
599
|
# scores string and calculate for each window the mean score and return
|
583
600
|
# the minimum mean score.
|
584
601
|
def scores_mean_local(window_size)
|
585
|
-
|
602
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
586
603
|
|
587
|
-
scores_mean_local_C(
|
604
|
+
scores_mean_local_C(@qual, @qual.length, SCORE_BASE, window_size)
|
588
605
|
end
|
589
606
|
|
590
607
|
# Method to find open reading frames (ORFs).
|
591
608
|
def each_orf(options = {})
|
592
|
-
size_min = options[:size_min]
|
593
|
-
size_max = options[:size_max]
|
594
|
-
start_codons = options[:start_codons] ||
|
595
|
-
stop_codons = options[:stop_codons]
|
609
|
+
size_min = options[:size_min] || 0
|
610
|
+
size_max = options[:size_max] || length
|
611
|
+
start_codons = options[:start_codons] || 'ATG,GTG,AUG,GUG'
|
612
|
+
stop_codons = options[:stop_codons] || 'TAA,TGA,TAG,UAA,UGA,UAG'
|
596
613
|
pick_longest = options[:pick_longest]
|
597
614
|
|
598
615
|
orfs = []
|
@@ -601,22 +618,23 @@ module BioDSL
|
|
601
618
|
regex_start = Regexp.new(start_codons.split(',').join('|'), true)
|
602
619
|
regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
|
603
620
|
|
604
|
-
while pos_beg
|
605
|
-
|
606
|
-
|
607
|
-
|
621
|
+
while pos_beg && pos_beg < length - size_min
|
622
|
+
pos_beg = @seq.index(regex_start, pos_beg)
|
623
|
+
next unless pos_beg
|
624
|
+
pos_end = @seq.index(regex_stop, pos_beg)
|
625
|
+
next unless pos_end
|
608
626
|
|
609
|
-
|
610
|
-
if size_min <= length and length <= size_max
|
611
|
-
subseq = self[pos_beg ... pos_beg + length]
|
627
|
+
orf_length = (pos_end - pos_beg) + 3
|
612
628
|
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
end
|
629
|
+
if (orf_length % 3) == 0
|
630
|
+
if size_min <= orf_length && orf_length <= size_max
|
631
|
+
subseq = self[pos_beg...pos_beg + orf_length]
|
617
632
|
|
618
|
-
|
633
|
+
orfs << Orf.new(subseq, pos_beg, pos_end + 2)
|
634
|
+
end
|
619
635
|
end
|
636
|
+
|
637
|
+
pos_beg += 1
|
620
638
|
end
|
621
639
|
|
622
640
|
if pick_longest
|
@@ -634,17 +652,8 @@ module BioDSL
|
|
634
652
|
end
|
635
653
|
end
|
636
654
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
def initialize(entry, start, stop)
|
641
|
-
@entry = entry
|
642
|
-
@start = start
|
643
|
-
@stop = stop
|
644
|
-
end
|
645
|
-
end
|
646
|
-
|
647
|
-
private
|
655
|
+
# Struct for holding an ORF.
|
656
|
+
Orf = Struct.new(:entry, :start, :stop)
|
648
657
|
|
649
658
|
inline do |builder|
|
650
659
|
builder.c %{
|