BioDSL 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
data/lib/BioDSL/filesys.rb
CHANGED
|
@@ -1,35 +1,39 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
|
4
|
-
#
|
|
5
|
-
# This program is free software; you can redistribute it and/or
|
|
6
|
-
# modify it under the terms of the GNU General Public License
|
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
|
8
|
-
# of the License, or (at your option) any later version.
|
|
9
|
-
#
|
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
# GNU General Public License for more details.
|
|
14
|
-
#
|
|
15
|
-
# You should have received a copy of the GNU General Public License
|
|
16
|
-
# along with this program; if not, write to the Free Software
|
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# Namespace for BioDSL.
|
|
27
29
|
module BioDSL
|
|
28
30
|
# Error class for all exceptions to do with Filesys.
|
|
29
31
|
class FilesysError < StandardError; end
|
|
30
32
|
|
|
33
|
+
# Class for handling filesystem manipulations.
|
|
31
34
|
class Filesys
|
|
32
35
|
require 'open3'
|
|
36
|
+
require 'English'
|
|
33
37
|
|
|
34
38
|
include Enumerable
|
|
35
39
|
|
|
@@ -40,10 +44,10 @@ module BioDSL
|
|
|
40
44
|
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
|
41
45
|
|
|
42
46
|
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
|
43
|
-
exts.each
|
|
47
|
+
exts.each do |ext|
|
|
44
48
|
exe = File.join(path, "#{cmd}#{ext}")
|
|
45
49
|
return exe if File.executable?(exe) && !File.directory?(exe)
|
|
46
|
-
|
|
50
|
+
end
|
|
47
51
|
end
|
|
48
52
|
|
|
49
53
|
nil
|
|
@@ -51,14 +55,15 @@ module BioDSL
|
|
|
51
55
|
|
|
52
56
|
# Class method that returns a path to a unique temporary file.
|
|
53
57
|
# If no directory is specified reverts to the systems tmp directory.
|
|
54
|
-
def self.tmpfile(tmp_dir = ENV[
|
|
58
|
+
def self.tmpfile(tmp_dir = ENV['TMPDIR'])
|
|
55
59
|
time = Time.now.to_i
|
|
56
|
-
user = ENV[
|
|
57
|
-
pid =
|
|
58
|
-
path = tmp_dir + [user, time + pid, pid].join(
|
|
60
|
+
user = ENV['USER']
|
|
61
|
+
pid = $PID
|
|
62
|
+
path = tmp_dir + [user, time + pid, pid].join('_') + '.tmp'
|
|
59
63
|
path
|
|
60
64
|
end
|
|
61
65
|
|
|
66
|
+
# Open a file which may be compressed with gzip og bzip2.
|
|
62
67
|
def self.open(*args)
|
|
63
68
|
file = args.shift
|
|
64
69
|
mode = args.shift
|
|
@@ -67,32 +72,37 @@ module BioDSL
|
|
|
67
72
|
if mode == 'w'
|
|
68
73
|
case options[:compress]
|
|
69
74
|
when :gzip
|
|
70
|
-
ios, = Open3.pipeline_w(
|
|
75
|
+
ios, = Open3.pipeline_w('gzip -f', out: file)
|
|
71
76
|
when :bzip, :bzip2
|
|
72
|
-
ios, = Open3.pipeline_w(
|
|
73
|
-
else
|
|
74
|
-
ios = File.open(file, mode, options)
|
|
75
|
-
end
|
|
76
|
-
else
|
|
77
|
-
type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
|
|
78
|
-
case type
|
|
79
|
-
when /gzip/
|
|
80
|
-
ios = IO.popen("gzip -cd #{file}")
|
|
81
|
-
when /bzip/
|
|
82
|
-
ios = IO.popen("bzcat #{file}")
|
|
77
|
+
ios, = Open3.pipeline_w('bzip2 -c', out: file)
|
|
83
78
|
else
|
|
84
79
|
ios = File.open(file, mode, options)
|
|
85
80
|
end
|
|
81
|
+
else
|
|
82
|
+
type = if file.respond_to? :path
|
|
83
|
+
`file -Lk #{file.path}`
|
|
84
|
+
else
|
|
85
|
+
`file -Lk #{file}`
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
ios = case type
|
|
89
|
+
when /gzip/
|
|
90
|
+
IO.popen("gzip -cd #{file}")
|
|
91
|
+
when /bzip/
|
|
92
|
+
IO.popen("bzcat #{file}")
|
|
93
|
+
else
|
|
94
|
+
File.open(file, mode, options)
|
|
95
|
+
end
|
|
86
96
|
end
|
|
87
97
|
|
|
88
98
|
if block_given?
|
|
89
99
|
begin
|
|
90
|
-
yield
|
|
100
|
+
yield new(ios)
|
|
91
101
|
ensure
|
|
92
102
|
ios.close
|
|
93
103
|
end
|
|
94
104
|
else
|
|
95
|
-
return
|
|
105
|
+
return new(ios)
|
|
96
106
|
end
|
|
97
107
|
end
|
|
98
108
|
|
|
@@ -134,4 +144,3 @@ module BioDSL
|
|
|
134
144
|
end
|
|
135
145
|
end
|
|
136
146
|
end
|
|
137
|
-
|
data/lib/BioDSL/fork.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/hamming.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/helpers.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
module BioDSL
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/html_report.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/math.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/mummer.rb
CHANGED
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
# #
|
|
21
21
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
22
|
# #
|
|
23
|
-
# This software is part of BioDSL (
|
|
23
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
24
24
|
# #
|
|
25
25
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
26
|
|
data/lib/BioDSL/pipeline.rb
CHANGED
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
# #
|
|
21
21
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
22
22
|
# #
|
|
23
|
-
# This software is part of BioDSL (
|
|
23
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
24
24
|
# #
|
|
25
25
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
26
|
module BioDSL
|
data/lib/BioDSL/seq.rb
CHANGED
|
@@ -1,30 +1,33 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
|
4
|
-
#
|
|
5
|
-
# This program is free software; you can redistribute it and/or
|
|
6
|
-
# modify it under the terms of the GNU General Public License
|
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
|
8
|
-
# of the License, or (at your option) any later version.
|
|
9
|
-
#
|
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
# GNU General Public License for more details.
|
|
14
|
-
#
|
|
15
|
-
# You should have received a copy of the GNU General Public License
|
|
16
|
-
# along with this program; if not, write to the Free Software
|
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part BioDSL (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
# Namespace for BioDSL.
|
|
27
29
|
module BioDSL
|
|
30
|
+
require 'English'
|
|
28
31
|
require 'narray'
|
|
29
32
|
require 'BioDSL/seq/ambiguity'
|
|
30
33
|
require 'BioDSL/seq/assemble'
|
|
@@ -40,12 +43,15 @@ module BioDSL
|
|
|
40
43
|
# Error class for all exceptions to do with Seq.
|
|
41
44
|
class SeqError < StandardError; end
|
|
42
45
|
|
|
46
|
+
# rubocop: disable ClassLength
|
|
47
|
+
|
|
48
|
+
# Class for manipulating sequences.
|
|
43
49
|
class Seq
|
|
44
50
|
# Residue alphabets
|
|
45
|
-
DNA = %w
|
|
46
|
-
RNA = %w
|
|
47
|
-
PROTEIN = %w
|
|
48
|
-
INDELS = %w
|
|
51
|
+
DNA = %w(a t c g)
|
|
52
|
+
RNA = %w(a u c g)
|
|
53
|
+
PROTEIN = %w(f l s y c w p h q r i m t n k v a d e g)
|
|
54
|
+
INDELS = %w(. - _ ~)
|
|
49
55
|
|
|
50
56
|
# Quality scores bases
|
|
51
57
|
SCORE_BASE = 33
|
|
@@ -69,30 +75,29 @@ module BioDSL
|
|
|
69
75
|
type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
|
|
70
76
|
qual = record[:SCORES]
|
|
71
77
|
|
|
72
|
-
|
|
78
|
+
new(seq_name: seq_name, seq: seq, type: type, qual: qual)
|
|
73
79
|
end
|
|
74
80
|
|
|
75
|
-
# Class method that generates all possible oligos of a specifed length and
|
|
81
|
+
# Class method that generates all possible oligos of a specifed length and
|
|
82
|
+
# type.
|
|
76
83
|
def self.generate_oligos(length, type)
|
|
77
|
-
|
|
84
|
+
fail SeqError, "Bad length: #{length}" if length <= 0
|
|
78
85
|
|
|
79
86
|
case type.downcase
|
|
80
87
|
when :dna then alph = DNA
|
|
81
88
|
when :rna then alph = RNA
|
|
82
89
|
when :protein then alph = PROTEIN
|
|
83
90
|
else
|
|
84
|
-
|
|
91
|
+
fail SeqError, "Unknown sequence type: #{type}"
|
|
85
92
|
end
|
|
86
93
|
|
|
87
|
-
oligos = [
|
|
94
|
+
oligos = ['']
|
|
88
95
|
|
|
89
|
-
(1
|
|
96
|
+
(1..length).each do
|
|
90
97
|
list = []
|
|
91
98
|
|
|
92
99
|
oligos.each do |oligo|
|
|
93
|
-
alph.each
|
|
94
|
-
list << oligo + char
|
|
95
|
-
end
|
|
100
|
+
alph.each { |char| list << oligo + char }
|
|
96
101
|
end
|
|
97
102
|
|
|
98
103
|
oligos = list
|
|
@@ -103,24 +108,22 @@ module BioDSL
|
|
|
103
108
|
|
|
104
109
|
def self.check_name_pair(entry1, entry2)
|
|
105
110
|
if entry1.seq_name =~ /^([^ ]+) \d:/
|
|
106
|
-
name1 =
|
|
107
|
-
elsif entry1.seq_name =~
|
|
108
|
-
name1 =
|
|
111
|
+
name1 = Regexp.last_match[1]
|
|
112
|
+
elsif entry1.seq_name =~ %r{^(.+)\/\d$}
|
|
113
|
+
name1 = Regexp.last_match[1]
|
|
109
114
|
else
|
|
110
|
-
|
|
115
|
+
fail SeqError, "Could not match sequence name: #{entry1.seq_name}"
|
|
111
116
|
end
|
|
112
117
|
|
|
113
118
|
if entry2.seq_name =~ /^([^ ]+) \d:/
|
|
114
|
-
name2 =
|
|
115
|
-
elsif entry2.seq_name =~
|
|
116
|
-
name2 =
|
|
119
|
+
name2 = Regexp.last_match[1]
|
|
120
|
+
elsif entry2.seq_name =~ %r{^(.+)\/\d$}
|
|
121
|
+
name2 = Regexp.last_match[1]
|
|
117
122
|
else
|
|
118
|
-
|
|
123
|
+
fail SeqError, "Could not match sequence name: #{entry2.seq_name}"
|
|
119
124
|
end
|
|
120
125
|
|
|
121
|
-
if name1 != name2
|
|
122
|
-
raise SeqError, "Name mismatch: #{name1} != #{name2}"
|
|
123
|
-
end
|
|
126
|
+
fail SeqError, "Name mismatch: #{name1} != #{name2}" if name1 != name2
|
|
124
127
|
end
|
|
125
128
|
|
|
126
129
|
# Initialize a sequence object with the following options:
|
|
@@ -134,18 +137,19 @@ module BioDSL
|
|
|
134
137
|
@type = options[:type]
|
|
135
138
|
@qual = options[:qual]
|
|
136
139
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
return unless @seq && @qual
|
|
141
|
+
return if @seq.length == @qual.length
|
|
142
|
+
|
|
143
|
+
fail SeqError, 'Sequence length and score length mismatch: ' \
|
|
144
|
+
"#{@seq.length} != #{@qual.length}"
|
|
141
145
|
end
|
|
142
146
|
|
|
143
147
|
# Method that guesses and returns the sequence type
|
|
144
148
|
# by inspecting the first 100 residues.
|
|
145
149
|
def type_guess
|
|
146
|
-
|
|
150
|
+
fail SeqError, 'Guess failed: sequence is nil' if @seq.nil?
|
|
147
151
|
|
|
148
|
-
case
|
|
152
|
+
case @seq[0...100].downcase
|
|
149
153
|
when /[flpqie]/ then return :protein
|
|
150
154
|
when /[u]/ then return :rna
|
|
151
155
|
else return :dna
|
|
@@ -155,31 +159,31 @@ module BioDSL
|
|
|
155
159
|
# Method that guesses and sets the sequence type
|
|
156
160
|
# by inspecting the first 100 residues.
|
|
157
161
|
def type_guess!
|
|
158
|
-
|
|
162
|
+
@type = type_guess
|
|
159
163
|
self
|
|
160
164
|
end
|
|
161
165
|
|
|
162
166
|
# Returns the length of a sequence.
|
|
163
167
|
def length
|
|
164
|
-
|
|
168
|
+
@seq.nil? ? 0 : @seq.length
|
|
165
169
|
end
|
|
166
170
|
|
|
167
|
-
|
|
171
|
+
alias_method :len, :length
|
|
168
172
|
|
|
169
173
|
# Return the number indels in a sequence.
|
|
170
174
|
def indels
|
|
171
175
|
regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
|
|
172
|
-
|
|
176
|
+
@seq.scan(regex).size
|
|
173
177
|
end
|
|
174
178
|
|
|
175
179
|
# Method to remove indels from seq and qual if qual.
|
|
176
180
|
def indels_remove
|
|
177
|
-
if
|
|
178
|
-
|
|
181
|
+
if @qual.nil?
|
|
182
|
+
@seq.delete!(Regexp.escape(INDELS.join('')))
|
|
179
183
|
else
|
|
180
|
-
na_seq = NArray.to_na(
|
|
181
|
-
na_qual = NArray.to_na(
|
|
182
|
-
mask = NArray.byte(
|
|
184
|
+
na_seq = NArray.to_na(@seq, 'byte')
|
|
185
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
186
|
+
mask = NArray.byte(length)
|
|
183
187
|
|
|
184
188
|
INDELS.each do |c|
|
|
185
189
|
mask += na_seq.eq(c.ord)
|
|
@@ -187,113 +191,113 @@ module BioDSL
|
|
|
187
191
|
|
|
188
192
|
mask = mask.eq(0)
|
|
189
193
|
|
|
190
|
-
|
|
191
|
-
|
|
194
|
+
@seq = na_seq[mask].to_s
|
|
195
|
+
@qual = na_qual[mask].to_s
|
|
192
196
|
end
|
|
193
197
|
|
|
194
198
|
self
|
|
195
199
|
end
|
|
196
200
|
|
|
197
201
|
# Method that returns true is a given sequence type is DNA.
|
|
198
|
-
def
|
|
199
|
-
|
|
202
|
+
def dna?
|
|
203
|
+
@type == :dna
|
|
200
204
|
end
|
|
201
205
|
|
|
202
206
|
# Method that returns true is a given sequence type is RNA.
|
|
203
|
-
def
|
|
204
|
-
|
|
207
|
+
def rna?
|
|
208
|
+
@type == :rna
|
|
205
209
|
end
|
|
206
210
|
|
|
207
211
|
# Method that returns true is a given sequence type is protein.
|
|
208
|
-
def
|
|
209
|
-
|
|
212
|
+
def protein?
|
|
213
|
+
@type == :protein
|
|
210
214
|
end
|
|
211
215
|
|
|
212
216
|
# Method to transcribe DNA to RNA.
|
|
213
217
|
def to_rna
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
+
fail SeqError, 'Cannot transcribe 0 length sequence' if length == 0
|
|
219
|
+
fail SeqError, 'Cannot transcribe sequence type: #{@type}' unless dna?
|
|
220
|
+
@type = :rna
|
|
221
|
+
@seq.tr!('Tt', 'Uu')
|
|
218
222
|
end
|
|
219
223
|
|
|
220
224
|
# Method to reverse-transcribe RNA to DNA.
|
|
221
225
|
def to_dna
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
+
fail SeqError, 'Cant reverse-transcribe 0 length sequence' if length == 0
|
|
227
|
+
fail SeqError, "Cant reverse-transcribe seq type: #{@type}" unless rna?
|
|
228
|
+
@type = :dna
|
|
229
|
+
@seq.tr!('Uu', 'Tt')
|
|
226
230
|
end
|
|
227
231
|
|
|
228
232
|
# Method that given a Seq entry returns a BioDSL record (a hash).
|
|
229
233
|
def to_bp
|
|
230
234
|
record = {}
|
|
231
|
-
record[:SEQ_NAME] =
|
|
232
|
-
record[:SEQ] =
|
|
233
|
-
record[:SEQ_LEN] =
|
|
234
|
-
record[:SCORES] =
|
|
235
|
+
record[:SEQ_NAME] = @seq_name if @seq_name
|
|
236
|
+
record[:SEQ] = @seq if @seq
|
|
237
|
+
record[:SEQ_LEN] = length if @seq
|
|
238
|
+
record[:SCORES] = @qual if @qual
|
|
235
239
|
record
|
|
236
240
|
end
|
|
237
241
|
|
|
238
242
|
# Method that given a Seq entry returns a FASTA entry (a string).
|
|
239
243
|
def to_fasta(wrap = nil)
|
|
240
|
-
|
|
241
|
-
|
|
244
|
+
fail SeqError, 'Missing seq_name' if @seq_name.nil? || @seq_name == ''
|
|
245
|
+
fail SeqError, 'Missing seq' if @seq.nil? || @seq.empty?
|
|
242
246
|
|
|
243
|
-
seq_name =
|
|
244
|
-
seq =
|
|
247
|
+
seq_name = @seq_name.to_s
|
|
248
|
+
seq = @seq.to_s
|
|
245
249
|
|
|
246
250
|
unless wrap.nil?
|
|
247
251
|
seq.gsub!(/(.{#{wrap}})/) do |match|
|
|
248
|
-
match <<
|
|
252
|
+
match << $INPUT_RECORD_SEPARATOR
|
|
249
253
|
end
|
|
250
254
|
|
|
251
255
|
seq.chomp!
|
|
252
256
|
end
|
|
253
257
|
|
|
254
|
-
">#{seq_name}#{
|
|
258
|
+
">#{seq_name}#{$INPUT_RECORD_SEPARATOR}#{seq}#{$INPUT_RECORD_SEPARATOR}"
|
|
255
259
|
end
|
|
256
260
|
|
|
257
261
|
# Method that given a Seq entry returns a FASTQ entry (a string).
|
|
258
262
|
def to_fastq
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
263
|
+
fail SeqError, 'Missing seq_name' if @seq_name.nil?
|
|
264
|
+
fail SeqError, 'Missing seq' if @seq.nil?
|
|
265
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
|
262
266
|
|
|
263
|
-
seq_name =
|
|
264
|
-
seq =
|
|
265
|
-
qual =
|
|
267
|
+
seq_name = @seq_name.to_s
|
|
268
|
+
seq = @seq.to_s
|
|
269
|
+
qual = @qual.to_s
|
|
266
270
|
|
|
267
|
-
"@#{seq_name}#{
|
|
271
|
+
"@#{seq_name}#{$RS}#{seq}#{$RS}+#{$RS}#{qual}#{$RS}"
|
|
268
272
|
end
|
|
269
273
|
|
|
270
274
|
# Method that generates a unique key for a
|
|
271
275
|
# DNA sequence and return this key as a Fixnum.
|
|
272
276
|
def to_key
|
|
273
277
|
key = 0
|
|
274
|
-
|
|
275
|
-
|
|
278
|
+
|
|
279
|
+
@seq.upcase.each_char do |char|
|
|
276
280
|
key <<= 2
|
|
277
|
-
|
|
281
|
+
|
|
278
282
|
case char
|
|
279
283
|
when 'A' then key |= 0
|
|
280
284
|
when 'C' then key |= 1
|
|
281
285
|
when 'G' then key |= 2
|
|
282
286
|
when 'T' then key |= 3
|
|
283
|
-
else
|
|
287
|
+
else fail SeqError, "Bad residue: #{char}"
|
|
284
288
|
end
|
|
285
289
|
end
|
|
286
|
-
|
|
290
|
+
|
|
287
291
|
key
|
|
288
292
|
end
|
|
289
293
|
|
|
290
294
|
# Method to reverse the sequence.
|
|
291
295
|
def reverse
|
|
292
296
|
entry = Seq.new(
|
|
293
|
-
seq_name:
|
|
294
|
-
seq:
|
|
295
|
-
type:
|
|
296
|
-
qual: (
|
|
297
|
+
seq_name: @seq_name,
|
|
298
|
+
seq: @seq.reverse,
|
|
299
|
+
type: @type,
|
|
300
|
+
qual: (@qual ? @qual.reverse : @qual)
|
|
297
301
|
)
|
|
298
302
|
|
|
299
303
|
entry
|
|
@@ -301,27 +305,25 @@ module BioDSL
|
|
|
301
305
|
|
|
302
306
|
# Method to reverse the sequence.
|
|
303
307
|
def reverse!
|
|
304
|
-
|
|
305
|
-
|
|
308
|
+
@seq.reverse!
|
|
309
|
+
@qual.reverse! if @qual
|
|
306
310
|
self
|
|
307
311
|
end
|
|
308
312
|
|
|
309
313
|
# Method that complements sequence including ambiguity codes.
|
|
310
314
|
def complement
|
|
311
|
-
|
|
315
|
+
fail SeqError, 'Cannot complement 0 length sequence' if length == 0
|
|
312
316
|
|
|
313
|
-
entry = Seq.new(
|
|
314
|
-
seq_name: self.seq_name,
|
|
315
|
-
type: self.type,
|
|
316
|
-
qual: self.qual
|
|
317
|
-
)
|
|
317
|
+
entry = Seq.new(seq_name: @seq_name, type: @type, qual: @qual)
|
|
318
318
|
|
|
319
|
-
if
|
|
320
|
-
entry.seq =
|
|
321
|
-
|
|
322
|
-
|
|
319
|
+
if dna?
|
|
320
|
+
entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
|
321
|
+
'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
|
322
|
+
elsif rna?
|
|
323
|
+
entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
|
324
|
+
'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
|
323
325
|
else
|
|
324
|
-
|
|
326
|
+
fail SeqError, "Cannot complement sequence type: #{@type}"
|
|
325
327
|
end
|
|
326
328
|
|
|
327
329
|
entry
|
|
@@ -329,14 +331,16 @@ module BioDSL
|
|
|
329
331
|
|
|
330
332
|
# Method that complements sequence including ambiguity codes.
|
|
331
333
|
def complement!
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
if
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
334
|
+
fail SeqError, 'Cannot complement 0 length sequence' if length == 0
|
|
335
|
+
|
|
336
|
+
if dna?
|
|
337
|
+
@seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
|
338
|
+
'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
|
|
339
|
+
elsif rna?
|
|
340
|
+
@seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
|
|
341
|
+
'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
|
|
338
342
|
else
|
|
339
|
-
|
|
343
|
+
fail SeqError, "Cannot complement sequence type: #{@type}"
|
|
340
344
|
end
|
|
341
345
|
|
|
342
346
|
self
|
|
@@ -346,68 +350,70 @@ module BioDSL
|
|
|
346
350
|
# two Sequence objects (case insensitive).
|
|
347
351
|
def hamming_distance(entry, options = {})
|
|
348
352
|
if options[:ambiguity]
|
|
349
|
-
BioDSL::Hamming.distance(
|
|
353
|
+
BioDSL::Hamming.distance(@seq, entry.seq, options)
|
|
350
354
|
else
|
|
351
|
-
BioDSL::Hamming.distance(
|
|
355
|
+
BioDSL::Hamming.distance(@seq.upcase, entry.seq.upcase, options)
|
|
352
356
|
end
|
|
353
357
|
end
|
|
354
358
|
|
|
355
359
|
# Method to determine the Edit Distance between
|
|
356
360
|
# two Sequence objects (case insensitive).
|
|
357
361
|
def edit_distance(entry)
|
|
358
|
-
Levenshtein.distance(
|
|
362
|
+
Levenshtein.distance(@seq, entry.seq)
|
|
359
363
|
end
|
|
360
364
|
|
|
361
365
|
# Method that generates a random sequence of a given length and type.
|
|
362
366
|
def generate(length, type)
|
|
363
|
-
|
|
367
|
+
fail SeqError, "Cannot generate seq length < 1: #{length}" if length <= 0
|
|
364
368
|
|
|
365
369
|
case type
|
|
366
370
|
when :dna then alph = DNA
|
|
367
371
|
when :rna then alph = RNA
|
|
368
372
|
when :protein then alph = PROTEIN
|
|
369
373
|
else
|
|
370
|
-
|
|
374
|
+
fail SeqError, "Unknown sequence type: #{type}"
|
|
371
375
|
end
|
|
372
376
|
|
|
373
|
-
seq_new
|
|
374
|
-
|
|
375
|
-
|
|
377
|
+
seq_new = Array.new(length) { alph[rand(alph.size)] }.join('')
|
|
378
|
+
@seq = seq_new
|
|
379
|
+
@type = type
|
|
380
|
+
|
|
376
381
|
seq_new
|
|
377
382
|
end
|
|
378
383
|
|
|
379
384
|
# Method to return a new Seq object with shuffled sequence.
|
|
380
385
|
def shuffle
|
|
381
386
|
Seq.new(
|
|
382
|
-
seq_name:
|
|
383
|
-
seq:
|
|
384
|
-
type:
|
|
385
|
-
qual:
|
|
387
|
+
seq_name: @seq_name,
|
|
388
|
+
seq: @seq.split('').shuffle!.join,
|
|
389
|
+
type: @type,
|
|
390
|
+
qual: @qual
|
|
386
391
|
)
|
|
387
392
|
end
|
|
388
393
|
|
|
389
394
|
# Method to shuffle a sequence randomly inline.
|
|
390
395
|
def shuffle!
|
|
391
|
-
|
|
396
|
+
@seq = @seq.split('').shuffle!.join
|
|
392
397
|
self
|
|
393
398
|
end
|
|
394
399
|
|
|
395
400
|
# Method to add two Seq objects.
|
|
396
|
-
def +(
|
|
397
|
-
new_entry = Seq.new
|
|
398
|
-
new_entry.seq =
|
|
399
|
-
new_entry.type =
|
|
400
|
-
new_entry.qual =
|
|
401
|
+
def +(other)
|
|
402
|
+
new_entry = Seq.new
|
|
403
|
+
new_entry.seq = @seq + other.seq
|
|
404
|
+
new_entry.type = @type if @type == other.type
|
|
405
|
+
new_entry.qual = @qual + other.qual if @qual && other.qual
|
|
401
406
|
new_entry
|
|
402
407
|
end
|
|
403
408
|
|
|
404
409
|
# Method to concatenate sequence entries.
|
|
405
410
|
def <<(entry)
|
|
406
|
-
|
|
407
|
-
|
|
411
|
+
fail SeqError, 'sequences of different types' unless @type == entry.type
|
|
412
|
+
fail SeqError, 'qual is missing in one entry' unless @qual.class ==
|
|
413
|
+
entry.qual.class
|
|
408
414
|
|
|
409
|
-
|
|
410
|
-
|
|
415
|
+
@seq << entry.seq
|
|
416
|
+
@qual << entry.qual unless entry.qual.nil?
|
|
411
417
|
|
|
412
418
|
self
|
|
413
419
|
end
|
|
@@ -415,18 +421,18 @@ module BioDSL
|
|
|
415
421
|
# Index method for Seq objects.
|
|
416
422
|
def [](*args)
|
|
417
423
|
entry = Seq.new
|
|
418
|
-
entry.seq_name =
|
|
419
|
-
entry.seq =
|
|
420
|
-
entry.type =
|
|
421
|
-
entry.qual =
|
|
424
|
+
entry.seq_name = @seq_name.dup unless @seq_name.nil?
|
|
425
|
+
entry.seq = @seq[*args] || ''
|
|
426
|
+
entry.type = @type
|
|
427
|
+
entry.qual = @qual[*args] || '' unless @qual.nil?
|
|
422
428
|
|
|
423
429
|
entry
|
|
424
430
|
end
|
|
425
431
|
|
|
426
432
|
# Index assignment method for Seq objects.
|
|
427
433
|
def []=(*args, entry)
|
|
428
|
-
|
|
429
|
-
|
|
434
|
+
@seq[*args] = entry.seq[*args]
|
|
435
|
+
@qual[*args] = entry.qual[*args] unless @qual.nil?
|
|
430
436
|
|
|
431
437
|
self
|
|
432
438
|
end
|
|
@@ -437,7 +443,7 @@ module BioDSL
|
|
|
437
443
|
def composition
|
|
438
444
|
comp = Hash.new(0);
|
|
439
445
|
|
|
440
|
-
|
|
446
|
+
@seq.upcase.each_char do |char|
|
|
441
447
|
comp[char] += 1
|
|
442
448
|
end
|
|
443
449
|
|
|
@@ -447,30 +453,33 @@ module BioDSL
|
|
|
447
453
|
# Method that returns the percentage of hard masked residues
|
|
448
454
|
# or N's in a sequence.
|
|
449
455
|
def hard_mask
|
|
450
|
-
((
|
|
456
|
+
((@seq.upcase.scan('N').size.to_f / (length - indels).to_f) * 100).
|
|
457
|
+
round(2)
|
|
451
458
|
end
|
|
452
459
|
|
|
453
460
|
# Method that returns the percentage of soft masked residues
|
|
454
461
|
# or lower cased residues in a sequence.
|
|
455
462
|
def soft_mask
|
|
456
|
-
((
|
|
463
|
+
((@seq.scan(/[a-z]/).size.to_f / (length - indels).to_f) * 100).round(2)
|
|
457
464
|
end
|
|
458
465
|
|
|
459
|
-
# Hard masks sequence residues where the corresponding quality
|
|
460
|
-
#
|
|
466
|
+
# Hard masks sequence residues where the corresponding quality scoreis below
|
|
467
|
+
# a given cutoff.
|
|
461
468
|
def mask_seq_hard!(cutoff)
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
469
|
+
fail SeqError, 'seq is nil' if @seq.nil?
|
|
470
|
+
fail SeqError, 'qual is nil' if @qual.nil?
|
|
471
|
+
fail SeqError, "cufoff value: #{cutoff} out of range: " \
|
|
472
|
+
"#{SCORE_MIN}..#{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
|
|
473
|
+
include? cutoff
|
|
474
|
+
|
|
475
|
+
na_seq = NArray.to_na(@seq.upcase, 'byte')
|
|
476
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
468
477
|
mask = (na_qual - SCORE_BASE) < cutoff
|
|
469
|
-
mask
|
|
478
|
+
mask *= na_seq.ne('-'.ord)
|
|
470
479
|
|
|
471
480
|
na_seq[mask] = 'N'.ord
|
|
472
481
|
|
|
473
|
-
|
|
482
|
+
@seq = na_seq.to_s
|
|
474
483
|
|
|
475
484
|
self
|
|
476
485
|
end
|
|
@@ -479,18 +488,20 @@ module BioDSL
|
|
|
479
488
|
# is below a given cutoff. Masked sequence will be lowercased and
|
|
480
489
|
# remaining will be uppercased.
|
|
481
490
|
def mask_seq_soft!(cutoff)
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
491
|
+
fail SeqError, 'seq is nil' if @seq.nil?
|
|
492
|
+
fail SeqError, 'qual is nil' if @qual.nil?
|
|
493
|
+
fail SeqError, "cufoff value: #{cutoff} out of range: " \
|
|
494
|
+
"#{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
|
|
495
|
+
include? cutoff
|
|
496
|
+
|
|
497
|
+
na_seq = NArray.to_na(@seq.upcase, 'byte')
|
|
498
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
488
499
|
mask = (na_qual - SCORE_BASE) < cutoff
|
|
489
|
-
mask
|
|
500
|
+
mask *= na_seq.ne('-'.ord)
|
|
490
501
|
|
|
491
502
|
na_seq[mask] ^= ' '.ord
|
|
492
503
|
|
|
493
|
-
|
|
504
|
+
@seq = na_seq.to_s
|
|
494
505
|
|
|
495
506
|
self
|
|
496
507
|
end
|
|
@@ -498,22 +509,22 @@ module BioDSL
|
|
|
498
509
|
# Method that determines if a quality score string can be
|
|
499
510
|
# absolutely identified as base 33.
|
|
500
511
|
def qual_base33?
|
|
501
|
-
|
|
512
|
+
@qual.match(/[!-:]/) ? true : false
|
|
502
513
|
end
|
|
503
|
-
|
|
514
|
+
|
|
504
515
|
# Method that determines if a quality score string may be base 64.
|
|
505
516
|
def qual_base64?
|
|
506
|
-
|
|
517
|
+
@qual.match(/[K-h]/) ? true : false
|
|
507
518
|
end
|
|
508
519
|
|
|
509
520
|
# Method to determine if a quality score is valid accepting only 0-40 range.
|
|
510
521
|
def qual_valid?(encoding)
|
|
511
|
-
|
|
522
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
|
512
523
|
|
|
513
524
|
case encoding
|
|
514
|
-
when :base_33 then return true if
|
|
515
|
-
when :base_64 then return true if
|
|
516
|
-
else
|
|
525
|
+
when :base_33 then return true if @qual.match(/^[!-I]*$/)
|
|
526
|
+
when :base_64 then return true if @qual.match(/^[@-h]*$/)
|
|
527
|
+
else fail SeqError, "unknown quality score encoding: #{encoding}"
|
|
517
528
|
end
|
|
518
529
|
|
|
519
530
|
false
|
|
@@ -521,28 +532,34 @@ module BioDSL
|
|
|
521
532
|
|
|
522
533
|
# Method to coerce quality scores to be within the 0-40 range.
|
|
523
534
|
def qual_coerce!(encoding)
|
|
524
|
-
|
|
535
|
+
fail SeqError, 'Missing qual' if @qual.nil?
|
|
525
536
|
|
|
526
537
|
case encoding
|
|
527
|
-
when :base_33 then qual_coerce_C(
|
|
528
|
-
when :base_64 then qual_coerce_C(
|
|
538
|
+
when :base_33 then qual_coerce_C(@qual, @qual.length, 33, 73) # !-J
|
|
539
|
+
when :base_64 then qual_coerce_C(@qual, @qual.length, 64, 104) # @-h
|
|
529
540
|
else
|
|
530
|
-
|
|
531
|
-
end
|
|
541
|
+
fail SeqError, "unknown quality score encoding: #{encoding}"
|
|
542
|
+
end
|
|
532
543
|
|
|
533
544
|
self
|
|
534
545
|
end
|
|
535
546
|
|
|
536
547
|
# Method to convert quality scores.
|
|
537
548
|
def qual_convert!(from, to)
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
549
|
+
unless from == :base_33 || from == :base_64
|
|
550
|
+
fail SeqError, "unknown quality score encoding: #{from}"
|
|
551
|
+
end
|
|
552
|
+
|
|
553
|
+
unless to == :base_33 || to == :base_64
|
|
554
|
+
fail SeqError, "unknown quality score encoding: #{to}"
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
if from == :base_33 && to == :base_64
|
|
558
|
+
qual_convert_C(@qual, @qual.length, 31) # += 64 - 33
|
|
559
|
+
elsif from == :base_64 && to == :base_33
|
|
560
|
+
# Handle negative Solexa values from -5 to -1 (set these to 0).
|
|
561
|
+
qual_coerce_C(@qual, @qual.length, 64, 104)
|
|
562
|
+
qual_convert_C(@qual, @qual.length, -31) # -= 64 - 33
|
|
546
563
|
end
|
|
547
564
|
|
|
548
565
|
self
|
|
@@ -550,9 +567,9 @@ module BioDSL
|
|
|
550
567
|
|
|
551
568
|
# Method to calculate and return the mean quality score.
|
|
552
569
|
def scores_mean
|
|
553
|
-
|
|
570
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
|
554
571
|
|
|
555
|
-
na_qual = NArray.to_na(
|
|
572
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
556
573
|
na_qual -= SCORE_BASE
|
|
557
574
|
|
|
558
575
|
na_qual.mean
|
|
@@ -560,9 +577,9 @@ module BioDSL
|
|
|
560
577
|
|
|
561
578
|
# Method to calculate and return the min quality score.
|
|
562
579
|
def scores_min
|
|
563
|
-
|
|
580
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
|
564
581
|
|
|
565
|
-
na_qual = NArray.to_na(
|
|
582
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
566
583
|
na_qual -= SCORE_BASE
|
|
567
584
|
|
|
568
585
|
na_qual.min
|
|
@@ -570,9 +587,9 @@ module BioDSL
|
|
|
570
587
|
|
|
571
588
|
# Method to calculate and return the max quality score.
|
|
572
589
|
def scores_max
|
|
573
|
-
|
|
590
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
|
574
591
|
|
|
575
|
-
na_qual = NArray.to_na(
|
|
592
|
+
na_qual = NArray.to_na(@qual, 'byte')
|
|
576
593
|
na_qual -= SCORE_BASE
|
|
577
594
|
|
|
578
595
|
na_qual.max
|
|
@@ -582,17 +599,17 @@ module BioDSL
|
|
|
582
599
|
# scores string and calculate for each window the mean score and return
|
|
583
600
|
# the minimum mean score.
|
|
584
601
|
def scores_mean_local(window_size)
|
|
585
|
-
|
|
602
|
+
fail SeqError, 'Missing qual in entry' if @qual.nil?
|
|
586
603
|
|
|
587
|
-
scores_mean_local_C(
|
|
604
|
+
scores_mean_local_C(@qual, @qual.length, SCORE_BASE, window_size)
|
|
588
605
|
end
|
|
589
606
|
|
|
590
607
|
# Method to find open reading frames (ORFs).
|
|
591
608
|
def each_orf(options = {})
|
|
592
|
-
size_min = options[:size_min]
|
|
593
|
-
size_max = options[:size_max]
|
|
594
|
-
start_codons = options[:start_codons] ||
|
|
595
|
-
stop_codons = options[:stop_codons]
|
|
609
|
+
size_min = options[:size_min] || 0
|
|
610
|
+
size_max = options[:size_max] || length
|
|
611
|
+
start_codons = options[:start_codons] || 'ATG,GTG,AUG,GUG'
|
|
612
|
+
stop_codons = options[:stop_codons] || 'TAA,TGA,TAG,UAA,UGA,UAG'
|
|
596
613
|
pick_longest = options[:pick_longest]
|
|
597
614
|
|
|
598
615
|
orfs = []
|
|
@@ -601,22 +618,23 @@ module BioDSL
|
|
|
601
618
|
regex_start = Regexp.new(start_codons.split(',').join('|'), true)
|
|
602
619
|
regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
|
|
603
620
|
|
|
604
|
-
while pos_beg
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
621
|
+
while pos_beg && pos_beg < length - size_min
|
|
622
|
+
pos_beg = @seq.index(regex_start, pos_beg)
|
|
623
|
+
next unless pos_beg
|
|
624
|
+
pos_end = @seq.index(regex_stop, pos_beg)
|
|
625
|
+
next unless pos_end
|
|
608
626
|
|
|
609
|
-
|
|
610
|
-
if size_min <= length and length <= size_max
|
|
611
|
-
subseq = self[pos_beg ... pos_beg + length]
|
|
627
|
+
orf_length = (pos_end - pos_beg) + 3
|
|
612
628
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
end
|
|
629
|
+
if (orf_length % 3) == 0
|
|
630
|
+
if size_min <= orf_length && orf_length <= size_max
|
|
631
|
+
subseq = self[pos_beg...pos_beg + orf_length]
|
|
617
632
|
|
|
618
|
-
|
|
633
|
+
orfs << Orf.new(subseq, pos_beg, pos_end + 2)
|
|
634
|
+
end
|
|
619
635
|
end
|
|
636
|
+
|
|
637
|
+
pos_beg += 1
|
|
620
638
|
end
|
|
621
639
|
|
|
622
640
|
if pick_longest
|
|
@@ -634,17 +652,8 @@ module BioDSL
|
|
|
634
652
|
end
|
|
635
653
|
end
|
|
636
654
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
def initialize(entry, start, stop)
|
|
641
|
-
@entry = entry
|
|
642
|
-
@start = start
|
|
643
|
-
@stop = stop
|
|
644
|
-
end
|
|
645
|
-
end
|
|
646
|
-
|
|
647
|
-
private
|
|
655
|
+
# Struct for holding an ORF.
|
|
656
|
+
Orf = Struct.new(:entry, :start, :stop)
|
|
648
657
|
|
|
649
658
|
inline do |builder|
|
|
650
659
|
builder.c %{
|