BioDSL 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/BioDSL.gemspec +1 -1
- data/Gemfile +6 -0
- data/README.md +289 -155
- data/Rakefile +18 -16
- data/lib/BioDSL.rb +1 -1
- data/lib/BioDSL/cary.rb +78 -53
- data/lib/BioDSL/command.rb +2 -2
- data/lib/BioDSL/commands.rb +1 -1
- data/lib/BioDSL/commands/add_key.rb +1 -1
- data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
- data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
- data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
- data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
- data/lib/BioDSL/commands/classify_seq.rb +8 -8
- data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
- data/lib/BioDSL/commands/clip_primer.rb +7 -7
- data/lib/BioDSL/commands/cluster_otus.rb +5 -5
- data/lib/BioDSL/commands/collapse_otus.rb +2 -2
- data/lib/BioDSL/commands/collect_otus.rb +2 -2
- data/lib/BioDSL/commands/complement_seq.rb +4 -4
- data/lib/BioDSL/commands/count.rb +1 -1
- data/lib/BioDSL/commands/count_values.rb +2 -2
- data/lib/BioDSL/commands/degap_seq.rb +6 -7
- data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
- data/lib/BioDSL/commands/dump.rb +2 -2
- data/lib/BioDSL/commands/filter_rrna.rb +4 -4
- data/lib/BioDSL/commands/genecall.rb +7 -7
- data/lib/BioDSL/commands/grab.rb +1 -1
- data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
- data/lib/BioDSL/commands/mask_seq.rb +4 -4
- data/lib/BioDSL/commands/mean_scores.rb +2 -2
- data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
- data/lib/BioDSL/commands/merge_table.rb +1 -1
- data/lib/BioDSL/commands/merge_values.rb +1 -1
- data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
- data/lib/BioDSL/commands/plot_histogram.rb +4 -4
- data/lib/BioDSL/commands/plot_matches.rb +5 -5
- data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
- data/lib/BioDSL/commands/plot_scores.rb +7 -7
- data/lib/BioDSL/commands/random.rb +1 -1
- data/lib/BioDSL/commands/read_fasta.rb +9 -9
- data/lib/BioDSL/commands/read_fastq.rb +16 -16
- data/lib/BioDSL/commands/read_table.rb +2 -3
- data/lib/BioDSL/commands/reverse_seq.rb +4 -4
- data/lib/BioDSL/commands/slice_align.rb +4 -4
- data/lib/BioDSL/commands/slice_seq.rb +3 -3
- data/lib/BioDSL/commands/sort.rb +1 -1
- data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
- data/lib/BioDSL/commands/split_values.rb +2 -2
- data/lib/BioDSL/commands/trim_primer.rb +13 -8
- data/lib/BioDSL/commands/trim_seq.rb +5 -5
- data/lib/BioDSL/commands/uchime_ref.rb +6 -6
- data/lib/BioDSL/commands/uclust.rb +5 -5
- data/lib/BioDSL/commands/unique_values.rb +1 -1
- data/lib/BioDSL/commands/usearch_global.rb +2 -2
- data/lib/BioDSL/commands/usearch_local.rb +2 -2
- data/lib/BioDSL/commands/write_fasta.rb +7 -9
- data/lib/BioDSL/commands/write_fastq.rb +4 -4
- data/lib/BioDSL/commands/write_table.rb +3 -3
- data/lib/BioDSL/commands/write_tree.rb +2 -3
- data/lib/BioDSL/config.rb +2 -2
- data/lib/BioDSL/csv.rb +8 -10
- data/lib/BioDSL/debug.rb +1 -1
- data/lib/BioDSL/fasta.rb +54 -40
- data/lib/BioDSL/fastq.rb +35 -32
- data/lib/BioDSL/filesys.rb +56 -47
- data/lib/BioDSL/fork.rb +1 -1
- data/lib/BioDSL/hamming.rb +1 -1
- data/lib/BioDSL/helpers.rb +1 -1
- data/lib/BioDSL/helpers/aux_helper.rb +1 -1
- data/lib/BioDSL/helpers/email_helper.rb +1 -1
- data/lib/BioDSL/helpers/history_helper.rb +1 -1
- data/lib/BioDSL/helpers/log_helper.rb +1 -1
- data/lib/BioDSL/helpers/options_helper.rb +1 -1
- data/lib/BioDSL/helpers/status_helper.rb +1 -1
- data/lib/BioDSL/html_report.rb +1 -1
- data/lib/BioDSL/math.rb +1 -1
- data/lib/BioDSL/mummer.rb +1 -1
- data/lib/BioDSL/pipeline.rb +1 -1
- data/lib/BioDSL/seq.rb +240 -231
- data/lib/BioDSL/seq/ambiguity.rb +1 -1
- data/lib/BioDSL/seq/assemble.rb +1 -1
- data/lib/BioDSL/seq/backtrack.rb +93 -76
- data/lib/BioDSL/seq/digest.rb +1 -1
- data/lib/BioDSL/seq/dynamic.rb +43 -55
- data/lib/BioDSL/seq/homopolymer.rb +34 -36
- data/lib/BioDSL/seq/kmer.rb +67 -50
- data/lib/BioDSL/seq/levenshtein.rb +35 -40
- data/lib/BioDSL/seq/translate.rb +64 -55
- data/lib/BioDSL/seq/trim.rb +60 -50
- data/lib/BioDSL/serializer.rb +1 -1
- data/lib/BioDSL/stream.rb +1 -1
- data/lib/BioDSL/taxonomy.rb +1 -1
- data/lib/BioDSL/test.rb +1 -1
- data/lib/BioDSL/tmp_dir.rb +1 -1
- data/lib/BioDSL/usearch.rb +1 -1
- data/lib/BioDSL/verbose.rb +1 -1
- data/lib/BioDSL/version.rb +2 -2
- data/test/BioDSL/commands/test_add_key.rb +1 -1
- data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq.rb +1 -1
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
- data/test/BioDSL/commands/test_clip_primer.rb +1 -1
- data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
- data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
- data/test/BioDSL/commands/test_collect_otus.rb +1 -1
- data/test/BioDSL/commands/test_complement_seq.rb +1 -1
- data/test/BioDSL/commands/test_count.rb +1 -1
- data/test/BioDSL/commands/test_count_values.rb +1 -1
- data/test/BioDSL/commands/test_degap_seq.rb +1 -1
- data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
- data/test/BioDSL/commands/test_dump.rb +1 -1
- data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
- data/test/BioDSL/commands/test_genecall.rb +1 -1
- data/test/BioDSL/commands/test_grab.rb +1 -1
- data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
- data/test/BioDSL/commands/test_mask_seq.rb +1 -1
- data/test/BioDSL/commands/test_mean_scores.rb +1 -1
- data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_merge_table.rb +1 -1
- data/test/BioDSL/commands/test_merge_values.rb +1 -1
- data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
- data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
- data/test/BioDSL/commands/test_plot_matches.rb +1 -1
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
- data/test/BioDSL/commands/test_plot_scores.rb +1 -1
- data/test/BioDSL/commands/test_random.rb +1 -1
- data/test/BioDSL/commands/test_read_fasta.rb +1 -1
- data/test/BioDSL/commands/test_read_fastq.rb +1 -1
- data/test/BioDSL/commands/test_read_table.rb +1 -1
- data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
- data/test/BioDSL/commands/test_slice_align.rb +1 -1
- data/test/BioDSL/commands/test_slice_seq.rb +1 -1
- data/test/BioDSL/commands/test_sort.rb +1 -1
- data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
- data/test/BioDSL/commands/test_split_values.rb +1 -1
- data/test/BioDSL/commands/test_trim_primer.rb +1 -1
- data/test/BioDSL/commands/test_trim_seq.rb +1 -1
- data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
- data/test/BioDSL/commands/test_uclust.rb +1 -1
- data/test/BioDSL/commands/test_unique_values.rb +1 -1
- data/test/BioDSL/commands/test_usearch_global.rb +1 -1
- data/test/BioDSL/commands/test_usearch_local.rb +1 -1
- data/test/BioDSL/commands/test_write_fasta.rb +1 -1
- data/test/BioDSL/commands/test_write_fastq.rb +1 -1
- data/test/BioDSL/commands/test_write_table.rb +1 -1
- data/test/BioDSL/commands/test_write_tree.rb +1 -1
- data/test/BioDSL/helpers/test_options_helper.rb +3 -3
- data/test/BioDSL/seq/test_assemble.rb +58 -56
- data/test/BioDSL/seq/test_backtrack.rb +83 -81
- data/test/BioDSL/seq/test_digest.rb +47 -45
- data/test/BioDSL/seq/test_dynamic.rb +66 -64
- data/test/BioDSL/seq/test_homopolymer.rb +35 -33
- data/test/BioDSL/seq/test_kmer.rb +29 -28
- data/test/BioDSL/seq/test_translate.rb +44 -42
- data/test/BioDSL/seq/test_trim.rb +59 -57
- data/test/BioDSL/test_cary.rb +1 -1
- data/test/BioDSL/test_command.rb +2 -2
- data/test/BioDSL/test_csv.rb +34 -31
- data/test/BioDSL/test_debug.rb +31 -31
- data/test/BioDSL/test_fasta.rb +30 -29
- data/test/BioDSL/test_fastq.rb +27 -26
- data/test/BioDSL/test_filesys.rb +28 -27
- data/test/BioDSL/test_fork.rb +29 -28
- data/test/BioDSL/test_math.rb +31 -30
- data/test/BioDSL/test_mummer.rb +1 -1
- data/test/BioDSL/test_pipeline.rb +1 -1
- data/test/BioDSL/test_seq.rb +42 -41
- data/test/BioDSL/test_serializer.rb +35 -33
- data/test/BioDSL/test_stream.rb +28 -27
- data/test/BioDSL/test_taxonomy.rb +38 -37
- data/test/BioDSL/test_test.rb +32 -31
- data/test/BioDSL/test_tmp_dir.rb +1 -1
- data/test/BioDSL/test_usearch.rb +28 -27
- data/test/BioDSL/test_verbose.rb +32 -31
- data/test/helper.rb +34 -31
- metadata +3 -2
data/lib/BioDSL/seq/ambiguity.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/seq/assemble.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of the BioDSL
|
|
24
|
+
# This software is part of the BioDSL (www.BioDSL.org). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/seq/backtrack.rb
CHANGED
|
@@ -1,36 +1,37 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
|
4
|
-
#
|
|
5
|
-
# This program is free software; you can redistribute it and/or
|
|
6
|
-
# modify it under the terms of the GNU General Public License
|
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
|
8
|
-
# of the License, or (at your option) any later version.
|
|
9
|
-
#
|
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
# GNU General Public License for more details.
|
|
14
|
-
#
|
|
15
|
-
# You should have received a copy of the GNU General Public License
|
|
16
|
-
# along with this program; if not, write to the Free Software
|
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
27
|
|
|
27
28
|
module BioDSL
|
|
28
29
|
# Error class for all exceptions to do with BackTrack.
|
|
29
30
|
class BackTrackError < StandardError; end
|
|
30
31
|
|
|
31
|
-
# Module containing code to locate nucleotide patterns in sequences allowing
|
|
32
|
-
# ambiguity codes and a given maximum mismatches, insertions, and
|
|
33
|
-
# pattern match engine is based on a backtrack algorithm.
|
|
32
|
+
# Module containing code to locate nucleotide patterns in sequences allowing
|
|
33
|
+
# for ambiguity codes and a given maximum mismatches, insertions, and
|
|
34
|
+
# deletions. The pattern match engine is based on a backtrack algorithm.
|
|
34
35
|
# Insertions are nucleotides found in the pattern but not in the sequence.
|
|
35
36
|
# Deletions are nucleotides found in the sequence but not in the pattern.
|
|
36
37
|
# Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
|
|
@@ -43,7 +44,7 @@ module BioDSL
|
|
|
43
44
|
MAX_INS = 5 # Maximum number of insertions allowed
|
|
44
45
|
MAX_DEL = 5 # Maximum number of deletions allowed
|
|
45
46
|
|
|
46
|
-
#
|
|
47
|
+
# --------------------------------------------------------------------------
|
|
47
48
|
# str.patmatch(pattern[, options])
|
|
48
49
|
# -> Match
|
|
49
50
|
# str.patmatch(pattern[, options]) { |match|
|
|
@@ -58,20 +59,20 @@ module BioDSL
|
|
|
58
59
|
# :max_insertions
|
|
59
60
|
# :max_deletions
|
|
60
61
|
#
|
|
61
|
-
#
|
|
62
|
-
# Method to iterate through a sequence from a given start position to the
|
|
63
|
-
# the sequence or to a given stop position to locate a pattern
|
|
64
|
-
# maximum number of mismatches, insertions, and deletions.
|
|
65
|
-
# nucleotides found in the pattern but not in the sequence.
|
|
66
|
-
# nucleotides found in the sequence but not in the pattern.
|
|
62
|
+
# --------------------------------------------------------------------------
|
|
63
|
+
# Method to iterate through a sequence from a given start position to the
|
|
64
|
+
# end of the sequence or to a given stop position to locate a pattern
|
|
65
|
+
# allowing for a maximum number of mismatches, insertions, and deletions.
|
|
66
|
+
# Insertions are nucleotides found in the pattern but not in the sequence.
|
|
67
|
+
# Deletions are nucleotides found in the sequence but not in the pattern.
|
|
67
68
|
def patmatch(pattern, options = {})
|
|
68
|
-
options[:start]
|
|
69
|
-
options[:stop]
|
|
69
|
+
options[:start] ||= 0
|
|
70
|
+
options[:stop] ||= length - 1
|
|
70
71
|
options[:max_mismatches] ||= 0
|
|
71
72
|
options[:max_insertions] ||= 0
|
|
72
|
-
options[:max_deletions]
|
|
73
|
+
options[:max_deletions] ||= 0
|
|
73
74
|
|
|
74
|
-
|
|
75
|
+
patscan(pattern, options) do |m|
|
|
75
76
|
if block_given?
|
|
76
77
|
yield m
|
|
77
78
|
else
|
|
@@ -80,7 +81,7 @@ module BioDSL
|
|
|
80
81
|
end
|
|
81
82
|
end
|
|
82
83
|
|
|
83
|
-
#
|
|
84
|
+
# --------------------------------------------------------------------------
|
|
84
85
|
# str.patscan(pattern[, options])
|
|
85
86
|
# -> Array
|
|
86
87
|
# str.patscan(pattern[, options]) { |match|
|
|
@@ -95,39 +96,57 @@ module BioDSL
|
|
|
95
96
|
# :max_insertions
|
|
96
97
|
# :max_deletions
|
|
97
98
|
#
|
|
98
|
-
#
|
|
99
|
-
# Method to iterate through a sequence from a given start position to the
|
|
100
|
-
# the sequence or to a given stop position to locate a pattern
|
|
101
|
-
# maximum number of mismatches, insertions, and deletions.
|
|
102
|
-
# nucleotides found in the pattern but not in the sequence.
|
|
103
|
-
# nucleotides found in the sequence but not in the pattern.
|
|
104
|
-
# block context return the Match object. Otherwise matches
|
|
105
|
-
# Array of Match objects.
|
|
99
|
+
# --------------------------------------------------------------------------
|
|
100
|
+
# Method to iterate through a sequence from a given start position to the
|
|
101
|
+
# end of the sequence or to a given stop position to locate a pattern
|
|
102
|
+
# allowing for a maximum number of mismatches, insertions, and deletions.
|
|
103
|
+
# Insertions are nucleotides found in the pattern but not in the sequence.
|
|
104
|
+
# Deletions are nucleotides found in the sequence but not in the pattern.
|
|
105
|
+
# Matches found in block context return the Match object. Otherwise matches
|
|
106
|
+
# are returned in an Array of Match objects.
|
|
106
107
|
def patscan(pattern, options = {})
|
|
107
|
-
options[:start]
|
|
108
|
-
options[:stop]
|
|
108
|
+
options[:start] ||= 0
|
|
109
|
+
options[:stop] ||= length - 1
|
|
109
110
|
options[:max_mismatches] ||= 0
|
|
110
111
|
options[:max_insertions] ||= 0
|
|
111
|
-
options[:max_deletions]
|
|
112
|
+
options[:max_deletions] ||= 0
|
|
112
113
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
114
|
+
unless pattern.downcase =~ OK_PATTERN
|
|
115
|
+
fail BackTrackError, "Bad pattern: #{pattern}"
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
unless (0...length).include? options[:start]
|
|
119
|
+
fail BackTrackError, "start: #{options[:start]} out of range " \
|
|
120
|
+
"(0..#{length - 1})"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
unless (0...length).include? options[:stop]
|
|
124
|
+
fail BackTrackError, "stop: #{options[:stop]} out of range " \
|
|
125
|
+
"(0..#{length - 1})"
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
unless (0..MAX_MIS).include? options[:max_mismatches]
|
|
129
|
+
fail BackTrackError, "max_mismatches: #{options[:max_mismatches]} " \
|
|
130
|
+
"out of range (0..#{MAX_MIS})"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
unless (0..MAX_INS).include? options[:max_insertions]
|
|
134
|
+
fail BackTrackError, "max_insertions: #{options[:max_insertions]} " \
|
|
135
|
+
"out of range (0..#{MAX_INS})"
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
unless (0..MAX_DEL).include? options[:max_deletions]
|
|
139
|
+
fail BackTrackError, "max_deletions: #{options[:max_deletions]} " \
|
|
140
|
+
"out of range (0..#{MAX_DEL})"
|
|
141
|
+
end
|
|
119
142
|
|
|
120
143
|
matches = []
|
|
121
144
|
|
|
122
|
-
while result = scan_C(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
options[:max_insertions],
|
|
128
|
-
options[:max_deletions]
|
|
129
|
-
)
|
|
130
|
-
match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
|
|
145
|
+
while (result = scan_C(@seq, pattern, options[:start], options[:stop],
|
|
146
|
+
options[:max_mismatches], options[:max_insertions],
|
|
147
|
+
options[:max_deletions]))
|
|
148
|
+
match = Match.new(result.first, result.last,
|
|
149
|
+
@seq[result.first...result.first + result.last])
|
|
131
150
|
|
|
132
151
|
if block_given?
|
|
133
152
|
yield match
|
|
@@ -146,10 +165,11 @@ module BioDSL
|
|
|
146
165
|
inline do |builder|
|
|
147
166
|
add_ambiguity_macro(builder)
|
|
148
167
|
|
|
149
|
-
# Backtrack algorithm for matching a pattern (p) starting in a sequence
|
|
150
|
-
# mismatches, ins insertions and del deletions. ss is
|
|
151
|
-
#
|
|
152
|
-
# are
|
|
168
|
+
# Backtrack algorithm for matching a pattern (p) starting in a sequence
|
|
169
|
+
# (s) allowing for mis mismatches, ins insertions and del deletions. ss is
|
|
170
|
+
# the start of the sequence, used only for reporting the match endpoints.
|
|
171
|
+
# State is used to avoid ins followed by del and visa versa which are
|
|
172
|
+
# nonsense.
|
|
153
173
|
builder.prefix %{
|
|
154
174
|
unsigned int backtrack(
|
|
155
175
|
char *ss, // Sequence start
|
|
@@ -177,9 +197,9 @@ module BioDSL
|
|
|
177
197
|
return 0;
|
|
178
198
|
}
|
|
179
199
|
}
|
|
180
|
-
|
|
181
|
-
# Find pattern (p) in a sequence (s) starting at pos, with at most mis
|
|
182
|
-
# insertions and del deletions.
|
|
200
|
+
|
|
201
|
+
# Find pattern (p) in a sequence (s) starting at pos, with at most mis
|
|
202
|
+
# mismatches, ins insertions and del deletions.
|
|
183
203
|
builder.c %{
|
|
184
204
|
VALUE scan_C(
|
|
185
205
|
VALUE _s, // Sequence
|
|
@@ -247,6 +267,3 @@ module BioDSL
|
|
|
247
267
|
end
|
|
248
268
|
end
|
|
249
269
|
end
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
__END__
|
data/lib/BioDSL/seq/digest.rb
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
# #
|
|
22
22
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
23
|
# #
|
|
24
|
-
# This software is part of BioDSL (
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
25
|
# #
|
|
26
26
|
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
27
|
|
data/lib/BioDSL/seq/dynamic.rb
CHANGED
|
@@ -1,35 +1,36 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
|
4
|
-
#
|
|
5
|
-
# This program is free software; you can redistribute it and/or
|
|
6
|
-
# modify it under the terms of the GNU General Public License
|
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
|
8
|
-
# of the License, or (at your option) any later version.
|
|
9
|
-
#
|
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
# GNU General Public License for more details.
|
|
14
|
-
#
|
|
15
|
-
# You should have received a copy of the GNU General Public License
|
|
16
|
-
# along with this program; if not, write to the Free Software
|
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
27
|
|
|
27
28
|
module BioDSL
|
|
28
29
|
# Error class for Dynamic.
|
|
29
30
|
class DynamicError < StandardError; end
|
|
30
31
|
|
|
31
|
-
# Module containing code to locate nucleotide patterns in sequences allowing
|
|
32
|
-
# ambiguity codes and a given maximum edit distance.
|
|
32
|
+
# Module containing code to locate nucleotide patterns in sequences allowing
|
|
33
|
+
# for ambiguity codes and a given maximum edit distance.
|
|
33
34
|
# Insertions are nucleotides found in the pattern but not in the sequence.
|
|
34
35
|
# Deletions are nucleotides found in the sequence but not in the pattern.
|
|
35
36
|
#
|
|
@@ -38,7 +39,7 @@ module BioDSL
|
|
|
38
39
|
module Dynamic
|
|
39
40
|
extend BioDSL::Ambiguity
|
|
40
41
|
|
|
41
|
-
#
|
|
42
|
+
# --------------------------------------------------------------------------
|
|
42
43
|
# str.patmatch(pattern[, pos[, max_edit_distance]])
|
|
43
44
|
# -> Match or nil
|
|
44
45
|
# str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
|
|
@@ -46,16 +47,16 @@ module BioDSL
|
|
|
46
47
|
# }
|
|
47
48
|
# -> Match
|
|
48
49
|
#
|
|
49
|
-
#
|
|
50
|
+
# --------------------------------------------------------------------------
|
|
50
51
|
# Method to iterate through a sequence to locate the first pattern match
|
|
51
52
|
# starting from a given position and allowing for a maximum edit distance.
|
|
52
53
|
def patmatch(pattern, pos = 0, max_edit_distance = 0)
|
|
53
|
-
|
|
54
|
+
patscan(pattern, pos, max_edit_distance) do |m|
|
|
54
55
|
return m
|
|
55
56
|
end
|
|
56
57
|
end
|
|
57
58
|
|
|
58
|
-
#
|
|
59
|
+
# --------------------------------------------------------------------------
|
|
59
60
|
# str.patscan(pattern[, pos[, max_edit_distance]])
|
|
60
61
|
# -> Array or nil
|
|
61
62
|
# str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
|
|
@@ -63,16 +64,17 @@ module BioDSL
|
|
|
63
64
|
# }
|
|
64
65
|
# -> Match
|
|
65
66
|
#
|
|
66
|
-
#
|
|
67
|
-
# Method to iterate through a sequence to locate pattern matches starting
|
|
68
|
-
# given position and allowing for a maximum edit distance. Matches
|
|
69
|
-
# block context return the Match object. Otherwise matches are
|
|
70
|
-
# Array.
|
|
67
|
+
# --------------------------------------------------------------------------
|
|
68
|
+
# Method to iterate through a sequence to locate pattern matches starting
|
|
69
|
+
# from a given position and allowing for a maximum edit distance. Matches
|
|
70
|
+
# found in block context return the Match object. Otherwise matches are
|
|
71
|
+
# returned in an Array.
|
|
71
72
|
def patscan(pattern, pos = 0, max_edit_distance = 0)
|
|
72
73
|
matches = []
|
|
73
74
|
|
|
74
|
-
while result = match_C(
|
|
75
|
-
|
|
75
|
+
while (result = match_C(@seq, length, pattern, pattern.length, pos,
|
|
76
|
+
max_edit_distance))
|
|
77
|
+
match = Match.new(*result, @seq[result[0]...result[0] + result[1]])
|
|
76
78
|
|
|
77
79
|
if block_given?
|
|
78
80
|
yield match
|
|
@@ -97,7 +99,7 @@ module BioDSL
|
|
|
97
99
|
}
|
|
98
100
|
|
|
99
101
|
builder.prefix %{
|
|
100
|
-
typedef struct
|
|
102
|
+
typedef struct
|
|
101
103
|
{
|
|
102
104
|
unsigned int mis;
|
|
103
105
|
unsigned int ins;
|
|
@@ -207,7 +209,7 @@ module BioDSL
|
|
|
207
209
|
unsigned int pat_len = FIX2UINT(_pat_len);
|
|
208
210
|
unsigned int pos = FIX2UINT(_pos);
|
|
209
211
|
unsigned int max_ed = FIX2UINT(_max_ed);
|
|
210
|
-
|
|
212
|
+
|
|
211
213
|
score vec[MAX_PAT] = {0};
|
|
212
214
|
unsigned int vec_len = pat_len + 1;
|
|
213
215
|
unsigned int match_beg = 0;
|
|
@@ -244,20 +246,6 @@ module BioDSL
|
|
|
244
246
|
}
|
|
245
247
|
end
|
|
246
248
|
|
|
247
|
-
|
|
248
|
-
attr_accessor :beg, :length, :mis, :ins, :del, :match
|
|
249
|
-
|
|
250
|
-
def initialize(beg, length, mis, ins, del, match)
|
|
251
|
-
@beg = beg
|
|
252
|
-
@length = length
|
|
253
|
-
@mis = mis
|
|
254
|
-
@ins = ins
|
|
255
|
-
@del = del
|
|
256
|
-
@match = match
|
|
257
|
-
end
|
|
258
|
-
end
|
|
249
|
+
Match = Struct.new(:beg, :length, :mis, :ins, :del, :match)
|
|
259
250
|
end
|
|
260
251
|
end
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
__END__
|
|
@@ -1,39 +1,45 @@
|
|
|
1
|
-
#
|
|
2
|
-
#
|
|
3
|
-
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk).
|
|
4
|
-
#
|
|
5
|
-
# This program is free software; you can redistribute it and/or
|
|
6
|
-
# modify it under the terms of the GNU General Public License
|
|
7
|
-
# as published by the Free Software Foundation; either version 2
|
|
8
|
-
# of the License, or (at your option) any later version.
|
|
9
|
-
#
|
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
# GNU General Public License for more details.
|
|
14
|
-
#
|
|
15
|
-
# You should have received a copy of the GNU General Public License
|
|
16
|
-
# along with this program; if not, write to the Free Software
|
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of BioDSL (http://maasha.github.io/BioDSL). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
26
27
|
|
|
28
|
+
# Namespace for BioDSL.
|
|
27
29
|
module BioDSL
|
|
28
30
|
# Error class for all exceptions to do with Homopolymer.
|
|
29
31
|
class HomopolymerError < StandardError; end
|
|
30
32
|
|
|
33
|
+
# Namespace for Homopolymer
|
|
31
34
|
module Homopolymer
|
|
32
35
|
def each_homopolymer(min = 1)
|
|
33
|
-
|
|
36
|
+
fail HomopolymerError, "Bad min value: #{min}" if min <= 0
|
|
34
37
|
list = []
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
regex = Regexp.new("A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|" \
|
|
40
|
+
"N{#{min},}")
|
|
41
|
+
|
|
42
|
+
@seq.upcase.scan(regex) do |match|
|
|
37
43
|
hp = Homopolymer.new(match, match.length, $`.length)
|
|
38
44
|
|
|
39
45
|
if block_given?
|
|
@@ -46,14 +52,6 @@ module BioDSL
|
|
|
46
52
|
block_given? ? self : list
|
|
47
53
|
end
|
|
48
54
|
|
|
49
|
-
|
|
50
|
-
attr_reader :pattern, :length, :pos
|
|
51
|
-
|
|
52
|
-
def initialize(pattern, length, pos)
|
|
53
|
-
@pattern = pattern
|
|
54
|
-
@length = length
|
|
55
|
-
@pos = pos
|
|
56
|
-
end
|
|
57
|
-
end
|
|
55
|
+
Homopolymer = Struct.new(:pattern, :length, :pos)
|
|
58
56
|
end
|
|
59
57
|
end
|