BioDSL 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
@@ -1,35 +1,39 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
28
30
  # Error class for all exceptions to do with Filesys.
29
31
  class FilesysError < StandardError; end
30
32
 
33
+ # Class for handling filesystem manipulations.
31
34
  class Filesys
32
35
  require 'open3'
36
+ require 'English'
33
37
 
34
38
  include Enumerable
35
39
 
@@ -40,10 +44,10 @@ module BioDSL
40
44
  exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
41
45
 
42
46
  ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
43
- exts.each { |ext|
47
+ exts.each do |ext|
44
48
  exe = File.join(path, "#{cmd}#{ext}")
45
49
  return exe if File.executable?(exe) && !File.directory?(exe)
46
- }
50
+ end
47
51
  end
48
52
 
49
53
  nil
@@ -51,14 +55,15 @@ module BioDSL
51
55
 
52
56
  # Class method that returns a path to a unique temporary file.
53
57
  # If no directory is specified reverts to the systems tmp directory.
54
- def self.tmpfile(tmp_dir = ENV["TMPDIR"])
58
+ def self.tmpfile(tmp_dir = ENV['TMPDIR'])
55
59
  time = Time.now.to_i
56
- user = ENV["USER"]
57
- pid = $$
58
- path = tmp_dir + [user, time + pid, pid].join("_") + ".tmp"
60
+ user = ENV['USER']
61
+ pid = $PID
62
+ path = tmp_dir + [user, time + pid, pid].join('_') + '.tmp'
59
63
  path
60
64
  end
61
65
 
66
+ # Open a file which may be compressed with gzip og bzip2.
62
67
  def self.open(*args)
63
68
  file = args.shift
64
69
  mode = args.shift
@@ -67,32 +72,37 @@ module BioDSL
67
72
  if mode == 'w'
68
73
  case options[:compress]
69
74
  when :gzip
70
- ios, = Open3.pipeline_w("gzip -f", out: file)
75
+ ios, = Open3.pipeline_w('gzip -f', out: file)
71
76
  when :bzip, :bzip2
72
- ios, = Open3.pipeline_w("bzip2 -c", out: file)
73
- else
74
- ios = File.open(file, mode, options)
75
- end
76
- else
77
- type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
78
- case type
79
- when /gzip/
80
- ios = IO.popen("gzip -cd #{file}")
81
- when /bzip/
82
- ios = IO.popen("bzcat #{file}")
77
+ ios, = Open3.pipeline_w('bzip2 -c', out: file)
83
78
  else
84
79
  ios = File.open(file, mode, options)
85
80
  end
81
+ else
82
+ type = if file.respond_to? :path
83
+ `file -Lk #{file.path}`
84
+ else
85
+ `file -Lk #{file}`
86
+ end
87
+
88
+ ios = case type
89
+ when /gzip/
90
+ IO.popen("gzip -cd #{file}")
91
+ when /bzip/
92
+ IO.popen("bzcat #{file}")
93
+ else
94
+ File.open(file, mode, options)
95
+ end
86
96
  end
87
97
 
88
98
  if block_given?
89
99
  begin
90
- yield self.new(ios)
100
+ yield new(ios)
91
101
  ensure
92
102
  ios.close
93
103
  end
94
104
  else
95
- return self.new(ios)
105
+ return new(ios)
96
106
  end
97
107
  end
98
108
 
@@ -134,4 +144,3 @@ module BioDSL
134
144
  end
135
145
  end
136
146
  end
137
-
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
  module BioDSL
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -20,7 +20,7 @@
20
20
  # #
21
21
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
22
  # #
23
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
23
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
24
24
  # #
25
25
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
26
 
@@ -20,7 +20,7 @@
20
20
  # #
21
21
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
22
  # #
23
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
23
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
24
24
  # #
25
25
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
26
  module BioDSL
@@ -1,30 +1,33 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part BioDSL (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
30
+ require 'English'
28
31
  require 'narray'
29
32
  require 'BioDSL/seq/ambiguity'
30
33
  require 'BioDSL/seq/assemble'
@@ -40,12 +43,15 @@ module BioDSL
40
43
  # Error class for all exceptions to do with Seq.
41
44
  class SeqError < StandardError; end
42
45
 
46
+ # rubocop: disable ClassLength
47
+
48
+ # Class for manipulating sequences.
43
49
  class Seq
44
50
  # Residue alphabets
45
- DNA = %w[a t c g]
46
- RNA = %w[a u c g]
47
- PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
48
- INDELS = %w[. - _ ~]
51
+ DNA = %w(a t c g)
52
+ RNA = %w(a u c g)
53
+ PROTEIN = %w(f l s y c w p h q r i m t n k v a d e g)
54
+ INDELS = %w(. - _ ~)
49
55
 
50
56
  # Quality scores bases
51
57
  SCORE_BASE = 33
@@ -69,30 +75,29 @@ module BioDSL
69
75
  type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
70
76
  qual = record[:SCORES]
71
77
 
72
- self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
78
+ new(seq_name: seq_name, seq: seq, type: type, qual: qual)
73
79
  end
74
80
 
75
- # Class method that generates all possible oligos of a specifed length and type.
81
+ # Class method that generates all possible oligos of a specifed length and
82
+ # type.
76
83
  def self.generate_oligos(length, type)
77
- raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
84
+ fail SeqError, "Bad length: #{length}" if length <= 0
78
85
 
79
86
  case type.downcase
80
87
  when :dna then alph = DNA
81
88
  when :rna then alph = RNA
82
89
  when :protein then alph = PROTEIN
83
90
  else
84
- raise SeqError, "Unknown sequence type: #{type}"
91
+ fail SeqError, "Unknown sequence type: #{type}"
85
92
  end
86
93
 
87
- oligos = [""]
94
+ oligos = ['']
88
95
 
89
- (1 .. length).each do
96
+ (1..length).each do
90
97
  list = []
91
98
 
92
99
  oligos.each do |oligo|
93
- alph.each do |char|
94
- list << oligo + char
95
- end
100
+ alph.each { |char| list << oligo + char }
96
101
  end
97
102
 
98
103
  oligos = list
@@ -103,24 +108,22 @@ module BioDSL
103
108
 
104
109
  def self.check_name_pair(entry1, entry2)
105
110
  if entry1.seq_name =~ /^([^ ]+) \d:/
106
- name1 = $1
107
- elsif entry1.seq_name =~ /^(.+)\/\d$/
108
- name1 = $1
111
+ name1 = Regexp.last_match[1]
112
+ elsif entry1.seq_name =~ %r{^(.+)\/\d$}
113
+ name1 = Regexp.last_match[1]
109
114
  else
110
- raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
115
+ fail SeqError, "Could not match sequence name: #{entry1.seq_name}"
111
116
  end
112
117
 
113
118
  if entry2.seq_name =~ /^([^ ]+) \d:/
114
- name2 = $1
115
- elsif entry2.seq_name =~ /^(.+)\/\d$/
116
- name2 = $1
119
+ name2 = Regexp.last_match[1]
120
+ elsif entry2.seq_name =~ %r{^(.+)\/\d$}
121
+ name2 = Regexp.last_match[1]
117
122
  else
118
- raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
123
+ fail SeqError, "Could not match sequence name: #{entry2.seq_name}"
119
124
  end
120
125
 
121
- if name1 != name2
122
- raise SeqError, "Name mismatch: #{name1} != #{name2}"
123
- end
126
+ fail SeqError, "Name mismatch: #{name1} != #{name2}" if name1 != name2
124
127
  end
125
128
 
126
129
  # Initialize a sequence object with the following options:
@@ -134,18 +137,19 @@ module BioDSL
134
137
  @type = options[:type]
135
138
  @qual = options[:qual]
136
139
 
137
- if @seq and @qual and @seq.length != @qual.length
138
- raise SeqError, "Sequence length and score length mismatch:" \
139
- "#{@seq.length} != #{@qual.length}"
140
- end
140
+ return unless @seq && @qual
141
+ return if @seq.length == @qual.length
142
+
143
+ fail SeqError, 'Sequence length and score length mismatch: ' \
144
+ "#{@seq.length} != #{@qual.length}"
141
145
  end
142
146
 
143
147
  # Method that guesses and returns the sequence type
144
148
  # by inspecting the first 100 residues.
145
149
  def type_guess
146
- raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
150
+ fail SeqError, 'Guess failed: sequence is nil' if @seq.nil?
147
151
 
148
- case self.seq[0 ... 100].downcase
152
+ case @seq[0...100].downcase
149
153
  when /[flpqie]/ then return :protein
150
154
  when /[u]/ then return :rna
151
155
  else return :dna
@@ -155,31 +159,31 @@ module BioDSL
155
159
  # Method that guesses and sets the sequence type
156
160
  # by inspecting the first 100 residues.
157
161
  def type_guess!
158
- self.type = self.type_guess
162
+ @type = type_guess
159
163
  self
160
164
  end
161
165
 
162
166
  # Returns the length of a sequence.
163
167
  def length
164
- self.seq.nil? ? 0 : self.seq.length
168
+ @seq.nil? ? 0 : @seq.length
165
169
  end
166
170
 
167
- alias :len :length
171
+ alias_method :len, :length
168
172
 
169
173
  # Return the number indels in a sequence.
170
174
  def indels
171
175
  regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
172
- self.seq.scan(regex).size
176
+ @seq.scan(regex).size
173
177
  end
174
178
 
175
179
  # Method to remove indels from seq and qual if qual.
176
180
  def indels_remove
177
- if self.qual.nil?
178
- self.seq.delete!(Regexp.escape(INDELS.join('')))
181
+ if @qual.nil?
182
+ @seq.delete!(Regexp.escape(INDELS.join('')))
179
183
  else
180
- na_seq = NArray.to_na(self.seq, "byte")
181
- na_qual = NArray.to_na(self.qual, "byte")
182
- mask = NArray.byte(self.length)
184
+ na_seq = NArray.to_na(@seq, 'byte')
185
+ na_qual = NArray.to_na(@qual, 'byte')
186
+ mask = NArray.byte(length)
183
187
 
184
188
  INDELS.each do |c|
185
189
  mask += na_seq.eq(c.ord)
@@ -187,113 +191,113 @@ module BioDSL
187
191
 
188
192
  mask = mask.eq(0)
189
193
 
190
- self.seq = na_seq[mask].to_s
191
- self.qual = na_qual[mask].to_s
194
+ @seq = na_seq[mask].to_s
195
+ @qual = na_qual[mask].to_s
192
196
  end
193
197
 
194
198
  self
195
199
  end
196
200
 
197
201
  # Method that returns true is a given sequence type is DNA.
198
- def is_dna?
199
- self.type == :dna
202
+ def dna?
203
+ @type == :dna
200
204
  end
201
205
 
202
206
  # Method that returns true is a given sequence type is RNA.
203
- def is_rna?
204
- self.type == :rna
207
+ def rna?
208
+ @type == :rna
205
209
  end
206
210
 
207
211
  # Method that returns true is a given sequence type is protein.
208
- def is_protein?
209
- self.type == :protein
212
+ def protein?
213
+ @type == :protein
210
214
  end
211
215
 
212
216
  # Method to transcribe DNA to RNA.
213
217
  def to_rna
214
- raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
215
- raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
216
- self.type = :rna
217
- self.seq.tr!('Tt','Uu')
218
+ fail SeqError, 'Cannot transcribe 0 length sequence' if length == 0
219
+ fail SeqError, 'Cannot transcribe sequence type: #{@type}' unless dna?
220
+ @type = :rna
221
+ @seq.tr!('Tt', 'Uu')
218
222
  end
219
223
 
220
224
  # Method to reverse-transcribe RNA to DNA.
221
225
  def to_dna
222
- raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
223
- raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
224
- self.type = :dna
225
- self.seq.tr!('Uu','Tt')
226
+ fail SeqError, 'Cant reverse-transcribe 0 length sequence' if length == 0
227
+ fail SeqError, "Cant reverse-transcribe seq type: #{@type}" unless rna?
228
+ @type = :dna
229
+ @seq.tr!('Uu', 'Tt')
226
230
  end
227
231
 
228
232
  # Method that given a Seq entry returns a BioDSL record (a hash).
229
233
  def to_bp
230
234
  record = {}
231
- record[:SEQ_NAME] = self.seq_name if self.seq_name
232
- record[:SEQ] = self.seq if self.seq
233
- record[:SEQ_LEN] = self.seq.length if self.seq
234
- record[:SCORES] = self.qual if self.qual
235
+ record[:SEQ_NAME] = @seq_name if @seq_name
236
+ record[:SEQ] = @seq if @seq
237
+ record[:SEQ_LEN] = length if @seq
238
+ record[:SCORES] = @qual if @qual
235
239
  record
236
240
  end
237
241
 
238
242
  # Method that given a Seq entry returns a FASTA entry (a string).
239
243
  def to_fasta(wrap = nil)
240
- raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
241
- raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
244
+ fail SeqError, 'Missing seq_name' if @seq_name.nil? || @seq_name == ''
245
+ fail SeqError, 'Missing seq' if @seq.nil? || @seq.empty?
242
246
 
243
- seq_name = self.seq_name.to_s
244
- seq = self.seq.to_s
247
+ seq_name = @seq_name.to_s
248
+ seq = @seq.to_s
245
249
 
246
250
  unless wrap.nil?
247
251
  seq.gsub!(/(.{#{wrap}})/) do |match|
248
- match << $/
252
+ match << $INPUT_RECORD_SEPARATOR
249
253
  end
250
254
 
251
255
  seq.chomp!
252
256
  end
253
257
 
254
- ">#{seq_name}#{$/}#{seq}#{$/}"
258
+ ">#{seq_name}#{$INPUT_RECORD_SEPARATOR}#{seq}#{$INPUT_RECORD_SEPARATOR}"
255
259
  end
256
260
 
257
261
  # Method that given a Seq entry returns a FASTQ entry (a string).
258
262
  def to_fastq
259
- raise SeqError, "Missing seq_name" if self.seq_name.nil?
260
- raise SeqError, "Missing seq" if self.seq.nil?
261
- raise SeqError, "Missing qual" if self.qual.nil?
263
+ fail SeqError, 'Missing seq_name' if @seq_name.nil?
264
+ fail SeqError, 'Missing seq' if @seq.nil?
265
+ fail SeqError, 'Missing qual' if @qual.nil?
262
266
 
263
- seq_name = self.seq_name.to_s
264
- seq = self.seq.to_s
265
- qual = self.qual.to_s
267
+ seq_name = @seq_name.to_s
268
+ seq = @seq.to_s
269
+ qual = @qual.to_s
266
270
 
267
- "@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
271
+ "@#{seq_name}#{$RS}#{seq}#{$RS}+#{$RS}#{qual}#{$RS}"
268
272
  end
269
273
 
270
274
  # Method that generates a unique key for a
271
275
  # DNA sequence and return this key as a Fixnum.
272
276
  def to_key
273
277
  key = 0
274
-
275
- self.seq.upcase.each_char do |char|
278
+
279
+ @seq.upcase.each_char do |char|
276
280
  key <<= 2
277
-
281
+
278
282
  case char
279
283
  when 'A' then key |= 0
280
284
  when 'C' then key |= 1
281
285
  when 'G' then key |= 2
282
286
  when 'T' then key |= 3
283
- else raise SeqError, "Bad residue: #{char}"
287
+ else fail SeqError, "Bad residue: #{char}"
284
288
  end
285
289
  end
286
-
290
+
287
291
  key
288
292
  end
289
293
 
290
294
  # Method to reverse the sequence.
291
295
  def reverse
292
296
  entry = Seq.new(
293
- seq_name: self.seq_name,
294
- seq: self.seq.reverse,
295
- type: self.type,
296
- qual: (self.qual ? self.qual.reverse : self.qual)
297
+ seq_name: @seq_name,
298
+ seq: @seq.reverse,
299
+ type: @type,
300
+ qual: (@qual ? @qual.reverse : @qual)
297
301
  )
298
302
 
299
303
  entry
@@ -301,27 +305,25 @@ module BioDSL
301
305
 
302
306
  # Method to reverse the sequence.
303
307
  def reverse!
304
- self.seq.reverse!
305
- self.qual.reverse! if self.qual
308
+ @seq.reverse!
309
+ @qual.reverse! if @qual
306
310
  self
307
311
  end
308
312
 
309
313
  # Method that complements sequence including ambiguity codes.
310
314
  def complement
311
- raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
315
+ fail SeqError, 'Cannot complement 0 length sequence' if length == 0
312
316
 
313
- entry = Seq.new(
314
- seq_name: self.seq_name,
315
- type: self.type,
316
- qual: self.qual
317
- )
317
+ entry = Seq.new(seq_name: @seq_name, type: @type, qual: @qual)
318
318
 
319
- if self.is_dna?
320
- entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
321
- elsif self.is_rna?
322
- entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
319
+ if dna?
320
+ entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
321
+ 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
322
+ elsif rna?
323
+ entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
324
+ 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
323
325
  else
324
- raise SeqError, "Cannot complement sequence type: #{self.type}"
326
+ fail SeqError, "Cannot complement sequence type: #{@type}"
325
327
  end
326
328
 
327
329
  entry
@@ -329,14 +331,16 @@ module BioDSL
329
331
 
330
332
  # Method that complements sequence including ambiguity codes.
331
333
  def complement!
332
- raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
333
-
334
- if self.is_dna?
335
- self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
336
- elsif self.is_rna?
337
- self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
334
+ fail SeqError, 'Cannot complement 0 length sequence' if length == 0
335
+
336
+ if dna?
337
+ @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
338
+ 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
339
+ elsif rna?
340
+ @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
341
+ 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
338
342
  else
339
- raise SeqError, "Cannot complement sequence type: #{self.type}"
343
+ fail SeqError, "Cannot complement sequence type: #{@type}"
340
344
  end
341
345
 
342
346
  self
@@ -346,68 +350,70 @@ module BioDSL
346
350
  # two Sequence objects (case insensitive).
347
351
  def hamming_distance(entry, options = {})
348
352
  if options[:ambiguity]
349
- BioDSL::Hamming.distance(self.seq, entry.seq, options)
353
+ BioDSL::Hamming.distance(@seq, entry.seq, options)
350
354
  else
351
- BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
355
+ BioDSL::Hamming.distance(@seq.upcase, entry.seq.upcase, options)
352
356
  end
353
357
  end
354
358
 
355
359
  # Method to determine the Edit Distance between
356
360
  # two Sequence objects (case insensitive).
357
361
  def edit_distance(entry)
358
- Levenshtein.distance(self.seq, entry.seq)
362
+ Levenshtein.distance(@seq, entry.seq)
359
363
  end
360
364
 
361
365
  # Method that generates a random sequence of a given length and type.
362
366
  def generate(length, type)
363
- raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
367
+ fail SeqError, "Cannot generate seq length < 1: #{length}" if length <= 0
364
368
 
365
369
  case type
366
370
  when :dna then alph = DNA
367
371
  when :rna then alph = RNA
368
372
  when :protein then alph = PROTEIN
369
373
  else
370
- raise SeqError, "Unknown sequence type: #{type}"
374
+ fail SeqError, "Unknown sequence type: #{type}"
371
375
  end
372
376
 
373
- seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
374
- self.seq = seq_new
375
- self.type = type
377
+ seq_new = Array.new(length) { alph[rand(alph.size)] }.join('')
378
+ @seq = seq_new
379
+ @type = type
380
+
376
381
  seq_new
377
382
  end
378
383
 
379
384
  # Method to return a new Seq object with shuffled sequence.
380
385
  def shuffle
381
386
  Seq.new(
382
- seq_name: self.seq_name,
383
- seq: self.seq.split('').shuffle!.join,
384
- type: self.type,
385
- qual: self.qual
387
+ seq_name: @seq_name,
388
+ seq: @seq.split('').shuffle!.join,
389
+ type: @type,
390
+ qual: @qual
386
391
  )
387
392
  end
388
393
 
389
394
  # Method to shuffle a sequence randomly inline.
390
395
  def shuffle!
391
- self.seq = self.seq.split('').shuffle!.join
396
+ @seq = @seq.split('').shuffle!.join
392
397
  self
393
398
  end
394
399
 
395
400
  # Method to add two Seq objects.
396
- def +(entry)
397
- new_entry = Seq.new()
398
- new_entry.seq = self.seq + entry.seq
399
- new_entry.type = self.type if self.type == entry.type
400
- new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
401
+ def +(other)
402
+ new_entry = Seq.new
403
+ new_entry.seq = @seq + other.seq
404
+ new_entry.type = @type if @type == other.type
405
+ new_entry.qual = @qual + other.qual if @qual && other.qual
401
406
  new_entry
402
407
  end
403
408
 
404
409
  # Method to concatenate sequence entries.
405
410
  def <<(entry)
406
- raise SeqError, "sequences of different types" unless self.type == entry.type
407
- raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
411
+ fail SeqError, 'sequences of different types' unless @type == entry.type
412
+ fail SeqError, 'qual is missing in one entry' unless @qual.class ==
413
+ entry.qual.class
408
414
 
409
- self.seq << entry.seq
410
- self.qual << entry.qual unless entry.qual.nil?
415
+ @seq << entry.seq
416
+ @qual << entry.qual unless entry.qual.nil?
411
417
 
412
418
  self
413
419
  end
@@ -415,18 +421,18 @@ module BioDSL
415
421
  # Index method for Seq objects.
416
422
  def [](*args)
417
423
  entry = Seq.new
418
- entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
419
- entry.seq = self.seq[*args] || ""
420
- entry.type = self.type
421
- entry.qual = self.qual[*args] || "" unless self.qual.nil?
424
+ entry.seq_name = @seq_name.dup unless @seq_name.nil?
425
+ entry.seq = @seq[*args] || ''
426
+ entry.type = @type
427
+ entry.qual = @qual[*args] || '' unless @qual.nil?
422
428
 
423
429
  entry
424
430
  end
425
431
 
426
432
  # Index assignment method for Seq objects.
427
433
  def []=(*args, entry)
428
- self.seq[*args] = entry.seq[*args]
429
- self.qual[*args] = entry.qual[*args] unless self.qual.nil?
434
+ @seq[*args] = entry.seq[*args]
435
+ @qual[*args] = entry.qual[*args] unless @qual.nil?
430
436
 
431
437
  self
432
438
  end
@@ -437,7 +443,7 @@ module BioDSL
437
443
  def composition
438
444
  comp = Hash.new(0);
439
445
 
440
- self.seq.upcase.each_char do |char|
446
+ @seq.upcase.each_char do |char|
441
447
  comp[char] += 1
442
448
  end
443
449
 
@@ -447,30 +453,33 @@ module BioDSL
447
453
  # Method that returns the percentage of hard masked residues
448
454
  # or N's in a sequence.
449
455
  def hard_mask
450
- ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
456
+ ((@seq.upcase.scan('N').size.to_f / (length - indels).to_f) * 100).
457
+ round(2)
451
458
  end
452
459
 
453
460
  # Method that returns the percentage of soft masked residues
454
461
  # or lower cased residues in a sequence.
455
462
  def soft_mask
456
- ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
463
+ ((@seq.scan(/[a-z]/).size.to_f / (length - indels).to_f) * 100).round(2)
457
464
  end
458
465
 
459
- # Hard masks sequence residues where the corresponding quality score
460
- # is below a given cutoff.
466
+ # Hard masks sequence residues where the corresponding quality scoreis below
467
+ # a given cutoff.
461
468
  def mask_seq_hard!(cutoff)
462
- raise SeqError, "seq is nil" if self.seq.nil?
463
- raise SeqError, "qual is nil" if self.qual.nil?
464
- raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
465
-
466
- na_seq = NArray.to_na(self.seq.upcase, "byte")
467
- na_qual = NArray.to_na(self.qual, "byte")
469
+ fail SeqError, 'seq is nil' if @seq.nil?
470
+ fail SeqError, 'qual is nil' if @qual.nil?
471
+ fail SeqError, "cufoff value: #{cutoff} out of range: " \
472
+ "#{SCORE_MIN}..#{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
473
+ include? cutoff
474
+
475
+ na_seq = NArray.to_na(@seq.upcase, 'byte')
476
+ na_qual = NArray.to_na(@qual, 'byte')
468
477
  mask = (na_qual - SCORE_BASE) < cutoff
469
- mask *= na_seq.ne("-".ord)
478
+ mask *= na_seq.ne('-'.ord)
470
479
 
471
480
  na_seq[mask] = 'N'.ord
472
481
 
473
- self.seq = na_seq.to_s
482
+ @seq = na_seq.to_s
474
483
 
475
484
  self
476
485
  end
@@ -479,18 +488,20 @@ module BioDSL
479
488
  # is below a given cutoff. Masked sequence will be lowercased and
480
489
  # remaining will be uppercased.
481
490
  def mask_seq_soft!(cutoff)
482
- raise SeqError, "seq is nil" if self.seq.nil?
483
- raise SeqError, "qual is nil" if self.qual.nil?
484
- raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
485
-
486
- na_seq = NArray.to_na(self.seq.upcase, "byte")
487
- na_qual = NArray.to_na(self.qual, "byte")
491
+ fail SeqError, 'seq is nil' if @seq.nil?
492
+ fail SeqError, 'qual is nil' if @qual.nil?
493
+ fail SeqError, "cufoff value: #{cutoff} out of range: " \
494
+ "#{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
495
+ include? cutoff
496
+
497
+ na_seq = NArray.to_na(@seq.upcase, 'byte')
498
+ na_qual = NArray.to_na(@qual, 'byte')
488
499
  mask = (na_qual - SCORE_BASE) < cutoff
489
- mask *= na_seq.ne("-".ord)
500
+ mask *= na_seq.ne('-'.ord)
490
501
 
491
502
  na_seq[mask] ^= ' '.ord
492
503
 
493
- self.seq = na_seq.to_s
504
+ @seq = na_seq.to_s
494
505
 
495
506
  self
496
507
  end
@@ -498,22 +509,22 @@ module BioDSL
498
509
  # Method that determines if a quality score string can be
499
510
  # absolutely identified as base 33.
500
511
  def qual_base33?
501
- self.qual.match(/[!-:]/) ? true : false
512
+ @qual.match(/[!-:]/) ? true : false
502
513
  end
503
-
514
+
504
515
  # Method that determines if a quality score string may be base 64.
505
516
  def qual_base64?
506
- self.qual.match(/[K-h]/) ? true : false
517
+ @qual.match(/[K-h]/) ? true : false
507
518
  end
508
519
 
509
520
  # Method to determine if a quality score is valid accepting only 0-40 range.
510
521
  def qual_valid?(encoding)
511
- raise SeqError, "Missing qual" if self.qual.nil?
522
+ fail SeqError, 'Missing qual' if @qual.nil?
512
523
 
513
524
  case encoding
514
- when :base_33 then return true if self.qual.match(/^[!-I]*$/)
515
- when :base_64 then return true if self.qual.match(/^[@-h]*$/)
516
- else raise SeqError, "unknown quality score encoding: #{encoding}"
525
+ when :base_33 then return true if @qual.match(/^[!-I]*$/)
526
+ when :base_64 then return true if @qual.match(/^[@-h]*$/)
527
+ else fail SeqError, "unknown quality score encoding: #{encoding}"
517
528
  end
518
529
 
519
530
  false
@@ -521,28 +532,34 @@ module BioDSL
521
532
 
522
533
  # Method to coerce quality scores to be within the 0-40 range.
523
534
  def qual_coerce!(encoding)
524
- raise SeqError, "Missing qual" if self.qual.nil?
535
+ fail SeqError, 'Missing qual' if @qual.nil?
525
536
 
526
537
  case encoding
527
- when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
528
- when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
538
+ when :base_33 then qual_coerce_C(@qual, @qual.length, 33, 73) # !-J
539
+ when :base_64 then qual_coerce_C(@qual, @qual.length, 64, 104) # @-h
529
540
  else
530
- raise SeqError, "unknown quality score encoding: #{encoding}"
531
- end
541
+ fail SeqError, "unknown quality score encoding: #{encoding}"
542
+ end
532
543
 
533
544
  self
534
545
  end
535
546
 
536
547
  # Method to convert quality scores.
537
548
  def qual_convert!(from, to)
538
- raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
539
- raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
540
-
541
- if from == :base_33 and to == :base_64
542
- qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
543
- elsif from == :base_64 and to == :base_33
544
- qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
545
- qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
549
+ unless from == :base_33 || from == :base_64
550
+ fail SeqError, "unknown quality score encoding: #{from}"
551
+ end
552
+
553
+ unless to == :base_33 || to == :base_64
554
+ fail SeqError, "unknown quality score encoding: #{to}"
555
+ end
556
+
557
+ if from == :base_33 && to == :base_64
558
+ qual_convert_C(@qual, @qual.length, 31) # += 64 - 33
559
+ elsif from == :base_64 && to == :base_33
560
+ # Handle negative Solexa values from -5 to -1 (set these to 0).
561
+ qual_coerce_C(@qual, @qual.length, 64, 104)
562
+ qual_convert_C(@qual, @qual.length, -31) # -= 64 - 33
546
563
  end
547
564
 
548
565
  self
@@ -550,9 +567,9 @@ module BioDSL
550
567
 
551
568
  # Method to calculate and return the mean quality score.
552
569
  def scores_mean
553
- raise SeqError, "Missing qual in entry" if self.qual.nil?
570
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
554
571
 
555
- na_qual = NArray.to_na(self.qual, "byte")
572
+ na_qual = NArray.to_na(@qual, 'byte')
556
573
  na_qual -= SCORE_BASE
557
574
 
558
575
  na_qual.mean
@@ -560,9 +577,9 @@ module BioDSL
560
577
 
561
578
  # Method to calculate and return the min quality score.
562
579
  def scores_min
563
- raise SeqError, "Missing qual in entry" if self.qual.nil?
580
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
564
581
 
565
- na_qual = NArray.to_na(self.qual, "byte")
582
+ na_qual = NArray.to_na(@qual, 'byte')
566
583
  na_qual -= SCORE_BASE
567
584
 
568
585
  na_qual.min
@@ -570,9 +587,9 @@ module BioDSL
570
587
 
571
588
  # Method to calculate and return the max quality score.
572
589
  def scores_max
573
- raise SeqError, "Missing qual in entry" if self.qual.nil?
590
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
574
591
 
575
- na_qual = NArray.to_na(self.qual, "byte")
592
+ na_qual = NArray.to_na(@qual, 'byte')
576
593
  na_qual -= SCORE_BASE
577
594
 
578
595
  na_qual.max
@@ -582,17 +599,17 @@ module BioDSL
582
599
  # scores string and calculate for each window the mean score and return
583
600
  # the minimum mean score.
584
601
  def scores_mean_local(window_size)
585
- raise SeqError, "Missing qual in entry" if self.qual.nil?
602
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
586
603
 
587
- scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
604
+ scores_mean_local_C(@qual, @qual.length, SCORE_BASE, window_size)
588
605
  end
589
606
 
590
607
  # Method to find open reading frames (ORFs).
591
608
  def each_orf(options = {})
592
- size_min = options[:size_min] || 0
593
- size_max = options[:size_max] || self.length
594
- start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
595
- stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
609
+ size_min = options[:size_min] || 0
610
+ size_max = options[:size_max] || length
611
+ start_codons = options[:start_codons] || 'ATG,GTG,AUG,GUG'
612
+ stop_codons = options[:stop_codons] || 'TAA,TGA,TAG,UAA,UGA,UAG'
596
613
  pick_longest = options[:pick_longest]
597
614
 
598
615
  orfs = []
@@ -601,22 +618,23 @@ module BioDSL
601
618
  regex_start = Regexp.new(start_codons.split(',').join('|'), true)
602
619
  regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
603
620
 
604
- while pos_beg and pos_beg < self.length - size_min
605
- if pos_beg = self.seq.index(regex_start, pos_beg)
606
- if pos_end = self.seq.index(regex_stop, pos_beg)
607
- length = (pos_end - pos_beg) + 3
621
+ while pos_beg && pos_beg < length - size_min
622
+ pos_beg = @seq.index(regex_start, pos_beg)
623
+ next unless pos_beg
624
+ pos_end = @seq.index(regex_stop, pos_beg)
625
+ next unless pos_end
608
626
 
609
- if (length % 3) == 0
610
- if size_min <= length and length <= size_max
611
- subseq = self[pos_beg ... pos_beg + length]
627
+ orf_length = (pos_end - pos_beg) + 3
612
628
 
613
- orfs << Orf.new(subseq, pos_beg, pos_end + 2)
614
- end
615
- end
616
- end
629
+ if (orf_length % 3) == 0
630
+ if size_min <= orf_length && orf_length <= size_max
631
+ subseq = self[pos_beg...pos_beg + orf_length]
617
632
 
618
- pos_beg += 1
633
+ orfs << Orf.new(subseq, pos_beg, pos_end + 2)
634
+ end
619
635
  end
636
+
637
+ pos_beg += 1
620
638
  end
621
639
 
622
640
  if pick_longest
@@ -634,17 +652,8 @@ module BioDSL
634
652
  end
635
653
  end
636
654
 
637
- class Orf
638
- attr_reader :entry, :start, :stop
639
-
640
- def initialize(entry, start, stop)
641
- @entry = entry
642
- @start = start
643
- @stop = stop
644
- end
645
- end
646
-
647
- private
655
+ # Struct for holding an ORF.
656
+ Orf = Struct.new(:entry, :start, :stop)
648
657
 
649
658
  inline do |builder|
650
659
  builder.c %{