BioDSL 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
@@ -1,35 +1,39 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part of BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
28
30
  # Error class for all exceptions to do with Filesys.
29
31
  class FilesysError < StandardError; end
30
32
 
33
+ # Class for handling filesystem manipulations.
31
34
  class Filesys
32
35
  require 'open3'
36
+ require 'English'
33
37
 
34
38
  include Enumerable
35
39
 
@@ -40,10 +44,10 @@ module BioDSL
40
44
  exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
41
45
 
42
46
  ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
43
- exts.each { |ext|
47
+ exts.each do |ext|
44
48
  exe = File.join(path, "#{cmd}#{ext}")
45
49
  return exe if File.executable?(exe) && !File.directory?(exe)
46
- }
50
+ end
47
51
  end
48
52
 
49
53
  nil
@@ -51,14 +55,15 @@ module BioDSL
51
55
 
52
56
  # Class method that returns a path to a unique temporary file.
53
57
  # If no directory is specified reverts to the systems tmp directory.
54
- def self.tmpfile(tmp_dir = ENV["TMPDIR"])
58
+ def self.tmpfile(tmp_dir = ENV['TMPDIR'])
55
59
  time = Time.now.to_i
56
- user = ENV["USER"]
57
- pid = $$
58
- path = tmp_dir + [user, time + pid, pid].join("_") + ".tmp"
60
+ user = ENV['USER']
61
+ pid = $PID
62
+ path = tmp_dir + [user, time + pid, pid].join('_') + '.tmp'
59
63
  path
60
64
  end
61
65
 
66
+ # Open a file which may be compressed with gzip og bzip2.
62
67
  def self.open(*args)
63
68
  file = args.shift
64
69
  mode = args.shift
@@ -67,32 +72,37 @@ module BioDSL
67
72
  if mode == 'w'
68
73
  case options[:compress]
69
74
  when :gzip
70
- ios, = Open3.pipeline_w("gzip -f", out: file)
75
+ ios, = Open3.pipeline_w('gzip -f', out: file)
71
76
  when :bzip, :bzip2
72
- ios, = Open3.pipeline_w("bzip2 -c", out: file)
73
- else
74
- ios = File.open(file, mode, options)
75
- end
76
- else
77
- type = (file.respond_to? :path) ? `file -Lk #{file.path}` : `file -Lk #{file}`
78
- case type
79
- when /gzip/
80
- ios = IO.popen("gzip -cd #{file}")
81
- when /bzip/
82
- ios = IO.popen("bzcat #{file}")
77
+ ios, = Open3.pipeline_w('bzip2 -c', out: file)
83
78
  else
84
79
  ios = File.open(file, mode, options)
85
80
  end
81
+ else
82
+ type = if file.respond_to? :path
83
+ `file -Lk #{file.path}`
84
+ else
85
+ `file -Lk #{file}`
86
+ end
87
+
88
+ ios = case type
89
+ when /gzip/
90
+ IO.popen("gzip -cd #{file}")
91
+ when /bzip/
92
+ IO.popen("bzcat #{file}")
93
+ else
94
+ File.open(file, mode, options)
95
+ end
86
96
  end
87
97
 
88
98
  if block_given?
89
99
  begin
90
- yield self.new(ios)
100
+ yield new(ios)
91
101
  ensure
92
102
  ios.close
93
103
  end
94
104
  else
95
- return self.new(ios)
105
+ return new(ios)
96
106
  end
97
107
  end
98
108
 
@@ -134,4 +144,3 @@ module BioDSL
134
144
  end
135
145
  end
136
146
  end
137
-
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
  module BioDSL
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -21,7 +21,7 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
  # #
24
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
24
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
25
25
  # #
26
26
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
27
 
@@ -20,7 +20,7 @@
20
20
  # #
21
21
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
22
  # #
23
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
23
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
24
24
  # #
25
25
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
26
 
@@ -20,7 +20,7 @@
20
20
  # #
21
21
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
22
  # #
23
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
23
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
24
24
  # #
25
25
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
26
  module BioDSL
@@ -1,30 +1,33 @@
1
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
- # #
3
- # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
- # #
5
- # This program is free software; you can redistribute it and/or #
6
- # modify it under the terms of the GNU General Public License #
7
- # as published by the Free Software Foundation; either version 2 #
8
- # of the License, or (at your option) any later version. #
9
- # #
10
- # This program is distributed in the hope that it will be useful, #
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
- # GNU General Public License for more details. #
14
- # #
15
- # You should have received a copy of the GNU General Public License #
16
- # along with this program; if not, write to the Free Software #
17
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #
18
- # #
19
- # http://www.gnu.org/copyleft/gpl.html #
20
- # #
21
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
- # #
23
- # This software is part BioDSL (www.BioDSL.org). #
24
- # #
25
- # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
-
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part BioDSL (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
27
29
  module BioDSL
30
+ require 'English'
28
31
  require 'narray'
29
32
  require 'BioDSL/seq/ambiguity'
30
33
  require 'BioDSL/seq/assemble'
@@ -40,12 +43,15 @@ module BioDSL
40
43
  # Error class for all exceptions to do with Seq.
41
44
  class SeqError < StandardError; end
42
45
 
46
+ # rubocop: disable ClassLength
47
+
48
+ # Class for manipulating sequences.
43
49
  class Seq
44
50
  # Residue alphabets
45
- DNA = %w[a t c g]
46
- RNA = %w[a u c g]
47
- PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
48
- INDELS = %w[. - _ ~]
51
+ DNA = %w(a t c g)
52
+ RNA = %w(a u c g)
53
+ PROTEIN = %w(f l s y c w p h q r i m t n k v a d e g)
54
+ INDELS = %w(. - _ ~)
49
55
 
50
56
  # Quality scores bases
51
57
  SCORE_BASE = 33
@@ -69,30 +75,29 @@ module BioDSL
69
75
  type = record[:SEQ_TYPE].to_sym if record[:SEQ_TYPE]
70
76
  qual = record[:SCORES]
71
77
 
72
- self.new(seq_name: seq_name, seq: seq, type: type, qual: qual)
78
+ new(seq_name: seq_name, seq: seq, type: type, qual: qual)
73
79
  end
74
80
 
75
- # Class method that generates all possible oligos of a specifed length and type.
81
+ # Class method that generates all possible oligos of a specifed length and
82
+ # type.
76
83
  def self.generate_oligos(length, type)
77
- raise SeqError, "Cannot generate oligos of zero or negative length: #{length}" if length <= 0
84
+ fail SeqError, "Bad length: #{length}" if length <= 0
78
85
 
79
86
  case type.downcase
80
87
  when :dna then alph = DNA
81
88
  when :rna then alph = RNA
82
89
  when :protein then alph = PROTEIN
83
90
  else
84
- raise SeqError, "Unknown sequence type: #{type}"
91
+ fail SeqError, "Unknown sequence type: #{type}"
85
92
  end
86
93
 
87
- oligos = [""]
94
+ oligos = ['']
88
95
 
89
- (1 .. length).each do
96
+ (1..length).each do
90
97
  list = []
91
98
 
92
99
  oligos.each do |oligo|
93
- alph.each do |char|
94
- list << oligo + char
95
- end
100
+ alph.each { |char| list << oligo + char }
96
101
  end
97
102
 
98
103
  oligos = list
@@ -103,24 +108,22 @@ module BioDSL
103
108
 
104
109
  def self.check_name_pair(entry1, entry2)
105
110
  if entry1.seq_name =~ /^([^ ]+) \d:/
106
- name1 = $1
107
- elsif entry1.seq_name =~ /^(.+)\/\d$/
108
- name1 = $1
111
+ name1 = Regexp.last_match[1]
112
+ elsif entry1.seq_name =~ %r{^(.+)\/\d$}
113
+ name1 = Regexp.last_match[1]
109
114
  else
110
- raise SeqError, "Could not match sequence name: #{entry1.seq_name}"
115
+ fail SeqError, "Could not match sequence name: #{entry1.seq_name}"
111
116
  end
112
117
 
113
118
  if entry2.seq_name =~ /^([^ ]+) \d:/
114
- name2 = $1
115
- elsif entry2.seq_name =~ /^(.+)\/\d$/
116
- name2 = $1
119
+ name2 = Regexp.last_match[1]
120
+ elsif entry2.seq_name =~ %r{^(.+)\/\d$}
121
+ name2 = Regexp.last_match[1]
117
122
  else
118
- raise SeqError, "Could not match sequence name: #{entry2.seq_name}"
123
+ fail SeqError, "Could not match sequence name: #{entry2.seq_name}"
119
124
  end
120
125
 
121
- if name1 != name2
122
- raise SeqError, "Name mismatch: #{name1} != #{name2}"
123
- end
126
+ fail SeqError, "Name mismatch: #{name1} != #{name2}" if name1 != name2
124
127
  end
125
128
 
126
129
  # Initialize a sequence object with the following options:
@@ -134,18 +137,19 @@ module BioDSL
134
137
  @type = options[:type]
135
138
  @qual = options[:qual]
136
139
 
137
- if @seq and @qual and @seq.length != @qual.length
138
- raise SeqError, "Sequence length and score length mismatch:" \
139
- "#{@seq.length} != #{@qual.length}"
140
- end
140
+ return unless @seq && @qual
141
+ return if @seq.length == @qual.length
142
+
143
+ fail SeqError, 'Sequence length and score length mismatch: ' \
144
+ "#{@seq.length} != #{@qual.length}"
141
145
  end
142
146
 
143
147
  # Method that guesses and returns the sequence type
144
148
  # by inspecting the first 100 residues.
145
149
  def type_guess
146
- raise SeqError, "Guess failed: sequence is nil" if self.seq.nil?
150
+ fail SeqError, 'Guess failed: sequence is nil' if @seq.nil?
147
151
 
148
- case self.seq[0 ... 100].downcase
152
+ case @seq[0...100].downcase
149
153
  when /[flpqie]/ then return :protein
150
154
  when /[u]/ then return :rna
151
155
  else return :dna
@@ -155,31 +159,31 @@ module BioDSL
155
159
  # Method that guesses and sets the sequence type
156
160
  # by inspecting the first 100 residues.
157
161
  def type_guess!
158
- self.type = self.type_guess
162
+ @type = type_guess
159
163
  self
160
164
  end
161
165
 
162
166
  # Returns the length of a sequence.
163
167
  def length
164
- self.seq.nil? ? 0 : self.seq.length
168
+ @seq.nil? ? 0 : @seq.length
165
169
  end
166
170
 
167
- alias :len :length
171
+ alias_method :len, :length
168
172
 
169
173
  # Return the number indels in a sequence.
170
174
  def indels
171
175
  regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/)
172
- self.seq.scan(regex).size
176
+ @seq.scan(regex).size
173
177
  end
174
178
 
175
179
  # Method to remove indels from seq and qual if qual.
176
180
  def indels_remove
177
- if self.qual.nil?
178
- self.seq.delete!(Regexp.escape(INDELS.join('')))
181
+ if @qual.nil?
182
+ @seq.delete!(Regexp.escape(INDELS.join('')))
179
183
  else
180
- na_seq = NArray.to_na(self.seq, "byte")
181
- na_qual = NArray.to_na(self.qual, "byte")
182
- mask = NArray.byte(self.length)
184
+ na_seq = NArray.to_na(@seq, 'byte')
185
+ na_qual = NArray.to_na(@qual, 'byte')
186
+ mask = NArray.byte(length)
183
187
 
184
188
  INDELS.each do |c|
185
189
  mask += na_seq.eq(c.ord)
@@ -187,113 +191,113 @@ module BioDSL
187
191
 
188
192
  mask = mask.eq(0)
189
193
 
190
- self.seq = na_seq[mask].to_s
191
- self.qual = na_qual[mask].to_s
194
+ @seq = na_seq[mask].to_s
195
+ @qual = na_qual[mask].to_s
192
196
  end
193
197
 
194
198
  self
195
199
  end
196
200
 
197
201
  # Method that returns true is a given sequence type is DNA.
198
- def is_dna?
199
- self.type == :dna
202
+ def dna?
203
+ @type == :dna
200
204
  end
201
205
 
202
206
  # Method that returns true is a given sequence type is RNA.
203
- def is_rna?
204
- self.type == :rna
207
+ def rna?
208
+ @type == :rna
205
209
  end
206
210
 
207
211
  # Method that returns true is a given sequence type is protein.
208
- def is_protein?
209
- self.type == :protein
212
+ def protein?
213
+ @type == :protein
210
214
  end
211
215
 
212
216
  # Method to transcribe DNA to RNA.
213
217
  def to_rna
214
- raise SeqError, "Cannot transcribe 0 length sequence" if self.length == 0
215
- raise SeqError, "Cannot transcribe sequence type: #{self.type}" unless self.is_dna?
216
- self.type = :rna
217
- self.seq.tr!('Tt','Uu')
218
+ fail SeqError, 'Cannot transcribe 0 length sequence' if length == 0
219
+ fail SeqError, 'Cannot transcribe sequence type: #{@type}' unless dna?
220
+ @type = :rna
221
+ @seq.tr!('Tt', 'Uu')
218
222
  end
219
223
 
220
224
  # Method to reverse-transcribe RNA to DNA.
221
225
  def to_dna
222
- raise SeqError, "Cannot reverse-transcribe 0 length sequence" if self.length == 0
223
- raise SeqError, "Cannot reverse-transcribe sequence type: #{self.type}" unless self.is_rna?
224
- self.type = :dna
225
- self.seq.tr!('Uu','Tt')
226
+ fail SeqError, 'Cant reverse-transcribe 0 length sequence' if length == 0
227
+ fail SeqError, "Cant reverse-transcribe seq type: #{@type}" unless rna?
228
+ @type = :dna
229
+ @seq.tr!('Uu', 'Tt')
226
230
  end
227
231
 
228
232
  # Method that given a Seq entry returns a BioDSL record (a hash).
229
233
  def to_bp
230
234
  record = {}
231
- record[:SEQ_NAME] = self.seq_name if self.seq_name
232
- record[:SEQ] = self.seq if self.seq
233
- record[:SEQ_LEN] = self.seq.length if self.seq
234
- record[:SCORES] = self.qual if self.qual
235
+ record[:SEQ_NAME] = @seq_name if @seq_name
236
+ record[:SEQ] = @seq if @seq
237
+ record[:SEQ_LEN] = length if @seq
238
+ record[:SCORES] = @qual if @qual
235
239
  record
236
240
  end
237
241
 
238
242
  # Method that given a Seq entry returns a FASTA entry (a string).
239
243
  def to_fasta(wrap = nil)
240
- raise SeqError, "Missing seq_name" if self.seq_name.nil? or self.seq_name == ''
241
- raise SeqError, "Missing seq" if self.seq.nil? or self.seq.empty?
244
+ fail SeqError, 'Missing seq_name' if @seq_name.nil? || @seq_name == ''
245
+ fail SeqError, 'Missing seq' if @seq.nil? || @seq.empty?
242
246
 
243
- seq_name = self.seq_name.to_s
244
- seq = self.seq.to_s
247
+ seq_name = @seq_name.to_s
248
+ seq = @seq.to_s
245
249
 
246
250
  unless wrap.nil?
247
251
  seq.gsub!(/(.{#{wrap}})/) do |match|
248
- match << $/
252
+ match << $INPUT_RECORD_SEPARATOR
249
253
  end
250
254
 
251
255
  seq.chomp!
252
256
  end
253
257
 
254
- ">#{seq_name}#{$/}#{seq}#{$/}"
258
+ ">#{seq_name}#{$INPUT_RECORD_SEPARATOR}#{seq}#{$INPUT_RECORD_SEPARATOR}"
255
259
  end
256
260
 
257
261
  # Method that given a Seq entry returns a FASTQ entry (a string).
258
262
  def to_fastq
259
- raise SeqError, "Missing seq_name" if self.seq_name.nil?
260
- raise SeqError, "Missing seq" if self.seq.nil?
261
- raise SeqError, "Missing qual" if self.qual.nil?
263
+ fail SeqError, 'Missing seq_name' if @seq_name.nil?
264
+ fail SeqError, 'Missing seq' if @seq.nil?
265
+ fail SeqError, 'Missing qual' if @qual.nil?
262
266
 
263
- seq_name = self.seq_name.to_s
264
- seq = self.seq.to_s
265
- qual = self.qual.to_s
267
+ seq_name = @seq_name.to_s
268
+ seq = @seq.to_s
269
+ qual = @qual.to_s
266
270
 
267
- "@#{seq_name}#{$/}#{seq}#{$/}+#{$/}#{qual}#{$/}"
271
+ "@#{seq_name}#{$RS}#{seq}#{$RS}+#{$RS}#{qual}#{$RS}"
268
272
  end
269
273
 
270
274
  # Method that generates a unique key for a
271
275
  # DNA sequence and return this key as a Fixnum.
272
276
  def to_key
273
277
  key = 0
274
-
275
- self.seq.upcase.each_char do |char|
278
+
279
+ @seq.upcase.each_char do |char|
276
280
  key <<= 2
277
-
281
+
278
282
  case char
279
283
  when 'A' then key |= 0
280
284
  when 'C' then key |= 1
281
285
  when 'G' then key |= 2
282
286
  when 'T' then key |= 3
283
- else raise SeqError, "Bad residue: #{char}"
287
+ else fail SeqError, "Bad residue: #{char}"
284
288
  end
285
289
  end
286
-
290
+
287
291
  key
288
292
  end
289
293
 
290
294
  # Method to reverse the sequence.
291
295
  def reverse
292
296
  entry = Seq.new(
293
- seq_name: self.seq_name,
294
- seq: self.seq.reverse,
295
- type: self.type,
296
- qual: (self.qual ? self.qual.reverse : self.qual)
297
+ seq_name: @seq_name,
298
+ seq: @seq.reverse,
299
+ type: @type,
300
+ qual: (@qual ? @qual.reverse : @qual)
297
301
  )
298
302
 
299
303
  entry
@@ -301,27 +305,25 @@ module BioDSL
301
305
 
302
306
  # Method to reverse the sequence.
303
307
  def reverse!
304
- self.seq.reverse!
305
- self.qual.reverse! if self.qual
308
+ @seq.reverse!
309
+ @qual.reverse! if @qual
306
310
  self
307
311
  end
308
312
 
309
313
  # Method that complements sequence including ambiguity codes.
310
314
  def complement
311
- raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
315
+ fail SeqError, 'Cannot complement 0 length sequence' if length == 0
312
316
 
313
- entry = Seq.new(
314
- seq_name: self.seq_name,
315
- type: self.type,
316
- qual: self.qual
317
- )
317
+ entry = Seq.new(seq_name: @seq_name, type: @type, qual: @qual)
318
318
 
319
- if self.is_dna?
320
- entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
321
- elsif self.is_rna?
322
- entry.seq = self.seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
319
+ if dna?
320
+ entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
321
+ 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
322
+ elsif rna?
323
+ entry.seq = @seq.tr('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
324
+ 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
323
325
  else
324
- raise SeqError, "Cannot complement sequence type: #{self.type}"
326
+ fail SeqError, "Cannot complement sequence type: #{@type}"
325
327
  end
326
328
 
327
329
  entry
@@ -329,14 +331,16 @@ module BioDSL
329
331
 
330
332
  # Method that complements sequence including ambiguity codes.
331
333
  def complement!
332
- raise SeqError, "Cannot complement 0 length sequence" if self.length == 0
333
-
334
- if self.is_dna?
335
- self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
336
- elsif self.is_rna?
337
- self.seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn', 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
334
+ fail SeqError, 'Cannot complement 0 length sequence' if length == 0
335
+
336
+ if dna?
337
+ @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
338
+ 'TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn')
339
+ elsif rna?
340
+ @seq.tr!('AGCUTRYWSMKHDVBNagcutrywsmkhdvbn',
341
+ 'UCGAAYRWSKMDHBVNucgaayrwskmdhbvn')
338
342
  else
339
- raise SeqError, "Cannot complement sequence type: #{self.type}"
343
+ fail SeqError, "Cannot complement sequence type: #{@type}"
340
344
  end
341
345
 
342
346
  self
@@ -346,68 +350,70 @@ module BioDSL
346
350
  # two Sequence objects (case insensitive).
347
351
  def hamming_distance(entry, options = {})
348
352
  if options[:ambiguity]
349
- BioDSL::Hamming.distance(self.seq, entry.seq, options)
353
+ BioDSL::Hamming.distance(@seq, entry.seq, options)
350
354
  else
351
- BioDSL::Hamming.distance(self.seq.upcase, entry.seq.upcase, options)
355
+ BioDSL::Hamming.distance(@seq.upcase, entry.seq.upcase, options)
352
356
  end
353
357
  end
354
358
 
355
359
  # Method to determine the Edit Distance between
356
360
  # two Sequence objects (case insensitive).
357
361
  def edit_distance(entry)
358
- Levenshtein.distance(self.seq, entry.seq)
362
+ Levenshtein.distance(@seq, entry.seq)
359
363
  end
360
364
 
361
365
  # Method that generates a random sequence of a given length and type.
362
366
  def generate(length, type)
363
- raise SeqError, "Cannot generate sequence length < 1: #{length}" if length <= 0
367
+ fail SeqError, "Cannot generate seq length < 1: #{length}" if length <= 0
364
368
 
365
369
  case type
366
370
  when :dna then alph = DNA
367
371
  when :rna then alph = RNA
368
372
  when :protein then alph = PROTEIN
369
373
  else
370
- raise SeqError, "Unknown sequence type: #{type}"
374
+ fail SeqError, "Unknown sequence type: #{type}"
371
375
  end
372
376
 
373
- seq_new = Array.new(length) { alph[rand(alph.size)] }.join("")
374
- self.seq = seq_new
375
- self.type = type
377
+ seq_new = Array.new(length) { alph[rand(alph.size)] }.join('')
378
+ @seq = seq_new
379
+ @type = type
380
+
376
381
  seq_new
377
382
  end
378
383
 
379
384
  # Method to return a new Seq object with shuffled sequence.
380
385
  def shuffle
381
386
  Seq.new(
382
- seq_name: self.seq_name,
383
- seq: self.seq.split('').shuffle!.join,
384
- type: self.type,
385
- qual: self.qual
387
+ seq_name: @seq_name,
388
+ seq: @seq.split('').shuffle!.join,
389
+ type: @type,
390
+ qual: @qual
386
391
  )
387
392
  end
388
393
 
389
394
  # Method to shuffle a sequence randomly inline.
390
395
  def shuffle!
391
- self.seq = self.seq.split('').shuffle!.join
396
+ @seq = @seq.split('').shuffle!.join
392
397
  self
393
398
  end
394
399
 
395
400
  # Method to add two Seq objects.
396
- def +(entry)
397
- new_entry = Seq.new()
398
- new_entry.seq = self.seq + entry.seq
399
- new_entry.type = self.type if self.type == entry.type
400
- new_entry.qual = self.qual + entry.qual if self.qual and entry.qual
401
+ def +(other)
402
+ new_entry = Seq.new
403
+ new_entry.seq = @seq + other.seq
404
+ new_entry.type = @type if @type == other.type
405
+ new_entry.qual = @qual + other.qual if @qual && other.qual
401
406
  new_entry
402
407
  end
403
408
 
404
409
  # Method to concatenate sequence entries.
405
410
  def <<(entry)
406
- raise SeqError, "sequences of different types" unless self.type == entry.type
407
- raise SeqError, "qual is missing in one entry" unless self.qual.class == entry.qual.class
411
+ fail SeqError, 'sequences of different types' unless @type == entry.type
412
+ fail SeqError, 'qual is missing in one entry' unless @qual.class ==
413
+ entry.qual.class
408
414
 
409
- self.seq << entry.seq
410
- self.qual << entry.qual unless entry.qual.nil?
415
+ @seq << entry.seq
416
+ @qual << entry.qual unless entry.qual.nil?
411
417
 
412
418
  self
413
419
  end
@@ -415,18 +421,18 @@ module BioDSL
415
421
  # Index method for Seq objects.
416
422
  def [](*args)
417
423
  entry = Seq.new
418
- entry.seq_name = self.seq_name.dup unless self.seq_name.nil?
419
- entry.seq = self.seq[*args] || ""
420
- entry.type = self.type
421
- entry.qual = self.qual[*args] || "" unless self.qual.nil?
424
+ entry.seq_name = @seq_name.dup unless @seq_name.nil?
425
+ entry.seq = @seq[*args] || ''
426
+ entry.type = @type
427
+ entry.qual = @qual[*args] || '' unless @qual.nil?
422
428
 
423
429
  entry
424
430
  end
425
431
 
426
432
  # Index assignment method for Seq objects.
427
433
  def []=(*args, entry)
428
- self.seq[*args] = entry.seq[*args]
429
- self.qual[*args] = entry.qual[*args] unless self.qual.nil?
434
+ @seq[*args] = entry.seq[*args]
435
+ @qual[*args] = entry.qual[*args] unless @qual.nil?
430
436
 
431
437
  self
432
438
  end
@@ -437,7 +443,7 @@ module BioDSL
437
443
  def composition
438
444
  comp = Hash.new(0);
439
445
 
440
- self.seq.upcase.each_char do |char|
446
+ @seq.upcase.each_char do |char|
441
447
  comp[char] += 1
442
448
  end
443
449
 
@@ -447,30 +453,33 @@ module BioDSL
447
453
  # Method that returns the percentage of hard masked residues
448
454
  # or N's in a sequence.
449
455
  def hard_mask
450
- ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2)
456
+ ((@seq.upcase.scan('N').size.to_f / (length - indels).to_f) * 100).
457
+ round(2)
451
458
  end
452
459
 
453
460
  # Method that returns the percentage of soft masked residues
454
461
  # or lower cased residues in a sequence.
455
462
  def soft_mask
456
- ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
463
+ ((@seq.scan(/[a-z]/).size.to_f / (length - indels).to_f) * 100).round(2)
457
464
  end
458
465
 
459
- # Hard masks sequence residues where the corresponding quality score
460
- # is below a given cutoff.
466
+ # Hard masks sequence residues where the corresponding quality scoreis below
467
+ # a given cutoff.
461
468
  def mask_seq_hard!(cutoff)
462
- raise SeqError, "seq is nil" if self.seq.nil?
463
- raise SeqError, "qual is nil" if self.qual.nil?
464
- raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
465
-
466
- na_seq = NArray.to_na(self.seq.upcase, "byte")
467
- na_qual = NArray.to_na(self.qual, "byte")
469
+ fail SeqError, 'seq is nil' if @seq.nil?
470
+ fail SeqError, 'qual is nil' if @qual.nil?
471
+ fail SeqError, "cufoff value: #{cutoff} out of range: " \
472
+ "#{SCORE_MIN}..#{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
473
+ include? cutoff
474
+
475
+ na_seq = NArray.to_na(@seq.upcase, 'byte')
476
+ na_qual = NArray.to_na(@qual, 'byte')
468
477
  mask = (na_qual - SCORE_BASE) < cutoff
469
- mask *= na_seq.ne("-".ord)
478
+ mask *= na_seq.ne('-'.ord)
470
479
 
471
480
  na_seq[mask] = 'N'.ord
472
481
 
473
- self.seq = na_seq.to_s
482
+ @seq = na_seq.to_s
474
483
 
475
484
  self
476
485
  end
@@ -479,18 +488,20 @@ module BioDSL
479
488
  # is below a given cutoff. Masked sequence will be lowercased and
480
489
  # remaining will be uppercased.
481
490
  def mask_seq_soft!(cutoff)
482
- raise SeqError, "seq is nil" if self.seq.nil?
483
- raise SeqError, "qual is nil" if self.qual.nil?
484
- raise SeqError, "cufoff value: #{cutoff} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? cutoff
485
-
486
- na_seq = NArray.to_na(self.seq.upcase, "byte")
487
- na_qual = NArray.to_na(self.qual, "byte")
491
+ fail SeqError, 'seq is nil' if @seq.nil?
492
+ fail SeqError, 'qual is nil' if @qual.nil?
493
+ fail SeqError, "cufoff value: #{cutoff} out of range: " \
494
+ "#{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN..SCORE_MAX).
495
+ include? cutoff
496
+
497
+ na_seq = NArray.to_na(@seq.upcase, 'byte')
498
+ na_qual = NArray.to_na(@qual, 'byte')
488
499
  mask = (na_qual - SCORE_BASE) < cutoff
489
- mask *= na_seq.ne("-".ord)
500
+ mask *= na_seq.ne('-'.ord)
490
501
 
491
502
  na_seq[mask] ^= ' '.ord
492
503
 
493
- self.seq = na_seq.to_s
504
+ @seq = na_seq.to_s
494
505
 
495
506
  self
496
507
  end
@@ -498,22 +509,22 @@ module BioDSL
498
509
  # Method that determines if a quality score string can be
499
510
  # absolutely identified as base 33.
500
511
  def qual_base33?
501
- self.qual.match(/[!-:]/) ? true : false
512
+ @qual.match(/[!-:]/) ? true : false
502
513
  end
503
-
514
+
504
515
  # Method that determines if a quality score string may be base 64.
505
516
  def qual_base64?
506
- self.qual.match(/[K-h]/) ? true : false
517
+ @qual.match(/[K-h]/) ? true : false
507
518
  end
508
519
 
509
520
  # Method to determine if a quality score is valid accepting only 0-40 range.
510
521
  def qual_valid?(encoding)
511
- raise SeqError, "Missing qual" if self.qual.nil?
522
+ fail SeqError, 'Missing qual' if @qual.nil?
512
523
 
513
524
  case encoding
514
- when :base_33 then return true if self.qual.match(/^[!-I]*$/)
515
- when :base_64 then return true if self.qual.match(/^[@-h]*$/)
516
- else raise SeqError, "unknown quality score encoding: #{encoding}"
525
+ when :base_33 then return true if @qual.match(/^[!-I]*$/)
526
+ when :base_64 then return true if @qual.match(/^[@-h]*$/)
527
+ else fail SeqError, "unknown quality score encoding: #{encoding}"
517
528
  end
518
529
 
519
530
  false
@@ -521,28 +532,34 @@ module BioDSL
521
532
 
522
533
  # Method to coerce quality scores to be within the 0-40 range.
523
534
  def qual_coerce!(encoding)
524
- raise SeqError, "Missing qual" if self.qual.nil?
535
+ fail SeqError, 'Missing qual' if @qual.nil?
525
536
 
526
537
  case encoding
527
- when :base_33 then qual_coerce_C(self.qual, self.qual.length, 33, 73) # !-J
528
- when :base_64 then qual_coerce_C(self.qual, self.qual.length, 64, 104) # @-h
538
+ when :base_33 then qual_coerce_C(@qual, @qual.length, 33, 73) # !-J
539
+ when :base_64 then qual_coerce_C(@qual, @qual.length, 64, 104) # @-h
529
540
  else
530
- raise SeqError, "unknown quality score encoding: #{encoding}"
531
- end
541
+ fail SeqError, "unknown quality score encoding: #{encoding}"
542
+ end
532
543
 
533
544
  self
534
545
  end
535
546
 
536
547
  # Method to convert quality scores.
537
548
  def qual_convert!(from, to)
538
- raise SeqError, "unknown quality score encoding: #{from}" unless from == :base_33 or from == :base_64
539
- raise SeqError, "unknown quality score encoding: #{to}" unless to == :base_33 or to == :base_64
540
-
541
- if from == :base_33 and to == :base_64
542
- qual_convert_C(self.qual, self.qual.length, 31) # += 64 - 33
543
- elsif from == :base_64 and to == :base_33
544
- qual_coerce_C(self.qual, self.qual.length, 64, 104) # Handle negative Solexa values from -5 to -1 (set these to 0).
545
- qual_convert_C(self.qual, self.qual.length, -31) # -= 64 - 33
549
+ unless from == :base_33 || from == :base_64
550
+ fail SeqError, "unknown quality score encoding: #{from}"
551
+ end
552
+
553
+ unless to == :base_33 || to == :base_64
554
+ fail SeqError, "unknown quality score encoding: #{to}"
555
+ end
556
+
557
+ if from == :base_33 && to == :base_64
558
+ qual_convert_C(@qual, @qual.length, 31) # += 64 - 33
559
+ elsif from == :base_64 && to == :base_33
560
+ # Handle negative Solexa values from -5 to -1 (set these to 0).
561
+ qual_coerce_C(@qual, @qual.length, 64, 104)
562
+ qual_convert_C(@qual, @qual.length, -31) # -= 64 - 33
546
563
  end
547
564
 
548
565
  self
@@ -550,9 +567,9 @@ module BioDSL
550
567
 
551
568
  # Method to calculate and return the mean quality score.
552
569
  def scores_mean
553
- raise SeqError, "Missing qual in entry" if self.qual.nil?
570
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
554
571
 
555
- na_qual = NArray.to_na(self.qual, "byte")
572
+ na_qual = NArray.to_na(@qual, 'byte')
556
573
  na_qual -= SCORE_BASE
557
574
 
558
575
  na_qual.mean
@@ -560,9 +577,9 @@ module BioDSL
560
577
 
561
578
  # Method to calculate and return the min quality score.
562
579
  def scores_min
563
- raise SeqError, "Missing qual in entry" if self.qual.nil?
580
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
564
581
 
565
- na_qual = NArray.to_na(self.qual, "byte")
582
+ na_qual = NArray.to_na(@qual, 'byte')
566
583
  na_qual -= SCORE_BASE
567
584
 
568
585
  na_qual.min
@@ -570,9 +587,9 @@ module BioDSL
570
587
 
571
588
  # Method to calculate and return the max quality score.
572
589
  def scores_max
573
- raise SeqError, "Missing qual in entry" if self.qual.nil?
590
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
574
591
 
575
- na_qual = NArray.to_na(self.qual, "byte")
592
+ na_qual = NArray.to_na(@qual, 'byte')
576
593
  na_qual -= SCORE_BASE
577
594
 
578
595
  na_qual.max
@@ -582,17 +599,17 @@ module BioDSL
582
599
  # scores string and calculate for each window the mean score and return
583
600
  # the minimum mean score.
584
601
  def scores_mean_local(window_size)
585
- raise SeqError, "Missing qual in entry" if self.qual.nil?
602
+ fail SeqError, 'Missing qual in entry' if @qual.nil?
586
603
 
587
- scores_mean_local_C(self.qual, self.qual.length, SCORE_BASE, window_size)
604
+ scores_mean_local_C(@qual, @qual.length, SCORE_BASE, window_size)
588
605
  end
589
606
 
590
607
  # Method to find open reading frames (ORFs).
591
608
  def each_orf(options = {})
592
- size_min = options[:size_min] || 0
593
- size_max = options[:size_max] || self.length
594
- start_codons = options[:start_codons] || "ATG,GTG,AUG,GUG"
595
- stop_codons = options[:stop_codons] || "TAA,TGA,TAG,UAA,UGA,UAG"
609
+ size_min = options[:size_min] || 0
610
+ size_max = options[:size_max] || length
611
+ start_codons = options[:start_codons] || 'ATG,GTG,AUG,GUG'
612
+ stop_codons = options[:stop_codons] || 'TAA,TGA,TAG,UAA,UGA,UAG'
596
613
  pick_longest = options[:pick_longest]
597
614
 
598
615
  orfs = []
@@ -601,22 +618,23 @@ module BioDSL
601
618
  regex_start = Regexp.new(start_codons.split(',').join('|'), true)
602
619
  regex_stop = Regexp.new(stop_codons.split(',').join('|'), true)
603
620
 
604
- while pos_beg and pos_beg < self.length - size_min
605
- if pos_beg = self.seq.index(regex_start, pos_beg)
606
- if pos_end = self.seq.index(regex_stop, pos_beg)
607
- length = (pos_end - pos_beg) + 3
621
+ while pos_beg && pos_beg < length - size_min
622
+ pos_beg = @seq.index(regex_start, pos_beg)
623
+ next unless pos_beg
624
+ pos_end = @seq.index(regex_stop, pos_beg)
625
+ next unless pos_end
608
626
 
609
- if (length % 3) == 0
610
- if size_min <= length and length <= size_max
611
- subseq = self[pos_beg ... pos_beg + length]
627
+ orf_length = (pos_end - pos_beg) + 3
612
628
 
613
- orfs << Orf.new(subseq, pos_beg, pos_end + 2)
614
- end
615
- end
616
- end
629
+ if (orf_length % 3) == 0
630
+ if size_min <= orf_length && orf_length <= size_max
631
+ subseq = self[pos_beg...pos_beg + orf_length]
617
632
 
618
- pos_beg += 1
633
+ orfs << Orf.new(subseq, pos_beg, pos_end + 2)
634
+ end
619
635
  end
636
+
637
+ pos_beg += 1
620
638
  end
621
639
 
622
640
  if pick_longest
@@ -634,17 +652,8 @@ module BioDSL
634
652
  end
635
653
  end
636
654
 
637
- class Orf
638
- attr_reader :entry, :start, :stop
639
-
640
- def initialize(entry, start, stop)
641
- @entry = entry
642
- @start = start
643
- @stop = stop
644
- end
645
- end
646
-
647
- private
655
+ # Struct for holding an ORF.
656
+ Orf = Struct.new(:entry, :start, :stop)
648
657
 
649
658
  inline do |builder|
650
659
  builder.c %{