BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,194 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Align sequences in the stream using Mothur.
30
+ #
31
+ # This is a wrapper for the +mothur+ command +align.seqs()+. Basically,
32
+ # it aligns sequences to a reference alignment.
33
+ #
34
+ # Please refer to the manual:
35
+ #
36
+ # http://www.mothur.org/wiki/Align.seqs
37
+ #
38
+ # Mothur must be installed for +align_seq_mothurs+ to work. Read more here:
39
+ #
40
+ # http://www.mothur.org/
41
+ #
42
+ # == Usage
43
+ #
44
+ # align_seq_mothur(<template_file: <file>>[, cpus: <uint>])
45
+ #
46
+ # === Options
47
+ #
48
+ # * template_file: <file> - File with template alignment in FASTA format.
49
+ # * cpus: <uint> - Number of CPU cores to use (default=1).
50
+ #
51
+ # == Examples
52
+ #
53
+ # To align the entries in the FASTA file `test.fna` to the template alignment
54
+ # in the file `template.fna` do:
55
+ #
56
+ # BP.new.
57
+ # read_fasta(input: "test.fna").
58
+ # align_seq_mothur(template_file: "template.fna").
59
+ # run
60
+ class AlignSeqMothur
61
+ require 'English'
62
+ require 'BioDSL/helpers/aux_helper'
63
+
64
+ include AuxHelper
65
+
66
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
67
+ residues_out)
68
+
69
+ # Constructor for the AlignSeqMothur class.
70
+ #
71
+ # @param [Hash] options Options hash.
72
+ # @option options [String] :template_file Path to template file.
73
+ # @option options [Integer] :cpus Number of CPUs to use.
74
+ #
75
+ # @return [AlignSeqMothur] Returns an instance of the class.
76
+ def initialize(options)
77
+ @options = options
78
+
79
+ aux_exist('mothur')
80
+ check_options
81
+ defaults
82
+ end
83
+
84
+ # Return a lambda for the align_seq_mothur command.
85
+ #
86
+ # @return [Proc] Returns the align_seq_mothur command lambda.
87
+ def lmb
88
+ lambda do |input, output, status|
89
+ status_init(status, STATS)
90
+
91
+ TmpDir.create('input.fna', 'input.align') do |tmp_in, tmp_out, tmp_dir|
92
+ process_input(input, output, tmp_in)
93
+ run_mothur(@options[:template_file], @options[:cpus], tmp_dir, tmp_in)
94
+ process_output(output, tmp_out)
95
+ end
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ # Check the options.
102
+ def check_options
103
+ options_allowed(@options, :template_file, :cpus)
104
+ options_required(@options, :template_file)
105
+ options_files_exist(@options, :template_file)
106
+ options_assert(@options, ':cpus >= 1')
107
+ options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
108
+ end
109
+
110
+ # Set default options.
111
+ def defaults
112
+ @options[:cpus] ||= 1
113
+ end
114
+
115
+ # Process all records in the input stream and write those with sequences to
116
+ # file and all other records to the output stream.
117
+ #
118
+ # @param input [BioDSL::Stream] The input stream.
119
+ # @param output [BioDSL::Stream] The output stream.
120
+ # @param tmp_in [String] Path to temporary file.
121
+ def process_input(input, output, tmp_in)
122
+ BioDSL::Fasta.open(tmp_in, 'w') do |ios|
123
+ input.each_with_index do |record, i|
124
+ @status[:records_in] += 1
125
+
126
+ if record[:SEQ]
127
+ write_entry(ios, record, i)
128
+ else
129
+ output << record
130
+ @status[:records_out] += 1
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ # Write a record containing sequence information to a FASTA file IO handle.
137
+ # If no sequence_name is found in the record use the sequence index
138
+ # instead.
139
+ #
140
+ # @param ios [Fasta::IO] FASTA IO.
141
+ # @param record [Hash] BioDSL record to create FASTA entry from.
142
+ # @param i [Integer] Sequence index.
143
+ def write_entry(ios, record, i)
144
+ seq_name = record[:SEQ_NAME] || i.to_s
145
+ entry = BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
146
+
147
+ @status[:sequences_in] += 1
148
+ @status[:residues_in] += entry.length
149
+
150
+ ios.puts entry.to_fasta
151
+ end
152
+
153
+ # Read all FASTA entries from output file and emit to the output stream.
154
+ #
155
+ # @param output [BioDSL::Stream] The output stream.
156
+ # @param tmp_out [String] Path to temporary file.
157
+ def process_output(output, tmp_out)
158
+ BioDSL::Fasta.open(tmp_out) do |ios|
159
+ ios.each do |entry|
160
+ output << entry.to_bp
161
+ @status[:records_out] += 1
162
+ @status[:sequences_out] += 1
163
+ @status[:residues_out] += entry.length
164
+ end
165
+ end
166
+ end
167
+
168
+ # Run Mothur using a system call.
169
+ #
170
+ # @param template_file [String] Path to template file.
171
+ # @param cpus [Integer] Number of CPUs to use.
172
+ # @param tmp_dir [String] Path to temporary dir.
173
+ # @param tmp_in [String] Path to temporary file.
174
+ #
175
+ # @raise [RunTimeError] If system call fails.
176
+ def run_mothur(template_file, cpus, tmp_dir, tmp_in)
177
+ cmd = <<-CMD.gsub(/^\s+\|/, '').delete("\n")
178
+ |mothur "#set.dir(input=#{tmp_dir});
179
+ |set.dir(output=#{tmp_dir});
180
+ |align.seqs(candidate=#{tmp_in},
181
+ |template=#{template_file},
182
+ |processors=#{cpus})"
183
+ CMD
184
+
185
+ if BioDSL.verbose
186
+ system(cmd)
187
+ else
188
+ system("#{cmd} > /dev/null 2>&1")
189
+ end
190
+
191
+ fail 'Mothur failed' unless $CHILD_STATUS.success?
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,222 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Analyze the residue distribution from sequences in the stream.
30
+ #
31
+ # +analyze_residue_distribution+ determines the distribution per position
32
+ # of residues from sequences and output records per observed residue with
33
+ # counts at the different positions. Using the +percent+ option outputs the
34
+ # count as percentages of observed residues per position.
35
+ #
36
+ # The records output looks like this:
37
+ #
38
+ # {:RECORD_TYPE=>"residue distribution",
39
+ # :V0=>"A",
40
+ # :V1=>5,
41
+ # :V2=>0,
42
+ # :V3=>0,
43
+ # :V4=>0}
44
+ #
45
+ # Which are ready for +write_table+. See examples.
46
+ #
47
+ # == Usage
48
+ #
49
+ # analyze_residue_distribution([percent: <bool>])
50
+ #
51
+ # === Options
52
+ #
53
+ # * percent: <bool> - Output distributions in percent (default=false).
54
+ #
55
+ # == Examples
56
+ #
57
+ # Consider the following entries in the file `test.fna`:
58
+ #
59
+ # >DNA
60
+ # AGCT
61
+ # >RNA
62
+ # AGCU
63
+ # >Protein
64
+ # FLS*
65
+ # >Gaps
66
+ # -.~
67
+ #
68
+ # Now we run the data through the following pipeline and get the resulting
69
+ # table:
70
+ #
71
+ # BP.new.
72
+ # read_fasta(input: "test.fna").
73
+ # analyze_residue_distribution.
74
+ # grab(select: "residue").
75
+ # write_table(skip: [:RECORD_TYPE]).
76
+ # run
77
+ #
78
+ # A 2 0 0 0
79
+ # G 0 2 0 0
80
+ # C 0 0 2 0
81
+ # T 0 0 0 1
82
+ # U 0 0 0 1
83
+ # F 1 0 0 0
84
+ # L 0 1 0 0
85
+ # S 0 0 1 0
86
+ # * 0 0 0 1
87
+ # - 1 0 0 0
88
+ # . 0 1 0 0
89
+ # ~ 0 0 1 0
90
+ #
91
+ # Here we do the same as above, but output percentages instead of absolute
92
+ # counts:
93
+ #
94
+ # BP.new.
95
+ # read_fasta(input: "test.fna").
96
+ # analyze_residue_distribution(percent: true).
97
+ # grab(select: "residue").
98
+ # write_table(skip: [:RECORD_TYPE]).
99
+ # run
100
+ #
101
+ # A 50 0 0 0
102
+ # G 0 50 0 0
103
+ # C 0 0 50 0
104
+ # T 0 0 0 33
105
+ # U 0 0 0 33
106
+ # F 25 0 0 0
107
+ # L 0 25 0 0
108
+ # S 0 0 25 0
109
+ # * 0 0 0 33
110
+ # - 25 0 0 0
111
+ # . 0 25 0 0
112
+ # ~ 0 0 25 0
113
+ class AnalyzeResidueDistribution
114
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
115
+ residues_out)
116
+
117
+ # Constructor for the AnalyzeResidueDistribution class.
118
+ #
119
+ # @param [Hash] options Options hash.
120
+ # @option options [Boolean] :percent Output distribution in percent.
121
+ #
122
+ # @return [AnalyzeResidueDistribution] Returns an instance of the class.
123
+ def initialize(options)
124
+ @options = options
125
+
126
+ check_options
127
+
128
+ @counts = Hash.new { |h, k| h[k] = Hash.new(0) }
129
+ @total = Hash.new(0)
130
+ @residues = Set.new
131
+ end
132
+
133
+ # Return a lambda for the read_fasta command.
134
+ #
135
+ # @return [Proc] Returns the read_fasta command lambda.
136
+ def lmb
137
+ require 'set'
138
+
139
+ lambda do |input, output, status|
140
+ status_init(status, STATS)
141
+
142
+ input.each do |record|
143
+ @status[:records_in] += 1
144
+
145
+ analyze_residues(record[:SEQ]) if record[:SEQ]
146
+
147
+ if output
148
+ output << record
149
+ @status[:records_out] += 1
150
+ end
151
+ end
152
+
153
+ calc_dist(output)
154
+ end
155
+ end
156
+
157
+ private
158
+
159
+ # Check the options.
160
+ def check_options
161
+ options_allowed(@options, :percent)
162
+ options_allowed_values(@options, percent: [nil, true, false])
163
+ end
164
+
165
+ # Analyze the sequence distribution of a given sequence.
166
+ #
167
+ # @param seq [String] - Sequence to analyze.
168
+ def analyze_residues(seq)
169
+ @status[:sequences_in] += 1
170
+ @status[:sequences_out] += 1
171
+ @status[:residues_in] += seq.length
172
+ @status[:residues_out] += seq.length
173
+
174
+ seq.upcase.chars.each_with_index do |char, i|
175
+ c = char.to_sym
176
+ @counts[i][c] += 1
177
+ @total[i] += 1
178
+ @residues.add(c)
179
+ end
180
+ end
181
+
182
+ # Calculate the residue destribution.
183
+ #
184
+ # @param output [BioDSL::Stream] Output stream.
185
+ def calc_dist(output)
186
+ @residues.each do |res|
187
+ record = {}
188
+ record[:RECORD_TYPE] = 'residue distribution'
189
+ record[:V0] = res.to_s
190
+
191
+ if @options[:percent]
192
+ calc_dist_percent(record, res)
193
+ else
194
+ calc_dist_count(record, res)
195
+ end
196
+
197
+ output << record
198
+ end
199
+ end
200
+
201
+ # Calculate the residue distribution in percent for a given residue.
202
+ #
203
+ # @param record [Hash] BioDSL record.
204
+ # @param res [Symbol] Residue.
205
+ def calc_dist_percent(record, res)
206
+ @counts.each do |pos, dist|
207
+ value = (@total[pos] == 0) ? 0 : 100 * dist[res] / @total[pos]
208
+ record["V#{pos + 1}".to_sym] = value
209
+ end
210
+ end
211
+
212
+ # Calculate the residue distribution for a given residue.
213
+ #
214
+ # @param record [Hash] BioDSL record.
215
+ # @param res [Symbol] Residue.
216
+ def calc_dist_count(record, res)
217
+ @counts.each do |pos, dist|
218
+ record["V#{pos + 1}".to_sym] = dist[res]
219
+ end
220
+ end
221
+ end
222
+ end