BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,150 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Collect OTU data from records in the stream.
30
+ #
31
+ # +collect_otus+ count the number of times each OTU is found in a set of
32
+ # samples. OTUs are given by the :S_ID key and samples by the :SAMPLE key.
33
+ # If a :SEQ_COUNT key is present it will be used to increment the OTU count,
34
+ # allowing for dereplicated sequences to be used.
35
+ #
36
+ # == Usage
37
+ #
38
+ # collect_otus()
39
+ #
40
+ # === Options
41
+ #
42
+ # == Examples
43
+ #
44
+ class CollectOtus
45
+ require 'set'
46
+
47
+ STATS = %i(records_in records_out hits_in hits_out)
48
+
49
+ # Constructor for CollectOtus.
50
+ #
51
+ # @param options [Hash] Options hash.
52
+ def initialize(options)
53
+ @options = options
54
+
55
+ check_options
56
+ end
57
+
58
+ # Return lambda for CollectOtus command.
59
+ #
60
+ # @return [Proc] Command lambda.
61
+ def lmb
62
+ lambda do |input, output, status|
63
+ status_init(status, STATS)
64
+
65
+ count_hash = process_input(input, output)
66
+ samples = collect_samples(count_hash)
67
+ process_output(count_hash, samples, output)
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ # Check options.
74
+ def check_options
75
+ options_allowed(@options, nil)
76
+ end
77
+
78
+ # Read input stream and for all hit records add these to the count hash.
79
+ #
80
+ # @param input [Enumerator] Input stream.
81
+ # @param output [Enumerator::Yielder] Output stream.
82
+ #
83
+ # @return [Hash] Returns the count_hash.
84
+ def process_input(input, output)
85
+ count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
86
+
87
+ input.each do |record|
88
+ @status[:records_in] += 1
89
+
90
+ if record[:TYPE] && record[:TYPE] == 'H'
91
+ add_to_count_hash(count_hash, record)
92
+ end
93
+
94
+ output << record
95
+ @status[:records_out] += 1
96
+ end
97
+
98
+ count_hash
99
+ end
100
+
101
+ # Add to the count_hash a given record.
102
+ #
103
+ # @param count_hash [Hash] Hash with sample counts
104
+ # @param record [Hash] BioDSL record with sample and count.
105
+ def add_to_count_hash(count_hash, record)
106
+ id = record[:S_ID].to_sym
107
+ sample = record[:SAMPLE].upcase.to_sym
108
+ count_hash[id][sample] += (record[:SEQ_COUNT] || 1)
109
+ @status[:hits_in] += 1
110
+ end
111
+
112
+ # Collect all samples in the count_hash into a sorted set.
113
+ #
114
+ # @param count_hash [Hash] Hash with sample counts.
115
+ #
116
+ # @return [SortedSet] Sample names.
117
+ def collect_samples(count_hash)
118
+ samples = SortedSet.new
119
+
120
+ count_hash.values.each do |value|
121
+ value.keys.map { |key| samples << key }
122
+ end
123
+
124
+ samples
125
+ end
126
+
127
+ # Output all samples and counts from the count_hash and samples to the
128
+ # output stream.
129
+ #
130
+ # @param count_hash [Hash] Hash with sample counts
131
+ # @param samples [SortedSet] Set with sample names.
132
+ # @param output [Enumerator::Yielder] Output stream.
133
+ def process_output(count_hash, samples, output)
134
+ count_hash.each do |key, value|
135
+ record = {}
136
+ record[:RECORD_TYPE] = 'OTU'
137
+ record[:OTU] = key.to_s
138
+
139
+ samples.each do |sample|
140
+ record["#{sample}_COUNT".to_sym] = value[sample]
141
+ end
142
+
143
+ output << record
144
+
145
+ @status[:hits_out] += 1
146
+ @status[:records_out] += 1
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,117 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Complment sequences in the stream.
30
+ #
31
+ # +complement_seq+ complements sequences in the stream. The sequence type -
32
+ # DNA or RNA - is guessed by inspected the first sequence in the stream.
33
+ #
34
+ # +complement_seq+ can be used together with +reverse_seq+ to reverse-
35
+ # complement sequences.
36
+ #
37
+ # == Usage
38
+ #
39
+ # complement_seq()
40
+ #
41
+ # === Options
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following FASTQ entry in the file test.fq:
46
+ #
47
+ # @M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185
48
+ # TTGTAAAACGACGGCCAGTG
49
+ # +
50
+ # >>>>>FFFFD@A?A0AE0FG
51
+ #
52
+ # To complement the sequence do:
53
+ #
54
+ # BP.new.read_fastq(input:"test.fq").complement_seq.dump.run
55
+ #
56
+ # {:SEQ_NAME=>"M02529:88:000000000-AC0WY:1:1101:12879:1928 2:N:0:185",
57
+ # :SEQ=>"AACATTTTGCTGCCGGTCAC",
58
+ # :SEQ_LEN=>20,
59
+ # :SCORES=>">>>>>FFFFD@A?A0AE0FG"}
60
+ class ComplementSeq
61
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
62
+ residues_out)
63
+
64
+ # Constructor for ComplementSeq.
65
+ #
66
+ # @param options [Hash] Options hash.
67
+ def initialize(options)
68
+ @options = options
69
+ @type = nil
70
+
71
+ check_options
72
+ end
73
+
74
+ # Return the command lambda for ComplementSeq.
75
+ #
76
+ # @return [Proc] Command lambda
77
+ def lmb
78
+ lambda do |input, output, status|
79
+ status_init(status, STATS)
80
+
81
+ input.each do |record|
82
+ @status[:records_in] += 1
83
+
84
+ complement(record) if record.key? :SEQ
85
+
86
+ output << record
87
+
88
+ @status[:records_out] += 1
89
+ end
90
+ end
91
+ end
92
+
93
+ private
94
+
95
+ # Check options.
96
+ def check_options
97
+ options_allowed(@options, nil)
98
+ end
99
+
100
+ # Complements sequence in record.
101
+ #
102
+ # @param record [Hash] BioDSL record with sequence.
103
+ def complement(record)
104
+ entry = BioDSL::Seq.new_bp(record)
105
+ @type = entry.type_guess unless @type
106
+ entry.type = @type
107
+ entry.complement!
108
+
109
+ @status[:sequences_in] += 1
110
+ @status[:sequences_out] += 1
111
+ @status[:residues_in] += entry.length
112
+ @status[:residues_out] += entry.length
113
+
114
+ record.merge! entry.to_bp
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,135 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ # Namespace for BioDSL.
29
+ module BioDSL
30
+ # == Count the number of records in the stream.
31
+ #
32
+ # +count+ counts the number of records in the stream and outputs the
33
+ # count as a record who's count is _not_ included. Using the +output+
34
+ # option will output the count in a file as a table with header.
35
+ #
36
+ # == Usage
37
+ #
38
+ # count([output: <file>[, force: <bool]])
39
+ #
40
+ # === Options
41
+ #
42
+ # * output: <file> - Output file.
43
+ # * force: <bool> - Force overwrite existing output file.
44
+ #
45
+ # == Examples
46
+ #
47
+ # To count the number of records in the file `test.fq`:
48
+ #
49
+ # BP.new.read_fastq(input: "test.fq").count(output: "count.txt").dump.run
50
+ #
51
+ # {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1040:5263#TTAGGC/1",
52
+ # :SEQ=>"TTCGGCATCGGCGGCGACGTTGGCGGCGGGGCCGGGCGGGTCGANNNCAT",
53
+ # :SEQ_LEN=>50,
54
+ # :SCORES=>"GGFBGGEADFAFFDDD,-5AC5?>C:)7?#####################"}
55
+ # {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1041:14486#TTAGGC/1",
56
+ # :SEQ=>"CATGGCGTATGCCAGACGGCCAGAACGATGGCCGCCGGGCTTCANNNAAG",
57
+ # :SEQ_LEN=>50,
58
+ # :SCORES=>"FFFFDBD?EEEEEEEFGGFAGAGEFDF=BFGFFGGDDDD=ABAA######"}
59
+ # {:SEQ_NAME=>"ILLUMINA-52179E_0004:2:1:1043:19446#TTAGGC/1",
60
+ # :SEQ=>"CGGTACTGATCGAGTGTCAGGCTGTTGATCGCCGCGGGCGGGGGTNNGAC",
61
+ # :SEQ_LEN=>50,
62
+ # :SCORES=>"ECAEBEEEEEFFFFFEFFFFDDEEEGGGGGDEBEECBDAE@#########"}
63
+ # {:RECORD_TYPE=>"count", :COUNT=>3}
64
+ #
65
+ # And the count is also saved in the file `count.txt`:
66
+ # #RECORD_TYPE COUNT
67
+ # count 3
68
+ class Count
69
+ STATS = %i(records_in records_out)
70
+
71
+ # Constructor for the count command.
72
+ #
73
+ # @param options [Hash] Options hash.
74
+ # @option options [String] :output Path to output file.
75
+ # @option options [Boolean] :force Force overwrite of output file.
76
+ #
77
+ # @return [Count] Instance of class Count.
78
+ def initialize(options)
79
+ @options = options
80
+
81
+ check_options
82
+ end
83
+
84
+ # Return the command lambda for count.
85
+ #
86
+ # @return [Proc] Command lambda.
87
+ def lmb
88
+ lambda do |input, output, status|
89
+ status_init(status, STATS)
90
+
91
+ process_input(input, output)
92
+
93
+ new_record = {
94
+ RECORD_TYPE: 'count',
95
+ COUNT: @status[:records_in]
96
+ }
97
+
98
+ output << new_record
99
+ @status[:records_out] += 1
100
+
101
+ write_output if @options[:output]
102
+ end
103
+ end
104
+
105
+ private
106
+
107
+ # Check options.
108
+ def check_options
109
+ options_allowed(@options, :output, :force)
110
+ options_allowed_values(@options, force: [true, false, nil])
111
+ options_files_exist_force(@options, :output)
112
+ end
113
+
114
+ # Process the input stream and emit all recors to the output stream.
115
+ #
116
+ # @param input [Enumerator] Input stream
117
+ # @param output [Enumerator::Yielder] Output stream
118
+ def process_input(input, output)
119
+ input.each do |record|
120
+ @status[:records_in] += 1
121
+
122
+ output << record
123
+ @status[:records_out] += 1
124
+ end
125
+ end
126
+
127
+ # Write output table to file.
128
+ def write_output
129
+ Filesys.open(@options[:output], 'w') do |ios|
130
+ ios.puts "#RECORD_TYPE\tCOUNT"
131
+ ios.puts "count\t#{@status[:records_in]}"
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,149 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Count the number of times values of given keys exists in stream.
30
+ #
31
+ # +count_values+ count the values for a given comma seperated list of keys.
32
+ #
33
+ # == Usage
34
+ #
35
+ # count_values(<keys: <list>)
36
+ #
37
+ # === Options
38
+ #
39
+ # * keys: <list> - Keys whos values to count.
40
+ #
41
+ # == Examples
42
+ #
43
+ # Consider the following two column table in the file `test.tab`:
44
+ #
45
+ # Human H1
46
+ # Human H2
47
+ # Human H3
48
+ # Dog D1
49
+ # Dog D2
50
+ # Mouse M1
51
+ #
52
+ # To count the values of both columns we first read the table with
53
+ # +read_table+ and then pass the result to +count_values+:
54
+ #
55
+ # BP.new.
56
+ # read_table(input: "test.tab").
57
+ # count_values(keys: [:V0, :V1]).
58
+ # dump.
59
+ # run
60
+ #
61
+ # {:V0=>"Human", :V1=>"H1", :V0_COUNT=>3, :V1_COUNT=>1}
62
+ # {:V0=>"Human", :V1=>"H2", :V0_COUNT=>3, :V1_COUNT=>1}
63
+ # {:V0=>"Human", :V1=>"H3", :V0_COUNT=>3, :V1_COUNT=>1}
64
+ # {:V0=>"Dog", :V1=>"D1", :V0_COUNT=>2, :V1_COUNT=>1}
65
+ # {:V0=>"Dog", :V1=>"D2", :V0_COUNT=>2, :V1_COUNT=>1}
66
+ # {:V0=>"Mouse", :V1=>"M1", :V0_COUNT=>1, :V1_COUNT=>1}
67
+ class CountValues
68
+ STATS = %i(records_in records_out)
69
+
70
+ # Constructor for CountValues.
71
+ #
72
+ # @param options [Hash] Options hash.
73
+ # @option options [Array] List of keys whos values to count.
74
+ #
75
+ # @return [CountValues] Instance of class.
76
+ def initialize(options)
77
+ @options = options
78
+
79
+ check_options
80
+
81
+ @keys = @options[:keys].map(&:to_sym)
82
+ @count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
83
+ end
84
+
85
+ # Return the command lambda for the count_values command.
86
+ #
87
+ # @return [Proc] Return command lambda.
88
+ def lmb
89
+ lambda do |input, output, status|
90
+ status_init(status, STATS)
91
+
92
+ TmpDir.create('count_values') do |tmp_file, _|
93
+ process_input(input, tmp_file)
94
+ process_output(output, tmp_file)
95
+ end
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ # Check options.
102
+ def check_options
103
+ options_allowed(@options, :keys)
104
+ options_required(@options, :keys)
105
+ end
106
+
107
+ # Save serialized stream to a temporary file and counting the requested
108
+ # values.
109
+ #
110
+ # @param input [Enumerator] Input stream.
111
+ # @param tmp_file [String] Path to temp file.
112
+ def process_input(input, tmp_file)
113
+ File.open(tmp_file, 'wb') do |ios|
114
+ BioDSL::Serializer.new(ios) do |s|
115
+ input.each do |record|
116
+ @keys.map do |key|
117
+ @count_hash[key][record[key]] += 1 if record.key? key
118
+ end
119
+
120
+ @status[:records_in] += 1
121
+
122
+ s << record
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ # Output serialized stream to the output stream including value counts.
129
+ #
130
+ # @param output [Enumerator::Yielder] Output stream.
131
+ # @param tmp_file [String] Path to temp file with serialized input stream.
132
+ def process_output(output, tmp_file)
133
+ File.open(tmp_file, 'rb') do |ios|
134
+ BioDSL::Serializer.new(ios) do |s|
135
+ s.each do |record|
136
+ @keys.map do |key|
137
+ if record.key? key
138
+ record["#{key}_COUNT".to_sym] = @count_hash[key][record[key]]
139
+ end
140
+ end
141
+
142
+ output << record
143
+ @status[:records_out] += 1
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end