BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,175 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Merge pair-end sequences in the stream.
30
+ #
31
+ # +merge_pair_seq+ merges paired sequences in the stream, if these are
32
+ # interleaved. Sequence names must be in either Illumina1.3/1.5 format
33
+ # trailing a /1 or /2 or Illumina1.8 containing 1: or 2:. Sequence names must
34
+ # match accordingly in order to merge sequences.
35
+ #
36
+ # == Usage
37
+ #
38
+ # merge_pair_seq
39
+ #
40
+ # === Options
41
+ #
42
+ # == Examples
43
+ #
44
+ # Consider the following FASTQ entry in the file test.fq:
45
+ #
46
+ # @M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14
47
+ # TGGGGAATATTGGACAATGG
48
+ # +
49
+ # <??????BDDDDDDDDGGGG
50
+ # @M01168:16:000000000-A1R9L:1:1101:14862:1868 2:N:0:14
51
+ # CCTGTTTGCTACCCACGCTT
52
+ # +
53
+ # ?????BB<-<BDDDDDFEEF
54
+ # @M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14
55
+ # TAGGGAATCTTGCACAATGG
56
+ # +
57
+ # <???9?BBBDBDDBDDFFFF
58
+ # @M01168:16:000000000-A1R9L:1:1101:13906:2139 2:N:0:14
59
+ # ACTCTTCGCTACCCATGCTT
60
+ # +
61
+ # ,5<??BB?DDABDBDDFFFF
62
+ # @M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14
63
+ # TAGGGAATCTTGCACAATGG
64
+ # +
65
+ # ?????BBBBBDDBDDBFFFF
66
+ # @M01168:16:000000000-A1R9L:1:1101:14865:2158 2:N:0:14
67
+ # CCTCTTCGCTACCCATGCTT
68
+ # +
69
+ # ??,<??B?BB?BBBBBFF?F
70
+ #
71
+ # To merge these interleaved pair-end sequences use merge_pair_seq:
72
+ #
73
+ # BP.new.
74
+ # read_fastq(input: "test.fq", encoding: :base_33).
75
+ # merge_pair_seq.
76
+ # dump.
77
+ # run
78
+ #
79
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14862:1868 1:N:0:14",
80
+ # :SEQ=>"TGGGGAATATTGGACAATGGCCTGTTTGCTACCCACGCTT",
81
+ # :SEQ_LEN=>40,
82
+ # :SCORES=>"<??????BDDDDDDDDGGGG?????BB<-<BDDDDDFEEF",
83
+ # :SEQ_LEN_LEFT=>20,
84
+ # :SEQ_LEN_RIGHT=>20}
85
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:13906:2139 1:N:0:14",
86
+ # :SEQ=>"TAGGGAATCTTGCACAATGGACTCTTCGCTACCCATGCTT",
87
+ # :SEQ_LEN=>40,
88
+ # :SCORES=>"<???9?BBBDBDDBDDFFFF,5<??BB?DDABDBDDFFFF",
89
+ # :SEQ_LEN_LEFT=>20,
90
+ # :SEQ_LEN_RIGHT=>20}
91
+ # {:SEQ_NAME=>"M01168:16:000000000-A1R9L:1:1101:14865:2158 1:N:0:14",
92
+ # :SEQ=>"TAGGGAATCTTGCACAATGGCCTCTTCGCTACCCATGCTT",
93
+ # :SEQ_LEN=>40,
94
+ # :SCORES=>"?????BBBBBDDBDDBFFFF??,<??B?BB?BBBBBFF?F",
95
+ # :SEQ_LEN_LEFT=>20,
96
+ # :SEQ_LEN_RIGHT=>20}
97
+ class MergePairSeq
98
+ STATS = %i(records_in records_out sequences_in sequences_out residues_in
99
+ residues_out)
100
+
101
+ # Constructor for MergePairSeq.
102
+ #
103
+ # @param options [Hash] Options hash.
104
+ #
105
+ # @return [MergePairSeq] Instance of MergePairSeq.
106
+ def initialize(options)
107
+ @options = options
108
+
109
+ check_options
110
+ end
111
+
112
+ # Return the command lambda for merge_pair_seq.
113
+ #
114
+ # @return [Proc] Command lambda for.
115
+ def lmb
116
+ lambda do |input, output, status|
117
+ status_init(status, STATS)
118
+
119
+ input.each_slice(2) do |record1, record2|
120
+ @status[:records_in] += record2 ? 2 : 1
121
+
122
+ if record1[:SEQ] && record2[:SEQ]
123
+ output << merge_pair_seq(record1, record2)
124
+
125
+ @status[:sequences_in] += 2
126
+ @status[:sequences_out] += 1
127
+ @status[:records_out] += 1
128
+ else
129
+ output.puts record1, record2
130
+
131
+ @status[:records_out] += 2
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ private
138
+
139
+ # Check options.
140
+ def check_options
141
+ options_allowed(@options, nil)
142
+ end
143
+
144
+ # Merge entry pair and return a new BioDSL record with this.
145
+ #
146
+ # @param record1 [Hash] BioDSL record 1.
147
+ # @param record2 [Hash] BioDSL record 2.
148
+ #
149
+ # @return [Hash] BioDSL record.
150
+ def merge_pair_seq(record1, record2)
151
+ entry1 = BioDSL::Seq.new_bp(record1)
152
+ entry2 = BioDSL::Seq.new_bp(record2)
153
+
154
+ BioDSL::Seq.check_name_pair(entry1, entry2)
155
+
156
+ @status[:residues_in] += entry1.length + entry2.length
157
+
158
+ length1 = entry1.length
159
+ length2 = entry2.length
160
+
161
+ entry1 << entry2
162
+
163
+ @status[:residues_out] += entry1.length
164
+
165
+ new_record(entry1, length1, length2)
166
+ end
167
+
168
+ def new_record(entry1, length1, length2)
169
+ new_record = entry1.to_bp
170
+ new_record[:SEQ_LEN_LEFT] = length1
171
+ new_record[:SEQ_LEN_RIGHT] = length2
172
+ new_record
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,225 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Merge records on a given key with tabular data from one or more files.
30
+ #
31
+ # +merge_table+ reads in one or more tabular files and merges any records in
32
+ # the stream with identical values for a given key. The values for the given
33
+ # key must be unique in the tabular files, but not necesarily in the stream.
34
+ #
35
+ # Consult *read_table* for details on how the tabular files are read.
36
+ #
37
+ # The stats for +merge_table+ includes the following values:
38
+ #
39
+ # * rows_total - total number of table rows.
40
+ # * rows_matched - number of table rows with the given key.
41
+ # * rows_unmatched - number of table rows without the given key.
42
+ # * merged - number of records that was merged.
43
+ # * non_merged - number of records that was not merged.
44
+ #
45
+ # == Usage
46
+ # merge_table(<input: <glob>>, <key: <string>>[, columns: <list>
47
+ # [, keys: <list>[, skip: <uint>[, delimiter: <string>]]]])
48
+ #
49
+ # === Options
50
+ # * input <glob> - Input file or file glob expression.
51
+ # * key <string> - Key used to merge
52
+ # * columns <list> - List of columns to read in that order.
53
+ # * keys <list> - List of key identifiers to use for each column.
54
+ # * skip <uint> - Number of initial lines to skip (default=0).
55
+ # * delimiter <string> - Delimter to use for separating columsn
56
+ # (default="\s+").
57
+ #
58
+ # == Examples
59
+ #
60
+ # Consider the following two files:
61
+ #
62
+ # test1.tab:
63
+ # #ID ORGANISM
64
+ # 1 parrot
65
+ # 2 eel
66
+ # 3 platypus
67
+ # 4 beetle
68
+ #
69
+ # test2.tab:
70
+ #
71
+ # #ID COUNT
72
+ # 1 5423
73
+ # 2 34
74
+ # 3 2423
75
+ # 4 234
76
+ #
77
+ # We can merge the data with +merge_table+ like this:
78
+ #
79
+ # BP.new.
80
+ # read_table(input: "test1.tab").
81
+ # merge_table(input: "test2.tab", key: :ID).
82
+ # dump.
83
+ # run
84
+ #
85
+ # {:ID=>1, :ORGANISM=>"parrot", :COUNT=>5423}
86
+ # {:ID=>2, :ORGANISM=>"eel", :COUNT=>34}
87
+ # {:ID=>3, :ORGANISM=>"platypus", :COUNT=>2423}
88
+ # {:ID=>4, :ORGANISM=>"beetle", :COUNT=>234}
89
+ class MergeTable
90
+ STATS = %i(records_in records_out rows_total rows_matched rows_unmatched
91
+ merged non_merged)
92
+
93
+ # Constructor for MergeTable.
94
+ #
95
+ # @param options [Hash]
96
+ # Options hash.
97
+ #
98
+ # @option options [String] :input
99
+ # Input glob expression.
100
+ #
101
+ # @option options [String, Symbol] :key
102
+ # Key used to merge.
103
+ #
104
+ # @option options [Array] :keys
105
+ # List of key identifiers to use for each column.
106
+ #
107
+ # @option options [Array] :columns
108
+ # List of columns to read in that order.
109
+ #
110
+ # @option options [Integer] :skip
111
+ # Number of initial lines to skip.
112
+ #
113
+ # @option options [String] :delimiter
114
+ # Delimter to use for separating columns.
115
+ #
116
+ # @return [MergeTable] Class instance.
117
+ def initialize(options)
118
+ @options = options
119
+
120
+ check_options
121
+ defaults
122
+
123
+ @table = {}
124
+ @key = @options[:key].to_sym
125
+ @keys = options[:keys] ? @options[:keys].map(&:to_sym) : nil
126
+ end
127
+
128
+ # Return command lambda for merge_table.
129
+ #
130
+ # @return [Proc] Command lambda.
131
+ def lmb
132
+ lambda do |input, output, status|
133
+ status_init(status, STATS)
134
+
135
+ parse_input_tables
136
+
137
+ input.each do |record|
138
+ @status[:records_in] += 1
139
+
140
+ if record[@key] && @table[record[@key]]
141
+ @status[:merged] += 1
142
+ record = record.merge(@table[record[@key]])
143
+ else
144
+ @status[:non_merged] += 1
145
+ end
146
+
147
+ output << record
148
+ @status[:records_out] += 1
149
+ end
150
+
151
+ @status[:rows_total] = @status[:rows_matched] + @status[:rows_unmatched]
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ # Check options.
158
+ def check_options
159
+ options_allowed(@options, :input, :key, :keys, :columns, :skip,
160
+ :delimiter)
161
+ options_required(@options, :input, :key)
162
+ options_files_exist(@options, :input)
163
+ options_list_unique(@options, :keys, :columns)
164
+ options_assert(@options, ':skip >= 0')
165
+ end
166
+
167
+ # Set default options.
168
+ def defaults
169
+ @options[:skip] ||= 0
170
+ end
171
+
172
+ # Parse input table files and add each row to a table hash.
173
+ def parse_input_tables
174
+ options_glob(@options[:input]).each do |file|
175
+ BioDSL::CSV.open(file) do |ios|
176
+ ios.skip(@options[:skip])
177
+
178
+ ios.each_hash(delimiter: @options[:delimiter],
179
+ select: @options[:columns]) do |record|
180
+ trim_record(record) if @keys
181
+
182
+ add_row(record)
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ # Trim given record removing unwanted key/values.
189
+ #
190
+ # @param record [Hash] BioDSL record.
191
+ def trim_record(record)
192
+ record.first(@keys.size).each_with_index do |(k, v), i|
193
+ record.delete(k)
194
+ record[@keys[i]] = v
195
+ end
196
+ end
197
+
198
+ # Add a given record to the table hash.
199
+ #
200
+ # @param record [Hash] BioDSL record.
201
+ #
202
+ # @raise [RuntimeError] if duplicate values are found.
203
+ def add_row(record)
204
+ if record[@key]
205
+ check_duplicate(record)
206
+
207
+ @status[:rows_matched] += 1
208
+
209
+ @table[record[@key]] = record
210
+ else
211
+ @status[:rows_unmatched] += 1
212
+ end
213
+ end
214
+
215
+ # Check if a given record is already added to the table and raise if so.
216
+ #
217
+ # @param record [Hash] BioDSL record.
218
+ #
219
+ # @raise [RuntimeError] if duplicate values are found.
220
+ def check_duplicate(record)
221
+ return unless @table[record[@key]]
222
+ fail "Duplicate values found for key: #{@key} value: #{record[@key]}"
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,113 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Merge values of specified keys.
30
+ #
31
+ # +merge_values+ merges the values of a list of keys using a given delimiter
32
+ # and saves the new value as the value of the first key.
33
+ #
34
+ # == Usage
35
+ #
36
+ # merge_values(<keys: <list>>[, delimiter: <string>])
37
+ #
38
+ # === Options
39
+ #
40
+ # * keys: <list> - List of keys to merge.
41
+ # * delimiter: <string> - Delimiter (default='_').
42
+ #
43
+ # == Examples
44
+ #
45
+ # Consider the following record:
46
+ #
47
+ # {ID: "FOO", COUNT: 10, SEQ: "gataag"}
48
+ #
49
+ # To merge the values so that the COUNT and ID is merged in that order do:
50
+ #
51
+ # merge_values(keys: [:COUNT, :ID])
52
+ #
53
+ # {:ID=>"FOO", :COUNT=>"10_FOO", :SEQ=>"gataag"}
54
+ #
55
+ # Changing the +delimiter+ and order:
56
+ #
57
+ # merge_values(keys: [:ID, :COUNT], delimiter: ':count=')
58
+ #
59
+ # {:ID=>"FOO:count=10", :COUNT=>10, :SEQ=>"gataag"}
60
+ class MergeValues
61
+ STATS = %i(records_in records_out)
62
+
63
+ # Constructor for MergeValues.
64
+ #
65
+ # @param options [Hash] Options hash.
66
+ # @option options [Array] :keys Keys whos values to merge.
67
+ # @option options [String] :delimiter Delimiter for joining.
68
+ #
69
+ # @return [MergeValues] Class instance of MergeValues.
70
+ def initialize(options)
71
+ @options = options
72
+ check_options
73
+ defaults
74
+
75
+ @keys = options[:keys]
76
+ @delimiter = options[:delimiter]
77
+ end
78
+
79
+ # Return command lambda for merge_values.
80
+ #
81
+ # @return [Proc] Command lambda.
82
+ def lmb
83
+ lambda do |input, output, status|
84
+ status_init(status, STATS)
85
+
86
+ input.each do |record|
87
+ @status[:records_in] += 1
88
+
89
+ if @keys.all? { |key| record.key? key }
90
+ values = @keys.inject([]) { |a, e| a << record[e.to_sym] }
91
+ record[@keys.first] = values.join(@delimiter)
92
+ end
93
+
94
+ output << record
95
+ @status[:records_out] += 1
96
+ end
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ # Check options.
103
+ def check_options
104
+ options_allowed(@options, :keys, :delimiter)
105
+ options_required(@options, :keys)
106
+ end
107
+
108
+ # Set default options.
109
+ def defaults
110
+ @options[:delimiter] ||= '_'
111
+ end
112
+ end
113
+ end