BioDSL 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,535 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Grab records in stream.
30
+ #
31
+ # +grab+ select records from the stream by matching patterns to keys or
32
+ # values. +grab+ is BioDSL' equivalent of Unix' +grep+, however, +grab+
33
+ # is much more versatile.
34
+ #
35
+ # NB! If chaining multiple +grab+ commands then use the most restrictive
36
+ # +grab+ first in order to get the best performance.
37
+ #
38
+ # NB! Avoid using exact with long values because of memory use.
39
+ #
40
+ # == Usage
41
+ #
42
+ # grab(<select: <pattern>|select_file: <file>|reject: <pattern>|
43
+ # reject_file: <file>|evaluate: <expression>|exact: <bool>>
44
+ # [, keys: <list>|keys_only: <bool>|values_only: <bool>|
45
+ # ignore_case: <bool>])
46
+ #
47
+ # === Options
48
+ #
49
+ # * select: <pattern> - Select records matching <pattern> which is
50
+ # a regex or an exact match if the exact option is set.
51
+ # * select_file: <file> - File with one <pattern> per line to select.
52
+ # * reject: <pattern> - Reject records matching <pattern> which is
53
+ # a regex or an exact match if the exact option is set.
54
+ # * reject_file: <file> - File with one <pattern> per line to reject.
55
+ # * evaluate: <expression> - Select records where <expression> is true.
56
+ # * exact: <bool> - Turn on exact matching for improved speed.
57
+ # * keys: <list> - Comma separated list or array of keys to grab
58
+ # the value for.
59
+ # * keys_only: <bool> - Only grab for keys.
60
+ # * values_only: <bool> - Only grab for values.
61
+ # * ignore_case: <bool> - Ignore case when grabbing with regex (does not
62
+ # work with +evaluate+ and +exact+).
63
+ #
64
+ # == Examples
65
+ #
66
+ # To easily grab all records in the stream that has any mentioning of the
67
+ # pattern 'human' just pipe the data stream through grab like this:
68
+ #
69
+ # grab(select: "human")
70
+ #
71
+ # This will search for the pattern 'human' in all keys and all values. The
72
+ # +select+ option alternatively uses an array of patterns, so in order to
73
+ # match one of multiple patterns do:
74
+ #
75
+ # grab(select: ["human", "mouse"])
76
+ #
77
+ # It is also possible to invoke flexible matching using regex (regular
78
+ # expressions) instead of simple pattern matching. If you want to +grab+
79
+ # records with the sequence +ATCG+ or +GCTA+ you can do this:
80
+ #
81
+ # grab(select: "ATCG|GCTA")
82
+ #
83
+ # Or if you want to +grab+ sequences beginning with +ATCG+:
84
+ #
85
+ # grab(select: "^ATCG")
86
+ #
87
+ # It is also possible to use the +select_file+ option to load patterns from
88
+ # a file with one pattern per line.
89
+ #
90
+ # grab(select_file: "patterns.txt")
91
+ #
92
+ # If you want the opposite result - to find all records that does not match
93
+ # the a pattern, use the +reject+ option:
94
+ #
95
+ # grab(reject: "human")
96
+ #
97
+ # Similar to +select_file+ there is a +reject_file+ option to load patterns
98
+ # from a file, and use any of these patterns to reject records:
99
+ #
100
+ # grab(reject_file: "patterns.txt")
101
+ #
102
+ # If you want to search the record keys only, e.g. to +grab+ all records
103
+ # containing the key +SEQ+ you can use the +keys_only+ option. This will
104
+ # prevent matching of +SEQ+ in any record value, and in fact +SEQ+ is a not
105
+ # uncommon peptide sequence you could get an unwanted record. Also, this
106
+ # will give an increase in speed since only the keys are searched:
107
+ #
108
+ # grab(select: "SEQ", keys_only: true)
109
+ #
110
+ # However, if you are interested in +grabbing+ the peptide sequence +SEQ+ and
111
+ # not the +SEQ+ key, just use the +vals_only+ option:
112
+ #
113
+ # grab(select: "SEQ", vals_only: true)
114
+ #
115
+ # Also, if you want to +grab+ for certain key/value pairs you can supply a
116
+ # comma separated list or an array of keys whos values will then be grabbed
117
+ # using the +keys+ option. This is handy if your records contain large
118
+ # genomic sequences and you don't want to search the entire sequence for
119
+ # e.g. the organism name - it is much faster to tell +grab+ which keys to
120
+ # search the value for:
121
+ #
122
+ # grab(select: "human", keys: :SEQ_NAME)
123
+ #
124
+ # You can also use the +evaluate+ option to +grab+ records that fulfill an
125
+ # expression. So to +grab+ all records with a sequence length greater than 30:
126
+ #
127
+ # grab(evaluate: 'SEQ_LEN > 30')
128
+ #
129
+ # If you want to +grab+ all records containing the pattern 'human' and where
130
+ # the sequence length is greater that 30, you do this by running the stream
131
+ # through +grab+ twice:
132
+ #
133
+ # grab(select: 'human').grab(evaluate: 'SEQ_LEN > 30')
134
+ #
135
+ # Finally, it is possible to +grab+ for exact pattern using the +exact+
136
+ # option. This is much faster than the default regex pattern grabbing
137
+ # because with +exact+ the patterns are used to create a lookup hash for
138
+ # instant matching of keys or values. This is useful if you e.g. have a
139
+ # file with ID numbers and you want to +grab+ matching records from the
140
+ # stream:
141
+ #
142
+ # grab(select_file: "ids.txt", keys: :ID, exact: true)
143
+ #
144
+ # rubocop:disable ClassLength
145
+ class Grab
146
+ STATS = %i(records_in records_out)
147
+
148
+ # Constructor for the ReadFasta class.
149
+ #
150
+ # @param [Hash] options Options hash.
151
+ #
152
+ # @option options [String, Array] :select
153
+ # Patterns or list of patterns to select records.
154
+ #
155
+ # @option options [String] :select_file
156
+ # File path with patterns, one per line, to select records.
157
+ #
158
+ # @option options [String, Array] :reject
159
+ # Patterns or list of patterns to reject records.
160
+ #
161
+ # @option options [String] :reject_file
162
+ # File path with patterns, one per line, to reject records.
163
+ #
164
+ # @option options [String] :evaluate
165
+ # Expression that is evaluated to select records.
166
+ #
167
+ # @option options [Boolean] :exact
168
+ # Flag indicating that a given pattern must match over its entire length.
169
+ #
170
+ # @option options [Symbol, Array] :keys
171
+ # Key or list of keys whos key/value pairs to grab for.
172
+ #
173
+ # @option options [Boolean] :keys_only
174
+ # Flag indicating to grab for key only - not values.
175
+ #
176
+ # @option options [Boolean] :values_only
177
+ # Flag indicating to grab for values only - not keys.
178
+ #
179
+ # @option options [Boolean] :ignore_case
180
+ # Flag indicating that pattern matching should be case insensitive.
181
+ #
182
+ # @return [ReadFasta] Returns an instance of the class.
183
+ def initialize(options)
184
+ @options = options
185
+
186
+ check_options
187
+
188
+ @keys_only = @options[:keys_only]
189
+ @vals_only = @options[:values_only]
190
+ @invert = @options[:reject] || @options[:reject_file]
191
+ @eval = @options[:evaluate]
192
+ @exact = nil
193
+ @regex = nil
194
+ @keys = nil
195
+ end
196
+
197
+ # Return a lambda for the grab command.
198
+ #
199
+ # @return [Proc] Returns the grab command lambda.
200
+ def lmb
201
+ lambda do |input, output, status|
202
+ status_init(status, STATS)
203
+ compile_keys
204
+ compile_exact
205
+ compile_regexes
206
+
207
+ input.each do |record|
208
+ @status[:records_in] += 1
209
+
210
+ match = case
211
+ when @exact then exact_match? record
212
+ when @regex then regex_match? record
213
+ when @eval then eval_match? record
214
+ end
215
+
216
+ emit_match(output, record, match)
217
+ end
218
+ end
219
+ end
220
+
221
+ private
222
+
223
+ # Check the options.
224
+ def check_options
225
+ options_allowed(@options, :select, :select_file, :reject, :reject_file,
226
+ :evaluate, :exact, :keys, :keys_only, :values_only,
227
+ :ignore_case)
228
+ options_required_unique(@options, :select, :select_file, :reject,
229
+ :reject_file, :evaluate)
230
+ options_conflict(@options, keys: :evaluate, keys_only: :evaluate,
231
+ values_only: :evaluate, ignore_case: :evaluate,
232
+ exact: :evaluate)
233
+ options_unique(@options, :keys_only, :values_only)
234
+ options_files_exist(@options, :select_file, :reject_file)
235
+ end
236
+
237
+ # Emit a record to the output stream if a match was found and w/o invert
238
+ # matching, or if no match was found and with invert matching.
239
+ #
240
+ # @param output [Enumerator::Yielder] Output stream.
241
+ # @param record [Hash] Record to emit.
242
+ # @param match [Boolean] Flag indicating a positive match.
243
+ def emit_match(output, record, match)
244
+ if match && !@invert
245
+ output << record
246
+ @status[:records_out] += 1
247
+ elsif !match && @invert
248
+ output << record
249
+ @status[:records_out] += 1
250
+ end
251
+ end
252
+
253
+ # Compile a list of keys from the options hash, which may contain either a
254
+ # list of keys, a symbol or a comma seperated string of keys.
255
+ def compile_keys
256
+ return unless @options[:keys]
257
+
258
+ @keys = case @options[:keys].class.to_s
259
+ when 'Array'
260
+ @options[:keys].map(&:to_sym)
261
+ when 'Symbol'
262
+ [@options[:keys]]
263
+ when 'String'
264
+ @options[:keys].split(/, */).map do |key|
265
+ key.sub(/^:/, '').to_sym
266
+ end
267
+ end
268
+ end
269
+
270
+ # Compile a list of regexes for matching.
271
+ def compile_regexes
272
+ return if @options[:exact]
273
+ return if @options[:evaluate]
274
+
275
+ @regex = []
276
+
277
+ compile_regex_patterns(@options[:select])
278
+ compile_regex_patterns(@options[:reject])
279
+ compile_regex_file(@options[:select_file])
280
+ compile_regex_file(@options[:reject_file])
281
+ end
282
+
283
+ # Compile a list of regex from a list of given patterns.
284
+ #
285
+ # @param patterns [Array] List of patterns.
286
+ def compile_regex_patterns(patterns)
287
+ return unless patterns
288
+
289
+ [patterns].flatten.each do |pattern|
290
+ if @options[:ignore_case]
291
+ @regex << Regexp.new(/#{pattern}/i)
292
+ else
293
+ @regex << Regexp.new(/#{pattern}/)
294
+ end
295
+ end
296
+ end
297
+
298
+ # Compile a list of regex from a given file with one pattern per line.
299
+ #
300
+ # @param file [String] Path to file with patterns.
301
+ def compile_regex_file(file)
302
+ return unless file
303
+
304
+ File.open(file) do |ios|
305
+ ios.each_line do |line|
306
+ line.chomp!
307
+
308
+ if @options[:ignore_case]
309
+ @regex << Regexp.new(/#{line}/i)
310
+ else
311
+ @regex << Regexp.new(/#{line}/)
312
+ end
313
+ end
314
+ end
315
+ end
316
+
317
+ # Compile a lookup hash for fast exact matching.
318
+ #
319
+ # @return [Set] Set of exact patterns.
320
+ def compile_exact
321
+ return unless @options[:exact]
322
+
323
+ @exact = {}
324
+
325
+ compile_exact_patterns(@options[:select])
326
+ compile_exact_patterns(@options[:reject])
327
+ compile_exact_file(@options[:select_file])
328
+ compile_exact_file(@options[:reject_file])
329
+ end
330
+
331
+ # Compile a lookup hash for a given list of patterns.
332
+ #
333
+ # @param patterns [Array] List of patterns.
334
+ def compile_exact_patterns(patterns)
335
+ return unless patterns
336
+
337
+ [patterns].flatten.each do |pattern|
338
+ if pattern.class == String
339
+ @exact[pattern.to_sym] = true
340
+ else
341
+ @exact[pattern] = true
342
+ end
343
+ end
344
+ end
345
+
346
+ # Compile a lookup hash a given file with one pattern per line.
347
+ #
348
+ # @param file [String] Path to file with patterns.
349
+ def compile_exact_file(file)
350
+ return unless file
351
+
352
+ File.open(file) do |ios|
353
+ ios.each_line do |line|
354
+ pattern = line.chomp!
355
+
356
+ type = pattern.to_num.class.to_s.to_sym unless type
357
+
358
+ if type == :String
359
+ @exact[pattern.to_sym] = true
360
+ else
361
+ @exact[pattern] = true
362
+ end
363
+ end
364
+ end
365
+ end
366
+
367
+ # Match exactly record keys or values
368
+ #
369
+ # @param record [Hash] Record to match.
370
+ #
371
+ # @return [Boolean] True if exact match found.
372
+ def exact_match?(record)
373
+ keys = @keys || record.keys
374
+
375
+ if @keys_only
376
+ exact_match_keys?(keys)
377
+ elsif @vals_only
378
+ exact_match_values?(record, keys)
379
+ else
380
+ exact_match_key_values?(record, keys)
381
+ end
382
+ end
383
+
384
+ # Match exactly any record keys.
385
+ #
386
+ # @param keys [Array] List of keys to match.
387
+ #
388
+ # @return [Boolean] True if exact match found.
389
+ def exact_match_keys?(keys)
390
+ keys.each do |key|
391
+ return true if @exact[key]
392
+ end
393
+
394
+ false
395
+ end
396
+
397
+ # Match exactly any record values.
398
+ #
399
+ # @param record [Hash] Record to match.
400
+ # @param keys [Array] List of keys whos values to match.
401
+ #
402
+ # @return [Boolean] True if exact match found.
403
+ def exact_match_values?(record, keys)
404
+ keys.each do |key|
405
+ value = record[key]
406
+
407
+ next unless value
408
+
409
+ if value.class == String
410
+ return true if @exact.include?(value.to_sym)
411
+ else
412
+ return true if @exact.include?(value)
413
+ end
414
+ end
415
+
416
+ false
417
+ end
418
+
419
+ # Match exactly any record keys or values.
420
+ #
421
+ # @param record [Hash] Record to match.
422
+ # @param keys [Array] List of keys or values to match.
423
+ #
424
+ # @return [Boolean] True if exact match found.
425
+ def exact_match_key_values?(record, keys)
426
+ keys.each do |key|
427
+ return true if @exact.include?(key)
428
+
429
+ value = record[key]
430
+
431
+ next unless value
432
+
433
+ if value.class == String
434
+ return true if @exact.include?(value.to_sym)
435
+ else
436
+ return true if @exact.include?(value)
437
+ end
438
+ end
439
+
440
+ false
441
+ end
442
+
443
+ def regex_match?(record)
444
+ keys = @keys || record.keys
445
+
446
+ if @keys_only
447
+ regex_match_keys?(keys)
448
+ elsif @vals_only
449
+ regex_match_values?(record, keys)
450
+ else
451
+ regex_match_key_values?(record, keys)
452
+ end
453
+ end
454
+
455
+ # Match using regex any record keys.
456
+ #
457
+ # @param keys [Array] List of keys to match.
458
+ #
459
+ # @return [Boolean] True if regex match found.
460
+ def regex_match_keys?(keys)
461
+ keys.each do |key|
462
+ @regex.each do |regex|
463
+ return true if key.to_s =~ regex
464
+ end
465
+ end
466
+
467
+ false
468
+ end
469
+
470
+ # Match using regex any record values.
471
+ #
472
+ # @param record [Hash] Record to match.
473
+ # @param keys [Array] List of keys whos values to match.
474
+ #
475
+ # @return [Boolean] True if regex match found.
476
+ def regex_match_values?(record, keys)
477
+ keys.each do |key|
478
+ next unless record[key]
479
+ value = record[key]
480
+
481
+ @regex.each do |regex|
482
+ return true if value.to_s =~ regex
483
+ end
484
+ end
485
+
486
+ false
487
+ end
488
+
489
+ # Match using regex any record keys or values.
490
+ #
491
+ # @param record [Hash] Record to match.
492
+ # @param keys [Array] List of keys or values to match.
493
+ #
494
+ # @return [Boolean] True if regex match found.
495
+ def regex_match_key_values?(record, keys)
496
+ keys.each do |key|
497
+ @regex.each do |regex|
498
+ return true if key.to_s =~ regex
499
+ end
500
+
501
+ next unless record[key]
502
+ value = record[key]
503
+
504
+ @regex.each do |regex|
505
+ return true if value.to_s =~ regex
506
+ end
507
+ end
508
+
509
+ false
510
+ end
511
+
512
+ # Match using eval expression on record values.
513
+ #
514
+ # @param record [Hash] Record to match.
515
+ #
516
+ # @return [Boolean] True if eval match found.
517
+ def eval_match?(record)
518
+ expr = []
519
+
520
+ @eval.split("\s").each do |item|
521
+ if item[0] == ':'
522
+ key = item[1..-1].to_sym
523
+
524
+ return false unless record[key]
525
+
526
+ expr << record[key]
527
+ else
528
+ expr << item
529
+ end
530
+ end
531
+
532
+ eval expr.join(' ')
533
+ end
534
+ end
535
+ end