BioDSL 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (197) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/BioDSL.gemspec +64 -0
  4. data/LICENSE +339 -0
  5. data/README.md +205 -0
  6. data/Rakefile +94 -0
  7. data/examples/fastq_to_fasta.rb +8 -0
  8. data/lib/BioDSL/cary.rb +242 -0
  9. data/lib/BioDSL/command.rb +133 -0
  10. data/lib/BioDSL/commands/add_key.rb +110 -0
  11. data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
  12. data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
  13. data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
  14. data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
  15. data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
  16. data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
  17. data/lib/BioDSL/commands/classify_seq.rb +217 -0
  18. data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
  19. data/lib/BioDSL/commands/clip_primer.rb +318 -0
  20. data/lib/BioDSL/commands/cluster_otus.rb +181 -0
  21. data/lib/BioDSL/commands/collapse_otus.rb +170 -0
  22. data/lib/BioDSL/commands/collect_otus.rb +150 -0
  23. data/lib/BioDSL/commands/complement_seq.rb +117 -0
  24. data/lib/BioDSL/commands/count.rb +135 -0
  25. data/lib/BioDSL/commands/count_values.rb +149 -0
  26. data/lib/BioDSL/commands/degap_seq.rb +253 -0
  27. data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
  28. data/lib/BioDSL/commands/dump.rb +157 -0
  29. data/lib/BioDSL/commands/filter_rrna.rb +239 -0
  30. data/lib/BioDSL/commands/genecall.rb +237 -0
  31. data/lib/BioDSL/commands/grab.rb +535 -0
  32. data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
  33. data/lib/BioDSL/commands/mask_seq.rb +175 -0
  34. data/lib/BioDSL/commands/mean_scores.rb +168 -0
  35. data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
  36. data/lib/BioDSL/commands/merge_table.rb +225 -0
  37. data/lib/BioDSL/commands/merge_values.rb +113 -0
  38. data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
  39. data/lib/BioDSL/commands/plot_histogram.rb +306 -0
  40. data/lib/BioDSL/commands/plot_matches.rb +282 -0
  41. data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
  42. data/lib/BioDSL/commands/plot_scores.rb +285 -0
  43. data/lib/BioDSL/commands/random.rb +153 -0
  44. data/lib/BioDSL/commands/read_fasta.rb +222 -0
  45. data/lib/BioDSL/commands/read_fastq.rb +414 -0
  46. data/lib/BioDSL/commands/read_table.rb +329 -0
  47. data/lib/BioDSL/commands/reverse_seq.rb +113 -0
  48. data/lib/BioDSL/commands/slice_align.rb +400 -0
  49. data/lib/BioDSL/commands/slice_seq.rb +151 -0
  50. data/lib/BioDSL/commands/sort.rb +223 -0
  51. data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
  52. data/lib/BioDSL/commands/split_values.rb +165 -0
  53. data/lib/BioDSL/commands/trim_primer.rb +314 -0
  54. data/lib/BioDSL/commands/trim_seq.rb +192 -0
  55. data/lib/BioDSL/commands/uchime_ref.rb +170 -0
  56. data/lib/BioDSL/commands/uclust.rb +286 -0
  57. data/lib/BioDSL/commands/unique_values.rb +145 -0
  58. data/lib/BioDSL/commands/usearch_global.rb +171 -0
  59. data/lib/BioDSL/commands/usearch_local.rb +171 -0
  60. data/lib/BioDSL/commands/write_fasta.rb +207 -0
  61. data/lib/BioDSL/commands/write_fastq.rb +191 -0
  62. data/lib/BioDSL/commands/write_table.rb +419 -0
  63. data/lib/BioDSL/commands/write_tree.rb +167 -0
  64. data/lib/BioDSL/commands.rb +31 -0
  65. data/lib/BioDSL/config.rb +55 -0
  66. data/lib/BioDSL/csv.rb +307 -0
  67. data/lib/BioDSL/debug.rb +42 -0
  68. data/lib/BioDSL/fasta.rb +133 -0
  69. data/lib/BioDSL/fastq.rb +77 -0
  70. data/lib/BioDSL/filesys.rb +137 -0
  71. data/lib/BioDSL/fork.rb +145 -0
  72. data/lib/BioDSL/hamming.rb +128 -0
  73. data/lib/BioDSL/helpers/aux_helper.rb +44 -0
  74. data/lib/BioDSL/helpers/email_helper.rb +66 -0
  75. data/lib/BioDSL/helpers/history_helper.rb +40 -0
  76. data/lib/BioDSL/helpers/log_helper.rb +55 -0
  77. data/lib/BioDSL/helpers/options_helper.rb +405 -0
  78. data/lib/BioDSL/helpers/status_helper.rb +132 -0
  79. data/lib/BioDSL/helpers.rb +35 -0
  80. data/lib/BioDSL/html_report.rb +200 -0
  81. data/lib/BioDSL/math.rb +55 -0
  82. data/lib/BioDSL/mummer.rb +216 -0
  83. data/lib/BioDSL/pipeline.rb +354 -0
  84. data/lib/BioDSL/seq/ambiguity.rb +66 -0
  85. data/lib/BioDSL/seq/assemble.rb +240 -0
  86. data/lib/BioDSL/seq/backtrack.rb +252 -0
  87. data/lib/BioDSL/seq/digest.rb +99 -0
  88. data/lib/BioDSL/seq/dynamic.rb +263 -0
  89. data/lib/BioDSL/seq/homopolymer.rb +59 -0
  90. data/lib/BioDSL/seq/kmer.rb +293 -0
  91. data/lib/BioDSL/seq/levenshtein.rb +113 -0
  92. data/lib/BioDSL/seq/translate.rb +109 -0
  93. data/lib/BioDSL/seq/trim.rb +188 -0
  94. data/lib/BioDSL/seq.rb +742 -0
  95. data/lib/BioDSL/serializer.rb +98 -0
  96. data/lib/BioDSL/stream.rb +113 -0
  97. data/lib/BioDSL/taxonomy.rb +691 -0
  98. data/lib/BioDSL/test.rb +42 -0
  99. data/lib/BioDSL/tmp_dir.rb +68 -0
  100. data/lib/BioDSL/usearch.rb +301 -0
  101. data/lib/BioDSL/verbose.rb +42 -0
  102. data/lib/BioDSL/version.rb +31 -0
  103. data/lib/BioDSL.rb +81 -0
  104. data/test/BioDSL/commands/test_add_key.rb +105 -0
  105. data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
  106. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
  107. data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
  108. data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
  109. data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
  110. data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
  111. data/test/BioDSL/commands/test_classify_seq.rb +50 -0
  112. data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
  113. data/test/BioDSL/commands/test_clip_primer.rb +377 -0
  114. data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
  115. data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
  116. data/test/BioDSL/commands/test_collect_otus.rb +82 -0
  117. data/test/BioDSL/commands/test_complement_seq.rb +78 -0
  118. data/test/BioDSL/commands/test_count.rb +103 -0
  119. data/test/BioDSL/commands/test_count_values.rb +85 -0
  120. data/test/BioDSL/commands/test_degap_seq.rb +96 -0
  121. data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
  122. data/test/BioDSL/commands/test_dump.rb +109 -0
  123. data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
  124. data/test/BioDSL/commands/test_genecall.rb +50 -0
  125. data/test/BioDSL/commands/test_grab.rb +398 -0
  126. data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
  127. data/test/BioDSL/commands/test_mask_seq.rb +98 -0
  128. data/test/BioDSL/commands/test_mean_scores.rb +111 -0
  129. data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
  130. data/test/BioDSL/commands/test_merge_table.rb +131 -0
  131. data/test/BioDSL/commands/test_merge_values.rb +83 -0
  132. data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
  133. data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
  134. data/test/BioDSL/commands/test_plot_matches.rb +157 -0
  135. data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
  136. data/test/BioDSL/commands/test_plot_scores.rb +308 -0
  137. data/test/BioDSL/commands/test_random.rb +88 -0
  138. data/test/BioDSL/commands/test_read_fasta.rb +229 -0
  139. data/test/BioDSL/commands/test_read_fastq.rb +552 -0
  140. data/test/BioDSL/commands/test_read_table.rb +327 -0
  141. data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
  142. data/test/BioDSL/commands/test_slice_align.rb +218 -0
  143. data/test/BioDSL/commands/test_slice_seq.rb +131 -0
  144. data/test/BioDSL/commands/test_sort.rb +128 -0
  145. data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
  146. data/test/BioDSL/commands/test_split_values.rb +95 -0
  147. data/test/BioDSL/commands/test_trim_primer.rb +329 -0
  148. data/test/BioDSL/commands/test_trim_seq.rb +150 -0
  149. data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
  150. data/test/BioDSL/commands/test_uclust.rb +139 -0
  151. data/test/BioDSL/commands/test_unique_values.rb +98 -0
  152. data/test/BioDSL/commands/test_usearch_global.rb +123 -0
  153. data/test/BioDSL/commands/test_usearch_local.rb +125 -0
  154. data/test/BioDSL/commands/test_write_fasta.rb +159 -0
  155. data/test/BioDSL/commands/test_write_fastq.rb +166 -0
  156. data/test/BioDSL/commands/test_write_table.rb +411 -0
  157. data/test/BioDSL/commands/test_write_tree.rb +122 -0
  158. data/test/BioDSL/helpers/test_options_helper.rb +272 -0
  159. data/test/BioDSL/seq/test_assemble.rb +98 -0
  160. data/test/BioDSL/seq/test_backtrack.rb +176 -0
  161. data/test/BioDSL/seq/test_digest.rb +71 -0
  162. data/test/BioDSL/seq/test_dynamic.rb +133 -0
  163. data/test/BioDSL/seq/test_homopolymer.rb +58 -0
  164. data/test/BioDSL/seq/test_kmer.rb +134 -0
  165. data/test/BioDSL/seq/test_translate.rb +75 -0
  166. data/test/BioDSL/seq/test_trim.rb +101 -0
  167. data/test/BioDSL/test_cary.rb +176 -0
  168. data/test/BioDSL/test_command.rb +45 -0
  169. data/test/BioDSL/test_csv.rb +514 -0
  170. data/test/BioDSL/test_debug.rb +42 -0
  171. data/test/BioDSL/test_fasta.rb +154 -0
  172. data/test/BioDSL/test_fastq.rb +46 -0
  173. data/test/BioDSL/test_filesys.rb +145 -0
  174. data/test/BioDSL/test_fork.rb +85 -0
  175. data/test/BioDSL/test_math.rb +41 -0
  176. data/test/BioDSL/test_mummer.rb +79 -0
  177. data/test/BioDSL/test_pipeline.rb +187 -0
  178. data/test/BioDSL/test_seq.rb +790 -0
  179. data/test/BioDSL/test_serializer.rb +72 -0
  180. data/test/BioDSL/test_stream.rb +55 -0
  181. data/test/BioDSL/test_taxonomy.rb +336 -0
  182. data/test/BioDSL/test_test.rb +42 -0
  183. data/test/BioDSL/test_tmp_dir.rb +58 -0
  184. data/test/BioDSL/test_usearch.rb +33 -0
  185. data/test/BioDSL/test_verbose.rb +42 -0
  186. data/test/helper.rb +82 -0
  187. data/www/command.html.haml +14 -0
  188. data/www/css.html.haml +55 -0
  189. data/www/input_files.html.haml +3 -0
  190. data/www/layout.html.haml +12 -0
  191. data/www/output_files.html.haml +3 -0
  192. data/www/overview.html.haml +15 -0
  193. data/www/pipeline.html.haml +4 -0
  194. data/www/png.html.haml +2 -0
  195. data/www/status.html.haml +9 -0
  196. data/www/time.html.haml +11 -0
  197. metadata +503 -0
@@ -0,0 +1,535 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+ # #
24
+ # This software is part of the BioDSL framework (www.BioDSL.org). #
25
+ # #
26
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
27
+
28
+ module BioDSL
29
+ # == Grab records in stream.
30
+ #
31
+ # +grab+ select records from the stream by matching patterns to keys or
32
+ # values. +grab+ is BioDSL' equivalent of Unix' +grep+, however, +grab+
33
+ # is much more versatile.
34
+ #
35
+ # NB! If chaining multiple +grab+ commands then use the most restrictive
36
+ # +grab+ first in order to get the best performance.
37
+ #
38
+ # NB! Avoid using exact with long values because of memory use.
39
+ #
40
+ # == Usage
41
+ #
42
+ # grab(<select: <pattern>|select_file: <file>|reject: <pattern>|
43
+ # reject_file: <file>|evaluate: <expression>|exact: <bool>>
44
+ # [, keys: <list>|keys_only: <bool>|values_only: <bool>|
45
+ # ignore_case: <bool>])
46
+ #
47
+ # === Options
48
+ #
49
+ # * select: <pattern> - Select records matching <pattern> which is
50
+ # a regex or an exact match if the exact option is set.
51
+ # * select_file: <file> - File with one <pattern> per line to select.
52
+ # * reject: <pattern> - Reject records matching <pattern> which is
53
+ # a regex or an exact match if the exact option is set.
54
+ # * reject_file: <file> - File with one <pattern> per line to reject.
55
+ # * evaluate: <expression> - Select records where <expression> is true.
56
+ # * exact: <bool> - Turn on exact matching for improved speed.
57
+ # * keys: <list> - Comma separated list or array of keys to grab
58
+ # the value for.
59
+ # * keys_only: <bool> - Only grab for keys.
60
+ # * values_only: <bool> - Only grab for values.
61
+ # * ignore_case: <bool> - Ignore case when grabbing with regex (does not
62
+ # work with +evaluate+ and +exact+).
63
+ #
64
+ # == Examples
65
+ #
66
+ # To easily grab all records in the stream that has any mentioning of the
67
+ # pattern 'human' just pipe the data stream through grab like this:
68
+ #
69
+ # grab(select: "human")
70
+ #
71
+ # This will search for the pattern 'human' in all keys and all values. The
72
+ # +select+ option alternatively uses an array of patterns, so in order to
73
+ # match one of multiple patterns do:
74
+ #
75
+ # grab(select: ["human", "mouse"])
76
+ #
77
+ # It is also possible to invoke flexible matching using regex (regular
78
+ # expressions) instead of simple pattern matching. If you want to +grab+
79
+ # records with the sequence +ATCG+ or +GCTA+ you can do this:
80
+ #
81
+ # grab(select: "ATCG|GCTA")
82
+ #
83
+ # Or if you want to +grab+ sequences beginning with +ATCG+:
84
+ #
85
+ # grab(select: "^ATCG")
86
+ #
87
+ # It is also possible to use the +select_file+ option to load patterns from
88
+ # a file with one pattern per line.
89
+ #
90
+ # grab(select_file: "patterns.txt")
91
+ #
92
+ # If you want the opposite result - to find all records that does not match
93
+ # the a pattern, use the +reject+ option:
94
+ #
95
+ # grab(reject: "human")
96
+ #
97
+ # Similar to +select_file+ there is a +reject_file+ option to load patterns
98
+ # from a file, and use any of these patterns to reject records:
99
+ #
100
+ # grab(reject_file: "patterns.txt")
101
+ #
102
+ # If you want to search the record keys only, e.g. to +grab+ all records
103
+ # containing the key +SEQ+ you can use the +keys_only+ option. This will
104
+ # prevent matching of +SEQ+ in any record value, and in fact +SEQ+ is a not
105
+ # uncommon peptide sequence you could get an unwanted record. Also, this
106
+ # will give an increase in speed since only the keys are searched:
107
+ #
108
+ # grab(select: "SEQ", keys_only: true)
109
+ #
110
+ # However, if you are interested in +grabbing+ the peptide sequence +SEQ+ and
111
+ # not the +SEQ+ key, just use the +vals_only+ option:
112
+ #
113
+ # grab(select: "SEQ", vals_only: true)
114
+ #
115
+ # Also, if you want to +grab+ for certain key/value pairs you can supply a
116
+ # comma separated list or an array of keys whos values will then be grabbed
117
+ # using the +keys+ option. This is handy if your records contain large
118
+ # genomic sequences and you don't want to search the entire sequence for
119
+ # e.g. the organism name - it is much faster to tell +grab+ which keys to
120
+ # search the value for:
121
+ #
122
+ # grab(select: "human", keys: :SEQ_NAME)
123
+ #
124
+ # You can also use the +evaluate+ option to +grab+ records that fulfill an
125
+ # expression. So to +grab+ all records with a sequence length greater than 30:
126
+ #
127
+ # grab(evaluate: 'SEQ_LEN > 30')
128
+ #
129
+ # If you want to +grab+ all records containing the pattern 'human' and where
130
+ # the sequence length is greater that 30, you do this by running the stream
131
+ # through +grab+ twice:
132
+ #
133
+ # grab(select: 'human').grab(evaluate: 'SEQ_LEN > 30')
134
+ #
135
+ # Finally, it is possible to +grab+ for exact pattern using the +exact+
136
+ # option. This is much faster than the default regex pattern grabbing
137
+ # because with +exact+ the patterns are used to create a lookup hash for
138
+ # instant matching of keys or values. This is useful if you e.g. have a
139
+ # file with ID numbers and you want to +grab+ matching records from the
140
+ # stream:
141
+ #
142
+ # grab(select_file: "ids.txt", keys: :ID, exact: true)
143
+ #
144
+ # rubocop:disable ClassLength
145
+ class Grab
146
+ STATS = %i(records_in records_out)
147
+
148
+ # Constructor for the ReadFasta class.
149
+ #
150
+ # @param [Hash] options Options hash.
151
+ #
152
+ # @option options [String, Array] :select
153
+ # Patterns or list of patterns to select records.
154
+ #
155
+ # @option options [String] :select_file
156
+ # File path with patterns, one per line, to select records.
157
+ #
158
+ # @option options [String, Array] :reject
159
+ # Patterns or list of patterns to reject records.
160
+ #
161
+ # @option options [String] :reject_file
162
+ # File path with patterns, one per line, to reject records.
163
+ #
164
+ # @option options [String] :evaluate
165
+ # Expression that is evaluated to select records.
166
+ #
167
+ # @option options [Boolean] :exact
168
+ # Flag indicating that a given pattern must match over its entire length.
169
+ #
170
+ # @option options [Symbol, Array] :keys
171
+ # Key or list of keys whos key/value pairs to grab for.
172
+ #
173
+ # @option options [Boolean] :keys_only
174
+ # Flag indicating to grab for key only - not values.
175
+ #
176
+ # @option options [Boolean] :values_only
177
+ # Flag indicating to grab for values only - not keys.
178
+ #
179
+ # @option options [Boolean] :ignore_case
180
+ # Flag indicating that pattern matching should be case insensitive.
181
+ #
182
+ # @return [ReadFasta] Returns an instance of the class.
183
+ def initialize(options)
184
+ @options = options
185
+
186
+ check_options
187
+
188
+ @keys_only = @options[:keys_only]
189
+ @vals_only = @options[:values_only]
190
+ @invert = @options[:reject] || @options[:reject_file]
191
+ @eval = @options[:evaluate]
192
+ @exact = nil
193
+ @regex = nil
194
+ @keys = nil
195
+ end
196
+
197
+ # Return a lambda for the grab command.
198
+ #
199
+ # @return [Proc] Returns the grab command lambda.
200
+ def lmb
201
+ lambda do |input, output, status|
202
+ status_init(status, STATS)
203
+ compile_keys
204
+ compile_exact
205
+ compile_regexes
206
+
207
+ input.each do |record|
208
+ @status[:records_in] += 1
209
+
210
+ match = case
211
+ when @exact then exact_match? record
212
+ when @regex then regex_match? record
213
+ when @eval then eval_match? record
214
+ end
215
+
216
+ emit_match(output, record, match)
217
+ end
218
+ end
219
+ end
220
+
221
+ private
222
+
223
+ # Check the options.
224
+ def check_options
225
+ options_allowed(@options, :select, :select_file, :reject, :reject_file,
226
+ :evaluate, :exact, :keys, :keys_only, :values_only,
227
+ :ignore_case)
228
+ options_required_unique(@options, :select, :select_file, :reject,
229
+ :reject_file, :evaluate)
230
+ options_conflict(@options, keys: :evaluate, keys_only: :evaluate,
231
+ values_only: :evaluate, ignore_case: :evaluate,
232
+ exact: :evaluate)
233
+ options_unique(@options, :keys_only, :values_only)
234
+ options_files_exist(@options, :select_file, :reject_file)
235
+ end
236
+
237
+ # Emit a record to the output stream if a match was found and w/o invert
238
+ # matching, or if no match was found and with invert matching.
239
+ #
240
+ # @param output [Enumerator::Yielder] Output stream.
241
+ # @param record [Hash] Record to emit.
242
+ # @param match [Boolean] Flag indicating a positive match.
243
+ def emit_match(output, record, match)
244
+ if match && !@invert
245
+ output << record
246
+ @status[:records_out] += 1
247
+ elsif !match && @invert
248
+ output << record
249
+ @status[:records_out] += 1
250
+ end
251
+ end
252
+
253
+ # Compile a list of keys from the options hash, which may contain either a
254
+ # list of keys, a symbol or a comma seperated string of keys.
255
+ def compile_keys
256
+ return unless @options[:keys]
257
+
258
+ @keys = case @options[:keys].class.to_s
259
+ when 'Array'
260
+ @options[:keys].map(&:to_sym)
261
+ when 'Symbol'
262
+ [@options[:keys]]
263
+ when 'String'
264
+ @options[:keys].split(/, */).map do |key|
265
+ key.sub(/^:/, '').to_sym
266
+ end
267
+ end
268
+ end
269
+
270
+ # Compile a list of regexes for matching.
271
+ def compile_regexes
272
+ return if @options[:exact]
273
+ return if @options[:evaluate]
274
+
275
+ @regex = []
276
+
277
+ compile_regex_patterns(@options[:select])
278
+ compile_regex_patterns(@options[:reject])
279
+ compile_regex_file(@options[:select_file])
280
+ compile_regex_file(@options[:reject_file])
281
+ end
282
+
283
+ # Compile a list of regex from a list of given patterns.
284
+ #
285
+ # @param patterns [Array] List of patterns.
286
+ def compile_regex_patterns(patterns)
287
+ return unless patterns
288
+
289
+ [patterns].flatten.each do |pattern|
290
+ if @options[:ignore_case]
291
+ @regex << Regexp.new(/#{pattern}/i)
292
+ else
293
+ @regex << Regexp.new(/#{pattern}/)
294
+ end
295
+ end
296
+ end
297
+
298
+ # Compile a list of regex from a given file with one pattern per line.
299
+ #
300
+ # @param file [String] Path to file with patterns.
301
+ def compile_regex_file(file)
302
+ return unless file
303
+
304
+ File.open(file) do |ios|
305
+ ios.each_line do |line|
306
+ line.chomp!
307
+
308
+ if @options[:ignore_case]
309
+ @regex << Regexp.new(/#{line}/i)
310
+ else
311
+ @regex << Regexp.new(/#{line}/)
312
+ end
313
+ end
314
+ end
315
+ end
316
+
317
+ # Compile a lookup hash for fast exact matching.
318
+ #
319
+ # @return [Set] Set of exact patterns.
320
+ def compile_exact
321
+ return unless @options[:exact]
322
+
323
+ @exact = {}
324
+
325
+ compile_exact_patterns(@options[:select])
326
+ compile_exact_patterns(@options[:reject])
327
+ compile_exact_file(@options[:select_file])
328
+ compile_exact_file(@options[:reject_file])
329
+ end
330
+
331
+ # Compile a lookup hash for a given list of patterns.
332
+ #
333
+ # @param patterns [Array] List of patterns.
334
+ def compile_exact_patterns(patterns)
335
+ return unless patterns
336
+
337
+ [patterns].flatten.each do |pattern|
338
+ if pattern.class == String
339
+ @exact[pattern.to_sym] = true
340
+ else
341
+ @exact[pattern] = true
342
+ end
343
+ end
344
+ end
345
+
346
+ # Compile a lookup hash a given file with one pattern per line.
347
+ #
348
+ # @param file [String] Path to file with patterns.
349
+ def compile_exact_file(file)
350
+ return unless file
351
+
352
+ File.open(file) do |ios|
353
+ ios.each_line do |line|
354
+ pattern = line.chomp!
355
+
356
+ type = pattern.to_num.class.to_s.to_sym unless type
357
+
358
+ if type == :String
359
+ @exact[pattern.to_sym] = true
360
+ else
361
+ @exact[pattern] = true
362
+ end
363
+ end
364
+ end
365
+ end
366
+
367
+ # Match exactly record keys or values
368
+ #
369
+ # @param record [Hash] Record to match.
370
+ #
371
+ # @return [Boolean] True if exact match found.
372
+ def exact_match?(record)
373
+ keys = @keys || record.keys
374
+
375
+ if @keys_only
376
+ exact_match_keys?(keys)
377
+ elsif @vals_only
378
+ exact_match_values?(record, keys)
379
+ else
380
+ exact_match_key_values?(record, keys)
381
+ end
382
+ end
383
+
384
+ # Match exactly any record keys.
385
+ #
386
+ # @param keys [Array] List of keys to match.
387
+ #
388
+ # @return [Boolean] True if exact match found.
389
+ def exact_match_keys?(keys)
390
+ keys.each do |key|
391
+ return true if @exact[key]
392
+ end
393
+
394
+ false
395
+ end
396
+
397
+ # Match exactly any record values.
398
+ #
399
+ # @param record [Hash] Record to match.
400
+ # @param keys [Array] List of keys whos values to match.
401
+ #
402
+ # @return [Boolean] True if exact match found.
403
+ def exact_match_values?(record, keys)
404
+ keys.each do |key|
405
+ value = record[key]
406
+
407
+ next unless value
408
+
409
+ if value.class == String
410
+ return true if @exact.include?(value.to_sym)
411
+ else
412
+ return true if @exact.include?(value)
413
+ end
414
+ end
415
+
416
+ false
417
+ end
418
+
419
+ # Match exactly any record keys or values.
420
+ #
421
+ # @param record [Hash] Record to match.
422
+ # @param keys [Array] List of keys or values to match.
423
+ #
424
+ # @return [Boolean] True if exact match found.
425
+ def exact_match_key_values?(record, keys)
426
+ keys.each do |key|
427
+ return true if @exact.include?(key)
428
+
429
+ value = record[key]
430
+
431
+ next unless value
432
+
433
+ if value.class == String
434
+ return true if @exact.include?(value.to_sym)
435
+ else
436
+ return true if @exact.include?(value)
437
+ end
438
+ end
439
+
440
+ false
441
+ end
442
+
443
+ def regex_match?(record)
444
+ keys = @keys || record.keys
445
+
446
+ if @keys_only
447
+ regex_match_keys?(keys)
448
+ elsif @vals_only
449
+ regex_match_values?(record, keys)
450
+ else
451
+ regex_match_key_values?(record, keys)
452
+ end
453
+ end
454
+
455
+ # Match using regex any record keys.
456
+ #
457
+ # @param keys [Array] List of keys to match.
458
+ #
459
+ # @return [Boolean] True if regex match found.
460
+ def regex_match_keys?(keys)
461
+ keys.each do |key|
462
+ @regex.each do |regex|
463
+ return true if key.to_s =~ regex
464
+ end
465
+ end
466
+
467
+ false
468
+ end
469
+
470
+ # Match using regex any record values.
471
+ #
472
+ # @param record [Hash] Record to match.
473
+ # @param keys [Array] List of keys whos values to match.
474
+ #
475
+ # @return [Boolean] True if regex match found.
476
+ def regex_match_values?(record, keys)
477
+ keys.each do |key|
478
+ next unless record[key]
479
+ value = record[key]
480
+
481
+ @regex.each do |regex|
482
+ return true if value.to_s =~ regex
483
+ end
484
+ end
485
+
486
+ false
487
+ end
488
+
489
+ # Match using regex any record keys or values.
490
+ #
491
+ # @param record [Hash] Record to match.
492
+ # @param keys [Array] List of keys or values to match.
493
+ #
494
+ # @return [Boolean] True if regex match found.
495
+ def regex_match_key_values?(record, keys)
496
+ keys.each do |key|
497
+ @regex.each do |regex|
498
+ return true if key.to_s =~ regex
499
+ end
500
+
501
+ next unless record[key]
502
+ value = record[key]
503
+
504
+ @regex.each do |regex|
505
+ return true if value.to_s =~ regex
506
+ end
507
+ end
508
+
509
+ false
510
+ end
511
+
512
+ # Match using eval expression on record values.
513
+ #
514
+ # @param record [Hash] Record to match.
515
+ #
516
+ # @return [Boolean] True if eval match found.
517
+ def eval_match?(record)
518
+ expr = []
519
+
520
+ @eval.split("\s").each do |item|
521
+ if item[0] == ':'
522
+ key = item[1..-1].to_sym
523
+
524
+ return false unless record[key]
525
+
526
+ expr << record[key]
527
+ else
528
+ expr << item
529
+ end
530
+ end
531
+
532
+ eval expr.join(' ')
533
+ end
534
+ end
535
+ end