BioDSL 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/BioDSL.gemspec +64 -0
- data/LICENSE +339 -0
- data/README.md +205 -0
- data/Rakefile +94 -0
- data/examples/fastq_to_fasta.rb +8 -0
- data/lib/BioDSL/cary.rb +242 -0
- data/lib/BioDSL/command.rb +133 -0
- data/lib/BioDSL/commands/add_key.rb +110 -0
- data/lib/BioDSL/commands/align_seq_mothur.rb +194 -0
- data/lib/BioDSL/commands/analyze_residue_distribution.rb +222 -0
- data/lib/BioDSL/commands/assemble_pairs.rb +336 -0
- data/lib/BioDSL/commands/assemble_seq_idba.rb +230 -0
- data/lib/BioDSL/commands/assemble_seq_ray.rb +345 -0
- data/lib/BioDSL/commands/assemble_seq_spades.rb +252 -0
- data/lib/BioDSL/commands/classify_seq.rb +217 -0
- data/lib/BioDSL/commands/classify_seq_mothur.rb +226 -0
- data/lib/BioDSL/commands/clip_primer.rb +318 -0
- data/lib/BioDSL/commands/cluster_otus.rb +181 -0
- data/lib/BioDSL/commands/collapse_otus.rb +170 -0
- data/lib/BioDSL/commands/collect_otus.rb +150 -0
- data/lib/BioDSL/commands/complement_seq.rb +117 -0
- data/lib/BioDSL/commands/count.rb +135 -0
- data/lib/BioDSL/commands/count_values.rb +149 -0
- data/lib/BioDSL/commands/degap_seq.rb +253 -0
- data/lib/BioDSL/commands/dereplicate_seq.rb +168 -0
- data/lib/BioDSL/commands/dump.rb +157 -0
- data/lib/BioDSL/commands/filter_rrna.rb +239 -0
- data/lib/BioDSL/commands/genecall.rb +237 -0
- data/lib/BioDSL/commands/grab.rb +535 -0
- data/lib/BioDSL/commands/index_taxonomy.rb +226 -0
- data/lib/BioDSL/commands/mask_seq.rb +175 -0
- data/lib/BioDSL/commands/mean_scores.rb +168 -0
- data/lib/BioDSL/commands/merge_pair_seq.rb +175 -0
- data/lib/BioDSL/commands/merge_table.rb +225 -0
- data/lib/BioDSL/commands/merge_values.rb +113 -0
- data/lib/BioDSL/commands/plot_heatmap.rb +233 -0
- data/lib/BioDSL/commands/plot_histogram.rb +306 -0
- data/lib/BioDSL/commands/plot_matches.rb +282 -0
- data/lib/BioDSL/commands/plot_residue_distribution.rb +278 -0
- data/lib/BioDSL/commands/plot_scores.rb +285 -0
- data/lib/BioDSL/commands/random.rb +153 -0
- data/lib/BioDSL/commands/read_fasta.rb +222 -0
- data/lib/BioDSL/commands/read_fastq.rb +414 -0
- data/lib/BioDSL/commands/read_table.rb +329 -0
- data/lib/BioDSL/commands/reverse_seq.rb +113 -0
- data/lib/BioDSL/commands/slice_align.rb +400 -0
- data/lib/BioDSL/commands/slice_seq.rb +151 -0
- data/lib/BioDSL/commands/sort.rb +223 -0
- data/lib/BioDSL/commands/split_pair_seq.rb +220 -0
- data/lib/BioDSL/commands/split_values.rb +165 -0
- data/lib/BioDSL/commands/trim_primer.rb +314 -0
- data/lib/BioDSL/commands/trim_seq.rb +192 -0
- data/lib/BioDSL/commands/uchime_ref.rb +170 -0
- data/lib/BioDSL/commands/uclust.rb +286 -0
- data/lib/BioDSL/commands/unique_values.rb +145 -0
- data/lib/BioDSL/commands/usearch_global.rb +171 -0
- data/lib/BioDSL/commands/usearch_local.rb +171 -0
- data/lib/BioDSL/commands/write_fasta.rb +207 -0
- data/lib/BioDSL/commands/write_fastq.rb +191 -0
- data/lib/BioDSL/commands/write_table.rb +419 -0
- data/lib/BioDSL/commands/write_tree.rb +167 -0
- data/lib/BioDSL/commands.rb +31 -0
- data/lib/BioDSL/config.rb +55 -0
- data/lib/BioDSL/csv.rb +307 -0
- data/lib/BioDSL/debug.rb +42 -0
- data/lib/BioDSL/fasta.rb +133 -0
- data/lib/BioDSL/fastq.rb +77 -0
- data/lib/BioDSL/filesys.rb +137 -0
- data/lib/BioDSL/fork.rb +145 -0
- data/lib/BioDSL/hamming.rb +128 -0
- data/lib/BioDSL/helpers/aux_helper.rb +44 -0
- data/lib/BioDSL/helpers/email_helper.rb +66 -0
- data/lib/BioDSL/helpers/history_helper.rb +40 -0
- data/lib/BioDSL/helpers/log_helper.rb +55 -0
- data/lib/BioDSL/helpers/options_helper.rb +405 -0
- data/lib/BioDSL/helpers/status_helper.rb +132 -0
- data/lib/BioDSL/helpers.rb +35 -0
- data/lib/BioDSL/html_report.rb +200 -0
- data/lib/BioDSL/math.rb +55 -0
- data/lib/BioDSL/mummer.rb +216 -0
- data/lib/BioDSL/pipeline.rb +354 -0
- data/lib/BioDSL/seq/ambiguity.rb +66 -0
- data/lib/BioDSL/seq/assemble.rb +240 -0
- data/lib/BioDSL/seq/backtrack.rb +252 -0
- data/lib/BioDSL/seq/digest.rb +99 -0
- data/lib/BioDSL/seq/dynamic.rb +263 -0
- data/lib/BioDSL/seq/homopolymer.rb +59 -0
- data/lib/BioDSL/seq/kmer.rb +293 -0
- data/lib/BioDSL/seq/levenshtein.rb +113 -0
- data/lib/BioDSL/seq/translate.rb +109 -0
- data/lib/BioDSL/seq/trim.rb +188 -0
- data/lib/BioDSL/seq.rb +742 -0
- data/lib/BioDSL/serializer.rb +98 -0
- data/lib/BioDSL/stream.rb +113 -0
- data/lib/BioDSL/taxonomy.rb +691 -0
- data/lib/BioDSL/test.rb +42 -0
- data/lib/BioDSL/tmp_dir.rb +68 -0
- data/lib/BioDSL/usearch.rb +301 -0
- data/lib/BioDSL/verbose.rb +42 -0
- data/lib/BioDSL/version.rb +31 -0
- data/lib/BioDSL.rb +81 -0
- data/test/BioDSL/commands/test_add_key.rb +105 -0
- data/test/BioDSL/commands/test_align_seq_mothur.rb +99 -0
- data/test/BioDSL/commands/test_analyze_residue_distribution.rb +134 -0
- data/test/BioDSL/commands/test_assemble_pairs.rb +459 -0
- data/test/BioDSL/commands/test_assemble_seq_idba.rb +50 -0
- data/test/BioDSL/commands/test_assemble_seq_ray.rb +51 -0
- data/test/BioDSL/commands/test_assemble_seq_spades.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq.rb +50 -0
- data/test/BioDSL/commands/test_classify_seq_mothur.rb +59 -0
- data/test/BioDSL/commands/test_clip_primer.rb +377 -0
- data/test/BioDSL/commands/test_cluster_otus.rb +128 -0
- data/test/BioDSL/commands/test_collapse_otus.rb +81 -0
- data/test/BioDSL/commands/test_collect_otus.rb +82 -0
- data/test/BioDSL/commands/test_complement_seq.rb +78 -0
- data/test/BioDSL/commands/test_count.rb +103 -0
- data/test/BioDSL/commands/test_count_values.rb +85 -0
- data/test/BioDSL/commands/test_degap_seq.rb +96 -0
- data/test/BioDSL/commands/test_dereplicate_seq.rb +92 -0
- data/test/BioDSL/commands/test_dump.rb +109 -0
- data/test/BioDSL/commands/test_filter_rrna.rb +128 -0
- data/test/BioDSL/commands/test_genecall.rb +50 -0
- data/test/BioDSL/commands/test_grab.rb +398 -0
- data/test/BioDSL/commands/test_index_taxonomy.rb +62 -0
- data/test/BioDSL/commands/test_mask_seq.rb +98 -0
- data/test/BioDSL/commands/test_mean_scores.rb +111 -0
- data/test/BioDSL/commands/test_merge_pair_seq.rb +115 -0
- data/test/BioDSL/commands/test_merge_table.rb +131 -0
- data/test/BioDSL/commands/test_merge_values.rb +83 -0
- data/test/BioDSL/commands/test_plot_heatmap.rb +185 -0
- data/test/BioDSL/commands/test_plot_histogram.rb +194 -0
- data/test/BioDSL/commands/test_plot_matches.rb +157 -0
- data/test/BioDSL/commands/test_plot_residue_distribution.rb +309 -0
- data/test/BioDSL/commands/test_plot_scores.rb +308 -0
- data/test/BioDSL/commands/test_random.rb +88 -0
- data/test/BioDSL/commands/test_read_fasta.rb +229 -0
- data/test/BioDSL/commands/test_read_fastq.rb +552 -0
- data/test/BioDSL/commands/test_read_table.rb +327 -0
- data/test/BioDSL/commands/test_reverse_seq.rb +79 -0
- data/test/BioDSL/commands/test_slice_align.rb +218 -0
- data/test/BioDSL/commands/test_slice_seq.rb +131 -0
- data/test/BioDSL/commands/test_sort.rb +128 -0
- data/test/BioDSL/commands/test_split_pair_seq.rb +164 -0
- data/test/BioDSL/commands/test_split_values.rb +95 -0
- data/test/BioDSL/commands/test_trim_primer.rb +329 -0
- data/test/BioDSL/commands/test_trim_seq.rb +150 -0
- data/test/BioDSL/commands/test_uchime_ref.rb +113 -0
- data/test/BioDSL/commands/test_uclust.rb +139 -0
- data/test/BioDSL/commands/test_unique_values.rb +98 -0
- data/test/BioDSL/commands/test_usearch_global.rb +123 -0
- data/test/BioDSL/commands/test_usearch_local.rb +125 -0
- data/test/BioDSL/commands/test_write_fasta.rb +159 -0
- data/test/BioDSL/commands/test_write_fastq.rb +166 -0
- data/test/BioDSL/commands/test_write_table.rb +411 -0
- data/test/BioDSL/commands/test_write_tree.rb +122 -0
- data/test/BioDSL/helpers/test_options_helper.rb +272 -0
- data/test/BioDSL/seq/test_assemble.rb +98 -0
- data/test/BioDSL/seq/test_backtrack.rb +176 -0
- data/test/BioDSL/seq/test_digest.rb +71 -0
- data/test/BioDSL/seq/test_dynamic.rb +133 -0
- data/test/BioDSL/seq/test_homopolymer.rb +58 -0
- data/test/BioDSL/seq/test_kmer.rb +134 -0
- data/test/BioDSL/seq/test_translate.rb +75 -0
- data/test/BioDSL/seq/test_trim.rb +101 -0
- data/test/BioDSL/test_cary.rb +176 -0
- data/test/BioDSL/test_command.rb +45 -0
- data/test/BioDSL/test_csv.rb +514 -0
- data/test/BioDSL/test_debug.rb +42 -0
- data/test/BioDSL/test_fasta.rb +154 -0
- data/test/BioDSL/test_fastq.rb +46 -0
- data/test/BioDSL/test_filesys.rb +145 -0
- data/test/BioDSL/test_fork.rb +85 -0
- data/test/BioDSL/test_math.rb +41 -0
- data/test/BioDSL/test_mummer.rb +79 -0
- data/test/BioDSL/test_pipeline.rb +187 -0
- data/test/BioDSL/test_seq.rb +790 -0
- data/test/BioDSL/test_serializer.rb +72 -0
- data/test/BioDSL/test_stream.rb +55 -0
- data/test/BioDSL/test_taxonomy.rb +336 -0
- data/test/BioDSL/test_test.rb +42 -0
- data/test/BioDSL/test_tmp_dir.rb +58 -0
- data/test/BioDSL/test_usearch.rb +33 -0
- data/test/BioDSL/test_verbose.rb +42 -0
- data/test/helper.rb +82 -0
- data/www/command.html.haml +14 -0
- data/www/css.html.haml +55 -0
- data/www/input_files.html.haml +3 -0
- data/www/layout.html.haml +12 -0
- data/www/output_files.html.haml +3 -0
- data/www/overview.html.haml +15 -0
- data/www/pipeline.html.haml +4 -0
- data/www/png.html.haml +2 -0
- data/www/status.html.haml +9 -0
- data/www/time.html.haml +11 -0
- metadata +503 -0
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
2
|
+
# #
|
|
3
|
+
# Copyright (C) 2007-2015 Martin Asser Hansen (mail@maasha.dk). #
|
|
4
|
+
# #
|
|
5
|
+
# This program is free software; you can redistribute it and/or #
|
|
6
|
+
# modify it under the terms of the GNU General Public License #
|
|
7
|
+
# as published by the Free Software Foundation; either version 2 #
|
|
8
|
+
# of the License, or (at your option) any later version. #
|
|
9
|
+
# #
|
|
10
|
+
# This program is distributed in the hope that it will be useful, #
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
13
|
+
# GNU General Public License for more details. #
|
|
14
|
+
# #
|
|
15
|
+
# You should have received a copy of the GNU General Public License #
|
|
16
|
+
# along with this program; if not, write to the Free Software #
|
|
17
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
|
|
18
|
+
# USA. #
|
|
19
|
+
# #
|
|
20
|
+
# http://www.gnu.org/copyleft/gpl.html #
|
|
21
|
+
# #
|
|
22
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
23
|
+
# #
|
|
24
|
+
# This software is part of the BioDSL framework (www.BioDSL.org). #
|
|
25
|
+
# #
|
|
26
|
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
|
|
27
|
+
|
|
28
|
+
module BioDSL
|
|
29
|
+
# == Grab records in stream.
|
|
30
|
+
#
|
|
31
|
+
# +grab+ select records from the stream by matching patterns to keys or
|
|
32
|
+
# values. +grab+ is BioDSL' equivalent of Unix' +grep+, however, +grab+
|
|
33
|
+
# is much more versatile.
|
|
34
|
+
#
|
|
35
|
+
# NB! If chaining multiple +grab+ commands then use the most restrictive
|
|
36
|
+
# +grab+ first in order to get the best performance.
|
|
37
|
+
#
|
|
38
|
+
# NB! Avoid using exact with long values because of memory use.
|
|
39
|
+
#
|
|
40
|
+
# == Usage
|
|
41
|
+
#
|
|
42
|
+
# grab(<select: <pattern>|select_file: <file>|reject: <pattern>|
|
|
43
|
+
# reject_file: <file>|evaluate: <expression>|exact: <bool>>
|
|
44
|
+
# [, keys: <list>|keys_only: <bool>|values_only: <bool>|
|
|
45
|
+
# ignore_case: <bool>])
|
|
46
|
+
#
|
|
47
|
+
# === Options
|
|
48
|
+
#
|
|
49
|
+
# * select: <pattern> - Select records matching <pattern> which is
|
|
50
|
+
# a regex or an exact match if the exact option is set.
|
|
51
|
+
# * select_file: <file> - File with one <pattern> per line to select.
|
|
52
|
+
# * reject: <pattern> - Reject records matching <pattern> which is
|
|
53
|
+
# a regex or an exact match if the exact option is set.
|
|
54
|
+
# * reject_file: <file> - File with one <pattern> per line to reject.
|
|
55
|
+
# * evaluate: <expression> - Select records where <expression> is true.
|
|
56
|
+
# * exact: <bool> - Turn on exact matching for improved speed.
|
|
57
|
+
# * keys: <list> - Comma separated list or array of keys to grab
|
|
58
|
+
# the value for.
|
|
59
|
+
# * keys_only: <bool> - Only grab for keys.
|
|
60
|
+
# * values_only: <bool> - Only grab for values.
|
|
61
|
+
# * ignore_case: <bool> - Ignore case when grabbing with regex (does not
|
|
62
|
+
# work with +evaluate+ and +exact+).
|
|
63
|
+
#
|
|
64
|
+
# == Examples
|
|
65
|
+
#
|
|
66
|
+
# To easily grab all records in the stream that has any mentioning of the
|
|
67
|
+
# pattern 'human' just pipe the data stream through grab like this:
|
|
68
|
+
#
|
|
69
|
+
# grab(select: "human")
|
|
70
|
+
#
|
|
71
|
+
# This will search for the pattern 'human' in all keys and all values. The
|
|
72
|
+
# +select+ option alternatively uses an array of patterns, so in order to
|
|
73
|
+
# match one of multiple patterns do:
|
|
74
|
+
#
|
|
75
|
+
# grab(select: ["human", "mouse"])
|
|
76
|
+
#
|
|
77
|
+
# It is also possible to invoke flexible matching using regex (regular
|
|
78
|
+
# expressions) instead of simple pattern matching. If you want to +grab+
|
|
79
|
+
# records with the sequence +ATCG+ or +GCTA+ you can do this:
|
|
80
|
+
#
|
|
81
|
+
# grab(select: "ATCG|GCTA")
|
|
82
|
+
#
|
|
83
|
+
# Or if you want to +grab+ sequences beginning with +ATCG+:
|
|
84
|
+
#
|
|
85
|
+
# grab(select: "^ATCG")
|
|
86
|
+
#
|
|
87
|
+
# It is also possible to use the +select_file+ option to load patterns from
|
|
88
|
+
# a file with one pattern per line.
|
|
89
|
+
#
|
|
90
|
+
# grab(select_file: "patterns.txt")
|
|
91
|
+
#
|
|
92
|
+
# If you want the opposite result - to find all records that does not match
|
|
93
|
+
# the a pattern, use the +reject+ option:
|
|
94
|
+
#
|
|
95
|
+
# grab(reject: "human")
|
|
96
|
+
#
|
|
97
|
+
# Similar to +select_file+ there is a +reject_file+ option to load patterns
|
|
98
|
+
# from a file, and use any of these patterns to reject records:
|
|
99
|
+
#
|
|
100
|
+
# grab(reject_file: "patterns.txt")
|
|
101
|
+
#
|
|
102
|
+
# If you want to search the record keys only, e.g. to +grab+ all records
|
|
103
|
+
# containing the key +SEQ+ you can use the +keys_only+ option. This will
|
|
104
|
+
# prevent matching of +SEQ+ in any record value, and in fact +SEQ+ is a not
|
|
105
|
+
# uncommon peptide sequence you could get an unwanted record. Also, this
|
|
106
|
+
# will give an increase in speed since only the keys are searched:
|
|
107
|
+
#
|
|
108
|
+
# grab(select: "SEQ", keys_only: true)
|
|
109
|
+
#
|
|
110
|
+
# However, if you are interested in +grabbing+ the peptide sequence +SEQ+ and
|
|
111
|
+
# not the +SEQ+ key, just use the +vals_only+ option:
|
|
112
|
+
#
|
|
113
|
+
# grab(select: "SEQ", vals_only: true)
|
|
114
|
+
#
|
|
115
|
+
# Also, if you want to +grab+ for certain key/value pairs you can supply a
|
|
116
|
+
# comma separated list or an array of keys whos values will then be grabbed
|
|
117
|
+
# using the +keys+ option. This is handy if your records contain large
|
|
118
|
+
# genomic sequences and you don't want to search the entire sequence for
|
|
119
|
+
# e.g. the organism name - it is much faster to tell +grab+ which keys to
|
|
120
|
+
# search the value for:
|
|
121
|
+
#
|
|
122
|
+
# grab(select: "human", keys: :SEQ_NAME)
|
|
123
|
+
#
|
|
124
|
+
# You can also use the +evaluate+ option to +grab+ records that fulfill an
|
|
125
|
+
# expression. So to +grab+ all records with a sequence length greater than 30:
|
|
126
|
+
#
|
|
127
|
+
# grab(evaluate: 'SEQ_LEN > 30')
|
|
128
|
+
#
|
|
129
|
+
# If you want to +grab+ all records containing the pattern 'human' and where
|
|
130
|
+
# the sequence length is greater that 30, you do this by running the stream
|
|
131
|
+
# through +grab+ twice:
|
|
132
|
+
#
|
|
133
|
+
# grab(select: 'human').grab(evaluate: 'SEQ_LEN > 30')
|
|
134
|
+
#
|
|
135
|
+
# Finally, it is possible to +grab+ for exact pattern using the +exact+
|
|
136
|
+
# option. This is much faster than the default regex pattern grabbing
|
|
137
|
+
# because with +exact+ the patterns are used to create a lookup hash for
|
|
138
|
+
# instant matching of keys or values. This is useful if you e.g. have a
|
|
139
|
+
# file with ID numbers and you want to +grab+ matching records from the
|
|
140
|
+
# stream:
|
|
141
|
+
#
|
|
142
|
+
# grab(select_file: "ids.txt", keys: :ID, exact: true)
|
|
143
|
+
#
|
|
144
|
+
# rubocop:disable ClassLength
|
|
145
|
+
class Grab
|
|
146
|
+
STATS = %i(records_in records_out)
|
|
147
|
+
|
|
148
|
+
# Constructor for the ReadFasta class.
|
|
149
|
+
#
|
|
150
|
+
# @param [Hash] options Options hash.
|
|
151
|
+
#
|
|
152
|
+
# @option options [String, Array] :select
|
|
153
|
+
# Patterns or list of patterns to select records.
|
|
154
|
+
#
|
|
155
|
+
# @option options [String] :select_file
|
|
156
|
+
# File path with patterns, one per line, to select records.
|
|
157
|
+
#
|
|
158
|
+
# @option options [String, Array] :reject
|
|
159
|
+
# Patterns or list of patterns to reject records.
|
|
160
|
+
#
|
|
161
|
+
# @option options [String] :reject_file
|
|
162
|
+
# File path with patterns, one per line, to reject records.
|
|
163
|
+
#
|
|
164
|
+
# @option options [String] :evaluate
|
|
165
|
+
# Expression that is evaluated to select records.
|
|
166
|
+
#
|
|
167
|
+
# @option options [Boolean] :exact
|
|
168
|
+
# Flag indicating that a given pattern must match over its entire length.
|
|
169
|
+
#
|
|
170
|
+
# @option options [Symbol, Array] :keys
|
|
171
|
+
# Key or list of keys whos key/value pairs to grab for.
|
|
172
|
+
#
|
|
173
|
+
# @option options [Boolean] :keys_only
|
|
174
|
+
# Flag indicating to grab for key only - not values.
|
|
175
|
+
#
|
|
176
|
+
# @option options [Boolean] :values_only
|
|
177
|
+
# Flag indicating to grab for values only - not keys.
|
|
178
|
+
#
|
|
179
|
+
# @option options [Boolean] :ignore_case
|
|
180
|
+
# Flag indicating that pattern matching should be case insensitive.
|
|
181
|
+
#
|
|
182
|
+
# @return [ReadFasta] Returns an instance of the class.
|
|
183
|
+
def initialize(options)
|
|
184
|
+
@options = options
|
|
185
|
+
|
|
186
|
+
check_options
|
|
187
|
+
|
|
188
|
+
@keys_only = @options[:keys_only]
|
|
189
|
+
@vals_only = @options[:values_only]
|
|
190
|
+
@invert = @options[:reject] || @options[:reject_file]
|
|
191
|
+
@eval = @options[:evaluate]
|
|
192
|
+
@exact = nil
|
|
193
|
+
@regex = nil
|
|
194
|
+
@keys = nil
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Return a lambda for the grab command.
|
|
198
|
+
#
|
|
199
|
+
# @return [Proc] Returns the grab command lambda.
|
|
200
|
+
def lmb
|
|
201
|
+
lambda do |input, output, status|
|
|
202
|
+
status_init(status, STATS)
|
|
203
|
+
compile_keys
|
|
204
|
+
compile_exact
|
|
205
|
+
compile_regexes
|
|
206
|
+
|
|
207
|
+
input.each do |record|
|
|
208
|
+
@status[:records_in] += 1
|
|
209
|
+
|
|
210
|
+
match = case
|
|
211
|
+
when @exact then exact_match? record
|
|
212
|
+
when @regex then regex_match? record
|
|
213
|
+
when @eval then eval_match? record
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
emit_match(output, record, match)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
private
|
|
222
|
+
|
|
223
|
+
# Check the options.
|
|
224
|
+
def check_options
|
|
225
|
+
options_allowed(@options, :select, :select_file, :reject, :reject_file,
|
|
226
|
+
:evaluate, :exact, :keys, :keys_only, :values_only,
|
|
227
|
+
:ignore_case)
|
|
228
|
+
options_required_unique(@options, :select, :select_file, :reject,
|
|
229
|
+
:reject_file, :evaluate)
|
|
230
|
+
options_conflict(@options, keys: :evaluate, keys_only: :evaluate,
|
|
231
|
+
values_only: :evaluate, ignore_case: :evaluate,
|
|
232
|
+
exact: :evaluate)
|
|
233
|
+
options_unique(@options, :keys_only, :values_only)
|
|
234
|
+
options_files_exist(@options, :select_file, :reject_file)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Emit a record to the output stream if a match was found and w/o invert
|
|
238
|
+
# matching, or if no match was found and with invert matching.
|
|
239
|
+
#
|
|
240
|
+
# @param output [Enumerator::Yielder] Output stream.
|
|
241
|
+
# @param record [Hash] Record to emit.
|
|
242
|
+
# @param match [Boolean] Flag indicating a positive match.
|
|
243
|
+
def emit_match(output, record, match)
|
|
244
|
+
if match && !@invert
|
|
245
|
+
output << record
|
|
246
|
+
@status[:records_out] += 1
|
|
247
|
+
elsif !match && @invert
|
|
248
|
+
output << record
|
|
249
|
+
@status[:records_out] += 1
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Compile a list of keys from the options hash, which may contain either a
|
|
254
|
+
# list of keys, a symbol or a comma seperated string of keys.
|
|
255
|
+
def compile_keys
|
|
256
|
+
return unless @options[:keys]
|
|
257
|
+
|
|
258
|
+
@keys = case @options[:keys].class.to_s
|
|
259
|
+
when 'Array'
|
|
260
|
+
@options[:keys].map(&:to_sym)
|
|
261
|
+
when 'Symbol'
|
|
262
|
+
[@options[:keys]]
|
|
263
|
+
when 'String'
|
|
264
|
+
@options[:keys].split(/, */).map do |key|
|
|
265
|
+
key.sub(/^:/, '').to_sym
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Compile a list of regexes for matching.
|
|
271
|
+
def compile_regexes
|
|
272
|
+
return if @options[:exact]
|
|
273
|
+
return if @options[:evaluate]
|
|
274
|
+
|
|
275
|
+
@regex = []
|
|
276
|
+
|
|
277
|
+
compile_regex_patterns(@options[:select])
|
|
278
|
+
compile_regex_patterns(@options[:reject])
|
|
279
|
+
compile_regex_file(@options[:select_file])
|
|
280
|
+
compile_regex_file(@options[:reject_file])
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Compile a list of regex from a list of given patterns.
|
|
284
|
+
#
|
|
285
|
+
# @param patterns [Array] List of patterns.
|
|
286
|
+
def compile_regex_patterns(patterns)
|
|
287
|
+
return unless patterns
|
|
288
|
+
|
|
289
|
+
[patterns].flatten.each do |pattern|
|
|
290
|
+
if @options[:ignore_case]
|
|
291
|
+
@regex << Regexp.new(/#{pattern}/i)
|
|
292
|
+
else
|
|
293
|
+
@regex << Regexp.new(/#{pattern}/)
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Compile a list of regex from a given file with one pattern per line.
|
|
299
|
+
#
|
|
300
|
+
# @param file [String] Path to file with patterns.
|
|
301
|
+
def compile_regex_file(file)
|
|
302
|
+
return unless file
|
|
303
|
+
|
|
304
|
+
File.open(file) do |ios|
|
|
305
|
+
ios.each_line do |line|
|
|
306
|
+
line.chomp!
|
|
307
|
+
|
|
308
|
+
if @options[:ignore_case]
|
|
309
|
+
@regex << Regexp.new(/#{line}/i)
|
|
310
|
+
else
|
|
311
|
+
@regex << Regexp.new(/#{line}/)
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Compile a lookup hash for fast exact matching.
|
|
318
|
+
#
|
|
319
|
+
# @return [Set] Set of exact patterns.
|
|
320
|
+
def compile_exact
|
|
321
|
+
return unless @options[:exact]
|
|
322
|
+
|
|
323
|
+
@exact = {}
|
|
324
|
+
|
|
325
|
+
compile_exact_patterns(@options[:select])
|
|
326
|
+
compile_exact_patterns(@options[:reject])
|
|
327
|
+
compile_exact_file(@options[:select_file])
|
|
328
|
+
compile_exact_file(@options[:reject_file])
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Compile a lookup hash for a given list of patterns.
|
|
332
|
+
#
|
|
333
|
+
# @param patterns [Array] List of patterns.
|
|
334
|
+
def compile_exact_patterns(patterns)
|
|
335
|
+
return unless patterns
|
|
336
|
+
|
|
337
|
+
[patterns].flatten.each do |pattern|
|
|
338
|
+
if pattern.class == String
|
|
339
|
+
@exact[pattern.to_sym] = true
|
|
340
|
+
else
|
|
341
|
+
@exact[pattern] = true
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Compile a lookup hash a given file with one pattern per line.
|
|
347
|
+
#
|
|
348
|
+
# @param file [String] Path to file with patterns.
|
|
349
|
+
def compile_exact_file(file)
|
|
350
|
+
return unless file
|
|
351
|
+
|
|
352
|
+
File.open(file) do |ios|
|
|
353
|
+
ios.each_line do |line|
|
|
354
|
+
pattern = line.chomp!
|
|
355
|
+
|
|
356
|
+
type = pattern.to_num.class.to_s.to_sym unless type
|
|
357
|
+
|
|
358
|
+
if type == :String
|
|
359
|
+
@exact[pattern.to_sym] = true
|
|
360
|
+
else
|
|
361
|
+
@exact[pattern] = true
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Match exactly record keys or values
|
|
368
|
+
#
|
|
369
|
+
# @param record [Hash] Record to match.
|
|
370
|
+
#
|
|
371
|
+
# @return [Boolean] True if exact match found.
|
|
372
|
+
def exact_match?(record)
|
|
373
|
+
keys = @keys || record.keys
|
|
374
|
+
|
|
375
|
+
if @keys_only
|
|
376
|
+
exact_match_keys?(keys)
|
|
377
|
+
elsif @vals_only
|
|
378
|
+
exact_match_values?(record, keys)
|
|
379
|
+
else
|
|
380
|
+
exact_match_key_values?(record, keys)
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
# Match exactly any record keys.
|
|
385
|
+
#
|
|
386
|
+
# @param keys [Array] List of keys to match.
|
|
387
|
+
#
|
|
388
|
+
# @return [Boolean] True if exact match found.
|
|
389
|
+
def exact_match_keys?(keys)
|
|
390
|
+
keys.each do |key|
|
|
391
|
+
return true if @exact[key]
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
false
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Match exactly any record values.
|
|
398
|
+
#
|
|
399
|
+
# @param record [Hash] Record to match.
|
|
400
|
+
# @param keys [Array] List of keys whos values to match.
|
|
401
|
+
#
|
|
402
|
+
# @return [Boolean] True if exact match found.
|
|
403
|
+
def exact_match_values?(record, keys)
|
|
404
|
+
keys.each do |key|
|
|
405
|
+
value = record[key]
|
|
406
|
+
|
|
407
|
+
next unless value
|
|
408
|
+
|
|
409
|
+
if value.class == String
|
|
410
|
+
return true if @exact.include?(value.to_sym)
|
|
411
|
+
else
|
|
412
|
+
return true if @exact.include?(value)
|
|
413
|
+
end
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
false
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Match exactly any record keys or values.
|
|
420
|
+
#
|
|
421
|
+
# @param record [Hash] Record to match.
|
|
422
|
+
# @param keys [Array] List of keys or values to match.
|
|
423
|
+
#
|
|
424
|
+
# @return [Boolean] True if exact match found.
|
|
425
|
+
def exact_match_key_values?(record, keys)
|
|
426
|
+
keys.each do |key|
|
|
427
|
+
return true if @exact.include?(key)
|
|
428
|
+
|
|
429
|
+
value = record[key]
|
|
430
|
+
|
|
431
|
+
next unless value
|
|
432
|
+
|
|
433
|
+
if value.class == String
|
|
434
|
+
return true if @exact.include?(value.to_sym)
|
|
435
|
+
else
|
|
436
|
+
return true if @exact.include?(value)
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
false
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
def regex_match?(record)
|
|
444
|
+
keys = @keys || record.keys
|
|
445
|
+
|
|
446
|
+
if @keys_only
|
|
447
|
+
regex_match_keys?(keys)
|
|
448
|
+
elsif @vals_only
|
|
449
|
+
regex_match_values?(record, keys)
|
|
450
|
+
else
|
|
451
|
+
regex_match_key_values?(record, keys)
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# Match using regex any record keys.
|
|
456
|
+
#
|
|
457
|
+
# @param keys [Array] List of keys to match.
|
|
458
|
+
#
|
|
459
|
+
# @return [Boolean] True if regex match found.
|
|
460
|
+
def regex_match_keys?(keys)
|
|
461
|
+
keys.each do |key|
|
|
462
|
+
@regex.each do |regex|
|
|
463
|
+
return true if key.to_s =~ regex
|
|
464
|
+
end
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
false
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
# Match using regex any record values.
|
|
471
|
+
#
|
|
472
|
+
# @param record [Hash] Record to match.
|
|
473
|
+
# @param keys [Array] List of keys whos values to match.
|
|
474
|
+
#
|
|
475
|
+
# @return [Boolean] True if regex match found.
|
|
476
|
+
def regex_match_values?(record, keys)
|
|
477
|
+
keys.each do |key|
|
|
478
|
+
next unless record[key]
|
|
479
|
+
value = record[key]
|
|
480
|
+
|
|
481
|
+
@regex.each do |regex|
|
|
482
|
+
return true if value.to_s =~ regex
|
|
483
|
+
end
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
false
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Match using regex any record keys or values.
|
|
490
|
+
#
|
|
491
|
+
# @param record [Hash] Record to match.
|
|
492
|
+
# @param keys [Array] List of keys or values to match.
|
|
493
|
+
#
|
|
494
|
+
# @return [Boolean] True if regex match found.
|
|
495
|
+
def regex_match_key_values?(record, keys)
|
|
496
|
+
keys.each do |key|
|
|
497
|
+
@regex.each do |regex|
|
|
498
|
+
return true if key.to_s =~ regex
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
next unless record[key]
|
|
502
|
+
value = record[key]
|
|
503
|
+
|
|
504
|
+
@regex.each do |regex|
|
|
505
|
+
return true if value.to_s =~ regex
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
false
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
# Match using eval expression on record values.
|
|
513
|
+
#
|
|
514
|
+
# @param record [Hash] Record to match.
|
|
515
|
+
#
|
|
516
|
+
# @return [Boolean] True if eval match found.
|
|
517
|
+
def eval_match?(record)
|
|
518
|
+
expr = []
|
|
519
|
+
|
|
520
|
+
@eval.split("\s").each do |item|
|
|
521
|
+
if item[0] == ':'
|
|
522
|
+
key = item[1..-1].to_sym
|
|
523
|
+
|
|
524
|
+
return false unless record[key]
|
|
525
|
+
|
|
526
|
+
expr << record[key]
|
|
527
|
+
else
|
|
528
|
+
expr << item
|
|
529
|
+
end
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
eval expr.join(' ')
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
end
|