human-ql 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,498 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Human friendly, lenient query parser. Parses an arbitrary input
20
+ # string and outputs an abstract syntax tree (AST), which uses ruby
21
+ # arrays as S-expressions.
22
+ #
23
+ # === Supported Syntax Summary
24
+ #
25
+ # As per defaults. In the table below, input string variations on
26
+ # the left are sperated by ',' and output AST is shown on the right.
27
+ #
28
+ # a --> 'a'
29
+ # "a b c" --> [ :phrase, 'a', 'b', 'c' ]
30
+ # a b c --> [ :and, 'a', 'b', 'c' ]
31
+ # a OR b, a|b --> [ :or, 'a', 'b' ]
32
+ # a AND b, a&b --> [ :and, 'a', 'b' ]
33
+ # a b|c --> [ :and, 'a', [:or, 'b', 'c'] ]
34
+ # (a b) OR (c d) --> [ :or, [:and, 'a', 'b'], [:and, 'c', 'd'] ]
35
+ # NOT expr, -expr --> [ :not, expr ]
36
+ # SCOPE:expr, SCOPE : expr --> [ 'SCOPE', expr ]
37
+ #
38
+ # Where:
39
+ # * 'expr' may be simple term, phrase, or parenthetical expression.
40
+ # * SCOPEs must be specified. By default, no scopes are
41
+ # supported.
42
+ #
43
+ # The AST output from #parse may have various no-ops and
44
+ # redundances. Run it through a TreeNormalizer to avoid seeing or
45
+ # needing to handle these cases.
46
+ #
47
+ # === Customization
48
+ #
49
+ # The lexing and token matching patterns, as well as other
50
+ # attributes used in the parser may be adjusted via constructor
51
+ # options or attribute writer methods. Many of these attributes may
52
+ # either be String constants or Regex patterns supporting multiple
53
+ # values as needed. Some features may be disabled by setting these
54
+ # values to nil (e.g. match no tokens). While accessors are defined,
55
+ # internally the instance variables are accessed directly for
56
+ # speed. Tests show this is as fast as using constants (which would
57
+ # be harder to modify) and faster than reader method calls.
58
+ #
59
+ # === Implementation Notes
60
+ #
61
+ # The parser implementation adapts the infix precedence handling and
62
+ # operator stack of the
63
+ # {Shunting Yard Algorithm}[https://en.wikipedia.org/wiki/Shunting-yard_algorithm]
64
+ # originally described by Edsger Dijkstra. Attributes #default_op
65
+ # and #precedence control the handling of explicit or implied infix
66
+ # operators.
67
+ class QueryParser
68
+
69
+ # String pattern for Unicode spaces
70
+ SP = "[[:space:]]".freeze
71
+
72
+ # String pattern for Unicode non-spaces
73
+ NSP = "[^#{SP}]".freeze
74
+
75
+ # Regex for 1-to-many Unicode spaces
76
+ SPACES = /#{SP}+/.freeze
77
+
78
+ # Default precedence of supported operators.
79
+ DEFAULT_PRECEDENCE = {
80
+ not: 11,
81
+ or: 2,
82
+ and: 1
83
+ }.freeze
84
+
85
+ # The default operator when none is otherwise given between parsed
86
+ # terms.
87
+ # Default: :and
88
+ attr_accessor :default_op
89
+
90
+ # Hash of operators to precedence Integer value. The hash should
91
+ # also provide a default value for unlisted operators like any
92
+ # supported scopes. To limit human surprise, the #default_op
93
+ # should have the lowest precedence. The default is as per
94
+ # DEFAULT_PRECEDENCE with a default value of 10, thus :not has the
95
+ # highest precedence at 11.
96
+ attr_accessor :precedence
97
+
98
+ # Pattern matching one or more characters to treat as white-space
99
+ # Default: SPACES
100
+ attr_accessor :spaces
101
+
102
+ # Pattern used for lexing to treat certain punctuation characters
103
+ # as seperate tokens, even if they are not space seperated.
104
+ # Default: Pattern matching any characters '(', ')', '|', '&', '"'
105
+ # as used as operator/parenthesis tokens in defaults below.
106
+ attr_accessor :infix_token
107
+
108
+ # Pattern used for lexing to treat certain characters as seperate
109
+ # tokens when appearing as a prefix only.
110
+ # Default '-' (as used in default #not_tokens)
111
+ attr_accessor :prefix_token
112
+
113
+ # OR operator token pattern. Should match the entire token using
114
+ # the '\A' and '/z' syntax for begining and end of string.
115
+ # Default: Pattern matching complete tokens 'OR', 'or', or '|'
116
+ attr_accessor :or_token
117
+
118
+ # AND operator token pattern. Should match the entire token using
119
+ # the '\A' and '/z' syntax for begining and end of string.
120
+ # Default: Pattern matching complete tokens 'AND', 'and', or '&'
121
+ attr_accessor :and_token
122
+
123
+ # NOT operator token pattern. Should match the entire token using
124
+ # the '\A' and '/z' syntax for begining and end of string.
125
+ # Default: Pattern matching complete tokens 'NOT', 'not', or '-'
126
+ attr_accessor :not_token
127
+
128
+ # Left quote pattern or value
129
+ # Default: '"'
130
+ attr_accessor :lquote
131
+
132
+ # Right quote pattern or value. Its fine if this is the same as
133
+ # #lquote.
134
+ # Default: '"'
135
+ attr_accessor :rquote
136
+
137
+ # Left parentheses pattern or value
138
+ # Default: '('
139
+ attr_accessor :lparen
140
+
141
+ # Right parentheses pattern or value
142
+ # Default: ')'
143
+ attr_accessor :rparen
144
+
145
+ # Given one or an Array of scope prefixes, generate the #scope and
146
+ # #scope_token patterns. A trailing hash is intepreted
147
+ # as options, see below.
148
+ #
149
+ # ==== Options
150
+ #
151
+ # :ignorecase:: If true, generate case insensitive regexes and
152
+ # upcase the scope in AST output (per #scope_upcase)
153
+ def scopes=( scopes )
154
+ scopes = Array( scopes )
155
+ opts = scopes.last.is_a?( Hash ) && scopes.pop || {}
156
+ ignorecase = !!(opts[:ignorecase])
157
+ if scopes.empty?
158
+ @scope = nil
159
+ @scope_token = nil
160
+ elsif scopes.length == 1 && !ignorecase
161
+ s = scopes.first
162
+ @scope = ( s + ':' ).freeze
163
+ @scope_token = /((?<=\A|#{SP})(#{s}))?#{SP}*:/.freeze
164
+ else
165
+ opts = ignorecase ? Regexp::IGNORECASE : nil
166
+ s = Regexp.union( *scopes ).source
167
+ @scope = Regexp.new( '\A(' + s + '):\z', opts ).freeze
168
+ @scope_token = Regexp.new( "((?<=\\A|#{SP})(#{s}))?#{SP}*:",
169
+ opts ).freeze
170
+ end
171
+ @scope_upcase = ignorecase
172
+ nil
173
+ end
174
+
175
+ # Scope pattern or value matching post-normalized scope token,
176
+ # including trailing ':' but without whitespace.
177
+ # Default: nil -> no scopes
178
+ attr_accessor :scope
179
+
180
+ # SCOPE unary operator pattern used for lexing to treat a scope
181
+ # prefix, e.g. 'SCOPE' + ':', with or without internal or trailing
182
+ # whitespace as single token. Used by #norm_scope, where it also
183
+ # treats a non-matching ':' as whitespace. This would normally be
184
+ # set via #scopes=.
185
+ # Default: nil -> no scopes
186
+ attr_accessor :scope_token
187
+
188
+ # Should scope tokens be upcased in the AST? This would imply
189
+ # case-insensitive #scope, and #scope_token as generated via
190
+ # #scopes= with the `ignorecase: true` option.
191
+ # Default: false
192
+ attr_accessor :scope_upcase
193
+
194
+ # If true, log parsing progress and state to $stderr.
195
+ # Default: false
196
+ attr_accessor :verbose
197
+
198
+ # Construct given options which are interpreted as attribute names
199
+ # to set.
200
+ def initialize( opts = {} )
201
+ @default_op = :and
202
+
203
+ @precedence = Hash.new(10)
204
+ @precedence.merge!( DEFAULT_PRECEDENCE )
205
+ @precedence.freeze
206
+
207
+ @spaces = SPACES
208
+ @infix_token = /[()|&"]/.freeze
209
+ @prefix_token = /(?<=\A|#{SP})-(?=#{NSP})/.freeze
210
+ @or_token = /\A(OR|\|)\z/i.freeze
211
+ @and_token = /\A(AND|\&)\z/i.freeze
212
+ @not_token = /\A(NOT|\-)\z/i.freeze
213
+ @lquote = @rquote = '"'.freeze
214
+ @lparen = '('.freeze
215
+ @rparen = ')'.freeze
216
+
217
+ @scope = nil
218
+ @scope_token = nil
219
+ @scope_upcase = false
220
+
221
+ @verbose = false
222
+
223
+ opts.each do |name,val|
224
+ send( name.to_s + '=', val )
225
+ end
226
+ end
227
+
228
+ def parse( q )
229
+ unless @default_op == :and || @default_op == :or
230
+ raise( "QueryParser#default_op is (#{@default_op.inspect}) " +
231
+ "(should be :and or :or)" )
232
+ end
233
+ q = normalize( q )
234
+ tokens = q ? q.split(' ') : []
235
+ log { "Parse: " + tokens.join( ' ' ) }
236
+ ast = parse_tree( tokens )
237
+ log { "AST: " + ast.inspect }
238
+ ast
239
+ end
240
+
241
+ def log( l = nil )
242
+ if @verbose
243
+ l = yield if block_given?
244
+ $stderr.puts( l )
245
+ end
246
+ end
247
+
248
+ def parse_tree( tokens )
249
+ s = ParseState.new( self )
250
+ while ( t = tokens.shift )
251
+ case t
252
+ when @lquote
253
+ rqi = tokens.index { |tt| @rquote === tt }
254
+ if rqi
255
+ s.push_term( [ :phrase, *norm_phrase_tokens(tokens[0...rqi]) ] )
256
+ tokens = tokens[rqi+1..-1]
257
+ end # else ignore
258
+ when @lparen
259
+ rpi = rparen_index( tokens )
260
+ if rpi
261
+ s.push_term( parse_tree( tokens[0...rpi] ) )
262
+ tokens = tokens[rpi+1..-1]
263
+ end # else ignore
264
+ when @rquote
265
+ #ignore
266
+ when @rparen
267
+ #ignore
268
+ when @scope
269
+ s.push_op( scope_op( t ) )
270
+ when @or_token
271
+ s.push_op( :or )
272
+ when @and_token
273
+ s.push_op( :and )
274
+ when @not_token
275
+ s.push_op( :not )
276
+ else
277
+ s.push_term( norm_term( t ) )
278
+ end
279
+ end
280
+ s.flush_tree
281
+ end
282
+
283
+ # Given scope token, return the name (minus trailing ':'),
284
+ # upcased if #scope_upcase.
285
+ def scope_op( token )
286
+ t = token[0...-1]
287
+ t.upcase! if @scope_upcase
288
+ t
289
+ end
290
+
291
+ # Find token matching #rparen in remaining tokens.
292
+ def rparen_index( tokens )
293
+ li = 1
294
+ phrase = false
295
+ tokens.index do |tt|
296
+ if phrase
297
+ phrase = false if @rquote === tt
298
+ else
299
+ case tt
300
+ when @rparen
301
+ li -= 1
302
+ when @lparen
303
+ li += 1
304
+ when @lquote
305
+ phrase = true
306
+ end
307
+ end
308
+ (li == 0)
309
+ end
310
+ end
311
+
312
+ # Treat various punctuation form operators as _always_ being
313
+ # seperate tokens per #infix_token pattern.
314
+ # Note: Must always call norm_space _after_ this
315
+ def norm_infix( q )
316
+ q.gsub( @infix_token, ' \0 ' )
317
+ end
318
+
319
+ # Split prefixes as seperate tokens per #prefix_token pattern
320
+ def norm_prefix( q )
321
+ if @prefix_token
322
+ q.gsub( @prefix_token, '\0 ' )
323
+ else
324
+ q
325
+ end
326
+ end
327
+
328
+ # If #scope_token is specified, normalize scopes as separate
329
+ # 'SCOPE:' tokens.
330
+ # This expects the 2nd capture group of #scope_token to be the
331
+ # actual matching scope name, if present.
332
+ def norm_scope( q )
333
+ if @scope_token
334
+ q.gsub( @scope_token ) do
335
+ if $2
336
+ $2 + ': '
337
+ else
338
+ ' '
339
+ end
340
+ end
341
+ else
342
+ q
343
+ end
344
+ end
345
+
346
+ # Normalize any whitespace to a single ASCII space character and
347
+ # strip leading/trailing whitepsace.
348
+ def norm_space( q )
349
+ q.gsub(@spaces, ' ').strip
350
+ end
351
+
352
+ # Runs the suite of initial input norm_* functions. Returns nil if
353
+ # the result is empty.
354
+ def normalize( q )
355
+ q ||= ''
356
+ q = norm_infix( q )
357
+ q = norm_scope( q )
358
+ q = norm_prefix( q )
359
+ q = norm_space( q )
360
+ q unless q.empty?
361
+ end
362
+
363
+ # Select which tokens survive in a phrase. Also passes each token
364
+ # though #norm_term. Tokens matching #lparen and #rparen are
365
+ # dropped.
366
+ def norm_phrase_tokens( tokens )
367
+ tokens.
368
+ reject { |t| @lparen === t || @rparen === t }.
369
+ map { |t| norm_term( t ) }
370
+ end
371
+
372
+ # No-op in this implementation but may be used to replace
373
+ # characters. Should not receive nor return null or empty values.
374
+ def norm_term( t )
375
+ t
376
+ end
377
+
378
+ # Internal state keeping
379
+ class ParseState # :nodoc:
380
+
381
+ def initialize( parser )
382
+ @default_op = parser.default_op
383
+ @precedence = parser.precedence
384
+ @verbose = parser.verbose
385
+ @node = [ @default_op ]
386
+ @ops = []
387
+ @has_op = true
388
+ @index = 0
389
+ @last_term = -1
390
+ end
391
+
392
+ def log( l = nil )
393
+ if @verbose
394
+ l = yield if block_given?
395
+ $stderr.puts( l )
396
+ end
397
+ end
398
+
399
+ def dump( fr )
400
+ if @verbose
401
+ log( "%2d %2s ops: %-12s node: %-30s" %
402
+ [ @index, fr, @ops.inspect, @node.inspect ] )
403
+ end
404
+ end
405
+
406
+ def push_term( t )
407
+ if @has_op
408
+ @index += 1
409
+ else
410
+ push_op( @default_op )
411
+ end
412
+ @node << t
413
+ @last_term = @index
414
+ @has_op = false
415
+ dump 'PT'
416
+ end
417
+
418
+ def precedence_lte?( op1, op2 )
419
+ @precedence[op1] <= @precedence[op2]
420
+ end
421
+
422
+ def unary?( op )
423
+ ( op == :not || op.is_a?( String ) )
424
+ end
425
+
426
+ def push_op( op )
427
+ @index += 1
428
+ # Possible special case implied DEFAULT_OP in front of :not or
429
+ # :scope.
430
+ if unary?( op )
431
+ push_op( @default_op ) unless @has_op
432
+ elsif @node.length < 2 # no proceeding term
433
+ log { "Ignoring leading #{op.inspect} (index #{@index})" }
434
+ return
435
+ end
436
+ loop do
437
+ n, last = @ops.last
438
+ if last && precedence_lte?( op, last )
439
+ @ops.pop
440
+ op_to_node( n, last )
441
+ dump 'PL'
442
+ else
443
+ break
444
+ end
445
+ end
446
+ @ops << [ @index, op ]
447
+ @has_op = true
448
+ dump 'PO'
449
+ end
450
+
451
+ def flush_tree
452
+ loop do
453
+ n, last = @ops.pop
454
+ break unless last
455
+ op_to_node( n, last )
456
+ dump 'FO'
457
+ end
458
+ @node
459
+ end
460
+
461
+ def pop_term
462
+ @node.pop if @node.length > 1
463
+ end
464
+
465
+ def op_to_node( opi, op )
466
+ if opi >= @last_term
467
+ log { "Ignoring trailing #{op.inspect} (index #{opi})" }
468
+ return
469
+ end
470
+ o1 = pop_term
471
+ if o1
472
+ if unary?( op )
473
+ @node << [ op, o1 ]
474
+ else
475
+ o0 = pop_term
476
+ if o0
477
+ if @node[0] == op
478
+ @node << o0 << o1
479
+ else
480
+ @node << [ op, o0, o1 ]
481
+ end
482
+ else
483
+ if @node[0] == op
484
+ @node << o1
485
+ else
486
+ @node = [ op, @node, o1 ]
487
+ end
488
+ end
489
+ end
490
+ else
491
+ log { "No argument to #{op.inspect}, ignoring" }
492
+ end
493
+ end
494
+ end
495
+
496
+ end
497
+
498
+ end