human-ql 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,498 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Human friendly, lenient query parser. Parses an arbitrary input
20
+ # string and outputs an abstract syntax tree (AST), which uses ruby
21
+ # arrays as S-expressions.
22
+ #
23
+ # === Supported Syntax Summary
24
+ #
25
+ # As per defaults. In the table below, input string variations on
26
+ # the left are sperated by ',' and output AST is shown on the right.
27
+ #
28
+ # a --> 'a'
29
+ # "a b c" --> [ :phrase, 'a', 'b', 'c' ]
30
+ # a b c --> [ :and, 'a', 'b', 'c' ]
31
+ # a OR b, a|b --> [ :or, 'a', 'b' ]
32
+ # a AND b, a&b --> [ :and, 'a', 'b' ]
33
+ # a b|c --> [ :and, 'a', [:or, 'b', 'c'] ]
34
+ # (a b) OR (c d) --> [ :or, [:and, 'a', 'b'], [:and, 'c', 'd'] ]
35
+ # NOT expr, -expr --> [ :not, expr ]
36
+ # SCOPE:expr, SCOPE : expr --> [ 'SCOPE', expr ]
37
+ #
38
+ # Where:
39
+ # * 'expr' may be simple term, phrase, or parenthetical expression.
40
+ # * SCOPEs must be specified. By default, no scopes are
41
+ # supported.
42
+ #
43
+ # The AST output from #parse may have various no-ops and
44
+ # redundances. Run it through a TreeNormalizer to avoid seeing or
45
+ # needing to handle these cases.
46
+ #
47
+ # === Customization
48
+ #
49
+ # The lexing and token matching patterns, as well as other
50
+ # attributes used in the parser may be adjusted via constructor
51
+ # options or attribute writer methods. Many of these attributes may
52
+ # either be String constants or Regex patterns supporting multiple
53
+ # values as needed. Some features may be disabled by setting these
54
+ # values to nil (e.g. match no tokens). While accessors are defined,
55
+ # internally the instance variables are accessed directly for
56
+ # speed. Tests show this is as fast as using constants (which would
57
+ # be harder to modify) and faster than reader method calls.
58
+ #
59
+ # === Implementation Notes
60
+ #
61
+ # The parser implementation adapts the infix precedence handling and
62
+ # operator stack of the
63
+ # {Shunting Yard Algorithm}[https://en.wikipedia.org/wiki/Shunting-yard_algorithm]
64
+ # originally described by Edsger Dijkstra. Attributes #default_op
65
+ # and #precedence control the handling of explicit or implied infix
66
+ # operators.
67
+ class QueryParser
68
+
69
+ # String pattern for Unicode spaces
70
+ SP = "[[:space:]]".freeze
71
+
72
+ # String pattern for Unicode non-spaces
73
+ NSP = "[^#{SP}]".freeze
74
+
75
+ # Regex for 1-to-many Unicode spaces
76
+ SPACES = /#{SP}+/.freeze
77
+
78
+ # Default precedence of supported operators.
79
+ DEFAULT_PRECEDENCE = {
80
+ not: 11,
81
+ or: 2,
82
+ and: 1
83
+ }.freeze
84
+
85
+ # The default operator when none is otherwise given between parsed
86
+ # terms.
87
+ # Default: :and
88
+ attr_accessor :default_op
89
+
90
+ # Hash of operators to precedence Integer value. The hash should
91
+ # also provide a default value for unlisted operators like any
92
+ # supported scopes. To limit human surprise, the #default_op
93
+ # should have the lowest precedence. The default is as per
94
+ # DEFAULT_PRECEDENCE with a default value of 10, thus :not has the
95
+ # highest precedence at 11.
96
+ attr_accessor :precedence
97
+
98
+ # Pattern matching one or more characters to treat as white-space
99
+ # Default: SPACES
100
+ attr_accessor :spaces
101
+
102
+ # Pattern used for lexing to treat certain punctuation characters
103
+ # as seperate tokens, even if they are not space seperated.
104
+ # Default: Pattern matching any characters '(', ')', '|', '&', '"'
105
+ # as used as operator/parenthesis tokens in defaults below.
106
+ attr_accessor :infix_token
107
+
108
+ # Pattern used for lexing to treat certain characters as seperate
109
+ # tokens when appearing as a prefix only.
110
+ # Default '-' (as used in default #not_tokens)
111
+ attr_accessor :prefix_token
112
+
113
+ # OR operator token pattern. Should match the entire token using
114
+ # the '\A' and '/z' syntax for begining and end of string.
115
+ # Default: Pattern matching complete tokens 'OR', 'or', or '|'
116
+ attr_accessor :or_token
117
+
118
+ # AND operator token pattern. Should match the entire token using
119
+ # the '\A' and '/z' syntax for begining and end of string.
120
+ # Default: Pattern matching complete tokens 'AND', 'and', or '&'
121
+ attr_accessor :and_token
122
+
123
+ # NOT operator token pattern. Should match the entire token using
124
+ # the '\A' and '/z' syntax for begining and end of string.
125
+ # Default: Pattern matching complete tokens 'NOT', 'not', or '-'
126
+ attr_accessor :not_token
127
+
128
+ # Left quote pattern or value
129
+ # Default: '"'
130
+ attr_accessor :lquote
131
+
132
+ # Right quote pattern or value. Its fine if this is the same as
133
+ # #lquote.
134
+ # Default: '"'
135
+ attr_accessor :rquote
136
+
137
+ # Left parentheses pattern or value
138
+ # Default: '('
139
+ attr_accessor :lparen
140
+
141
+ # Right parentheses pattern or value
142
+ # Default: ')'
143
+ attr_accessor :rparen
144
+
145
+ # Given one or an Array of scope prefixes, generate the #scope and
146
+ # #scope_token patterns. A trailing hash is intepreted
147
+ # as options, see below.
148
+ #
149
+ # ==== Options
150
+ #
151
+ # :ignorecase:: If true, generate case insensitive regexes and
152
+ # upcase the scope in AST output (per #scope_upcase)
153
+ def scopes=( scopes )
154
+ scopes = Array( scopes )
155
+ opts = scopes.last.is_a?( Hash ) && scopes.pop || {}
156
+ ignorecase = !!(opts[:ignorecase])
157
+ if scopes.empty?
158
+ @scope = nil
159
+ @scope_token = nil
160
+ elsif scopes.length == 1 && !ignorecase
161
+ s = scopes.first
162
+ @scope = ( s + ':' ).freeze
163
+ @scope_token = /((?<=\A|#{SP})(#{s}))?#{SP}*:/.freeze
164
+ else
165
+ opts = ignorecase ? Regexp::IGNORECASE : nil
166
+ s = Regexp.union( *scopes ).source
167
+ @scope = Regexp.new( '\A(' + s + '):\z', opts ).freeze
168
+ @scope_token = Regexp.new( "((?<=\\A|#{SP})(#{s}))?#{SP}*:",
169
+ opts ).freeze
170
+ end
171
+ @scope_upcase = ignorecase
172
+ nil
173
+ end
174
+
175
+ # Scope pattern or value matching post-normalized scope token,
176
+ # including trailing ':' but without whitespace.
177
+ # Default: nil -> no scopes
178
+ attr_accessor :scope
179
+
180
+ # SCOPE unary operator pattern used for lexing to treat a scope
181
+ # prefix, e.g. 'SCOPE' + ':', with or without internal or trailing
182
+ # whitespace as single token. Used by #norm_scope, where it also
183
+ # treats a non-matching ':' as whitespace. This would normally be
184
+ # set via #scopes=.
185
+ # Default: nil -> no scopes
186
+ attr_accessor :scope_token
187
+
188
+ # Should scope tokens be upcased in the AST? This would imply
189
+ # case-insensitive #scope, and #scope_token as generated via
190
+ # #scopes= with the `ignorecase: true` option.
191
+ # Default: false
192
+ attr_accessor :scope_upcase
193
+
194
+ # If true, log parsing progress and state to $stderr.
195
+ # Default: false
196
+ attr_accessor :verbose
197
+
198
+ # Construct given options which are interpreted as attribute names
199
+ # to set.
200
+ def initialize( opts = {} )
201
+ @default_op = :and
202
+
203
+ @precedence = Hash.new(10)
204
+ @precedence.merge!( DEFAULT_PRECEDENCE )
205
+ @precedence.freeze
206
+
207
+ @spaces = SPACES
208
+ @infix_token = /[()|&"]/.freeze
209
+ @prefix_token = /(?<=\A|#{SP})-(?=#{NSP})/.freeze
210
+ @or_token = /\A(OR|\|)\z/i.freeze
211
+ @and_token = /\A(AND|\&)\z/i.freeze
212
+ @not_token = /\A(NOT|\-)\z/i.freeze
213
+ @lquote = @rquote = '"'.freeze
214
+ @lparen = '('.freeze
215
+ @rparen = ')'.freeze
216
+
217
+ @scope = nil
218
+ @scope_token = nil
219
+ @scope_upcase = false
220
+
221
+ @verbose = false
222
+
223
+ opts.each do |name,val|
224
+ send( name.to_s + '=', val )
225
+ end
226
+ end
227
+
228
+ def parse( q )
229
+ unless @default_op == :and || @default_op == :or
230
+ raise( "QueryParser#default_op is (#{@default_op.inspect}) " +
231
+ "(should be :and or :or)" )
232
+ end
233
+ q = normalize( q )
234
+ tokens = q ? q.split(' ') : []
235
+ log { "Parse: " + tokens.join( ' ' ) }
236
+ ast = parse_tree( tokens )
237
+ log { "AST: " + ast.inspect }
238
+ ast
239
+ end
240
+
241
+ def log( l = nil )
242
+ if @verbose
243
+ l = yield if block_given?
244
+ $stderr.puts( l )
245
+ end
246
+ end
247
+
248
+ def parse_tree( tokens )
249
+ s = ParseState.new( self )
250
+ while ( t = tokens.shift )
251
+ case t
252
+ when @lquote
253
+ rqi = tokens.index { |tt| @rquote === tt }
254
+ if rqi
255
+ s.push_term( [ :phrase, *norm_phrase_tokens(tokens[0...rqi]) ] )
256
+ tokens = tokens[rqi+1..-1]
257
+ end # else ignore
258
+ when @lparen
259
+ rpi = rparen_index( tokens )
260
+ if rpi
261
+ s.push_term( parse_tree( tokens[0...rpi] ) )
262
+ tokens = tokens[rpi+1..-1]
263
+ end # else ignore
264
+ when @rquote
265
+ #ignore
266
+ when @rparen
267
+ #ignore
268
+ when @scope
269
+ s.push_op( scope_op( t ) )
270
+ when @or_token
271
+ s.push_op( :or )
272
+ when @and_token
273
+ s.push_op( :and )
274
+ when @not_token
275
+ s.push_op( :not )
276
+ else
277
+ s.push_term( norm_term( t ) )
278
+ end
279
+ end
280
+ s.flush_tree
281
+ end
282
+
283
+ # Given scope token, return the name (minus trailing ':'),
284
+ # upcased if #scope_upcase.
285
+ def scope_op( token )
286
+ t = token[0...-1]
287
+ t.upcase! if @scope_upcase
288
+ t
289
+ end
290
+
291
+ # Find token matching #rparen in remaining tokens.
292
+ def rparen_index( tokens )
293
+ li = 1
294
+ phrase = false
295
+ tokens.index do |tt|
296
+ if phrase
297
+ phrase = false if @rquote === tt
298
+ else
299
+ case tt
300
+ when @rparen
301
+ li -= 1
302
+ when @lparen
303
+ li += 1
304
+ when @lquote
305
+ phrase = true
306
+ end
307
+ end
308
+ (li == 0)
309
+ end
310
+ end
311
+
312
+ # Treat various punctuation form operators as _always_ being
313
+ # seperate tokens per #infix_token pattern.
314
+ # Note: Must always call norm_space _after_ this
315
+ def norm_infix( q )
316
+ q.gsub( @infix_token, ' \0 ' )
317
+ end
318
+
319
+ # Split prefixes as seperate tokens per #prefix_token pattern
320
+ def norm_prefix( q )
321
+ if @prefix_token
322
+ q.gsub( @prefix_token, '\0 ' )
323
+ else
324
+ q
325
+ end
326
+ end
327
+
328
+ # If #scope_token is specified, normalize scopes as separate
329
+ # 'SCOPE:' tokens.
330
+ # This expects the 2nd capture group of #scope_token to be the
331
+ # actual matching scope name, if present.
332
+ def norm_scope( q )
333
+ if @scope_token
334
+ q.gsub( @scope_token ) do
335
+ if $2
336
+ $2 + ': '
337
+ else
338
+ ' '
339
+ end
340
+ end
341
+ else
342
+ q
343
+ end
344
+ end
345
+
346
+ # Normalize any whitespace to a single ASCII space character and
347
+ # strip leading/trailing whitepsace.
348
+ def norm_space( q )
349
+ q.gsub(@spaces, ' ').strip
350
+ end
351
+
352
+ # Runs the suite of initial input norm_* functions. Returns nil if
353
+ # the result is empty.
354
+ def normalize( q )
355
+ q ||= ''
356
+ q = norm_infix( q )
357
+ q = norm_scope( q )
358
+ q = norm_prefix( q )
359
+ q = norm_space( q )
360
+ q unless q.empty?
361
+ end
362
+
363
+ # Select which tokens survive in a phrase. Also passes each token
364
+ # though #norm_term. Tokens matching #lparen and #rparen are
365
+ # dropped.
366
+ def norm_phrase_tokens( tokens )
367
+ tokens.
368
+ reject { |t| @lparen === t || @rparen === t }.
369
+ map { |t| norm_term( t ) }
370
+ end
371
+
372
+ # No-op in this implementation but may be used to replace
373
+ # characters. Should not receive nor return null or empty values.
374
+ def norm_term( t )
375
+ t
376
+ end
377
+
378
+ # Internal state keeping
379
+ class ParseState # :nodoc:
380
+
381
+ def initialize( parser )
382
+ @default_op = parser.default_op
383
+ @precedence = parser.precedence
384
+ @verbose = parser.verbose
385
+ @node = [ @default_op ]
386
+ @ops = []
387
+ @has_op = true
388
+ @index = 0
389
+ @last_term = -1
390
+ end
391
+
392
+ def log( l = nil )
393
+ if @verbose
394
+ l = yield if block_given?
395
+ $stderr.puts( l )
396
+ end
397
+ end
398
+
399
+ def dump( fr )
400
+ if @verbose
401
+ log( "%2d %2s ops: %-12s node: %-30s" %
402
+ [ @index, fr, @ops.inspect, @node.inspect ] )
403
+ end
404
+ end
405
+
406
+ def push_term( t )
407
+ if @has_op
408
+ @index += 1
409
+ else
410
+ push_op( @default_op )
411
+ end
412
+ @node << t
413
+ @last_term = @index
414
+ @has_op = false
415
+ dump 'PT'
416
+ end
417
+
418
+ def precedence_lte?( op1, op2 )
419
+ @precedence[op1] <= @precedence[op2]
420
+ end
421
+
422
+ def unary?( op )
423
+ ( op == :not || op.is_a?( String ) )
424
+ end
425
+
426
+ def push_op( op )
427
+ @index += 1
428
+ # Possible special case implied DEFAULT_OP in front of :not or
429
+ # :scope.
430
+ if unary?( op )
431
+ push_op( @default_op ) unless @has_op
432
+ elsif @node.length < 2 # no proceeding term
433
+ log { "Ignoring leading #{op.inspect} (index #{@index})" }
434
+ return
435
+ end
436
+ loop do
437
+ n, last = @ops.last
438
+ if last && precedence_lte?( op, last )
439
+ @ops.pop
440
+ op_to_node( n, last )
441
+ dump 'PL'
442
+ else
443
+ break
444
+ end
445
+ end
446
+ @ops << [ @index, op ]
447
+ @has_op = true
448
+ dump 'PO'
449
+ end
450
+
451
+ def flush_tree
452
+ loop do
453
+ n, last = @ops.pop
454
+ break unless last
455
+ op_to_node( n, last )
456
+ dump 'FO'
457
+ end
458
+ @node
459
+ end
460
+
461
+ def pop_term
462
+ @node.pop if @node.length > 1
463
+ end
464
+
465
+ def op_to_node( opi, op )
466
+ if opi >= @last_term
467
+ log { "Ignoring trailing #{op.inspect} (index #{opi})" }
468
+ return
469
+ end
470
+ o1 = pop_term
471
+ if o1
472
+ if unary?( op )
473
+ @node << [ op, o1 ]
474
+ else
475
+ o0 = pop_term
476
+ if o0
477
+ if @node[0] == op
478
+ @node << o0 << o1
479
+ else
480
+ @node << [ op, o0, o1 ]
481
+ end
482
+ else
483
+ if @node[0] == op
484
+ @node << o1
485
+ else
486
+ @node = [ op, @node, o1 ]
487
+ end
488
+ end
489
+ end
490
+ else
491
+ log { "No argument to #{op.inspect}, ignoring" }
492
+ end
493
+ end
494
+ end
495
+
496
+ end
497
+
498
+ end