janeway-jsonpath 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +7 -0
  3. data/README.md +43 -0
  4. data/bin/janeway +130 -0
  5. data/lib/janeway/ast/array_slice_selector.rb +104 -0
  6. data/lib/janeway/ast/binary_operator.rb +101 -0
  7. data/lib/janeway/ast/boolean.rb +24 -0
  8. data/lib/janeway/ast/child_segment.rb +48 -0
  9. data/lib/janeway/ast/current_node.rb +71 -0
  10. data/lib/janeway/ast/descendant_segment.rb +44 -0
  11. data/lib/janeway/ast/error.rb +10 -0
  12. data/lib/janeway/ast/expression.rb +55 -0
  13. data/lib/janeway/ast/filter_selector.rb +61 -0
  14. data/lib/janeway/ast/function.rb +40 -0
  15. data/lib/janeway/ast/helpers.rb +27 -0
  16. data/lib/janeway/ast/identifier.rb +35 -0
  17. data/lib/janeway/ast/index_selector.rb +27 -0
  18. data/lib/janeway/ast/name_selector.rb +52 -0
  19. data/lib/janeway/ast/null.rb +23 -0
  20. data/lib/janeway/ast/number.rb +21 -0
  21. data/lib/janeway/ast/query.rb +41 -0
  22. data/lib/janeway/ast/root_node.rb +50 -0
  23. data/lib/janeway/ast/selector.rb +32 -0
  24. data/lib/janeway/ast/string_type.rb +21 -0
  25. data/lib/janeway/ast/unary_operator.rb +26 -0
  26. data/lib/janeway/ast/wildcard_selector.rb +46 -0
  27. data/lib/janeway/error.rb +23 -0
  28. data/lib/janeway/functions/count.rb +39 -0
  29. data/lib/janeway/functions/length.rb +33 -0
  30. data/lib/janeway/functions/match.rb +69 -0
  31. data/lib/janeway/functions/search.rb +63 -0
  32. data/lib/janeway/functions/value.rb +47 -0
  33. data/lib/janeway/functions.rb +62 -0
  34. data/lib/janeway/interpreter.rb +644 -0
  35. data/lib/janeway/lexer.rb +514 -0
  36. data/lib/janeway/location.rb +3 -0
  37. data/lib/janeway/parser.rb +608 -0
  38. data/lib/janeway/token.rb +39 -0
  39. data/lib/janeway/version.rb +5 -0
  40. data/lib/janeway.rb +51 -0
  41. metadata +92 -0
@@ -0,0 +1,514 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'location'
4
+ require_relative 'token'
5
+ require_relative 'error'
6
+
7
+ module Janeway
8
+ OPERATORS = {
9
+ and: '&&',
10
+ array_slice_separator: ':',
11
+ child_end: ']',
12
+ child_start: '[',
13
+ current_node: '@',
14
+ descendants: '..',
15
+ dot: '.',
16
+ equal: '==',
17
+ filter: '?',
18
+ greater_than: '>',
19
+ greater_than_or_equal: '>=',
20
+ group_end: ')',
21
+ group_start: '(',
22
+ less_than: '<',
23
+ less_than_or_equal: '<=',
24
+ minus: '-',
25
+ not: '!',
26
+ not_equal: '!=',
27
+ or: '||',
28
+ root: '$',
29
+ union: ',',
30
+ wildcard: '*',
31
+ }.freeze
32
+ ONE_CHAR_LEX = OPERATORS.values.select { |lexeme| lexeme.size == 1 }.freeze
33
+ TWO_CHAR_LEX = OPERATORS.values.select { |lexeme| lexeme.size == 2 }.freeze
34
+ TWO_CHAR_LEX_FIRST = TWO_CHAR_LEX.map { |lexeme| lexeme[0] }.freeze
35
+ ONE_OR_TWO_CHAR_LEX = ONE_CHAR_LEX & TWO_CHAR_LEX.map { |str| str[0] }.freeze
36
+
37
+ WHITESPACE = " \t\n\r"
38
+ KEYWORD = %w[true false null].freeze
39
+ FUNCTIONS = %w[length count match search value].freeze
40
+
41
+ # faster to check membership in a string than an array of char (benchmarked ruby 3.1.2)
42
+ ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
43
+ DIGITS = '0123456789'
44
+
45
+ # chars that may be used as the first letter of member-name-shorthand
46
+ NAME_FIRST = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
47
+
48
+ # Transforms source code into tokens
49
+ class Lexer
50
+ class Error < Janeway::Error; end
51
+
52
+ attr_reader :source, :tokens
53
+ attr_accessor :next_p, :lexeme_start_p
54
+
55
+ # Tokenize and return the token list.
56
+ #
57
+ # @param query [String] jsonpath query
58
+ # @return [Array<Token>]
59
+ def self.lex(query)
60
+ raise ArgumentError, "expect string, got #{query.inspect}" unless query.is_a?(String)
61
+
62
+ lexer = new(query)
63
+ lexer.start_tokenization
64
+ lexer.tokens
65
+ end
66
+
67
+ def initialize(source)
68
+ @source = source
69
+ @tokens = []
70
+ @next_p = 0
71
+ @lexeme_start_p = 0
72
+ end
73
+
74
+ def start_tokenization
75
+ if WHITESPACE.include?(@source[0]) || WHITESPACE.include?(@source[-1])
76
+ raise Error, 'JSONPath query may not start or end with whitespace'
77
+ end
78
+ tokenize while source_uncompleted?
79
+
80
+ tokens << Token.new(:eof, '', nil, after_source_end_location)
81
+ end
82
+
83
+ # Read a token from the @source, increment the pointers.
84
+ def tokenize
85
+ self.lexeme_start_p = next_p
86
+
87
+ c = consume
88
+ return if WHITESPACE.include?(c)
89
+
90
+ token =
91
+ if ONE_OR_TWO_CHAR_LEX.include?(c)
92
+ token_from_one_or_two_char_lex(c)
93
+ elsif ONE_CHAR_LEX.include?(c)
94
+ token_from_one_char_lex(c)
95
+ elsif TWO_CHAR_LEX_FIRST.include?(c)
96
+ token_from_two_char_lex(c)
97
+ elsif %w[" '].include?(c)
98
+ lex_delimited_string(c)
99
+ elsif digit?(c)
100
+ lex_number
101
+ elsif name_first_char?(c)
102
+ lex_member_name_shorthand(ignore_keywords: tokens.last.type == :dot)
103
+ end
104
+
105
+ if token
106
+ tokens << token
107
+ else
108
+ raise Error.new("Unknown character: #{c.inspect}", @source, current_location)
109
+ end
110
+ end
111
+
112
+ def digit?(lexeme)
113
+ DIGITS.include?(lexeme)
114
+ end
115
+
116
+ def alpha_numeric?(lexeme)
117
+ ALPHABET.include?(lexeme) || DIGITS.include?(lexeme)
118
+ end
119
+
120
+ def lookahead(offset = 1)
121
+ lookahead_p = (next_p - 1) + offset
122
+ return "\0" if lookahead_p >= source.length
123
+
124
+ source[lookahead_p]
125
+ end
126
+
127
+ def token_from_one_char_lex(lexeme)
128
+ if %w[. -].include?(lexeme) && WHITESPACE.include?(lookahead)
129
+ raise Error, "Operator #{lexeme.inspect} must not be followed by whitespace"
130
+ end
131
+
132
+ Token.new(OPERATORS.key(lexeme), lexeme, nil, current_location)
133
+ end
134
+
135
+ # Consumes an operator that could be either 1 or 2 chars in length
136
+ # @return [Token]
137
+ def token_from_one_or_two_char_lex(lexeme)
138
+ next_two_chars = [lexeme, lookahead].join
139
+ if TWO_CHAR_LEX.include?(next_two_chars)
140
+ consume
141
+ if next_two_chars == '..' && WHITESPACE.include?(lookahead)
142
+ raise Error, "Operator #{next_two_chars.inspect} must not be followed by whitespace"
143
+ end
144
+ Token.new(OPERATORS.key(next_two_chars), next_two_chars, nil, current_location)
145
+ else
146
+ token_from_one_char_lex(lexeme)
147
+ end
148
+ end
149
+
150
+ # Consumes a 2 char operator
151
+ # @return [Token]
152
+ def token_from_two_char_lex(lexeme)
153
+ next_two_chars = [lexeme, lookahead].join
154
+ raise Error.new("Unknown operator \"#{next_two_chars}\"", @source) unless TWO_CHAR_LEX.include?(next_two_chars)
155
+
156
+ consume
157
+ Token.new(OPERATORS.key(next_two_chars), next_two_chars, nil, current_location)
158
+ end
159
+
160
+ def consume
161
+ c = lookahead
162
+ @next_p += 1
163
+ c
164
+ end
165
+
166
+ def consume_digits
167
+ consume while digit?(lookahead)
168
+ end
169
+
170
+ # @param delimiter [String] eg. ' or "
171
+ # @return [Token] string token
172
+ def lex_delimited_string(delimiter)
173
+ non_delimiter = %w[' "].reject { _1 == delimiter }.first
174
+
175
+ literal_chars = []
176
+ while lookahead != delimiter && source_uncompleted?
177
+ # Transform escaped representation to literal chars
178
+ next_char = lookahead
179
+ literal_chars <<
180
+ if next_char == '\\'
181
+ if lookahead(2) == delimiter
182
+ consume # \
183
+ consume # delimiter
184
+ elsif lookahead(2) == non_delimiter
185
+ qtype = delimiter == '"' ? 'double' : 'single'
186
+ raise Error, "Character #{non_delimiter} must not be escaped within #{qtype} quotes"
187
+ else
188
+ consume_escape_sequence # consumes multiple chars
189
+ end
190
+ elsif unescaped?(next_char)
191
+ consume
192
+ elsif %w[' "].include?(next_char) && next_char != delimiter
193
+ consume
194
+ else
195
+ raise Error.new("invalid character #{next_char.inspect}", current_location)
196
+ end
197
+ end
198
+ raise Error.new("Unterminated string error: #{literal_chars.join.inspect}") if source_completed?
199
+
200
+ consume # closing delimiter
201
+
202
+ # literal value omits delimiters and includes un-escaped values
203
+ literal = literal_chars.join
204
+
205
+ # lexeme value includes delimiters and literal escape characters
206
+ lexeme = source[lexeme_start_p..(next_p - 1)]
207
+
208
+ Token.new(:string, lexeme, literal, current_location)
209
+ end
210
+
211
+ # Read escape char literals, and transform them into the described character
212
+ # @return [String] single character (possibly multi-byte)
213
+ def consume_escape_sequence
214
+ raise 'not an escape sequence' unless consume == '\\'
215
+
216
+ char = consume
217
+ case char
218
+ when 'b' then "\b"
219
+ when 'f' then "\f"
220
+ when 'n' then "\n"
221
+ when 'r' then "\r"
222
+ when 't' then "\t"
223
+ when '/', '\\', '"', "'" then char
224
+ when 'u' then consume_unicode_escape_sequence
225
+ else
226
+ if unescaped?(char)
227
+ raise Error.new("Character #{char} must not be escaped")
228
+ else
229
+ # whatever this is, it is not allowed even when escaped
230
+ raise Error.new("Invalid character #{char.inspect}", current_location)
231
+ end
232
+ end
233
+ end
234
+
235
+ # Consume a unicode escape that matches this ABNF grammar:
236
+ # https://www.rfc-editor.org/rfc/rfc9535.html#section-2.3.1.1-2
237
+ #
238
+ # hexchar = non-surrogate / (high-surrogate "\" %x75 low-surrogate)
239
+ # non-surrogate = ((DIGIT / "A"/"B"/"C" / "E"/"F") 3HEXDIG) /
240
+ # ("D" %x30-37 2HEXDIG )
241
+ # high-surrogate = "D" ("8"/"9"/"A"/"B") 2HEXDIG
242
+ # low-surrogate = "D" ("C"/"D"/"E"/"F") 2HEXDIG
243
+ #
244
+ # HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
245
+ #
246
+ # Both lower and uppercase are allowed. The grammar does now show this here
247
+ # but clarifies that in a following note.
248
+ #
249
+ # The preceding `\u` prefix has already been consumed.
250
+ #
251
+ # @return [String] single character (possibly multi-byte)
252
+ def consume_unicode_escape_sequence
253
+ # return a non-surrogate sequence
254
+ hex_str = consume_four_hex_digits
255
+ return hex_str.hex.chr('UTF-8') unless hex_str.upcase.start_with?('D')
256
+
257
+ # hex string starts with D, but is still non-surrogate
258
+ return hex_str.hex.chr('UTF-8') if (0..7).cover?(hex_str[1].to_i)
259
+
260
+ # hex value is in the high-surrogate or low-surrogate range.
261
+
262
+ if high_surrogate?(hex_str)
263
+ # valid, as long as it is followed by \u low-surrogate
264
+ prefix = [consume, consume].join
265
+ hex_str2 = consume_four_hex_digits
266
+
267
+ # This is a high-surrogate followed by a low-surrogate, which is valid.
268
+ # This is the UTF-16 method of representing certain high unicode codepoints.
269
+ # However this specific byte sequence is not a valid way to represent that same
270
+ # unicode character in the UTF-8 encoding.
271
+ # The surrogate pair must be converted into the correct UTF-8 code point.
272
+ # This returns a UTF-8 string containing a single unicode character.
273
+ return convert_surrogate_pair_to_codepoint(hex_str, hex_str2) if prefix == '\\u' && low_surrogate?(hex_str2)
274
+
275
+ # Not allowed to have high surrogate that is not followed by low surrogate
276
+ raise "invalid unicode escape sequence: \\u#{hex_str2.join}"
277
+
278
+ end
279
+ # Not allowed to have low surrogate that is not preceded by high surrogate
280
+ raise "invalid unicode escape sequence: \\u#{hex_str}"
281
+ end
282
+
283
+ # Convert a valid UTF-16 surrogate pair into a UTF-8 string containing a single code point.
284
+ #
285
+ # @param high_surrogate_hex [String] string of hex digits, eg. "D83D"
286
+ # @param low_surrogate_hex [String] string of hex digits, eg. "DE09"
287
+ # @return [String] UTF-8 string containing a single multi-byte unicode character, eg. "😉"
288
+ def convert_surrogate_pair_to_codepoint(high_surrogate_hex, low_surrogate_hex)
289
+ [high_surrogate_hex, low_surrogate_hex].each do |hex_str|
290
+ raise ArgumentError, "expect 4 hex digits, got #{hex_string.inspect}" unless hex_str.size == 4
291
+ end
292
+
293
+ # Calculate the code point from the surrogate pair values
294
+ # algorithm from https://russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
295
+ high = high_surrogate_hex.hex
296
+ low = low_surrogate_hex.hex
297
+ codepoint = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
298
+ [codepoint].pack('U') # convert integer codepoint to single character string
299
+ end
300
+
301
+ # Return true if the given 4 char hex string is "high-surrogate"
302
+ def high_surrogate?(hex_digits)
303
+ return false unless hex_digits.size == 4
304
+
305
+ %w[D8 D9 DA DB].include?(hex_digits[0..1].upcase)
306
+ end
307
+
308
+ # Return true if the given 4 char hex string is "low-surrogate"
309
+ def low_surrogate?(hex_digits)
310
+ return false unless hex_digits.size == 4
311
+
312
+ %w[DC DD DE DF].include?(hex_digits[0..1].upcase)
313
+ end
314
+
315
+ # Consume and return 4 hex digits from the source. Either upper or lower case is accepted.
316
+ # No judgement is made here on whether the resulting sequence is valid,
317
+ # as long as it is 4 hex digits.
318
+ #
319
+ # @return [String]
320
+ def consume_four_hex_digits
321
+ hex_digits = []
322
+ 4.times do
323
+ hex_digits << consume
324
+ case hex_digits.last.ord
325
+ when 0x30..0x39 then next # '0'..'1'
326
+ when 0x40..0x46 then next # 'A'..'F'
327
+ when 0x61..0x66 then next # 'a'..'f'
328
+ else
329
+ raise "invalid unicode escape sequence: \\u#{hex_digits.join}"
330
+ end
331
+ end
332
+ raise "incomplete unicode escape sequence: \\u#{hex_digits.join}" if hex_digits.size < 4
333
+
334
+ hex_digits.join
335
+ end
336
+
337
+ # Consume a numeric string. May be an integer, fractional, or exponent.
338
+ # number = (int / "-0") [ frac ] [ exp ] ; decimal number
339
+ # frac = "." 1*DIGIT ; decimal fraction
340
+ # exp = "e" [ "-" / "+" ] 1*DIGIT ; decimal exponent
341
+ def lex_number
342
+ consume_digits
343
+
344
+ # Look for a fractional part
345
+ if lookahead == '.' && digit?(lookahead(2))
346
+ consume # "."
347
+ consume_digits
348
+ end
349
+
350
+ # Look for an exponent part
351
+ if 'Ee'.include?(lookahead)
352
+ consume # "e", "E"
353
+ if %w[+ -].include?(lookahead)
354
+ consume # "+" / "-"
355
+ end
356
+ unless digit?(lookahead)
357
+ lexeme = source[lexeme_start_p..(next_p - 1)]
358
+ raise Error, "Exponent 'e' must be followed by number: #{lexeme.inspect}"
359
+ end
360
+ consume_digits
361
+ end
362
+
363
+ lexeme = source[lexeme_start_p..(next_p - 1)]
364
+ if lexeme.start_with?('0') && lexeme.size > 1
365
+ raise Error, "Number may not start with leading zero: #{lexeme.inspect}"
366
+ end
367
+
368
+ literal =
369
+ if lexeme.include?('.') || lexeme.downcase.include?('e')
370
+ lexeme.to_f
371
+ else
372
+ lexeme.to_i
373
+ end
374
+ Token.new(:number, lexeme, literal, current_location)
375
+ end
376
+
377
+ # Consume an alphanumeric string.
378
+ # If `ignore_keywords`, the result is alway an :identifier token.
379
+ # Otherwise, keywords and function names will be recognized and tokenized as those types.
380
+ #
381
+ # @param ignore_keywords [Boolean]
382
+ def lex_identifier(ignore_keywords: false)
383
+ consume while alpha_numeric?(lookahead)
384
+
385
+ identifier = source[lexeme_start_p..(next_p - 1)]
386
+ type =
387
+ if KEYWORD.include?(identifier) && !ignore_keywords
388
+ identifier.to_sym
389
+ elsif FUNCTIONS.include?(identifier) && !ignore_keywords
390
+ :function
391
+ else
392
+ :identifier
393
+ end
394
+
395
+ Token.new(type, identifier, identifier, current_location)
396
+ end
397
+
398
+ # Parse an identifier string which is not within delimiters.
399
+ # The standard set of unicode code points are allowed.
400
+ # No character escapes are allowed.
401
+ # Keywords and function names are ignored in this context.
402
+ # @return [Token]
403
+ def lex_unescaped_identifier
404
+ consume while unescaped?(lookahead)
405
+ identifier = source[lexeme_start_p..(next_p - 1)]
406
+ Token.new(:identifier, identifier, identifier, current_location)
407
+ end
408
+
409
+ # Return true if string matches the definition of "unescaped" from RFC9535:
410
+ # unescaped = %x20-21 / ; see RFC 8259
411
+ # ; omit 0x22 "
412
+ # %x23-26 /
413
+ # ; omit 0x27 '
414
+ # %x28-5B /
415
+ # ; omit 0x5C \
416
+ # %x5D-D7FF /
417
+ # ; skip surrogate code points
418
+ # %xE000-10FFFF
419
+ # @param char [String] single character, possibly multi-byte
420
+ def unescaped?(char)
421
+ case char.ord
422
+ when 0x20..0x21 then true # space, "!"
423
+ when 0x23..0x26 then true # "#", "$", "%"
424
+ when 0x28..0x5B then true # "(" ... "["
425
+ when 0x5D..0xD7FF then true # remaining ascii and lots of unicode code points
426
+ # omit surrogate code points
427
+ when 0xE000..0x10FFFF then true # much more unicode code points
428
+ else false
429
+ end
430
+ end
431
+
432
+ def escapable?(char)
433
+ case char.ord
434
+ when 0x62 then true # backspace
435
+ when 0x66 then true # form feed
436
+ when 0x6E then true # line feed
437
+ when 0x72 then true # carriage return
438
+ when 0x74 then true # horizontal tab
439
+ when 0x2F then true # slash
440
+ when 0x5C then true # backslash
441
+ else false
442
+ end
443
+ end
444
+
445
+ # True if character is suitable as the first character in a name selector
446
+ # using shorthand notation (ie. no bracket notation.)
447
+ #
448
+ # Defined in RFC9535 by ths ABNF grammar:
449
+ # name-first = ALPHA /
450
+ # "_" /
451
+ # %x80-D7FF /
452
+ # ; skip surrogate code points
453
+ # %xE000-10FFFF
454
+ #
455
+ # @param char [String] single character, possibly multi-byte
456
+ # @return [Boolean]
457
+ def name_first_char?(char)
458
+ NAME_FIRST.include?(char) \
459
+ || (0x80..0xD7FF).cover?(char.ord) \
460
+ || (0xE000..0x10FFFF).cover?(char.ord)
461
+ end
462
+
463
+ # True if character is acceptable in a name selector using shorthand notation (ie. no bracket notation.)
464
+ # This is the same set as #name_first_char? except that it also allows numbers
465
+ # @param char [String] single character, possibly multi-byte
466
+ # @return [Boolean]
467
+ def name_char?(char)
468
+ NAME_FIRST.include?(char) \
469
+ || DIGITS.include?(char) \
470
+ || (0x80..0xD7FF).cover?(char.ord) \
471
+ || (0xE000..0x10FFFF).cover?(char.ord)
472
+ end
473
+
474
+ # Lex a member name that is found within dot notation.
475
+ #
476
+ # Recognize keywords and given them the correct type.
477
+ # @see https://www.rfc-editor.org/rfc/rfc9535.html#section-2.5.1.1-3
478
+ #
479
+ # @param ignore_keywords [Boolean]
480
+ # @return [Token]
481
+ def lex_member_name_shorthand(ignore_keywords: false)
482
+ consume while name_char?(lookahead)
483
+ identifier = source[lexeme_start_p..(next_p - 1)]
484
+ type =
485
+ if KEYWORD.include?(identifier) && !ignore_keywords
486
+ identifier.to_sym
487
+ elsif FUNCTIONS.include?(identifier) && !ignore_keywords
488
+ :function
489
+ else
490
+ :identifier
491
+ end
492
+ if type == :function && WHITESPACE.include?(lookahead)
493
+ raise Error, "Function name \"#{identifier}\" must not be followed by whitespace"
494
+ end
495
+ Token.new(type, identifier, identifier, current_location)
496
+ end
497
+
498
+ def source_completed?
499
+ next_p >= source.length # our pointer starts at 0, so the last char is length - 1.
500
+ end
501
+
502
+ def source_uncompleted?
503
+ !source_completed?
504
+ end
505
+
506
+ def current_location
507
+ Location.new(lexeme_start_p, next_p - lexeme_start_p)
508
+ end
509
+
510
+ def after_source_end_location
511
+ Location.new(next_p, 1)
512
+ end
513
+ end
514
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ Location = Struct.new(:col, :length)