graphql 2.1.12 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,39 +1,228 @@
1
1
  # frozen_string_literal: true
2
-
3
- require "strscan"
4
-
5
2
  module GraphQL
6
3
  module Language
4
+
7
5
  class Lexer
8
- IDENTIFIER = /[_A-Za-z][_0-9A-Za-z]*/
9
- NEWLINE = /[\c\r\n]/
10
- BLANK = /[, \t]+/
11
- COMMENT = /#[^\n\r]*/
12
- INT = /[-]?(?:[0]|[1-9][0-9]*)/
13
- FLOAT_DECIMAL = /[.][0-9]+/
14
- FLOAT_EXP = /[eE][+-]?[0-9]+/
15
- FLOAT = /#{INT}(#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP})/
16
-
17
- module Literals
18
- ON = /on\b/
19
- FRAGMENT = /fragment\b/
20
- TRUE = /true\b/
21
- FALSE = /false\b/
22
- NULL = /null\b/
23
- QUERY = /query\b/
24
- MUTATION = /mutation\b/
25
- SUBSCRIPTION = /subscription\b/
26
- SCHEMA = /schema\b/
27
- SCALAR = /scalar\b/
28
- TYPE = /type\b/
29
- EXTEND = /extend\b/
30
- IMPLEMENTS = /implements\b/
31
- INTERFACE = /interface\b/
32
- UNION = /union\b/
33
- ENUM = /enum\b/
34
- INPUT = /input\b/
35
- DIRECTIVE = /directive\b/
36
- REPEATABLE = /repeatable\b/
6
+ def initialize(graphql_str, filename: nil)
7
+ if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?)
8
+ graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8)
9
+ end
10
+ @string = graphql_str
11
+ @filename = filename
12
+ @scanner = StringScanner.new(graphql_str)
13
+ @pos = nil
14
+ end
15
+
16
+ def eos?
17
+ @scanner.eos?
18
+ end
19
+
20
+ attr_reader :pos
21
+
22
+ def advance
23
+ @scanner.skip(IGNORE_REGEXP)
24
+ return false if @scanner.eos?
25
+ @pos = @scanner.pos
26
+ next_byte = @string.getbyte(@pos)
27
+ next_byte_is_for = FIRST_BYTES[next_byte]
28
+ case next_byte_is_for
29
+ when ByteFor::PUNCTUATION
30
+ @scanner.pos += 1
31
+ PUNCTUATION_NAME_FOR_BYTE[next_byte]
32
+ when ByteFor::NAME
33
+ if len = @scanner.skip(KEYWORD_REGEXP)
34
+ case len
35
+ when 2
36
+ :ON
37
+ when 12
38
+ :SUBSCRIPTION
39
+ else
40
+ pos = @pos
41
+
42
+ # Use bytes 2 and 3 as a unique identifier for this keyword
43
+ bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)
44
+ KEYWORD_BY_TWO_BYTES[_hash(bytes)]
45
+ end
46
+ else
47
+ @scanner.skip(IDENTIFIER_REGEXP)
48
+ :IDENTIFIER
49
+ end
50
+ when ByteFor::IDENTIFIER
51
+ @scanner.skip(IDENTIFIER_REGEXP)
52
+ :IDENTIFIER
53
+ when ByteFor::NUMBER
54
+ @scanner.skip(NUMERIC_REGEXP)
55
+ # Check for a matched decimal:
56
+ @scanner[1] ? :FLOAT : :INT
57
+ when ByteFor::ELLIPSIS
58
+ if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46
59
+ raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}")
60
+ end
61
+ @scanner.pos += 3
62
+ :ELLIPSIS
63
+ when ByteFor::STRING
64
+ if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP)
65
+ :STRING
66
+ else
67
+ raise_parse_error("Expected string or block string, but it was malformed")
68
+ end
69
+ else
70
+ @scanner.pos += 1
71
+ :UNKNOWN_CHAR
72
+ end
73
+ rescue ArgumentError => err
74
+ if err.message == "invalid byte sequence in UTF-8"
75
+ raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil)
76
+ end
77
+ end
78
+
79
+ def token_value
80
+ @string.byteslice(@scanner.pos - @scanner.matched_size, @scanner.matched_size)
81
+ rescue StandardError => err
82
+ raise GraphQL::Error, "(token_value failed: #{err.class}: #{err.message})"
83
+ end
84
+
85
+ def debug_token_value(token_name)
86
+ if token_name && Lexer::Punctuation.const_defined?(token_name)
87
+ Lexer::Punctuation.const_get(token_name)
88
+ elsif token_name == :ELLIPSIS
89
+ "..."
90
+ elsif token_name == :STRING
91
+ string_value
92
+ else
93
+ token_value
94
+ end
95
+ end
96
+
97
+ ESCAPES = /\\["\\\/bfnrt]/
98
+ ESCAPES_REPLACE = {
99
+ '\\"' => '"',
100
+ "\\\\" => "\\",
101
+ "\\/" => '/',
102
+ "\\b" => "\b",
103
+ "\\f" => "\f",
104
+ "\\n" => "\n",
105
+ "\\r" => "\r",
106
+ "\\t" => "\t",
107
+ }
108
+ UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
109
+ VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
110
+
111
+ def string_value
112
+ str = token_value
113
+ is_block = str.start_with?('"""')
114
+ if is_block
115
+ str.gsub!(/\A"""|"""\z/, '')
116
+ else
117
+ str.gsub!(/\A"|"\z/, '')
118
+ end
119
+
120
+ if is_block
121
+ str = Language::BlockString.trim_whitespace(str)
122
+ end
123
+
124
+ if !str.valid_encoding? || !str.match?(VALID_STRING)
125
+ raise_parse_error("Bad unicode escape in #{str.inspect}")
126
+ else
127
+ Lexer.replace_escaped_characters_in_place(str)
128
+
129
+ if !str.valid_encoding?
130
+ raise_parse_error("Bad unicode escape in #{str.inspect}")
131
+ else
132
+ str
133
+ end
134
+ end
135
+ end
136
+
137
+ def line_number
138
+ @scanner.string[0..@pos].count("\n") + 1
139
+ end
140
+
141
+ def column_number
142
+ @scanner.string[0..@pos].split("\n").last.length
143
+ end
144
+
145
+ def raise_parse_error(message, line = line_number, col = column_number)
146
+ raise GraphQL::ParseError.new(message, line, col, @string, filename: @filename)
147
+ end
148
+
149
+ IGNORE_REGEXP = %r{
150
+ (?:
151
+ [, \c\r\n\t]+ |
152
+ \#.*$
153
+ )*
154
+ }x
155
+ IDENTIFIER_REGEXP = /[_A-Za-z][_0-9A-Za-z]*/
156
+ INT_REGEXP = /-?(?:[0]|[1-9][0-9]*)/
157
+ FLOAT_DECIMAL_REGEXP = /[.][0-9]+/
158
+ FLOAT_EXP_REGEXP = /[eE][+-]?[0-9]+/
159
+ NUMERIC_REGEXP = /#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/
160
+
161
+ KEYWORDS = [
162
+ "on",
163
+ "fragment",
164
+ "true",
165
+ "false",
166
+ "null",
167
+ "query",
168
+ "mutation",
169
+ "subscription",
170
+ "schema",
171
+ "scalar",
172
+ "type",
173
+ "extend",
174
+ "implements",
175
+ "interface",
176
+ "union",
177
+ "enum",
178
+ "input",
179
+ "directive",
180
+ "repeatable"
181
+ ].freeze
182
+
183
+ KEYWORD_REGEXP = /#{Regexp.union(KEYWORDS.sort)}\b/
184
+ KEYWORD_BY_TWO_BYTES = [
185
+ :INTERFACE,
186
+ :MUTATION,
187
+ :EXTEND,
188
+ :FALSE,
189
+ :ENUM,
190
+ :TRUE,
191
+ :NULL,
192
+ nil,
193
+ nil,
194
+ nil,
195
+ nil,
196
+ nil,
197
+ nil,
198
+ nil,
199
+ :QUERY,
200
+ nil,
201
+ nil,
202
+ :REPEATABLE,
203
+ :IMPLEMENTS,
204
+ :INPUT,
205
+ :TYPE,
206
+ :SCHEMA,
207
+ nil,
208
+ nil,
209
+ nil,
210
+ :DIRECTIVE,
211
+ :UNION,
212
+ nil,
213
+ nil,
214
+ :SCALAR,
215
+ nil,
216
+ :FRAGMENT
217
+ ]
218
+
219
+ # This produces a unique integer for bytes 2 and 3 of each keyword string
220
+ # See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html
221
+ def _hash key
222
+ (key * 18592990) >> 27 & 0x1f
223
+ end
224
+
225
+ module Punctuation
37
226
  LCURLY = '{'
38
227
  RCURLY = '}'
39
228
  LPAREN = '('
@@ -43,36 +232,31 @@ module GraphQL
43
232
  COLON = ':'
44
233
  VAR_SIGN = '$'
45
234
  DIR_SIGN = '@'
46
- ELLIPSIS = '...'
47
235
  EQUALS = '='
48
236
  BANG = '!'
49
237
  PIPE = '|'
50
238
  AMP = '&'
51
239
  end
52
240
 
53
- include Literals
241
+ # A sparse array mapping the bytes for each punctuation
242
+ # to a symbol name for that punctuation
243
+ PUNCTUATION_NAME_FOR_BYTE = Punctuation.constants.each_with_object([]) { |name, arr|
244
+ punct = Punctuation.const_get(name)
245
+ arr[punct.ord] = name
246
+ }
54
247
 
55
248
  QUOTE = '"'
56
249
  UNICODE_DIGIT = /[0-9A-Za-z]/
57
250
  FOUR_DIGIT_UNICODE = /#{UNICODE_DIGIT}{4}/
58
- N_DIGIT_UNICODE = %r{#{LCURLY}#{UNICODE_DIGIT}{4,}#{RCURLY}}x
251
+ N_DIGIT_UNICODE = %r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
59
252
  UNICODE_ESCAPE = %r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
60
- # # https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
253
+ # # https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
61
254
  STRING_ESCAPE = %r{[\\][\\/bfnrt]}
62
255
  BLOCK_QUOTE = '"""'
63
256
  ESCAPED_QUOTE = /\\"/;
64
257
  STRING_CHAR = /#{ESCAPED_QUOTE}|[^"\\]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
65
-
66
- LIT_NAME_LUT = Literals.constants.each_with_object({}) { |n, o|
67
- key = Literals.const_get(n)
68
- key = key.is_a?(Regexp) ? key.source.gsub(/(\\b|\\)/, '') : key
69
- o[key] = n
70
- }
71
-
72
- LIT = Regexp.union(Literals.constants.map { |n| Literals.const_get(n) })
73
-
74
- QUOTED_STRING = %r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
75
- BLOCK_STRING = %r{
258
+ QUOTED_STRING_REGEXP = %r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
259
+ BLOCK_STRING_REGEXP = %r{
76
260
  #{BLOCK_QUOTE}
77
261
  (?: [^"\\] | # Any characters that aren't a quote or slash
78
262
  (?<!") ["]{1,2} (?!") | # Any quotes that don't have quotes next to them
@@ -84,85 +268,33 @@ module GraphQL
84
268
  #{BLOCK_QUOTE}
85
269
  }xm
86
270
 
87
- # # catch-all for anything else. must be at the bottom for precedence.
88
- UNKNOWN_CHAR = /./
89
-
90
- def initialize(value)
91
- @line = 1
92
- @col = 1
93
- @previous_token = nil
94
-
95
- @scan = scanner value
271
+ # Use this array to check, for a given byte that will start a token,
272
+ # what kind of token might it start?
273
+ FIRST_BYTES = Array.new(255)
274
+
275
+ module ByteFor
276
+ NUMBER = 0 # int or float
277
+ NAME = 1 # identifier or keyword
278
+ STRING = 2
279
+ ELLIPSIS = 3
280
+ IDENTIFIER = 4 # identifier, *not* a keyword
281
+ PUNCTUATION = 5
96
282
  end
97
283
 
98
- class BadEncoding < Lexer # :nodoc:
99
- def scanner(value)
100
- [emit(:BAD_UNICODE_ESCAPE, 0, 0, value)]
101
- end
102
-
103
- def next_token
104
- @scan.pop
105
- end
284
+ (0..9).each { |i| FIRST_BYTES[i.to_s.ord] = ByteFor::NUMBER }
285
+ FIRST_BYTES["-".ord] = ByteFor::NUMBER
286
+ # Some of these may be overwritten below, if keywords start with the same character
287
+ ("A".."Z").each { |char| FIRST_BYTES[char.ord] = ByteFor::IDENTIFIER }
288
+ ("a".."z").each { |char| FIRST_BYTES[char.ord] = ByteFor::IDENTIFIER }
289
+ FIRST_BYTES['_'.ord] = ByteFor::IDENTIFIER
290
+ FIRST_BYTES['.'.ord] = ByteFor::ELLIPSIS
291
+ FIRST_BYTES['"'.ord] = ByteFor::STRING
292
+ KEYWORDS.each { |kw| FIRST_BYTES[kw.getbyte(0)] = ByteFor::NAME }
293
+ Punctuation.constants.each do |punct_name|
294
+ punct = Punctuation.const_get(punct_name)
295
+ FIRST_BYTES[punct.ord] = ByteFor::PUNCTUATION
106
296
  end
107
297
 
108
- def self.tokenize(string)
109
- value = string.dup.force_encoding(Encoding::UTF_8)
110
-
111
- scanner = if value.valid_encoding?
112
- new value
113
- else
114
- BadEncoding.new value
115
- end
116
-
117
- toks = []
118
-
119
- while tok = scanner.next_token
120
- toks << tok
121
- end
122
-
123
- toks
124
- end
125
-
126
- def next_token
127
- return if @scan.eos?
128
-
129
- pos = @scan.pos
130
-
131
- case
132
- when str = @scan.scan(FLOAT) then emit(:FLOAT, pos, @scan.pos, str)
133
- when str = @scan.scan(INT) then emit(:INT, pos, @scan.pos, str)
134
- when str = @scan.scan(LIT) then emit(LIT_NAME_LUT[str], pos, @scan.pos, -str)
135
- when str = @scan.scan(IDENTIFIER) then emit(:IDENTIFIER, pos, @scan.pos, str)
136
- when str = @scan.scan(BLOCK_STRING) then emit_block(pos, @scan.pos, str.gsub(/\A#{BLOCK_QUOTE}|#{BLOCK_QUOTE}\z/, ''))
137
- when str = @scan.scan(QUOTED_STRING) then emit_string(pos, @scan.pos, str.gsub(/^"|"$/, ''))
138
- when str = @scan.scan(COMMENT) then record_comment(pos, @scan.pos, str)
139
- when str = @scan.scan(NEWLINE)
140
- @line += 1
141
- @col = 1
142
- next_token
143
- when @scan.scan(BLANK)
144
- @col += @scan.pos - pos
145
- next_token
146
- when str = @scan.scan(UNKNOWN_CHAR) then emit(:UNKNOWN_CHAR, pos, @scan.pos, str)
147
- else
148
- # This should never happen since `UNKNOWN_CHAR` ensures we make progress
149
- raise "Unknown string?"
150
- end
151
- end
152
-
153
- def emit(token_name, ts, te, token_value)
154
- token = [
155
- token_name,
156
- @line,
157
- @col,
158
- token_value,
159
- @previous_token,
160
- ]
161
- @previous_token = token
162
- # Bump the column counter for the next token
163
- @col += te - ts
164
- token
165
- end
166
298
 
167
299
  # Replace any escaped unicode or whitespace with the _actual_ characters
168
300
  # To avoid allocating more strings, this modifies the string passed into it
@@ -190,63 +322,25 @@ module GraphQL
190
322
  nil
191
323
  end
192
324
 
193
- def record_comment(ts, te, str)
194
- token = [
195
- :COMMENT,
196
- @line,
197
- @col,
198
- str,
199
- @previous_token,
200
- ]
201
-
202
- @previous_token = token
203
-
204
- @col += te - ts
205
- next_token
206
- end
207
-
208
- ESCAPES = /\\["\\\/bfnrt]/
209
- ESCAPES_REPLACE = {
210
- '\\"' => '"',
211
- "\\\\" => "\\",
212
- "\\/" => '/',
213
- "\\b" => "\b",
214
- "\\f" => "\f",
215
- "\\n" => "\n",
216
- "\\r" => "\r",
217
- "\\t" => "\t",
218
- }
219
- UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
220
- VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
221
-
222
- def emit_block(ts, te, value)
223
- line_incr = value.count("\n")
224
- value = GraphQL::Language::BlockString.trim_whitespace(value)
225
- tok = emit_string(ts, te, value)
226
- @line += line_incr
227
- tok
228
- end
229
-
230
- def emit_string(ts, te, value)
231
- if !value.valid_encoding? || !value.match?(VALID_STRING)
232
- emit(:BAD_UNICODE_ESCAPE, ts, te, value)
233
- else
234
- self.class.replace_escaped_characters_in_place(value)
235
-
236
- if !value.valid_encoding?
237
- emit(:BAD_UNICODE_ESCAPE, ts, te, value)
238
- else
239
- emit(:STRING, ts, te, value)
240
- end
325
+ # This is not used during parsing because the parser
326
+ # doesn't actually need tokens.
327
+ def self.tokenize(string)
328
+ lexer = GraphQL::Language::Lexer.new(string)
329
+ tokens = []
330
+ prev_token = nil
331
+ while (token_name = lexer.advance)
332
+ new_token = [
333
+ token_name,
334
+ lexer.line_number,
335
+ lexer.column_number,
336
+ lexer.debug_token_value(token_name),
337
+ prev_token,
338
+ ]
339
+ tokens << new_token
340
+ prev_token = new_token
241
341
  end
342
+ tokens
242
343
  end
243
-
244
- private
245
-
246
- def scanner(value)
247
- StringScanner.new value
248
- end
249
-
250
344
  end
251
345
  end
252
346
  end