violet 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/violet +34 -0
- data/lib/violet.rb +22 -0
- data/lib/violet/lexer.rb +671 -0
- data/lib/violet/parser.rb +1440 -0
- data/lib/violet/token.rb +78 -0
- data/test/test_assertions.rb +550 -0
- data/test/test_violet.rb +36 -0
- metadata +90 -0
data/bin/violet
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
require File.expand_path("../lib/violet", File.dirname(__FILE__))
|
5
|
+
require "pal"
|
6
|
+
|
7
|
+
# Internal: Defines the commands and variables available to the interactive
|
8
|
+
# shell.
|
9
|
+
class Command < Pal::Context
|
10
|
+
|
11
|
+
# Public: Lexes a string of JavaScript source code.
|
12
|
+
#
|
13
|
+
# source - The source `String`.
|
14
|
+
# patterns - Boolean arguments that correspond to each lexed token and
|
15
|
+
# specify if the `/` and `/=` tokens may be interpreted as regular
|
16
|
+
# expressions (`true`) or division operators (`false`).
|
17
|
+
#
|
18
|
+
# Returns an `Array` of `Token`s.
|
19
|
+
def lex(source, *patterns)
|
20
|
+
Lexer.new(source).tokens(*patterns)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Public: Parses a string of JavaScript source code.
|
24
|
+
#
|
25
|
+
# source - The source `String`.
|
26
|
+
#
|
27
|
+
# Returns an `Array` of `Token`s.
|
28
|
+
def parse(source)
|
29
|
+
Parser.parse(source)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Pal::REPL.new("violet", Command.new).loop
|
34
|
+
end
|
data/lib/violet.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
# Public: Contains the version information.
|
5
|
+
module Version
|
6
|
+
# Public: The current version of Violet. The major, minor, and patch
|
7
|
+
# versions are exposed as individual constants, and comprise the
|
8
|
+
# semantic version string.
|
9
|
+
STRING = (MAJOR, MINOR, PATCH = 0, 0, 1) * "."
|
10
|
+
end
|
11
|
+
|
12
|
+
# Internal: A named `Error` class, used for reporting parse errors.
|
13
|
+
Error = Class.new(StandardError)
|
14
|
+
|
15
|
+
# Prepend the `lib` directory to the load path to facilitate loading Violet
|
16
|
+
# without RubyGems. Modules and classes will be loaded as needed.
|
17
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
18
|
+
|
19
|
+
autoload :Token, "violet/token"
|
20
|
+
autoload :Lexer, "violet/lexer"
|
21
|
+
autoload :Parser, "violet/parser"
|
22
|
+
end
|
data/lib/violet/lexer.rb
ADDED
@@ -0,0 +1,671 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
# Internal: Records exceptions emitted by the lexer.
|
5
|
+
LexerError = Class.new(Error)
|
6
|
+
|
7
|
+
# Public: Lexes a JavaScript source string.
|
8
|
+
class Lexer
|
9
|
+
# Public: Matches line terminators: line feeds, carriage returns, line
|
10
|
+
# separators, and paragraph separators. See section 7.3 of the ES 5.1 spec.
|
11
|
+
LINE_TERMINATORS = /[\n\r\u2028\u2029]/
|
12
|
+
|
13
|
+
# Public: Matches line separators, paragraph separators, and carriage
|
14
|
+
# returns not followed by line separators. Used to convert all line
|
15
|
+
# terminators to line feeds. CRLF line endings are preserved.
|
16
|
+
NORMALIZE_LINE_ENDINGS = /[\u2028\u2029]|(?:\r[^\n])/
|
17
|
+
|
18
|
+
# Public: Matches Unicode letters, `$`, `_`, and Unicode escape sequences.
|
19
|
+
# See section 7.6.
|
20
|
+
IDENTIFIER_START = /[$_\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]/
|
21
|
+
|
22
|
+
# Public: Matches identifier starting characters, Unicode combining marks,
|
23
|
+
# Unicode digits, Unicode connector punctuators, zero-width non-joiners, and
|
24
|
+
# zero-width joiners. See section 7.1.
|
25
|
+
IDENTIFIER_FRAGMENT = Regexp.union(IDENTIFIER_START, /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]/)
|
26
|
+
|
27
|
+
# Public: Matches an ECMAScript token. This is a superset of the `Token`
|
28
|
+
# production defined in section 7.5 of the spec.
|
29
|
+
TOKEN = %r(
|
30
|
+
## Whitespace characters: tab, vertical tab, form feed, space,
|
31
|
+
# non-breaking space, byte-order mark, and other Unicode space separators
|
32
|
+
# (Category Z). The space and non-breaking space characters are matched by
|
33
|
+
# the \p{Z} Unicode category class. See section 7.2 of the ES spec.
|
34
|
+
(?<whitespace>[\t\v\f\ufeff\uffff\p{Z}])?
|
35
|
+
# Line terminators. See section 7.3.
|
36
|
+
(?<line_terminator>#{LINE_TERMINATORS})?
|
37
|
+
# Line and block comments. See section 7.4.
|
38
|
+
(?<line_comment>//)?
|
39
|
+
(?<block_comment>/\*)?
|
40
|
+
# Single- and double-quoted string literals. See section 7.8.4.
|
41
|
+
(?<single_quoted_string>')?
|
42
|
+
(?<double_quoted_string>")?
|
43
|
+
# Numeric literals. See section 7.8.3.
|
44
|
+
(?<number>\.?[0-9])?
|
45
|
+
# RegExp literals. See section 7.8.5. This capture may also match the
|
46
|
+
# `DivPunctuator` production.
|
47
|
+
(?:(?<pattern>/)[^=])?
|
48
|
+
# Punctuators. See section 7.7.
|
49
|
+
(?<punctuator>\>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|&&|
|
50
|
+
\|\||\+=|-=|\*=|%=|&=|\|=|\^=|/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|
|
51
|
+
\*|%|\||&|\||\^|!|~|\?|:|=|/)?
|
52
|
+
)x
|
53
|
+
|
54
|
+
# Internal: The `true`, `false`, and `null` literals, as well as the
|
55
|
+
# `undefined` value. The lexer marks these four values as primitives.
|
56
|
+
LITERALS = %w( undefined null true false )
|
57
|
+
|
58
|
+
# Internal: A `Hash` that contains the quote character, token kind, and the
|
59
|
+
# unterminated string and invalid line continuation error messages for
|
60
|
+
# single- and double-quoted string tokens.
|
61
|
+
STRINGS = %w( single ' double " ).each_slice(2).with_object({}) do |(kind, quote), value|
|
62
|
+
value[kind.to_sym] = {
|
63
|
+
:quote => quote,
|
64
|
+
:kind => "#{kind}_quoted_string".to_sym,
|
65
|
+
:unterminated_string_error => "Unterminated #{kind}-quoted string literal.",
|
66
|
+
:invalid_continuation_error => "Unescaped line terminators are not permitted within #{kind}-quoted string literals."
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
# Public: Gets the source string.
|
71
|
+
attr_reader :source
|
72
|
+
|
73
|
+
# Public: Gets the current line.
|
74
|
+
attr_reader :line
|
75
|
+
|
76
|
+
# Public: Gets the current column.
|
77
|
+
attr_reader :column
|
78
|
+
|
79
|
+
# Public: Creates a new `Lexer` with a source string.
|
80
|
+
#
|
81
|
+
# source - The source `String`.
|
82
|
+
def initialize(source)
|
83
|
+
@source = source
|
84
|
+
# Replace all line terminators with a single line feed, but preserve CRLF
|
85
|
+
# line endings.
|
86
|
+
@normalized_source = @source.gsub(NORMALIZE_LINE_ENDINGS, ?\n)
|
87
|
+
reset!
|
88
|
+
end
|
89
|
+
|
90
|
+
# Public: Resets the lexer to its original position and clears the token
|
91
|
+
# stream.
|
92
|
+
def reset!
|
93
|
+
@index = @line = @column = 0
|
94
|
+
@terminated = false
|
95
|
+
(@tokens ||= []).clear
|
96
|
+
end
|
97
|
+
|
98
|
+
# Public: Produces a complete token stream from the source. This method
|
99
|
+
# resets the lexer prior to lexing the source string.
|
100
|
+
#
|
101
|
+
# patterns - Zero or more boolean arguments that correspond to each lexed
|
102
|
+
# token and specify if the `/` and `/=` tokens may be interpreted as
|
103
|
+
# regular expressions (`true`) or division operators (`false`). This
|
104
|
+
# flag only applies to division and regular expression tokens; setting
|
105
|
+
# it for other tokens has no effect.
|
106
|
+
def tokens(*patterns)
|
107
|
+
reset!
|
108
|
+
index = -1
|
109
|
+
# Lex tokens until the end-of-file mark is reached.
|
110
|
+
loop { break unless lex patterns[index += 1] }
|
111
|
+
@tokens
|
112
|
+
end
|
113
|
+
|
114
|
+
# Public: Inserts a new token into the token stream, before a reference
|
115
|
+
# token. If the reference token is the end-of-file mark, the token is
|
116
|
+
# appended instead.
|
117
|
+
#
|
118
|
+
# token - The `Token` to be inserted into the token stream.
|
119
|
+
# original - The reference `Token` before which the new `Token` is inserted.
|
120
|
+
#
|
121
|
+
# Returns the new `Token`.
|
122
|
+
def insert_before(token, original)
|
123
|
+
if original[:name] == Token::Types[:eof]
|
124
|
+
token[:index] = @tokens.size
|
125
|
+
@tokens << token
|
126
|
+
else
|
127
|
+
token[:index] = original[:index]
|
128
|
+
@tokens[token[:index]] = token
|
129
|
+
original[:index] += 1
|
130
|
+
@tokens[original[:index]] = original
|
131
|
+
end
|
132
|
+
token
|
133
|
+
end
|
134
|
+
|
135
|
+
# Internal: Returns the maximum number of characters, relative to the
|
136
|
+
# current scan pointer, that may be parsed as valid identifier
|
137
|
+
# characters. The scan pointer is not advanced.
|
138
|
+
#
|
139
|
+
# lex_as_fragment - A boolean that specifies whether the identifier may be
|
140
|
+
# lexed as a fragment. Certain productions allow identifier fragments,
|
141
|
+
# while others require that the identifier begin with a subset of valid
|
142
|
+
# fragment characters (default: false).
|
143
|
+
def match_identifier?(lex_as_fragment = false)
|
144
|
+
size = @index
|
145
|
+
# Identifier starting characters are restricted to a subset of valid
|
146
|
+
# identifier fragment characters.
|
147
|
+
until eof?
|
148
|
+
# Unicode escape sequences may occur anywhere within an identifier.
|
149
|
+
if /^\\u\h{4}$/ =~ @source[size, 6]
|
150
|
+
# Advance the scan pointer past the Unicode escape sequence.
|
151
|
+
size += 6
|
152
|
+
else
|
153
|
+
character = @source[size]
|
154
|
+
if lex_as_fragment
|
155
|
+
# Use the full `IdentifierPart` production.
|
156
|
+
break unless character =~ IDENTIFIER_FRAGMENT
|
157
|
+
else
|
158
|
+
# The initial character must conform to the more restrictive
|
159
|
+
# `IdentifierStart` production.
|
160
|
+
break unless character =~ IDENTIFIER_START
|
161
|
+
# All subsequent characters may be lexed as identifier fragments.
|
162
|
+
lex_as_fragment = true
|
163
|
+
end
|
164
|
+
size += 1
|
165
|
+
end
|
166
|
+
end
|
167
|
+
size - @index
|
168
|
+
end
|
169
|
+
|
170
|
+
# Internal: Returns the maximum number of characters, relative to the
|
171
|
+
# current scan pointer, that may be parsed as valid decimal characters.
|
172
|
+
# The scan pointer is not advanced.
|
173
|
+
def match_decimal?
|
174
|
+
size = @index
|
175
|
+
size += 1 until eof? || @source[size] !~ /\d/
|
176
|
+
size - @index
|
177
|
+
end
|
178
|
+
|
179
|
+
# Public: Returns `true` if the lexer has reached the end of the source
|
180
|
+
# string.
|
181
|
+
def eof?
|
182
|
+
@terminated || @index >= @source.size
|
183
|
+
end
|
184
|
+
|
185
|
+
# Public: Lexes a token.
|
186
|
+
#
|
187
|
+
# pattern - If the token is `/` or `/=`, specifies whether it may be lexed
|
188
|
+
# as part of a regular expression. If `false`, the token will be lexed as
|
189
|
+
# a division operator instead (default: true).
|
190
|
+
#
|
191
|
+
# Returns the lexed `Token`, or `nil` if the lexer has finished scanning the
|
192
|
+
# source.
|
193
|
+
def lex(pattern = true)
|
194
|
+
return if @terminated
|
195
|
+
if eof?
|
196
|
+
@terminated ||= true
|
197
|
+
token = Token.new(self, :eof, @source.size...@source.size)
|
198
|
+
return token
|
199
|
+
end
|
200
|
+
token = TOKEN.match(@source, @index) do |match|
|
201
|
+
case
|
202
|
+
# Produces a whitespace, line terminator, line comment (`// ...`), or
|
203
|
+
# block comment (`/* ... */`) token.
|
204
|
+
when match[:whitespace] then lex_whitespace
|
205
|
+
when match[:line_terminator] then lex_line_terminator
|
206
|
+
when match[:line_comment] then lex_line_comment
|
207
|
+
when match[:block_comment] then lex_block_comment
|
208
|
+
# Produces a single- or double-quoted string token. A single method is
|
209
|
+
# used to produce both kinds of tokens.
|
210
|
+
when match[:single_quoted_string] then lex_string :single
|
211
|
+
when match[:double_quoted_string] then lex_string :double
|
212
|
+
# Produces a hexadecimal or decimal token. Octal numbers produce an
|
213
|
+
# error, as they are prohibited in ES 5.
|
214
|
+
when match[:number] then lex_number
|
215
|
+
# `/` and `/=` may be interpreted as either regular expressions or
|
216
|
+
# division operators. The `pattern` argument specifies whether
|
217
|
+
# these tokens should be lexed as RegExps or punctuators.
|
218
|
+
when pattern && match[:pattern] then lex_pattern
|
219
|
+
else
|
220
|
+
# The `<pattern>` capture may contain the `/` and `/=` tokens.
|
221
|
+
if result = match[:pattern] || match[:punctuator]
|
222
|
+
token = Token.new(self, :punctuator, @index...@index += result.size)
|
223
|
+
@column += token.size
|
224
|
+
token
|
225
|
+
else
|
226
|
+
# Lex the token as an identifier.
|
227
|
+
lex_identifier
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
# Record the position of the token in the token stream.
|
232
|
+
token[:index] = @tokens.size
|
233
|
+
@tokens << token
|
234
|
+
token
|
235
|
+
end
|
236
|
+
|
237
|
+
# Internal: Lexes a whitespace token at the current scan position.
|
238
|
+
#
|
239
|
+
# Returns the lexed `Token`.
|
240
|
+
def lex_whitespace
|
241
|
+
token = Token.new(self, :whitespace, @index...@index += 1)
|
242
|
+
token[:isWhite] = true
|
243
|
+
@column += 1
|
244
|
+
token
|
245
|
+
end
|
246
|
+
|
247
|
+
# Internal: Lexes a line terminator at the current scan position: either a
|
248
|
+
# line feed, carriage return, line separator, or paragraph separator. See
|
249
|
+
# section 7.3 of the spec.
|
250
|
+
#
|
251
|
+
# Returns the lexed `Token`.
|
252
|
+
def lex_line_terminator
|
253
|
+
character = @source[@index]
|
254
|
+
stop = @index + 1
|
255
|
+
# If the current character is a carriage return and the next character is
|
256
|
+
# a line feed, the source string contains CRLF line endings. The `stop`
|
257
|
+
# position is advanced one additional character, so that "\r\n" is treated
|
258
|
+
# as a single terminator.
|
259
|
+
stop += 1 if character == ?\r && @source[stop] == ?\n
|
260
|
+
# Advance the current index past the terminator.
|
261
|
+
token = Token.new(self, :line_terminator, @index...@index = stop)
|
262
|
+
token[:lines] = 1
|
263
|
+
token[:isWhite] = true
|
264
|
+
@line += 1
|
265
|
+
@column = 0
|
266
|
+
token
|
267
|
+
end
|
268
|
+
|
269
|
+
# Internal: Lexes a line comment at the current scan position.
|
270
|
+
#
|
271
|
+
# Returns the lexed `Token`.
|
272
|
+
def lex_line_comment
|
273
|
+
@column = @normalized_source.index(?\n, @index) || @source.length
|
274
|
+
token = Token.new(self, :line_comment, @index...@index = @column)
|
275
|
+
token[:isComment] = token[:isWhite] = true
|
276
|
+
token
|
277
|
+
end
|
278
|
+
|
279
|
+
# Internal: Lexes a block comment at the current scan position.
|
280
|
+
#
|
281
|
+
# Returns the lexed `Token`.
|
282
|
+
def lex_block_comment
|
283
|
+
start = @index
|
284
|
+
# Mark the ending position of the comment.
|
285
|
+
stop = @source.index("*/", start)
|
286
|
+
if stop
|
287
|
+
# Advance the current position past the end of the comment.
|
288
|
+
@index = stop + 2
|
289
|
+
token = Token.new(self, :block_comment, start...@index)
|
290
|
+
token[:isComment] = token[:isWhite] = true
|
291
|
+
# Block comments trigger automatic semicolon insertion only if they
|
292
|
+
# span multiple lines. The normalized source is used to quickly
|
293
|
+
# detect line terminators.
|
294
|
+
index = lines = 0
|
295
|
+
# Advance the current line.
|
296
|
+
lines += 1 while index = @normalized_source[start...@index].index(?\n, index + 1)
|
297
|
+
if lines.zero?
|
298
|
+
# For single-line block comments, increase the column by the size of
|
299
|
+
# the token.
|
300
|
+
@column += token[:size]
|
301
|
+
else
|
302
|
+
# For multiline block comments, record the number of lines comprising
|
303
|
+
# the comment and reset the column.
|
304
|
+
@line += token[:lines] = lines
|
305
|
+
@column = 0
|
306
|
+
end
|
307
|
+
else
|
308
|
+
# Unterminated block comment. If a line terminator is found, the comment
|
309
|
+
# is assumed to end immediately before it. Otherwise, the comment is
|
310
|
+
# assumed to end two characters after the current scan position.
|
311
|
+
stop = @normalized_source.index(?\n, @index)
|
312
|
+
@index = stop || @index + 2
|
313
|
+
token = Token.new(self, :error, start...@index)
|
314
|
+
token[:error] = "Unterminated block comment."
|
315
|
+
token[:isComment] = token[:isWhite] = token[:tokenError] = true
|
316
|
+
@column += token[:size]
|
317
|
+
end
|
318
|
+
token
|
319
|
+
end
|
320
|
+
|
321
|
+
# Internal: Lexes a single- or double-quoted string primitive at the
|
322
|
+
# current scan position.
|
323
|
+
#
|
324
|
+
# style - A `Symbol` that specifies the quoting style. The quoting style
|
325
|
+
# must be defined as a key in the `Lexer::STRINGS` hash.
|
326
|
+
#
|
327
|
+
# Returns the lexed `Token`.
|
328
|
+
# Raises `KeyError` if the quoting style is not defined in the hash.
|
329
|
+
def lex_string(style)
|
330
|
+
style = STRINGS.fetch(style)
|
331
|
+
start = @index
|
332
|
+
lines = 0
|
333
|
+
loop do
|
334
|
+
# Parse escape sequences in strings.
|
335
|
+
until eof? || @source[@index += 1] != ?\\
|
336
|
+
# Record the number of new lines if the string contains linefeeds. The shadow input is
|
337
|
+
# used to avoid repeatedly normalizing line endings.
|
338
|
+
@line += (lines += 1) if @normalized_source[@index + 1] == ?\n
|
339
|
+
# Advance to the next character.
|
340
|
+
@index += 1
|
341
|
+
end
|
342
|
+
# If the string contains an unescaped line terminator, it is a syntax error. Some
|
343
|
+
# environments permit unescaped new lines in strings; however, the spec disallows them.
|
344
|
+
if @source[@index] =~ LINE_TERMINATORS
|
345
|
+
token = Token.new(self, :error, start...@index)
|
346
|
+
token[:error] = style[:invalid_continuation_error]
|
347
|
+
token[:isString] = token[:tokenError] = true
|
348
|
+
break
|
349
|
+
end
|
350
|
+
# Consume escape sequences until either the end of the source or the end-of-string character
|
351
|
+
# is reached.
|
352
|
+
break if eof? || @source[@index] == style[:quote]
|
353
|
+
end
|
354
|
+
# If the end of the source is reached without consuming the end-of-string character, the
|
355
|
+
# source contains an unterminated string literal.
|
356
|
+
if @source[@index] == style[:quote]
|
357
|
+
# Advance the index past the end-of-string character.
|
358
|
+
@index += 1
|
359
|
+
token = Token.new(self, style[:kind], start...@index)
|
360
|
+
token[:isPrimitive] = token[:isString] = true
|
361
|
+
# Update the line and column entries accordingly.
|
362
|
+
if lines.zero?
|
363
|
+
@column += token[:size]
|
364
|
+
else
|
365
|
+
token[:lines] = lines
|
366
|
+
@column = 0
|
367
|
+
end
|
368
|
+
else
|
369
|
+
token = Token.new(self, :error, start...@index)
|
370
|
+
token[:error] = style[:unterminated_string_error]
|
371
|
+
token[:isString] = token[:tokenError] = true
|
372
|
+
@column += token[:size]
|
373
|
+
end
|
374
|
+
token
|
375
|
+
end
|
376
|
+
|
377
|
+
# Internal: Lexes a decimal or hexadecimal numeric value. See section 7.8.3.
|
378
|
+
#
|
379
|
+
# Returns the lexed `Token`.
|
380
|
+
def lex_number
|
381
|
+
start = @index
|
382
|
+
@index += 1
|
383
|
+
# If the token begins with a `0x`, parse the remainder as a hexadecimal value.
|
384
|
+
if @source[start..@index] =~ /0[xX]/
|
385
|
+
position = @index += 1
|
386
|
+
# Consume characters until the end of the string or a non-hexdigit
|
387
|
+
# character is encountered.
|
388
|
+
@index += 1 until eof? || @source[@index] !~ /\h/
|
389
|
+
# If no additional characters were consumed, the hex value is invalid.
|
390
|
+
if position == @index
|
391
|
+
token = Token.new(self, :error, start...@index)
|
392
|
+
token[:error] = "Invalid hexdigit value."
|
393
|
+
token[:isNumber] = token[:tokenError] = true
|
394
|
+
else
|
395
|
+
# The value is syntactically sound.
|
396
|
+
token = Token.new(self, :hexadecimal_number, start...@index)
|
397
|
+
token[:isPrimitive] = token[:isNumber] = true
|
398
|
+
end
|
399
|
+
else
|
400
|
+
# Determine if an octal escape sequence is being parsed (i.e., a leading
|
401
|
+
# zero followed by a decimal digit).
|
402
|
+
is_octal = @source[start..@index] =~ /0\d/
|
403
|
+
# Parse the integral expression before the decimal point.
|
404
|
+
unless @source[start] == ?.
|
405
|
+
# Consume characters until the end of the string or a non-decimal
|
406
|
+
# character is encountered.
|
407
|
+
@index += match_decimal?
|
408
|
+
# Advance past the decimal point.
|
409
|
+
@index += 1 if @source[@index] == ?.
|
410
|
+
end
|
411
|
+
# Parse the decimal component.
|
412
|
+
@index += match_decimal?
|
413
|
+
# Parse the exponent.
|
414
|
+
if @source[@index] =~ /[eE]/
|
415
|
+
# Advance past the sign.
|
416
|
+
@index += 1 if @source[@index += 1] =~ /[+-]/
|
417
|
+
# Mark the current position and consume decimal digits past the
|
418
|
+
# exponential.
|
419
|
+
position = @index
|
420
|
+
@index += match_decimal?
|
421
|
+
# If no additional characters were consumed but an exponent was lexed,
|
422
|
+
# the decimal value is invalid.
|
423
|
+
if position == @index
|
424
|
+
token = Token.new(self, :error, start...@index)
|
425
|
+
token[:error] = "Exponents may not be empty."
|
426
|
+
token[:tokenError] = true
|
427
|
+
end
|
428
|
+
end
|
429
|
+
unless token
|
430
|
+
# Octal literals are invalid in ES 5.
|
431
|
+
if is_octal
|
432
|
+
token = Token.new(self, :error, start...@index)
|
433
|
+
token[:error] = "Invalid octal escape sequence."
|
434
|
+
token[:isNumber] = token[:isOctal] = token[:tokenError] = true
|
435
|
+
else
|
436
|
+
# Syntactically valid decimal value. As with hexdigits, the parser
|
437
|
+
# will determine if the lexed value is semantically sound.
|
438
|
+
token = Token.new(self, :decimal_number, start...@index)
|
439
|
+
token[:isPrimitive] = token[:isNumber] = true
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
@column += token[:size]
|
444
|
+
token
|
445
|
+
end
|
446
|
+
|
447
|
+
# Internal: Lexes a regular expression literal. See section 7.8.5.
|
448
|
+
#
|
449
|
+
# Returns the lexed `Token`.
|
450
|
+
def lex_pattern
|
451
|
+
start = @index
|
452
|
+
# Maintains a hash of the initial and terminal positions of balanced
|
453
|
+
# regular expression characters: grouping parentheses, character class
|
454
|
+
# brackets, and quantifier braces.
|
455
|
+
balanced = {}
|
456
|
+
# Ensures that all capturing groups in the pattern are balanced.
|
457
|
+
groups = []
|
458
|
+
# A flag that specifies if the regular expression is terminated.
|
459
|
+
terminated = false
|
460
|
+
# Only the last syntax error is preserved for improperly constructed
|
461
|
+
# regular expressions.
|
462
|
+
syntax_error = nil
|
463
|
+
loop do
|
464
|
+
@index += 1
|
465
|
+
break if eof?
|
466
|
+
# Use the normalized input to quickly detect line terminators.
|
467
|
+
case character = @normalized_source[@index]
|
468
|
+
when ?\n
|
469
|
+
# Line terminators cannot occur within RegExp literals.
|
470
|
+
token = Token.new(self, :error, start...@index)
|
471
|
+
token[:error] = "Line terminators are not permitted within RegExp literals."
|
472
|
+
token[:tokenError] = token[:errorHasContent] = true
|
473
|
+
# Avoid emitting a second unterminated RegExp error once lexing is
|
474
|
+
# complete.
|
475
|
+
terminated = true
|
476
|
+
break
|
477
|
+
when ?/
|
478
|
+
# An unescaped `/` marks the end of the regular expression.
|
479
|
+
terminated = true
|
480
|
+
break
|
481
|
+
when /[?*+]/
|
482
|
+
syntax_error = "`?`, `*`, and `+` require a value to repeat."
|
483
|
+
when ?^
|
484
|
+
# `^` may only occur immediately following `|`, or at the beginning
|
485
|
+
# of either the pattern, a capturing group, or a lookahead assertion
|
486
|
+
# (`?:`, `?=`, or `?!`). Note that `^` may also negate a character
|
487
|
+
# class; however, character classes have different semantics and are
|
488
|
+
# lexed separately.
|
489
|
+
unless @source[@index - 1] =~ %r{[/|(]} || @source[@index - 3, 3] =~ /\(\?[:!=]/
|
490
|
+
syntax_error = "`^` may not occur here."
|
491
|
+
end
|
492
|
+
when ?$
|
493
|
+
# `$` may only occur immediately before `|`, or at the end of either
|
494
|
+
# the pattern, a capturing group, or a lookahead assertion.
|
495
|
+
unless @source[@index + 1] =~ %r{[/|)]}
|
496
|
+
syntax_error = "`$` may not occur here."
|
497
|
+
end
|
498
|
+
when ?}
|
499
|
+
# Interpreters can distinguish between and automatically escape braces
|
500
|
+
# not used to delimit quantifiers. Nevertheless, it's considered a bad
|
501
|
+
# practice to leave special characters unescaped in RegExps. Both the
|
502
|
+
# Violet lexer and the ZeParser tokenizer assume that all unescaped
|
503
|
+
# braces delimit quantifiers, and emit errors accordingly.
|
504
|
+
syntax_error = "Mismatched `}`."
|
505
|
+
else
|
506
|
+
# Lex capturing groups.
|
507
|
+
if character == ?(
|
508
|
+
# Mark the initial position of the capturing group.
|
509
|
+
groups << @index - start
|
510
|
+
elsif character == ?)
|
511
|
+
if groups.empty?
|
512
|
+
syntax_error = "Capturing group parentheses must be balanced."
|
513
|
+
else
|
514
|
+
# Record the initial and terminal positions of the parentheses delimiting the group.
|
515
|
+
terminal = @index - start
|
516
|
+
balanced[initial = groups.pop] = terminal
|
517
|
+
balanced[terminal] = initial
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
# Character Classes.
|
522
|
+
# ------------------
|
523
|
+
if character == ?[
|
524
|
+
# Record the initial position of the character class.
|
525
|
+
initial = @index - start
|
526
|
+
# Characters in character classes are treated literally, so there
|
527
|
+
# is no need to escape them. The exceptions are line terminators and
|
528
|
+
# unescaped closing brackets, which are not part of the
|
529
|
+
# `RegularExpressionClassChar` grammar.
|
530
|
+
loop do
|
531
|
+
@index += 1
|
532
|
+
break if eof? || @normalized_source[@index] == ?\n || @source[@index] == ?]
|
533
|
+
if @source[@index] == ?\\
|
534
|
+
if @normalized_source[@index + 1] == ?\n
|
535
|
+
# Abort lexing if a line terminator is encountered.
|
536
|
+
break
|
537
|
+
else
|
538
|
+
# Skip lexing the subsequent escaped character. This ensures
|
539
|
+
# that escaped closing brackets (`\]`) are lexed correctly.
|
540
|
+
@index += 1
|
541
|
+
end
|
542
|
+
end
|
543
|
+
end
|
544
|
+
if @source[@index] == ?]
|
545
|
+
# Record the initial and terminal positions of the brackets
|
546
|
+
# delimiting the class.
|
547
|
+
terminal = @index - start
|
548
|
+
balanced[initial] = terminal
|
549
|
+
balanced[terminal] = initial
|
550
|
+
else
|
551
|
+
token = Token.new(self, :error, start...@index)
|
552
|
+
token[:error] = "Character class brackets must be balanced."
|
553
|
+
token[:tokenError] = true
|
554
|
+
# Avoid emitting an unterminated RegExp error once lexing is
|
555
|
+
# complete.
|
556
|
+
terminated = true
|
557
|
+
break
|
558
|
+
end
|
559
|
+
# Lex escaped characters. Escape sequences may occur anywhere within
|
560
|
+
# the RegExp, and indicate that the following character should be
|
561
|
+
# interpreted literally.
|
562
|
+
elsif character == ?\\ && @normalized_source[@index + 1] != ?\n
|
563
|
+
@index += 1
|
564
|
+
end
|
565
|
+
|
566
|
+
# Lookahead Assertions and Quantifiers.
|
567
|
+
# -------------------------------------
|
568
|
+
if character == ?(
|
569
|
+
# Lex a non-capturing group, positive lookahead, or negative lookahead.
|
570
|
+
@index += 2 if @source[@index + 1, 2] =~ /\?[:=!]/
|
571
|
+
else
|
572
|
+
# Lex quantifiers.
|
573
|
+
case @source[@index + 1]
|
574
|
+
when ??
|
575
|
+
# The `?` quantifier matches the preceding character zero or one
|
576
|
+
# times.
|
577
|
+
@index += 1
|
578
|
+
when /[*+]/
|
579
|
+
# The `*` quantifier matches the preceding character zero or more
|
580
|
+
# times; `+` matches a character one or more times. `*?` and `+?`
|
581
|
+
# indicate a non-greedy match.
|
582
|
+
@index += 1 if @source[@index += 1] == ??
|
583
|
+
when ?{
|
584
|
+
# Advance one character and mark the initial position of the
|
585
|
+
# quantifier.
|
586
|
+
@index += 1
|
587
|
+
initial = @index - start
|
588
|
+
# The `{n}` quantifier matches the preceding character exactly
|
589
|
+
# `n` times. `{n,}` matches at least `n` occurrences of the
|
590
|
+
# preceding character. `{n,m}` matches at least `n` and at most
|
591
|
+
# `m` occurrences.
|
592
|
+
unless @source[@index += 1] =~ /\d/
|
593
|
+
syntax_error = "Quantifier curly requires at least one digit before the comma"
|
594
|
+
end
|
595
|
+
# Lex the `n` value.
|
596
|
+
@index += match_decimal?
|
597
|
+
# Lex the `m` value, if any, if a comma is specified.
|
598
|
+
@index += match_decimal? if @source[@index += 1] == ?,
|
599
|
+
# Quantifier braces must be balanced.
|
600
|
+
if @source[@index + 1] == ?}
|
601
|
+
@index += 1
|
602
|
+
terminal = @index - start
|
603
|
+
balanced[initial] = terminal
|
604
|
+
balanced[terminal] = initial
|
605
|
+
# A trailing `?` indicates a non-greedy match.
|
606
|
+
@index += 1 if @source[@index + 1] == ??
|
607
|
+
else
|
608
|
+
syntax_error = "Quantifier curly requires to be closed"
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
|
615
|
+
# Construct the token.
|
616
|
+
# --------------------
|
617
|
+
unless terminated
|
618
|
+
token = Token.new(self, :error, start...@index)
|
619
|
+
token[:error] = "Unterminated RegExp literal."
|
620
|
+
token[:tokenError] = true
|
621
|
+
else
|
622
|
+
# Advance one character and lex the regular expression flags, if any,
|
623
|
+
# as an identifier fragment (the grammar for `RegularExpressionFlags`
|
624
|
+
# is that of `IdentifierPart`).
|
625
|
+
@index += 1
|
626
|
+
@index += match_identifier? :fragment
|
627
|
+
if !groups.empty?
|
628
|
+
# If the `groups` list is not empty, at least one set of capturing
|
629
|
+
# group parentheses was not balanced.
|
630
|
+
token = Token.new(self, :error, start...@index)
|
631
|
+
token[:tokenError] = true
|
632
|
+
token[:error] = "Mismatched `(` or `)`."
|
633
|
+
elsif syntax_error
|
634
|
+
# Add the last syntax error to the stack.
|
635
|
+
token = Token.new(self, :error, start...@index)
|
636
|
+
token[:tokenError] = token[:errorHasContent] = true
|
637
|
+
token[:error] = syntax_error
|
638
|
+
else
|
639
|
+
token = Token.new(self, :pattern, start...@index)
|
640
|
+
token[:isPrimitive] = true
|
641
|
+
token[:pairs] = balanced
|
642
|
+
end
|
643
|
+
end
|
644
|
+
@column += @index - start
|
645
|
+
token
|
646
|
+
end
|
647
|
+
|
648
|
+
# Internal: Lexes a regular expression literal. See sections 7.1 and 7.6.
|
649
|
+
#
|
650
|
+
# Returns the lexed `Token`.
|
651
|
+
def lex_identifier
|
652
|
+
size = match_identifier?
|
653
|
+
if size.zero?
|
654
|
+
character = @source[@index]
|
655
|
+
token = Token.new(self, :error, @index...@index += 1)
|
656
|
+
token[:tokenError] = true
|
657
|
+
token[:error] = if character == ?\\
|
658
|
+
@source[@index] == ?u ? "Invalid Unicode escape sequence." : "Illegal escape sequence."
|
659
|
+
else
|
660
|
+
"Invalid token."
|
661
|
+
end
|
662
|
+
else
|
663
|
+
token = Token.new(self, :identifier, @index...@index += size)
|
664
|
+
# Mark the token as a primitive if it is in the `Lexer::LITERALS` array.
|
665
|
+
token[:isPrimitive] = LITERALS.include? token[:value]
|
666
|
+
end
|
667
|
+
@column += token[:size]
|
668
|
+
token
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|