violet 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/violet +34 -0
- data/lib/violet.rb +22 -0
- data/lib/violet/lexer.rb +671 -0
- data/lib/violet/parser.rb +1440 -0
- data/lib/violet/token.rb +78 -0
- data/test/test_assertions.rb +550 -0
- data/test/test_violet.rb +36 -0
- metadata +90 -0
data/bin/violet
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
require File.expand_path("../lib/violet", File.dirname(__FILE__))
|
5
|
+
require "pal"
|
6
|
+
|
7
|
+
# Internal: Defines the commands and variables available to the interactive
|
8
|
+
# shell.
|
9
|
+
class Command < Pal::Context
|
10
|
+
|
11
|
+
# Public: Lexes a string of JavaScript source code.
|
12
|
+
#
|
13
|
+
# source - The source `String`.
|
14
|
+
# patterns - Boolean arguments that correspond to each lexed token and
|
15
|
+
# specify if the `/` and `/=` tokens may be interpreted as regular
|
16
|
+
# expressions (`true`) or division operators (`false`).
|
17
|
+
#
|
18
|
+
# Returns an `Array` of `Token`s.
|
19
|
+
def lex(source, *patterns)
|
20
|
+
Lexer.new(source).tokens(*patterns)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Public: Parses a string of JavaScript source code.
|
24
|
+
#
|
25
|
+
# source - The source `String`.
|
26
|
+
#
|
27
|
+
# Returns an `Array` of `Token`s.
|
28
|
+
def parse(source)
|
29
|
+
Parser.parse(source)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Pal::REPL.new("violet", Command.new).loop
|
34
|
+
end
|
data/lib/violet.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
# Public: Contains the version information.
|
5
|
+
module Version
|
6
|
+
# Public: The current version of Violet. The major, minor, and patch
|
7
|
+
# versions are exposed as individual constants, and comprise the
|
8
|
+
# semantic version string.
|
9
|
+
STRING = (MAJOR, MINOR, PATCH = 0, 0, 1) * "."
|
10
|
+
end
|
11
|
+
|
12
|
+
# Internal: A named `Error` class, used for reporting parse errors.
|
13
|
+
Error = Class.new(StandardError)
|
14
|
+
|
15
|
+
# Prepend the `lib` directory to the load path to facilitate loading Violet
|
16
|
+
# without RubyGems. Modules and classes will be loaded as needed.
|
17
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
18
|
+
|
19
|
+
autoload :Token, "violet/token"
|
20
|
+
autoload :Lexer, "violet/lexer"
|
21
|
+
autoload :Parser, "violet/parser"
|
22
|
+
end
|
data/lib/violet/lexer.rb
ADDED
@@ -0,0 +1,671 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module Violet
|
4
|
+
# Internal: Records exceptions emitted by the lexer.
|
5
|
+
LexerError = Class.new(Error)
|
6
|
+
|
7
|
+
# Public: Lexes a JavaScript source string.
|
8
|
+
class Lexer
|
9
|
+
# Public: Matches line terminators: line feeds, carriage returns, line
|
10
|
+
# separators, and paragraph separators. See section 7.3 of the ES 5.1 spec.
|
11
|
+
LINE_TERMINATORS = /[\n\r\u2028\u2029]/
|
12
|
+
|
13
|
+
# Public: Matches line separators, paragraph separators, and carriage
|
14
|
+
# returns not followed by line separators. Used to convert all line
|
15
|
+
# terminators to line feeds. CRLF line endings are preserved.
|
16
|
+
NORMALIZE_LINE_ENDINGS = /[\u2028\u2029]|(?:\r[^\n])/
|
17
|
+
|
18
|
+
# Public: Matches Unicode letters, `$`, `_`, and Unicode escape sequences.
|
19
|
+
# See section 7.6.
|
20
|
+
IDENTIFIER_START = /[$_\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]/
|
21
|
+
|
22
|
+
# Public: Matches identifier starting characters, Unicode combining marks,
|
23
|
+
# Unicode digits, Unicode connector punctuators, zero-width non-joiners, and
|
24
|
+
# zero-width joiners. See section 7.1.
|
25
|
+
IDENTIFIER_FRAGMENT = Regexp.union(IDENTIFIER_START, /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]/)
|
26
|
+
|
27
|
+
# Public: Matches an ECMAScript token. This is a superset of the `Token`
|
28
|
+
# production defined in section 7.5 of the spec.
|
29
|
+
TOKEN = %r(
|
30
|
+
## Whitespace characters: tab, vertical tab, form feed, space,
|
31
|
+
# non-breaking space, byte-order mark, and other Unicode space separators
|
32
|
+
# (Category Z). The space and non-breaking space characters are matched by
|
33
|
+
# the \p{Z} Unicode category class. See section 7.2 of the ES spec.
|
34
|
+
(?<whitespace>[\t\v\f\ufeff\uffff\p{Z}])?
|
35
|
+
# Line terminators. See section 7.3.
|
36
|
+
(?<line_terminator>#{LINE_TERMINATORS})?
|
37
|
+
# Line and block comments. See section 7.4.
|
38
|
+
(?<line_comment>//)?
|
39
|
+
(?<block_comment>/\*)?
|
40
|
+
# Single- and double-quoted string literals. See section 7.8.4.
|
41
|
+
(?<single_quoted_string>')?
|
42
|
+
(?<double_quoted_string>")?
|
43
|
+
# Numeric literals. See section 7.8.3.
|
44
|
+
(?<number>\.?[0-9])?
|
45
|
+
# RegExp literals. See section 7.8.5. This capture may also match the
|
46
|
+
# `DivPunctuator` production.
|
47
|
+
(?:(?<pattern>/)[^=])?
|
48
|
+
# Punctuators. See section 7.7.
|
49
|
+
(?<punctuator>\>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|&&|
|
50
|
+
\|\||\+=|-=|\*=|%=|&=|\|=|\^=|/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|
|
51
|
+
\*|%|\||&|\||\^|!|~|\?|:|=|/)?
|
52
|
+
)x
|
53
|
+
|
54
|
+
# Internal: The `true`, `false`, and `null` literals, as well as the
|
55
|
+
# `undefined` value. The lexer marks these four values as primitives.
|
56
|
+
LITERALS = %w( undefined null true false )
|
57
|
+
|
58
|
+
# Internal: A `Hash` that contains the quote character, token kind, and the
|
59
|
+
# unterminated string and invalid line continuation error messages for
|
60
|
+
# single- and double-quoted string tokens.
|
61
|
+
STRINGS = %w( single ' double " ).each_slice(2).with_object({}) do |(kind, quote), value|
|
62
|
+
value[kind.to_sym] = {
|
63
|
+
:quote => quote,
|
64
|
+
:kind => "#{kind}_quoted_string".to_sym,
|
65
|
+
:unterminated_string_error => "Unterminated #{kind}-quoted string literal.",
|
66
|
+
:invalid_continuation_error => "Unescaped line terminators are not permitted within #{kind}-quoted string literals."
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
# Public: Gets the source string.
|
71
|
+
attr_reader :source
|
72
|
+
|
73
|
+
# Public: Gets the current line.
|
74
|
+
attr_reader :line
|
75
|
+
|
76
|
+
# Public: Gets the current column.
|
77
|
+
attr_reader :column
|
78
|
+
|
79
|
+
# Public: Creates a new `Lexer` with a source string.
|
80
|
+
#
|
81
|
+
# source - The source `String`.
|
82
|
+
def initialize(source)
|
83
|
+
@source = source
|
84
|
+
# Replace all line terminators with a single line feed, but preserve CRLF
|
85
|
+
# line endings.
|
86
|
+
@normalized_source = @source.gsub(NORMALIZE_LINE_ENDINGS, ?\n)
|
87
|
+
reset!
|
88
|
+
end
|
89
|
+
|
90
|
+
# Public: Resets the lexer to its original position and clears the token
|
91
|
+
# stream.
|
92
|
+
def reset!
|
93
|
+
@index = @line = @column = 0
|
94
|
+
@terminated = false
|
95
|
+
(@tokens ||= []).clear
|
96
|
+
end
|
97
|
+
|
98
|
+
# Public: Produces a complete token stream from the source. This method
|
99
|
+
# resets the lexer prior to lexing the source string.
|
100
|
+
#
|
101
|
+
# patterns - Zero or more boolean arguments that correspond to each lexed
|
102
|
+
# token and specify if the `/` and `/=` tokens may be interpreted as
|
103
|
+
# regular expressions (`true`) or division operators (`false`). This
|
104
|
+
# flag only applies to division and regular expression tokens; setting
|
105
|
+
# it for other tokens has no effect.
|
106
|
+
def tokens(*patterns)
|
107
|
+
reset!
|
108
|
+
index = -1
|
109
|
+
# Lex tokens until the end-of-file mark is reached.
|
110
|
+
loop { break unless lex patterns[index += 1] }
|
111
|
+
@tokens
|
112
|
+
end
|
113
|
+
|
114
|
+
# Public: Inserts a new token into the token stream, before a reference
|
115
|
+
# token. If the reference token is the end-of-file mark, the token is
|
116
|
+
# appended instead.
|
117
|
+
#
|
118
|
+
# token - The `Token` to be inserted into the token stream.
|
119
|
+
# original - The reference `Token` before which the new `Token` is inserted.
|
120
|
+
#
|
121
|
+
# Returns the new `Token`.
|
122
|
+
def insert_before(token, original)
|
123
|
+
if original[:name] == Token::Types[:eof]
|
124
|
+
token[:index] = @tokens.size
|
125
|
+
@tokens << token
|
126
|
+
else
|
127
|
+
token[:index] = original[:index]
|
128
|
+
@tokens[token[:index]] = token
|
129
|
+
original[:index] += 1
|
130
|
+
@tokens[original[:index]] = original
|
131
|
+
end
|
132
|
+
token
|
133
|
+
end
|
134
|
+
|
135
|
+
# Internal: Returns the maximum number of characters, relative to the
|
136
|
+
# current scan pointer, that may be parsed as valid identifier
|
137
|
+
# characters. The scan pointer is not advanced.
|
138
|
+
#
|
139
|
+
# lex_as_fragment - A boolean that specifies whether the identifier may be
|
140
|
+
# lexed as a fragment. Certain productions allow identifier fragments,
|
141
|
+
# while others require that the identifier begin with a subset of valid
|
142
|
+
# fragment characters (default: false).
|
143
|
+
def match_identifier?(lex_as_fragment = false)
|
144
|
+
size = @index
|
145
|
+
# Identifier starting characters are restricted to a subset of valid
|
146
|
+
# identifier fragment characters.
|
147
|
+
until eof?
|
148
|
+
# Unicode escape sequences may occur anywhere within an identifier.
|
149
|
+
if /^\\u\h{4}$/ =~ @source[size, 6]
|
150
|
+
# Advance the scan pointer past the Unicode escape sequence.
|
151
|
+
size += 6
|
152
|
+
else
|
153
|
+
character = @source[size]
|
154
|
+
if lex_as_fragment
|
155
|
+
# Use the full `IdentifierPart` production.
|
156
|
+
break unless character =~ IDENTIFIER_FRAGMENT
|
157
|
+
else
|
158
|
+
# The initial character must conform to the more restrictive
|
159
|
+
# `IdentifierStart` production.
|
160
|
+
break unless character =~ IDENTIFIER_START
|
161
|
+
# All subsequent characters may be lexed as identifier fragments.
|
162
|
+
lex_as_fragment = true
|
163
|
+
end
|
164
|
+
size += 1
|
165
|
+
end
|
166
|
+
end
|
167
|
+
size - @index
|
168
|
+
end
|
169
|
+
|
170
|
+
# Internal: Returns the maximum number of characters, relative to the
|
171
|
+
# current scan pointer, that may be parsed as valid decimal characters.
|
172
|
+
# The scan pointer is not advanced.
|
173
|
+
def match_decimal?
|
174
|
+
size = @index
|
175
|
+
size += 1 until eof? || @source[size] !~ /\d/
|
176
|
+
size - @index
|
177
|
+
end
|
178
|
+
|
179
|
+
# Public: Returns `true` if the lexer has reached the end of the source
|
180
|
+
# string.
|
181
|
+
def eof?
|
182
|
+
@terminated || @index >= @source.size
|
183
|
+
end
|
184
|
+
|
185
|
+
# Public: Lexes a token.
|
186
|
+
#
|
187
|
+
# pattern - If the token is `/` or `/=`, specifies whether it may be lexed
|
188
|
+
# as part of a regular expression. If `false`, the token will be lexed as
|
189
|
+
# a division operator instead (default: true).
|
190
|
+
#
|
191
|
+
# Returns the lexed `Token`, or `nil` if the lexer has finished scanning the
|
192
|
+
# source.
|
193
|
+
def lex(pattern = true)
|
194
|
+
return if @terminated
|
195
|
+
if eof?
|
196
|
+
@terminated ||= true
|
197
|
+
token = Token.new(self, :eof, @source.size...@source.size)
|
198
|
+
return token
|
199
|
+
end
|
200
|
+
token = TOKEN.match(@source, @index) do |match|
|
201
|
+
case
|
202
|
+
# Produces a whitespace, line terminator, line comment (`// ...`), or
|
203
|
+
# block comment (`/* ... */`) token.
|
204
|
+
when match[:whitespace] then lex_whitespace
|
205
|
+
when match[:line_terminator] then lex_line_terminator
|
206
|
+
when match[:line_comment] then lex_line_comment
|
207
|
+
when match[:block_comment] then lex_block_comment
|
208
|
+
# Produces a single- or double-quoted string token. A single method is
|
209
|
+
# used to produce both kinds of tokens.
|
210
|
+
when match[:single_quoted_string] then lex_string :single
|
211
|
+
when match[:double_quoted_string] then lex_string :double
|
212
|
+
# Produces a hexadecimal or decimal token. Octal numbers produce an
|
213
|
+
# error, as they are prohibited in ES 5.
|
214
|
+
when match[:number] then lex_number
|
215
|
+
# `/` and `/=` may be interpreted as either regular expressions or
|
216
|
+
# division operators. The `pattern` argument specifies whether
|
217
|
+
# these tokens should be lexed as RegExps or punctuators.
|
218
|
+
when pattern && match[:pattern] then lex_pattern
|
219
|
+
else
|
220
|
+
# The `<pattern>` capture may contain the `/` and `/=` tokens.
|
221
|
+
if result = match[:pattern] || match[:punctuator]
|
222
|
+
token = Token.new(self, :punctuator, @index...@index += result.size)
|
223
|
+
@column += token.size
|
224
|
+
token
|
225
|
+
else
|
226
|
+
# Lex the token as an identifier.
|
227
|
+
lex_identifier
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
# Record the position of the token in the token stream.
|
232
|
+
token[:index] = @tokens.size
|
233
|
+
@tokens << token
|
234
|
+
token
|
235
|
+
end
|
236
|
+
|
237
|
+
# Internal: Lexes a whitespace token at the current scan position.
|
238
|
+
#
|
239
|
+
# Returns the lexed `Token`.
|
240
|
+
def lex_whitespace
|
241
|
+
token = Token.new(self, :whitespace, @index...@index += 1)
|
242
|
+
token[:isWhite] = true
|
243
|
+
@column += 1
|
244
|
+
token
|
245
|
+
end
|
246
|
+
|
247
|
+
# Internal: Lexes a line terminator at the current scan position: either a
|
248
|
+
# line feed, carriage return, line separator, or paragraph separator. See
|
249
|
+
# section 7.3 of the spec.
|
250
|
+
#
|
251
|
+
# Returns the lexed `Token`.
|
252
|
+
def lex_line_terminator
|
253
|
+
character = @source[@index]
|
254
|
+
stop = @index + 1
|
255
|
+
# If the current character is a carriage return and the next character is
|
256
|
+
# a line feed, the source string contains CRLF line endings. The `stop`
|
257
|
+
# position is advanced one additional character, so that "\r\n" is treated
|
258
|
+
# as a single terminator.
|
259
|
+
stop += 1 if character == ?\r && @source[stop] == ?\n
|
260
|
+
# Advance the current index past the terminator.
|
261
|
+
token = Token.new(self, :line_terminator, @index...@index = stop)
|
262
|
+
token[:lines] = 1
|
263
|
+
token[:isWhite] = true
|
264
|
+
@line += 1
|
265
|
+
@column = 0
|
266
|
+
token
|
267
|
+
end
|
268
|
+
|
269
|
+
# Internal: Lexes a line comment at the current scan position.
|
270
|
+
#
|
271
|
+
# Returns the lexed `Token`.
|
272
|
+
def lex_line_comment
|
273
|
+
@column = @normalized_source.index(?\n, @index) || @source.length
|
274
|
+
token = Token.new(self, :line_comment, @index...@index = @column)
|
275
|
+
token[:isComment] = token[:isWhite] = true
|
276
|
+
token
|
277
|
+
end
|
278
|
+
|
279
|
+
# Internal: Lexes a block comment at the current scan position.
|
280
|
+
#
|
281
|
+
# Returns the lexed `Token`.
|
282
|
+
def lex_block_comment
|
283
|
+
start = @index
|
284
|
+
# Mark the ending position of the comment.
|
285
|
+
stop = @source.index("*/", start)
|
286
|
+
if stop
|
287
|
+
# Advance the current position past the end of the comment.
|
288
|
+
@index = stop + 2
|
289
|
+
token = Token.new(self, :block_comment, start...@index)
|
290
|
+
token[:isComment] = token[:isWhite] = true
|
291
|
+
# Block comments trigger automatic semicolon insertion only if they
|
292
|
+
# span multiple lines. The normalized source is used to quickly
|
293
|
+
# detect line terminators.
|
294
|
+
index = lines = 0
|
295
|
+
# Advance the current line.
|
296
|
+
lines += 1 while index = @normalized_source[start...@index].index(?\n, index + 1)
|
297
|
+
if lines.zero?
|
298
|
+
# For single-line block comments, increase the column by the size of
|
299
|
+
# the token.
|
300
|
+
@column += token[:size]
|
301
|
+
else
|
302
|
+
# For multiline block comments, record the number of lines comprising
|
303
|
+
# the comment and reset the column.
|
304
|
+
@line += token[:lines] = lines
|
305
|
+
@column = 0
|
306
|
+
end
|
307
|
+
else
|
308
|
+
# Unterminated block comment. If a line terminator is found, the comment
|
309
|
+
# is assumed to end immediately before it. Otherwise, the comment is
|
310
|
+
# assumed to end two characters after the current scan position.
|
311
|
+
stop = @normalized_source.index(?\n, @index)
|
312
|
+
@index = stop || @index + 2
|
313
|
+
token = Token.new(self, :error, start...@index)
|
314
|
+
token[:error] = "Unterminated block comment."
|
315
|
+
token[:isComment] = token[:isWhite] = token[:tokenError] = true
|
316
|
+
@column += token[:size]
|
317
|
+
end
|
318
|
+
token
|
319
|
+
end
|
320
|
+
|
321
|
+
# Internal: Lexes a single- or double-quoted string primitive at the
|
322
|
+
# current scan position.
|
323
|
+
#
|
324
|
+
# style - A `Symbol` that specifies the quoting style. The quoting style
|
325
|
+
# must be defined as a key in the `Lexer::STRINGS` hash.
|
326
|
+
#
|
327
|
+
# Returns the lexed `Token`.
|
328
|
+
# Raises `KeyError` if the quoting style is not defined in the hash.
|
329
|
+
def lex_string(style)
|
330
|
+
style = STRINGS.fetch(style)
|
331
|
+
start = @index
|
332
|
+
lines = 0
|
333
|
+
loop do
|
334
|
+
# Parse escape sequences in strings.
|
335
|
+
until eof? || @source[@index += 1] != ?\\
|
336
|
+
# Record the number of new lines if the string contains linefeeds. The shadow input is
|
337
|
+
# used to avoid repeatedly normalizing line endings.
|
338
|
+
@line += (lines += 1) if @normalized_source[@index + 1] == ?\n
|
339
|
+
# Advance to the next character.
|
340
|
+
@index += 1
|
341
|
+
end
|
342
|
+
# If the string contains an unescaped line terminator, it is a syntax error. Some
|
343
|
+
# environments permit unescaped new lines in strings; however, the spec disallows them.
|
344
|
+
if @source[@index] =~ LINE_TERMINATORS
|
345
|
+
token = Token.new(self, :error, start...@index)
|
346
|
+
token[:error] = style[:invalid_continuation_error]
|
347
|
+
token[:isString] = token[:tokenError] = true
|
348
|
+
break
|
349
|
+
end
|
350
|
+
# Consume escape sequences until either the end of the source or the end-of-string character
|
351
|
+
# is reached.
|
352
|
+
break if eof? || @source[@index] == style[:quote]
|
353
|
+
end
|
354
|
+
# If the end of the source is reached without consuming the end-of-string character, the
|
355
|
+
# source contains an unterminated string literal.
|
356
|
+
if @source[@index] == style[:quote]
|
357
|
+
# Advance the index past the end-of-string character.
|
358
|
+
@index += 1
|
359
|
+
token = Token.new(self, style[:kind], start...@index)
|
360
|
+
token[:isPrimitive] = token[:isString] = true
|
361
|
+
# Update the line and column entries accordingly.
|
362
|
+
if lines.zero?
|
363
|
+
@column += token[:size]
|
364
|
+
else
|
365
|
+
token[:lines] = lines
|
366
|
+
@column = 0
|
367
|
+
end
|
368
|
+
else
|
369
|
+
token = Token.new(self, :error, start...@index)
|
370
|
+
token[:error] = style[:unterminated_string_error]
|
371
|
+
token[:isString] = token[:tokenError] = true
|
372
|
+
@column += token[:size]
|
373
|
+
end
|
374
|
+
token
|
375
|
+
end
|
376
|
+
|
377
|
+
# Internal: Lexes a decimal or hexadecimal numeric value. See section 7.8.3.
|
378
|
+
#
|
379
|
+
# Returns the lexed `Token`.
|
380
|
+
def lex_number
|
381
|
+
start = @index
|
382
|
+
@index += 1
|
383
|
+
# If the token begins with a `0x`, parse the remainder as a hexadecimal value.
|
384
|
+
if @source[start..@index] =~ /0[xX]/
|
385
|
+
position = @index += 1
|
386
|
+
# Consume characters until the end of the string or a non-hexdigit
|
387
|
+
# character is encountered.
|
388
|
+
@index += 1 until eof? || @source[@index] !~ /\h/
|
389
|
+
# If no additional characters were consumed, the hex value is invalid.
|
390
|
+
if position == @index
|
391
|
+
token = Token.new(self, :error, start...@index)
|
392
|
+
token[:error] = "Invalid hexdigit value."
|
393
|
+
token[:isNumber] = token[:tokenError] = true
|
394
|
+
else
|
395
|
+
# The value is syntactically sound.
|
396
|
+
token = Token.new(self, :hexadecimal_number, start...@index)
|
397
|
+
token[:isPrimitive] = token[:isNumber] = true
|
398
|
+
end
|
399
|
+
else
|
400
|
+
# Determine if an octal escape sequence is being parsed (i.e., a leading
|
401
|
+
# zero followed by a decimal digit).
|
402
|
+
is_octal = @source[start..@index] =~ /0\d/
|
403
|
+
# Parse the integral expression before the decimal point.
|
404
|
+
unless @source[start] == ?.
|
405
|
+
# Consume characters until the end of the string or a non-decimal
|
406
|
+
# character is encountered.
|
407
|
+
@index += match_decimal?
|
408
|
+
# Advance past the decimal point.
|
409
|
+
@index += 1 if @source[@index] == ?.
|
410
|
+
end
|
411
|
+
# Parse the decimal component.
|
412
|
+
@index += match_decimal?
|
413
|
+
# Parse the exponent.
|
414
|
+
if @source[@index] =~ /[eE]/
|
415
|
+
# Advance past the sign.
|
416
|
+
@index += 1 if @source[@index += 1] =~ /[+-]/
|
417
|
+
# Mark the current position and consume decimal digits past the
|
418
|
+
# exponential.
|
419
|
+
position = @index
|
420
|
+
@index += match_decimal?
|
421
|
+
# If no additional characters were consumed but an exponent was lexed,
|
422
|
+
# the decimal value is invalid.
|
423
|
+
if position == @index
|
424
|
+
token = Token.new(self, :error, start...@index)
|
425
|
+
token[:error] = "Exponents may not be empty."
|
426
|
+
token[:tokenError] = true
|
427
|
+
end
|
428
|
+
end
|
429
|
+
unless token
|
430
|
+
# Octal literals are invalid in ES 5.
|
431
|
+
if is_octal
|
432
|
+
token = Token.new(self, :error, start...@index)
|
433
|
+
token[:error] = "Invalid octal escape sequence."
|
434
|
+
token[:isNumber] = token[:isOctal] = token[:tokenError] = true
|
435
|
+
else
|
436
|
+
# Syntactically valid decimal value. As with hexdigits, the parser
|
437
|
+
# will determine if the lexed value is semantically sound.
|
438
|
+
token = Token.new(self, :decimal_number, start...@index)
|
439
|
+
token[:isPrimitive] = token[:isNumber] = true
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
@column += token[:size]
|
444
|
+
token
|
445
|
+
end
|
446
|
+
|
447
|
+
# Internal: Lexes a regular expression literal. See section 7.8.5.
|
448
|
+
#
|
449
|
+
# Returns the lexed `Token`.
|
450
|
+
def lex_pattern
|
451
|
+
start = @index
|
452
|
+
# Maintains a hash of the initial and terminal positions of balanced
|
453
|
+
# regular expression characters: grouping parentheses, character class
|
454
|
+
# brackets, and quantifier braces.
|
455
|
+
balanced = {}
|
456
|
+
# Ensures that all capturing groups in the pattern are balanced.
|
457
|
+
groups = []
|
458
|
+
# A flag that specifies if the regular expression is terminated.
|
459
|
+
terminated = false
|
460
|
+
# Only the last syntax error is preserved for improperly constructed
|
461
|
+
# regular expressions.
|
462
|
+
syntax_error = nil
|
463
|
+
loop do
|
464
|
+
@index += 1
|
465
|
+
break if eof?
|
466
|
+
# Use the normalized input to quickly detect line terminators.
|
467
|
+
case character = @normalized_source[@index]
|
468
|
+
when ?\n
|
469
|
+
# Line terminators cannot occur within RegExp literals.
|
470
|
+
token = Token.new(self, :error, start...@index)
|
471
|
+
token[:error] = "Line terminators are not permitted within RegExp literals."
|
472
|
+
token[:tokenError] = token[:errorHasContent] = true
|
473
|
+
# Avoid emitting a second unterminated RegExp error once lexing is
|
474
|
+
# complete.
|
475
|
+
terminated = true
|
476
|
+
break
|
477
|
+
when ?/
|
478
|
+
# An unescaped `/` marks the end of the regular expression.
|
479
|
+
terminated = true
|
480
|
+
break
|
481
|
+
when /[?*+]/
|
482
|
+
syntax_error = "`?`, `*`, and `+` require a value to repeat."
|
483
|
+
when ?^
|
484
|
+
# `^` may only occur immediately following `|`, or at the beginning
|
485
|
+
# of either the pattern, a capturing group, or a lookahead assertion
|
486
|
+
# (`?:`, `?=`, or `?!`). Note that `^` may also negate a character
|
487
|
+
# class; however, character classes have different semantics and are
|
488
|
+
# lexed separately.
|
489
|
+
unless @source[@index - 1] =~ %r{[/|(]} || @source[@index - 3, 3] =~ /\(\?[:!=]/
|
490
|
+
syntax_error = "`^` may not occur here."
|
491
|
+
end
|
492
|
+
when ?$
|
493
|
+
# `$` may only occur immediately before `|`, or at the end of either
|
494
|
+
# the pattern, a capturing group, or a lookahead assertion.
|
495
|
+
unless @source[@index + 1] =~ %r{[/|)]}
|
496
|
+
syntax_error = "`$` may not occur here."
|
497
|
+
end
|
498
|
+
when ?}
|
499
|
+
# Interpreters can distinguish between and automatically escape braces
|
500
|
+
# not used to delimit quantifiers. Nevertheless, it's considered a bad
|
501
|
+
# practice to leave special characters unescaped in RegExps. Both the
|
502
|
+
# Violet lexer and the ZeParser tokenizer assume that all unescaped
|
503
|
+
# braces delimit quantifiers, and emit errors accordingly.
|
504
|
+
syntax_error = "Mismatched `}`."
|
505
|
+
else
|
506
|
+
# Lex capturing groups.
|
507
|
+
if character == ?(
|
508
|
+
# Mark the initial position of the capturing group.
|
509
|
+
groups << @index - start
|
510
|
+
elsif character == ?)
|
511
|
+
if groups.empty?
|
512
|
+
syntax_error = "Capturing group parentheses must be balanced."
|
513
|
+
else
|
514
|
+
# Record the initial and terminal positions of the parentheses delimiting the group.
|
515
|
+
terminal = @index - start
|
516
|
+
balanced[initial = groups.pop] = terminal
|
517
|
+
balanced[terminal] = initial
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
# Character Classes.
|
522
|
+
# ------------------
|
523
|
+
if character == ?[
|
524
|
+
# Record the initial position of the character class.
|
525
|
+
initial = @index - start
|
526
|
+
# Characters in character classes are treated literally, so there
|
527
|
+
# is no need to escape them. The exceptions are line terminators and
|
528
|
+
# unescaped closing brackets, which are not part of the
|
529
|
+
# `RegularExpressionClassChar` grammar.
|
530
|
+
loop do
|
531
|
+
@index += 1
|
532
|
+
break if eof? || @normalized_source[@index] == ?\n || @source[@index] == ?]
|
533
|
+
if @source[@index] == ?\\
|
534
|
+
if @normalized_source[@index + 1] == ?\n
|
535
|
+
# Abort lexing if a line terminator is encountered.
|
536
|
+
break
|
537
|
+
else
|
538
|
+
# Skip lexing the subsequent escaped character. This ensures
|
539
|
+
# that escaped closing brackets (`\]`) are lexed correctly.
|
540
|
+
@index += 1
|
541
|
+
end
|
542
|
+
end
|
543
|
+
end
|
544
|
+
if @source[@index] == ?]
|
545
|
+
# Record the initial and terminal positions of the brackets
|
546
|
+
# delimiting the class.
|
547
|
+
terminal = @index - start
|
548
|
+
balanced[initial] = terminal
|
549
|
+
balanced[terminal] = initial
|
550
|
+
else
|
551
|
+
token = Token.new(self, :error, start...@index)
|
552
|
+
token[:error] = "Character class brackets must be balanced."
|
553
|
+
token[:tokenError] = true
|
554
|
+
# Avoid emitting an unterminated RegExp error once lexing is
|
555
|
+
# complete.
|
556
|
+
terminated = true
|
557
|
+
break
|
558
|
+
end
|
559
|
+
# Lex escaped characters. Escape sequences may occur anywhere within
|
560
|
+
# the RegExp, and indicate that the following character should be
|
561
|
+
# interpreted literally.
|
562
|
+
elsif character == ?\\ && @normalized_source[@index + 1] != ?\n
|
563
|
+
@index += 1
|
564
|
+
end
|
565
|
+
|
566
|
+
# Lookahead Assertions and Quantifiers.
|
567
|
+
# -------------------------------------
|
568
|
+
if character == ?(
|
569
|
+
# Lex a non-capturing group, positive lookahead, or negative lookahead.
|
570
|
+
@index += 2 if @source[@index + 1, 2] =~ /\?[:=!]/
|
571
|
+
else
|
572
|
+
# Lex quantifiers.
|
573
|
+
case @source[@index + 1]
|
574
|
+
when ??
|
575
|
+
# The `?` quantifier matches the preceding character zero or one
|
576
|
+
# times.
|
577
|
+
@index += 1
|
578
|
+
when /[*+]/
|
579
|
+
# The `*` quantifier matches the preceding character zero or more
|
580
|
+
# times; `+` matches a character one or more times. `*?` and `+?`
|
581
|
+
# indicate a non-greedy match.
|
582
|
+
@index += 1 if @source[@index += 1] == ??
|
583
|
+
when ?{
|
584
|
+
# Advance one character and mark the initial position of the
|
585
|
+
# quantifier.
|
586
|
+
@index += 1
|
587
|
+
initial = @index - start
|
588
|
+
# The `{n}` quantifier matches the preceding character exactly
|
589
|
+
# `n` times. `{n,}` matches at least `n` occurrences of the
|
590
|
+
# preceding character. `{n,m}` matches at least `n` and at most
|
591
|
+
# `m` occurrences.
|
592
|
+
unless @source[@index += 1] =~ /\d/
|
593
|
+
syntax_error = "Quantifier curly requires at least one digit before the comma"
|
594
|
+
end
|
595
|
+
# Lex the `n` value.
|
596
|
+
@index += match_decimal?
|
597
|
+
# Lex the `m` value, if any, if a comma is specified.
|
598
|
+
@index += match_decimal? if @source[@index += 1] == ?,
|
599
|
+
# Quantifier braces must be balanced.
|
600
|
+
if @source[@index + 1] == ?}
|
601
|
+
@index += 1
|
602
|
+
terminal = @index - start
|
603
|
+
balanced[initial] = terminal
|
604
|
+
balanced[terminal] = initial
|
605
|
+
# A trailing `?` indicates a non-greedy match.
|
606
|
+
@index += 1 if @source[@index + 1] == ??
|
607
|
+
else
|
608
|
+
syntax_error = "Quantifier curly requires to be closed"
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
|
615
|
+
# Construct the token.
|
616
|
+
# --------------------
|
617
|
+
unless terminated
|
618
|
+
token = Token.new(self, :error, start...@index)
|
619
|
+
token[:error] = "Unterminated RegExp literal."
|
620
|
+
token[:tokenError] = true
|
621
|
+
else
|
622
|
+
# Advance one character and lex the regular expression flags, if any,
|
623
|
+
# as an identifier fragment (the grammar for `RegularExpressionFlags`
|
624
|
+
# is that of `IdentifierPart`).
|
625
|
+
@index += 1
|
626
|
+
@index += match_identifier? :fragment
|
627
|
+
if !groups.empty?
|
628
|
+
# If the `groups` list is not empty, at least one set of capturing
|
629
|
+
# group parentheses was not balanced.
|
630
|
+
token = Token.new(self, :error, start...@index)
|
631
|
+
token[:tokenError] = true
|
632
|
+
token[:error] = "Mismatched `(` or `)`."
|
633
|
+
elsif syntax_error
|
634
|
+
# Add the last syntax error to the stack.
|
635
|
+
token = Token.new(self, :error, start...@index)
|
636
|
+
token[:tokenError] = token[:errorHasContent] = true
|
637
|
+
token[:error] = syntax_error
|
638
|
+
else
|
639
|
+
token = Token.new(self, :pattern, start...@index)
|
640
|
+
token[:isPrimitive] = true
|
641
|
+
token[:pairs] = balanced
|
642
|
+
end
|
643
|
+
end
|
644
|
+
@column += @index - start
|
645
|
+
token
|
646
|
+
end
|
647
|
+
|
648
|
+
# Internal: Lexes a regular expression literal. See sections 7.1 and 7.6.
|
649
|
+
#
|
650
|
+
# Returns the lexed `Token`.
|
651
|
+
def lex_identifier
|
652
|
+
size = match_identifier?
|
653
|
+
if size.zero?
|
654
|
+
character = @source[@index]
|
655
|
+
token = Token.new(self, :error, @index...@index += 1)
|
656
|
+
token[:tokenError] = true
|
657
|
+
token[:error] = if character == ?\\
|
658
|
+
@source[@index] == ?u ? "Invalid Unicode escape sequence." : "Illegal escape sequence."
|
659
|
+
else
|
660
|
+
"Invalid token."
|
661
|
+
end
|
662
|
+
else
|
663
|
+
token = Token.new(self, :identifier, @index...@index += size)
|
664
|
+
# Mark the token as a primitive if it is in the `Lexer::LITERALS` array.
|
665
|
+
token[:isPrimitive] = LITERALS.include? token[:value]
|
666
|
+
end
|
667
|
+
@column += token[:size]
|
668
|
+
token
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|