violet 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module Violet
4
+ require File.expand_path("../lib/violet", File.dirname(__FILE__))
5
+ require "pal"
6
+
7
+ # Internal: Defines the commands and variables available to the interactive
8
+ # shell.
9
+ class Command < Pal::Context
10
+
11
+ # Public: Lexes a string of JavaScript source code.
12
+ #
13
+ # source - The source `String`.
14
+ # patterns - Boolean arguments that correspond to each lexed token and
15
+ # specify if the `/` and `/=` tokens may be interpreted as regular
16
+ # expressions (`true`) or division operators (`false`).
17
+ #
18
+ # Returns an `Array` of `Token`s.
19
+ def lex(source, *patterns)
20
+ Lexer.new(source).tokens(*patterns)
21
+ end
22
+
23
+ # Public: Parses a string of JavaScript source code.
24
+ #
25
+ # source - The source `String`.
26
+ #
27
+ # Returns an `Array` of `Token`s.
28
+ def parse(source)
29
+ Parser.parse(source)
30
+ end
31
+ end
32
+
33
+ Pal::REPL.new("violet", Command.new).loop
34
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module Violet
4
+ # Public: Contains the version information.
5
+ module Version
6
+ # Public: The current version of Violet. The major, minor, and patch
7
+ # versions are exposed as individual constants, and comprise the
8
+ # semantic version string.
9
+ STRING = (MAJOR, MINOR, PATCH = 0, 0, 1) * "."
10
+ end
11
+
12
+ # Internal: A named `Error` class, used for reporting parse errors.
13
+ Error = Class.new(StandardError)
14
+
15
+ # Prepend the `lib` directory to the load path to facilitate loading Violet
16
+ # without RubyGems. Modules and classes will be loaded as needed.
17
+ $:.unshift File.expand_path(File.dirname(__FILE__))
18
+
19
+ autoload :Token, "violet/token"
20
+ autoload :Lexer, "violet/lexer"
21
+ autoload :Parser, "violet/parser"
22
+ end
@@ -0,0 +1,671 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module Violet
4
+ # Internal: Records exceptions emitted by the lexer.
5
+ LexerError = Class.new(Error)
6
+
7
+ # Public: Lexes a JavaScript source string.
8
+ class Lexer
9
+ # Public: Matches line terminators: line feeds, carriage returns, line
10
+ # separators, and paragraph separators. See section 7.3 of the ES 5.1 spec.
11
+ LINE_TERMINATORS = /[\n\r\u2028\u2029]/
12
+
13
+ # Public: Matches line separators, paragraph separators, and carriage
14
+ # returns not followed by line separators. Used to convert all line
15
+ # terminators to line feeds. CRLF line endings are preserved.
16
+ NORMALIZE_LINE_ENDINGS = /[\u2028\u2029]|(?:\r[^\n])/
17
+
18
+ # Public: Matches Unicode letters, `$`, `_`, and Unicode escape sequences.
19
+ # See section 7.6.
20
+ IDENTIFIER_START = /[$_\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]/
21
+
22
+ # Public: Matches identifier starting characters, Unicode combining marks,
23
+ # Unicode digits, Unicode connector punctuators, zero-width non-joiners, and
24
+ # zero-width joiners. See section 7.1.
25
+ IDENTIFIER_FRAGMENT = Regexp.union(IDENTIFIER_START, /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}\u200c\u200d]/)
26
+
27
+ # Public: Matches an ECMAScript token. This is a superset of the `Token`
28
+ # production defined in section 7.5 of the spec.
29
+ TOKEN = %r(
30
+ ## Whitespace characters: tab, vertical tab, form feed, space,
31
+ # non-breaking space, byte-order mark, and other Unicode space separators
32
+ # (Category Z). The space and non-breaking space characters are matched by
33
+ # the \p{Z} Unicode category class. See section 7.2 of the ES spec.
34
+ (?<whitespace>[\t\v\f\ufeff\uffff\p{Z}])?
35
+ # Line terminators. See section 7.3.
36
+ (?<line_terminator>#{LINE_TERMINATORS})?
37
+ # Line and block comments. See section 7.4.
38
+ (?<line_comment>//)?
39
+ (?<block_comment>/\*)?
40
+ # Single- and double-quoted string literals. See section 7.8.4.
41
+ (?<single_quoted_string>')?
42
+ (?<double_quoted_string>")?
43
+ # Numeric literals. See section 7.8.3.
44
+ (?<number>\.?[0-9])?
45
+ # RegExp literals. See section 7.8.5. This capture may also match the
46
+ # `DivPunctuator` production.
47
+ (?:(?<pattern>/)[^=])?
48
+ # Punctuators. See section 7.7.
49
+ (?<punctuator>\>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|&&|
50
+ \|\||\+=|-=|\*=|%=|&=|\|=|\^=|/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|
51
+ \*|%|\||&|\||\^|!|~|\?|:|=|/)?
52
+ )x
53
+
54
+ # Internal: The `true`, `false`, and `null` literals, as well as the
55
+ # `undefined` value. The lexer marks these four values as primitives.
56
+ LITERALS = %w( undefined null true false )
57
+
58
+ # Internal: A `Hash` that contains the quote character, token kind, and the
59
+ # unterminated string and invalid line continuation error messages for
60
+ # single- and double-quoted string tokens.
61
+ STRINGS = %w( single ' double " ).each_slice(2).with_object({}) do |(kind, quote), value|
62
+ value[kind.to_sym] = {
63
+ :quote => quote,
64
+ :kind => "#{kind}_quoted_string".to_sym,
65
+ :unterminated_string_error => "Unterminated #{kind}-quoted string literal.",
66
+ :invalid_continuation_error => "Unescaped line terminators are not permitted within #{kind}-quoted string literals."
67
+ }
68
+ end
69
+
70
+ # Public: Gets the source string.
71
+ attr_reader :source
72
+
73
+ # Public: Gets the current line.
74
+ attr_reader :line
75
+
76
+ # Public: Gets the current column.
77
+ attr_reader :column
78
+
79
+ # Public: Creates a new `Lexer` with a source string.
80
+ #
81
+ # source - The source `String`.
82
+ def initialize(source)
83
+ @source = source
84
+ # Replace all line terminators with a single line feed, but preserve CRLF
85
+ # line endings.
86
+ @normalized_source = @source.gsub(NORMALIZE_LINE_ENDINGS, ?\n)
87
+ reset!
88
+ end
89
+
90
+ # Public: Resets the lexer to its original position and clears the token
91
+ # stream.
92
+ def reset!
93
+ @index = @line = @column = 0
94
+ @terminated = false
95
+ (@tokens ||= []).clear
96
+ end
97
+
98
+ # Public: Produces a complete token stream from the source. This method
99
+ # resets the lexer prior to lexing the source string.
100
+ #
101
+ # patterns - Zero or more boolean arguments that correspond to each lexed
102
+ # token and specify if the `/` and `/=` tokens may be interpreted as
103
+ # regular expressions (`true`) or division operators (`false`). This
104
+ # flag only applies to division and regular expression tokens; setting
105
+ # it for other tokens has no effect.
106
+ def tokens(*patterns)
107
+ reset!
108
+ index = -1
109
+ # Lex tokens until the end-of-file mark is reached.
110
+ loop { break unless lex patterns[index += 1] }
111
+ @tokens
112
+ end
113
+
114
+ # Public: Inserts a new token into the token stream, before a reference
115
+ # token. If the reference token is the end-of-file mark, the token is
116
+ # appended instead.
117
+ #
118
+ # token - The `Token` to be inserted into the token stream.
119
+ # original - The reference `Token` before which the new `Token` is inserted.
120
+ #
121
+ # Returns the new `Token`.
122
+ def insert_before(token, original)
123
+ if original[:name] == Token::Types[:eof]
124
+ token[:index] = @tokens.size
125
+ @tokens << token
126
+ else
127
+ token[:index] = original[:index]
128
+ @tokens[token[:index]] = token
129
+ original[:index] += 1
130
+ @tokens[original[:index]] = original
131
+ end
132
+ token
133
+ end
134
+
135
+ # Internal: Returns the maximum number of characters, relative to the
136
+ # current scan pointer, that may be parsed as valid identifier
137
+ # characters. The scan pointer is not advanced.
138
+ #
139
+ # lex_as_fragment - A boolean that specifies whether the identifier may be
140
+ # lexed as a fragment. Certain productions allow identifier fragments,
141
+ # while others require that the identifier begin with a subset of valid
142
+ # fragment characters (default: false).
143
+ def match_identifier?(lex_as_fragment = false)
144
+ size = @index
145
+ # Identifier starting characters are restricted to a subset of valid
146
+ # identifier fragment characters.
147
+ until eof?
148
+ # Unicode escape sequences may occur anywhere within an identifier.
149
+ if /^\\u\h{4}$/ =~ @source[size, 6]
150
+ # Advance the scan pointer past the Unicode escape sequence.
151
+ size += 6
152
+ else
153
+ character = @source[size]
154
+ if lex_as_fragment
155
+ # Use the full `IdentifierPart` production.
156
+ break unless character =~ IDENTIFIER_FRAGMENT
157
+ else
158
+ # The initial character must conform to the more restrictive
159
+ # `IdentifierStart` production.
160
+ break unless character =~ IDENTIFIER_START
161
+ # All subsequent characters may be lexed as identifier fragments.
162
+ lex_as_fragment = true
163
+ end
164
+ size += 1
165
+ end
166
+ end
167
+ size - @index
168
+ end
169
+
170
+ # Internal: Returns the maximum number of characters, relative to the
171
+ # current scan pointer, that may be parsed as valid decimal characters.
172
+ # The scan pointer is not advanced.
173
+ def match_decimal?
174
+ size = @index
175
+ size += 1 until eof? || @source[size] !~ /\d/
176
+ size - @index
177
+ end
178
+
179
+ # Public: Returns `true` if the lexer has reached the end of the source
180
+ # string.
181
+ def eof?
182
+ @terminated || @index >= @source.size
183
+ end
184
+
185
+ # Public: Lexes a token.
186
+ #
187
+ # pattern - If the token is `/` or `/=`, specifies whether it may be lexed
188
+ # as part of a regular expression. If `false`, the token will be lexed as
189
+ # a division operator instead (default: true).
190
+ #
191
+ # Returns the lexed `Token`, or `nil` if the lexer has finished scanning the
192
+ # source.
193
+ def lex(pattern = true)
194
+ return if @terminated
195
+ if eof?
196
+ @terminated ||= true
197
+ token = Token.new(self, :eof, @source.size...@source.size)
198
+ return token
199
+ end
200
+ token = TOKEN.match(@source, @index) do |match|
201
+ case
202
+ # Produces a whitespace, line terminator, line comment (`// ...`), or
203
+ # block comment (`/* ... */`) token.
204
+ when match[:whitespace] then lex_whitespace
205
+ when match[:line_terminator] then lex_line_terminator
206
+ when match[:line_comment] then lex_line_comment
207
+ when match[:block_comment] then lex_block_comment
208
+ # Produces a single- or double-quoted string token. A single method is
209
+ # used to produce both kinds of tokens.
210
+ when match[:single_quoted_string] then lex_string :single
211
+ when match[:double_quoted_string] then lex_string :double
212
+ # Produces a hexadecimal or decimal token. Octal numbers produce an
213
+ # error, as they are prohibited in ES 5.
214
+ when match[:number] then lex_number
215
+ # `/` and `/=` may be interpreted as either regular expressions or
216
+ # division operators. The `pattern` argument specifies whether
217
+ # these tokens should be lexed as RegExps or punctuators.
218
+ when pattern && match[:pattern] then lex_pattern
219
+ else
220
+ # The `<pattern>` capture may contain the `/` and `/=` tokens.
221
+ if result = match[:pattern] || match[:punctuator]
222
+ token = Token.new(self, :punctuator, @index...@index += result.size)
223
+ @column += token.size
224
+ token
225
+ else
226
+ # Lex the token as an identifier.
227
+ lex_identifier
228
+ end
229
+ end
230
+ end
231
+ # Record the position of the token in the token stream.
232
+ token[:index] = @tokens.size
233
+ @tokens << token
234
+ token
235
+ end
236
+
237
+ # Internal: Lexes a whitespace token at the current scan position.
238
+ #
239
+ # Returns the lexed `Token`.
240
+ def lex_whitespace
241
+ token = Token.new(self, :whitespace, @index...@index += 1)
242
+ token[:isWhite] = true
243
+ @column += 1
244
+ token
245
+ end
246
+
247
+ # Internal: Lexes a line terminator at the current scan position: either a
248
+ # line feed, carriage return, line separator, or paragraph separator. See
249
+ # section 7.3 of the spec.
250
+ #
251
+ # Returns the lexed `Token`.
252
+ def lex_line_terminator
253
+ character = @source[@index]
254
+ stop = @index + 1
255
+ # If the current character is a carriage return and the next character is
256
+ # a line feed, the source string contains CRLF line endings. The `stop`
257
+ # position is advanced one additional character, so that "\r\n" is treated
258
+ # as a single terminator.
259
+ stop += 1 if character == ?\r && @source[stop] == ?\n
260
+ # Advance the current index past the terminator.
261
+ token = Token.new(self, :line_terminator, @index...@index = stop)
262
+ token[:lines] = 1
263
+ token[:isWhite] = true
264
+ @line += 1
265
+ @column = 0
266
+ token
267
+ end
268
+
269
+ # Internal: Lexes a line comment at the current scan position.
270
+ #
271
+ # Returns the lexed `Token`.
272
+ def lex_line_comment
273
+ @column = @normalized_source.index(?\n, @index) || @source.length
274
+ token = Token.new(self, :line_comment, @index...@index = @column)
275
+ token[:isComment] = token[:isWhite] = true
276
+ token
277
+ end
278
+
279
+ # Internal: Lexes a block comment at the current scan position.
280
+ #
281
+ # Returns the lexed `Token`.
282
+ def lex_block_comment
283
+ start = @index
284
+ # Mark the ending position of the comment.
285
+ stop = @source.index("*/", start)
286
+ if stop
287
+ # Advance the current position past the end of the comment.
288
+ @index = stop + 2
289
+ token = Token.new(self, :block_comment, start...@index)
290
+ token[:isComment] = token[:isWhite] = true
291
+ # Block comments trigger automatic semicolon insertion only if they
292
+ # span multiple lines. The normalized source is used to quickly
293
+ # detect line terminators.
294
+ index = lines = 0
295
+ # Advance the current line.
296
+ lines += 1 while index = @normalized_source[start...@index].index(?\n, index + 1)
297
+ if lines.zero?
298
+ # For single-line block comments, increase the column by the size of
299
+ # the token.
300
+ @column += token[:size]
301
+ else
302
+ # For multiline block comments, record the number of lines comprising
303
+ # the comment and reset the column.
304
+ @line += token[:lines] = lines
305
+ @column = 0
306
+ end
307
+ else
308
+ # Unterminated block comment. If a line terminator is found, the comment
309
+ # is assumed to end immediately before it. Otherwise, the comment is
310
+ # assumed to end two characters after the current scan position.
311
+ stop = @normalized_source.index(?\n, @index)
312
+ @index = stop || @index + 2
313
+ token = Token.new(self, :error, start...@index)
314
+ token[:error] = "Unterminated block comment."
315
+ token[:isComment] = token[:isWhite] = token[:tokenError] = true
316
+ @column += token[:size]
317
+ end
318
+ token
319
+ end
320
+
321
+ # Internal: Lexes a single- or double-quoted string primitive at the
322
+ # current scan position.
323
+ #
324
+ # style - A `Symbol` that specifies the quoting style. The quoting style
325
+ # must be defined as a key in the `Lexer::STRINGS` hash.
326
+ #
327
+ # Returns the lexed `Token`.
328
+ # Raises `KeyError` if the quoting style is not defined in the hash.
329
+ def lex_string(style)
330
+ style = STRINGS.fetch(style)
331
+ start = @index
332
+ lines = 0
333
+ loop do
334
+ # Parse escape sequences in strings.
335
+ until eof? || @source[@index += 1] != ?\\
336
+ # Record the number of new lines if the string contains linefeeds. The shadow input is
337
+ # used to avoid repeatedly normalizing line endings.
338
+ @line += (lines += 1) if @normalized_source[@index + 1] == ?\n
339
+ # Advance to the next character.
340
+ @index += 1
341
+ end
342
+ # If the string contains an unescaped line terminator, it is a syntax error. Some
343
+ # environments permit unescaped new lines in strings; however, the spec disallows them.
344
+ if @source[@index] =~ LINE_TERMINATORS
345
+ token = Token.new(self, :error, start...@index)
346
+ token[:error] = style[:invalid_continuation_error]
347
+ token[:isString] = token[:tokenError] = true
348
+ break
349
+ end
350
+ # Consume escape sequences until either the end of the source or the end-of-string character
351
+ # is reached.
352
+ break if eof? || @source[@index] == style[:quote]
353
+ end
354
+ # If the end of the source is reached without consuming the end-of-string character, the
355
+ # source contains an unterminated string literal.
356
+ if @source[@index] == style[:quote]
357
+ # Advance the index past the end-of-string character.
358
+ @index += 1
359
+ token = Token.new(self, style[:kind], start...@index)
360
+ token[:isPrimitive] = token[:isString] = true
361
+ # Update the line and column entries accordingly.
362
+ if lines.zero?
363
+ @column += token[:size]
364
+ else
365
+ token[:lines] = lines
366
+ @column = 0
367
+ end
368
+ else
369
+ token = Token.new(self, :error, start...@index)
370
+ token[:error] = style[:unterminated_string_error]
371
+ token[:isString] = token[:tokenError] = true
372
+ @column += token[:size]
373
+ end
374
+ token
375
+ end
376
+
377
+ # Internal: Lexes a decimal or hexadecimal numeric value. See section 7.8.3.
378
+ #
379
+ # Returns the lexed `Token`.
380
+ def lex_number
381
+ start = @index
382
+ @index += 1
383
+ # If the token begins with a `0x`, parse the remainder as a hexadecimal value.
384
+ if @source[start..@index] =~ /0[xX]/
385
+ position = @index += 1
386
+ # Consume characters until the end of the string or a non-hexdigit
387
+ # character is encountered.
388
+ @index += 1 until eof? || @source[@index] !~ /\h/
389
+ # If no additional characters were consumed, the hex value is invalid.
390
+ if position == @index
391
+ token = Token.new(self, :error, start...@index)
392
+ token[:error] = "Invalid hexdigit value."
393
+ token[:isNumber] = token[:tokenError] = true
394
+ else
395
+ # The value is syntactically sound.
396
+ token = Token.new(self, :hexadecimal_number, start...@index)
397
+ token[:isPrimitive] = token[:isNumber] = true
398
+ end
399
+ else
400
+ # Determine if an octal escape sequence is being parsed (i.e., a leading
401
+ # zero followed by a decimal digit).
402
+ is_octal = @source[start..@index] =~ /0\d/
403
+ # Parse the integral expression before the decimal point.
404
+ unless @source[start] == ?.
405
+ # Consume characters until the end of the string or a non-decimal
406
+ # character is encountered.
407
+ @index += match_decimal?
408
+ # Advance past the decimal point.
409
+ @index += 1 if @source[@index] == ?.
410
+ end
411
+ # Parse the decimal component.
412
+ @index += match_decimal?
413
+ # Parse the exponent.
414
+ if @source[@index] =~ /[eE]/
415
+ # Advance past the sign.
416
+ @index += 1 if @source[@index += 1] =~ /[+-]/
417
+ # Mark the current position and consume decimal digits past the
418
+ # exponential.
419
+ position = @index
420
+ @index += match_decimal?
421
+ # If no additional characters were consumed but an exponent was lexed,
422
+ # the decimal value is invalid.
423
+ if position == @index
424
+ token = Token.new(self, :error, start...@index)
425
+ token[:error] = "Exponents may not be empty."
426
+ token[:tokenError] = true
427
+ end
428
+ end
429
+ unless token
430
+ # Octal literals are invalid in ES 5.
431
+ if is_octal
432
+ token = Token.new(self, :error, start...@index)
433
+ token[:error] = "Invalid octal escape sequence."
434
+ token[:isNumber] = token[:isOctal] = token[:tokenError] = true
435
+ else
436
+ # Syntactically valid decimal value. As with hexdigits, the parser
437
+ # will determine if the lexed value is semantically sound.
438
+ token = Token.new(self, :decimal_number, start...@index)
439
+ token[:isPrimitive] = token[:isNumber] = true
440
+ end
441
+ end
442
+ end
443
+ @column += token[:size]
444
+ token
445
+ end
446
+
447
+ # Internal: Lexes a regular expression literal. See section 7.8.5.
448
+ #
449
+ # Returns the lexed `Token`.
450
+ def lex_pattern
451
+ start = @index
452
+ # Maintains a hash of the initial and terminal positions of balanced
453
+ # regular expression characters: grouping parentheses, character class
454
+ # brackets, and quantifier braces.
455
+ balanced = {}
456
+ # Ensures that all capturing groups in the pattern are balanced.
457
+ groups = []
458
+ # A flag that specifies if the regular expression is terminated.
459
+ terminated = false
460
+ # Only the last syntax error is preserved for improperly constructed
461
+ # regular expressions.
462
+ syntax_error = nil
463
+ loop do
464
+ @index += 1
465
+ break if eof?
466
+ # Use the normalized input to quickly detect line terminators.
467
+ case character = @normalized_source[@index]
468
+ when ?\n
469
+ # Line terminators cannot occur within RegExp literals.
470
+ token = Token.new(self, :error, start...@index)
471
+ token[:error] = "Line terminators are not permitted within RegExp literals."
472
+ token[:tokenError] = token[:errorHasContent] = true
473
+ # Avoid emitting a second unterminated RegExp error once lexing is
474
+ # complete.
475
+ terminated = true
476
+ break
477
+ when ?/
478
+ # An unescaped `/` marks the end of the regular expression.
479
+ terminated = true
480
+ break
481
+ when /[?*+]/
482
+ syntax_error = "`?`, `*`, and `+` require a value to repeat."
483
+ when ?^
484
+ # `^` may only occur immediately following `|`, or at the beginning
485
+ # of either the pattern, a capturing group, or a lookahead assertion
486
+ # (`?:`, `?=`, or `?!`). Note that `^` may also negate a character
487
+ # class; however, character classes have different semantics and are
488
+ # lexed separately.
489
+ unless @source[@index - 1] =~ %r{[/|(]} || @source[@index - 3, 3] =~ /\(\?[:!=]/
490
+ syntax_error = "`^` may not occur here."
491
+ end
492
+ when ?$
493
+ # `$` may only occur immediately before `|`, or at the end of either
494
+ # the pattern, a capturing group, or a lookahead assertion.
495
+ unless @source[@index + 1] =~ %r{[/|)]}
496
+ syntax_error = "`$` may not occur here."
497
+ end
498
+ when ?}
499
+ # Interpreters can distinguish between and automatically escape braces
500
+ # not used to delimit quantifiers. Nevertheless, it's considered a bad
501
+ # practice to leave special characters unescaped in RegExps. Both the
502
+ # Violet lexer and the ZeParser tokenizer assume that all unescaped
503
+ # braces delimit quantifiers, and emit errors accordingly.
504
+ syntax_error = "Mismatched `}`."
505
+ else
506
+ # Lex capturing groups.
507
+ if character == ?(
508
+ # Mark the initial position of the capturing group.
509
+ groups << @index - start
510
+ elsif character == ?)
511
+ if groups.empty?
512
+ syntax_error = "Capturing group parentheses must be balanced."
513
+ else
514
+ # Record the initial and terminal positions of the parentheses delimiting the group.
515
+ terminal = @index - start
516
+ balanced[initial = groups.pop] = terminal
517
+ balanced[terminal] = initial
518
+ end
519
+ end
520
+
521
+ # Character Classes.
522
+ # ------------------
523
+ if character == ?[
524
+ # Record the initial position of the character class.
525
+ initial = @index - start
526
+ # Characters in character classes are treated literally, so there
527
+ # is no need to escape them. The exceptions are line terminators and
528
+ # unescaped closing brackets, which are not part of the
529
+ # `RegularExpressionClassChar` grammar.
530
+ loop do
531
+ @index += 1
532
+ break if eof? || @normalized_source[@index] == ?\n || @source[@index] == ?]
533
+ if @source[@index] == ?\\
534
+ if @normalized_source[@index + 1] == ?\n
535
+ # Abort lexing if a line terminator is encountered.
536
+ break
537
+ else
538
+ # Skip lexing the subsequent escaped character. This ensures
539
+ # that escaped closing brackets (`\]`) are lexed correctly.
540
+ @index += 1
541
+ end
542
+ end
543
+ end
544
+ if @source[@index] == ?]
545
+ # Record the initial and terminal positions of the brackets
546
+ # delimiting the class.
547
+ terminal = @index - start
548
+ balanced[initial] = terminal
549
+ balanced[terminal] = initial
550
+ else
551
+ token = Token.new(self, :error, start...@index)
552
+ token[:error] = "Character class brackets must be balanced."
553
+ token[:tokenError] = true
554
+ # Avoid emitting an unterminated RegExp error once lexing is
555
+ # complete.
556
+ terminated = true
557
+ break
558
+ end
559
+ # Lex escaped characters. Escape sequences may occur anywhere within
560
+ # the RegExp, and indicate that the following character should be
561
+ # interpreted literally.
562
+ elsif character == ?\\ && @normalized_source[@index + 1] != ?\n
563
+ @index += 1
564
+ end
565
+
566
+ # Lookahead Assertions and Quantifiers.
567
+ # -------------------------------------
568
+ if character == ?(
569
+ # Lex a non-capturing group, positive lookahead, or negative lookahead.
570
+ @index += 2 if @source[@index + 1, 2] =~ /\?[:=!]/
571
+ else
572
+ # Lex quantifiers.
573
+ case @source[@index + 1]
574
+ when ??
575
+ # The `?` quantifier matches the preceding character zero or one
576
+ # times.
577
+ @index += 1
578
+ when /[*+]/
579
+ # The `*` quantifier matches the preceding character zero or more
580
+ # times; `+` matches a character one or more times. `*?` and `+?`
581
+ # indicate a non-greedy match.
582
+ @index += 1 if @source[@index += 1] == ??
583
+ when ?{
584
+ # Advance one character and mark the initial position of the
585
+ # quantifier.
586
+ @index += 1
587
+ initial = @index - start
588
+ # The `{n}` quantifier matches the preceding character exactly
589
+ # `n` times. `{n,}` matches at least `n` occurrences of the
590
+ # preceding character. `{n,m}` matches at least `n` and at most
591
+ # `m` occurrences.
592
+ unless @source[@index += 1] =~ /\d/
593
+ syntax_error = "Quantifier curly requires at least one digit before the comma"
594
+ end
595
+ # Lex the `n` value.
596
+ @index += match_decimal?
597
+ # Lex the `m` value, if any, if a comma is specified.
598
+ @index += match_decimal? if @source[@index += 1] == ?,
599
+ # Quantifier braces must be balanced.
600
+ if @source[@index + 1] == ?}
601
+ @index += 1
602
+ terminal = @index - start
603
+ balanced[initial] = terminal
604
+ balanced[terminal] = initial
605
+ # A trailing `?` indicates a non-greedy match.
606
+ @index += 1 if @source[@index + 1] == ??
607
+ else
608
+ syntax_error = "Quantifier curly requires to be closed"
609
+ end
610
+ end
611
+ end
612
+ end
613
+ end
614
+
615
+ # Construct the token.
616
+ # --------------------
617
+ unless terminated
618
+ token = Token.new(self, :error, start...@index)
619
+ token[:error] = "Unterminated RegExp literal."
620
+ token[:tokenError] = true
621
+ else
622
+ # Advance one character and lex the regular expression flags, if any,
623
+ # as an identifier fragment (the grammar for `RegularExpressionFlags`
624
+ # is that of `IdentifierPart`).
625
+ @index += 1
626
+ @index += match_identifier? :fragment
627
+ if !groups.empty?
628
+ # If the `groups` list is not empty, at least one set of capturing
629
+ # group parentheses was not balanced.
630
+ token = Token.new(self, :error, start...@index)
631
+ token[:tokenError] = true
632
+ token[:error] = "Mismatched `(` or `)`."
633
+ elsif syntax_error
634
+ # Add the last syntax error to the stack.
635
+ token = Token.new(self, :error, start...@index)
636
+ token[:tokenError] = token[:errorHasContent] = true
637
+ token[:error] = syntax_error
638
+ else
639
+ token = Token.new(self, :pattern, start...@index)
640
+ token[:isPrimitive] = true
641
+ token[:pairs] = balanced
642
+ end
643
+ end
644
+ @column += @index - start
645
+ token
646
+ end
647
+
648
+ # Internal: Lexes a regular expression literal. See sections 7.1 and 7.6.
649
+ #
650
+ # Returns the lexed `Token`.
651
+ def lex_identifier
652
+ size = match_identifier?
653
+ if size.zero?
654
+ character = @source[@index]
655
+ token = Token.new(self, :error, @index...@index += 1)
656
+ token[:tokenError] = true
657
+ token[:error] = if character == ?\\
658
+ @source[@index] == ?u ? "Invalid Unicode escape sequence." : "Illegal escape sequence."
659
+ else
660
+ "Invalid token."
661
+ end
662
+ else
663
+ token = Token.new(self, :identifier, @index...@index += size)
664
+ # Mark the token as a primitive if it is in the `Lexer::LITERALS` array.
665
+ token[:isPrimitive] = LITERALS.include? token[:value]
666
+ end
667
+ @column += token[:size]
668
+ token
669
+ end
670
+ end
671
+ end