iv-phonic 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/.autotest +24 -0
  2. data/Manifest.txt +49 -0
  3. data/README.rdoc +32 -0
  4. data/Rakefile +54 -0
  5. data/ext/include/iv/algorithm.h +23 -0
  6. data/ext/include/iv/alloc.h +200 -0
  7. data/ext/include/iv/any.h +71 -0
  8. data/ext/include/iv/ast-factory.h +277 -0
  9. data/ext/include/iv/ast-fwd.h +92 -0
  10. data/ext/include/iv/ast-serializer.h +579 -0
  11. data/ext/include/iv/ast-visitor.h +121 -0
  12. data/ext/include/iv/ast.h +1127 -0
  13. data/ext/include/iv/chars.h +83 -0
  14. data/ext/include/iv/cmdline.h +830 -0
  15. data/ext/include/iv/conversions.h +308 -0
  16. data/ext/include/iv/dtoa.h +20 -0
  17. data/ext/include/iv/enable_if.h +18 -0
  18. data/ext/include/iv/errors.h +15 -0
  19. data/ext/include/iv/fixedcontainer.h +42 -0
  20. data/ext/include/iv/functor.h +29 -0
  21. data/ext/include/iv/lexer.h +1281 -0
  22. data/ext/include/iv/location.h +23 -0
  23. data/ext/include/iv/mt19937.h +175 -0
  24. data/ext/include/iv/noncopyable.h +30 -0
  25. data/ext/include/iv/none.h +10 -0
  26. data/ext/include/iv/parser.h +2150 -0
  27. data/ext/include/iv/source.h +27 -0
  28. data/ext/include/iv/space.h +178 -0
  29. data/ext/include/iv/static_assert.h +30 -0
  30. data/ext/include/iv/stringpiece.h +385 -0
  31. data/ext/include/iv/token.h +311 -0
  32. data/ext/include/iv/ucdata.h +58 -0
  33. data/ext/include/iv/uchar.h +8 -0
  34. data/ext/include/iv/ustring.h +28 -0
  35. data/ext/include/iv/ustringpiece.h +9 -0
  36. data/ext/include/iv/utils.h +83 -0
  37. data/ext/include/iv/xorshift.h +74 -0
  38. data/ext/iv/phonic/ast-fwd.h +21 -0
  39. data/ext/iv/phonic/ast.h +10 -0
  40. data/ext/iv/phonic/creator.h +530 -0
  41. data/ext/iv/phonic/encoding.h +110 -0
  42. data/ext/iv/phonic/extconf.rb +5 -0
  43. data/ext/iv/phonic/factory.h +247 -0
  44. data/ext/iv/phonic/parser.h +12 -0
  45. data/ext/iv/phonic/phonic.cc +69 -0
  46. data/ext/iv/phonic/rnode.h +15 -0
  47. data/ext/iv/phonic/rparser.h +48 -0
  48. data/ext/iv/phonic/source.h +146 -0
  49. data/test/test_iv_phonic.rb +32 -0
  50. metadata +159 -0
@@ -0,0 +1,1281 @@
1
+ #ifndef _IV_LEXER_H_
2
+ #define _IV_LEXER_H_
3
+
4
+ #include <cstddef>
5
+ #include <cassert>
6
+ #include <cstdlib>
7
+ #include <vector>
8
+ #include <string>
9
+ #include "uchar.h"
10
+ #include "chars.h"
11
+ #include "token.h"
12
+ #include "source.h"
13
+ #include "location.h"
14
+ #include "noncopyable.h"
15
+
16
+ namespace iv {
17
+ namespace core {
18
+
19
+ class Lexer: private Noncopyable<Lexer>::type {
20
+ public:
21
+ enum LexType {
22
+ kClear = 0,
23
+ kIdentifyReservedWords = 1,
24
+ kIgnoreReservedWords = 2,
25
+ kIgnoreReservedWordsAndIdentifyGetterOrSetter = 4,
26
+ kStrict = 8
27
+ };
28
+ enum State {
29
+ NONE,
30
+ ESCAPE,
31
+ DECIMAL,
32
+ HEX,
33
+ OCTAL
34
+ };
35
+
36
+ explicit Lexer(BasicSource* src)
37
+ : source_(src),
38
+ buffer8_(kInitialReadBufferCapacity),
39
+ buffer16_(kInitialReadBufferCapacity),
40
+ pos_(0),
41
+ end_(source_->size()),
42
+ has_line_terminator_before_next_(false),
43
+ has_shebang_(false),
44
+ line_number_(1),
45
+ location_() {
46
+ Initialize();
47
+ }
48
+
49
+ Token::Type Next(int type) {
50
+ Token::Type token;
51
+ has_line_terminator_before_next_ = false;
52
+ do {
53
+ location_.begin_position_ = pos();
54
+ while (Chars::IsWhiteSpace(c_)) {
55
+ // white space
56
+ Advance();
57
+ }
58
+ switch (c_) {
59
+ case '"':
60
+ case '\'':
61
+ // string literal
62
+ token = ScanString();
63
+ break;
64
+
65
+ case '<':
66
+ // < <= << <<= <!--
67
+ Advance();
68
+ if (c_ == '=') {
69
+ Advance();
70
+ token = Token::LTE;
71
+ } else if (c_ == '<') {
72
+ Advance();
73
+ if (c_ == '=') {
74
+ Advance();
75
+ token = Token::ASSIGN_SHL;
76
+ } else {
77
+ token = Token::SHL;
78
+ }
79
+ } else if (c_ == '!') {
80
+ token = ScanHtmlComment();
81
+ } else {
82
+ token = Token::LT;
83
+ }
84
+ break;
85
+
86
+ case '>':
87
+ // > >= >> >>= >>> >>>=
88
+ Advance();
89
+ if (c_ == '=') {
90
+ Advance();
91
+ token = Token::GTE;
92
+ } else if (c_ == '>') {
93
+ Advance();
94
+ if (c_ == '=') {
95
+ Advance();
96
+ token = Token::ASSIGN_SAR;
97
+ } else if (c_ == '>') {
98
+ Advance();
99
+ if (c_ == '=') {
100
+ Advance();
101
+ token = Token::ASSIGN_SHR;
102
+ } else {
103
+ token = Token::SHR;
104
+ }
105
+ } else {
106
+ token = Token::SAR;
107
+ }
108
+ } else {
109
+ token = Token::GT;
110
+ }
111
+ break;
112
+
113
+ case '=':
114
+ // = == ===
115
+ Advance();
116
+ if (c_ == '=') {
117
+ Advance();
118
+ if (c_ == '=') {
119
+ Advance();
120
+ token = Token::EQ_STRICT;
121
+ } else {
122
+ token = Token::EQ;
123
+ }
124
+ } else {
125
+ token = Token::ASSIGN;
126
+ }
127
+ break;
128
+
129
+ case '!':
130
+ // ! != !==
131
+ Advance();
132
+ if (c_ == '=') {
133
+ Advance();
134
+ if (c_ == '=') {
135
+ Advance();
136
+ token = Token::NE_STRICT;
137
+ } else {
138
+ token = Token::NE;
139
+ }
140
+ } else {
141
+ token = Token::NOT;
142
+ }
143
+ break;
144
+
145
+ case '+':
146
+ // + ++ +=
147
+ Advance();
148
+ if (c_ == '+') {
149
+ Advance();
150
+ token = Token::INC;
151
+ } else if (c_ == '=') {
152
+ Advance();
153
+ token = Token::ASSIGN_ADD;
154
+ } else {
155
+ token = Token::ADD;
156
+ }
157
+ break;
158
+
159
+ case '-':
160
+ // - -- --> -=
161
+ Advance();
162
+ if (c_ == '-') {
163
+ Advance();
164
+ if (c_ == '>' && has_line_terminator_before_next_) {
165
+ token = SkipSingleLineComment();
166
+ } else {
167
+ token = Token::DEC;
168
+ }
169
+ } else if (c_ == '=') {
170
+ Advance();
171
+ token = Token::ASSIGN_SUB;
172
+ } else {
173
+ token = Token::SUB;
174
+ }
175
+ break;
176
+
177
+ case '*':
178
+ // * *=
179
+ Advance();
180
+ if (c_ == '=') {
181
+ Advance();
182
+ token = Token::ASSIGN_MUL;
183
+ } else {
184
+ token = Token::MUL;
185
+ }
186
+ break;
187
+
188
+ case '%':
189
+ // % %=
190
+ Advance();
191
+ if (c_ == '=') {
192
+ Advance();
193
+ token = Token::ASSIGN_MOD;
194
+ } else {
195
+ token = Token::MOD;
196
+ }
197
+ break;
198
+
199
+ case '/':
200
+ // / // /* /=
201
+ // ASSIGN_DIV and DIV remain to be solved which is RegExp or not.
202
+ Advance();
203
+ if (c_ == '/') {
204
+ // SINGLE LINE COMMENT
205
+ if (line_number_ == (has_shebang_ ? 1 : 2)) {
206
+ // magic comment
207
+ token = ScanMagicComment();
208
+ } else {
209
+ token = SkipSingleLineComment();
210
+ }
211
+ } else if (c_ == '*') {
212
+ // MULTI LINES COMMENT
213
+ token = SkipMultiLineComment();
214
+ } else if (c_ == '=') {
215
+ // ASSIGN_DIV
216
+ Advance();
217
+ token = Token::ASSIGN_DIV;
218
+ } else {
219
+ // DIV
220
+ token = Token::DIV;
221
+ }
222
+ break;
223
+
224
+ case '&':
225
+ // && &= &
226
+ Advance();
227
+ if (c_ == '&') {
228
+ Advance();
229
+ token = Token::LOGICAL_AND;
230
+ } else if (c_ == '=') {
231
+ Advance();
232
+ token = Token::ASSIGN_BIT_AND;
233
+ } else {
234
+ token = Token::BIT_AND;
235
+ }
236
+ break;
237
+
238
+ case '|':
239
+ // || |= |
240
+ Advance();
241
+ if (c_ == '|') {
242
+ Advance();
243
+ token = Token::LOGICAL_OR;
244
+ } else if (c_ == '=') {
245
+ Advance();
246
+ token = Token::ASSIGN_BIT_OR;
247
+ } else {
248
+ token = Token::BIT_OR;
249
+ }
250
+ break;
251
+
252
+ case '^':
253
+ // ^
254
+ Advance();
255
+ token = Token::BIT_XOR;
256
+ break;
257
+
258
+ case '.':
259
+ // . Number
260
+ Advance();
261
+ if (Chars::IsDecimalDigit(c_)) {
262
+ // float number parse
263
+ token = ScanNumber(true);
264
+ } else {
265
+ token = Token::PERIOD;
266
+ }
267
+ break;
268
+
269
+ case ':':
270
+ Advance();
271
+ token = Token::COLON;
272
+ break;
273
+
274
+ case ';':
275
+ Advance();
276
+ token = Token::SEMICOLON;
277
+ break;
278
+
279
+ case ',':
280
+ Advance();
281
+ token = Token::COMMA;
282
+ break;
283
+
284
+ case '(':
285
+ Advance();
286
+ token = Token::LPAREN;
287
+ break;
288
+
289
+ case ')':
290
+ Advance();
291
+ token = Token::RPAREN;
292
+ break;
293
+
294
+ case '[':
295
+ Advance();
296
+ token = Token::LBRACK;
297
+ break;
298
+
299
+ case ']':
300
+ Advance();
301
+ token = Token::RBRACK;
302
+ break;
303
+
304
+ case '{':
305
+ Advance();
306
+ token = Token::LBRACE;
307
+ break;
308
+
309
+ case '}':
310
+ Advance();
311
+ token = Token::RBRACE;
312
+ break;
313
+
314
+ case '?':
315
+ Advance();
316
+ token = Token::CONDITIONAL;
317
+ break;
318
+
319
+ case '~':
320
+ Advance();
321
+ token = Token::BIT_NOT;
322
+ break;
323
+
324
+ case '#':
325
+ // #!
326
+ // skip shebang as single line comment
327
+ if (pos_ == 1) {
328
+ assert(line_number_ == 1);
329
+ Advance();
330
+ if (c_ == '!') {
331
+ // shebang
332
+ has_shebang_ = true;
333
+ token = SkipSingleLineComment();
334
+ break;
335
+ }
336
+ PushBack();
337
+ }
338
+
339
+ default:
340
+ if (Chars::IsIdentifierStart(c_)) {
341
+ token = ScanIdentifier(type);
342
+ } else if (Chars::IsDecimalDigit(c_)) {
343
+ token = ScanNumber(false);
344
+ } else if (Chars::IsLineTerminator(c_)) {
345
+ SkipLineTerminator();
346
+ has_line_terminator_before_next_ = true;
347
+ token = Token::NOT_FOUND;
348
+ } else if (c_ < 0) {
349
+ // EOS
350
+ token = Token::EOS;
351
+ } else {
352
+ token = Token::ILLEGAL;
353
+ }
354
+ break;
355
+ }
356
+ } while (token == Token::NOT_FOUND);
357
+ location_.end_position_ = pos();
358
+ return token;
359
+ }
360
+
361
+ inline const std::vector<uc16>& Buffer() const {
362
+ return buffer16_;
363
+ }
364
+
365
+ inline const std::vector<char>& Buffer8() const {
366
+ return buffer8_;
367
+ }
368
+
369
+ inline const double& Numeric() const {
370
+ return numeric_;
371
+ }
372
+
373
+ inline State NumericType() const {
374
+ assert(type_ == DECIMAL ||
375
+ type_ == HEX ||
376
+ type_ == OCTAL);
377
+ return type_;
378
+ }
379
+
380
+ inline State StringEscapeType() const {
381
+ assert(type_ == NONE ||
382
+ type_ == ESCAPE ||
383
+ type_ == OCTAL);
384
+ return type_;
385
+ }
386
+
387
+ inline bool has_line_terminator_before_next() const {
388
+ return has_line_terminator_before_next_;
389
+ }
390
+
391
+ std::size_t line_number() const {
392
+ return line_number_;
393
+ }
394
+
395
+ const std::string& filename() const {
396
+ return source_->filename();
397
+ }
398
+
399
+ std::size_t pos() const {
400
+ return pos_;
401
+ }
402
+
403
+ inline BasicSource* source() const {
404
+ return source_;
405
+ }
406
+
407
+ inline Location location() const {
408
+ return location_;
409
+ }
410
+
411
+ bool ScanRegExpLiteral(bool contains_eq) {
412
+ bool character = false;
413
+ buffer16_.clear();
414
+ if (contains_eq) {
415
+ Record16('=');
416
+ }
417
+ while (c_ != '/' || character) {
418
+ // invalid RegExp pattern
419
+ if (Chars::IsLineTerminator(c_) || c_ < 0) {
420
+ return false;
421
+ }
422
+ if (c_ == '\\') {
423
+ // escape
424
+ Record16Advance();
425
+ if (Chars::IsLineTerminator(c_) || c_ < 0) {
426
+ return false;
427
+ }
428
+ Record16Advance();
429
+ } else {
430
+ if (c_ == '[') {
431
+ character = true;
432
+ } else if (c_ == ']') {
433
+ character = false;
434
+ }
435
+ Record16Advance();
436
+ }
437
+ }
438
+ Advance();
439
+ return true;
440
+ }
441
+
442
+ bool ScanRegExpFlags() {
443
+ buffer16_.clear();
444
+ uc16 uc;
445
+ while (Chars::IsIdentifierPart(c_)) {
446
+ if (c_ == '\\') {
447
+ Advance();
448
+ if (c_ != 'u') {
449
+ return false;
450
+ }
451
+ Advance();
452
+ uc = ScanHexEscape('u', 4);
453
+ if (uc == '\\') {
454
+ return false;
455
+ }
456
+ Record16(uc);
457
+ } else {
458
+ Record16Advance();
459
+ }
460
+ }
461
+ return true;
462
+ }
463
+
464
+ private:
465
+ static const std::size_t kInitialReadBufferCapacity = 32;
466
+
467
+ void Initialize() {
468
+ Advance();
469
+ }
470
+
471
+ inline void Advance() {
472
+ if (pos_ == end_) {
473
+ c_ = -1;
474
+ } else {
475
+ c_ = source_->Get(pos_++);
476
+ }
477
+ }
478
+ inline void Record8() {
479
+ buffer8_.push_back(static_cast<char>(c_));
480
+ }
481
+ inline void Record8(const int ch) {
482
+ buffer8_.push_back(static_cast<char>(ch));
483
+ }
484
+ inline void Record16() { buffer16_.push_back(c_); }
485
+ inline void Record16(const int ch) { buffer16_.push_back(ch); }
486
+ inline void Record8Advance() {
487
+ Record8();
488
+ Advance();
489
+ }
490
+ inline void Record16Advance() {
491
+ Record16();
492
+ Advance();
493
+ }
494
+
495
+ void PushBack() {
496
+ if (pos_ < 2) {
497
+ c_ = -1;
498
+ } else {
499
+ c_ = source_->Get(pos_-2);
500
+ --pos_;
501
+ }
502
+ }
503
+
504
+ inline Token::Type IsMatch(char const * keyword,
505
+ std::size_t len,
506
+ Token::Type guess, bool strict) const {
507
+ if (!strict) {
508
+ return Token::IDENTIFIER;
509
+ }
510
+ std::vector<uc16>::const_iterator it = buffer16_.begin();
511
+ do {
512
+ if (*it++ != *keyword++) {
513
+ return Token::IDENTIFIER;
514
+ }
515
+ } while (--len);
516
+ return guess;
517
+ }
518
+
519
+ inline Token::Type IsMatch(char const * keyword,
520
+ std::size_t len,
521
+ Token::Type guess) const {
522
+ std::vector<uc16>::const_iterator it = buffer16_.begin();
523
+ do {
524
+ if (*it++ != *keyword++) {
525
+ return Token::IDENTIFIER;
526
+ }
527
+ } while (--len);
528
+ return guess;
529
+ }
530
+
531
+ Token::Type SkipSingleLineComment() {
532
+ Advance();
533
+ // see ECMA-262 section 7.4
534
+ while (c_ >= 0 && !Chars::IsLineTerminator(c_)) {
535
+ Advance();
536
+ }
537
+ return Token::NOT_FOUND;
538
+ }
539
+
540
+ Token::Type SkipMultiLineComment() {
541
+ Advance();
542
+ // remember previous ch
543
+ uc16 ch;
544
+ while (c_ >= 0) {
545
+ ch = c_;
546
+ Advance();
547
+ if (ch == '*' && c_ == '/') {
548
+ c_ = ' ';
549
+ return Token::NOT_FOUND;
550
+ } else if (Chars::IsLineTerminator(c_)) {
551
+ // see ECMA-262 section 7.4
552
+ SkipLineTerminator();
553
+ has_line_terminator_before_next_ = true;
554
+ ch = '\n';
555
+ }
556
+ }
557
+ return Token::ILLEGAL;
558
+ }
559
+
560
+ Token::Type ScanHtmlComment() {
561
+ Advance();
562
+ if (c_ == '-') {
563
+ // <!-
564
+ Advance();
565
+ if (c_ == '-') {
566
+ // <!--
567
+ return SkipSingleLineComment();
568
+ }
569
+ PushBack();
570
+ }
571
+ // <! is LT and NOT
572
+ PushBack();
573
+ return Token::LT;
574
+ }
575
+
576
+ Token::Type ScanMagicComment() {
577
+ Advance();
578
+ // see ECMA-262 section 7.4
579
+ while (c_ >= 0 && !Chars::IsLineTerminator(c_)) {
580
+ Advance();
581
+ }
582
+ return Token::NOT_FOUND;
583
+ }
584
+
585
+ Token::Type ScanIdentifier(int type) {
586
+ Token::Type token = Token::IDENTIFIER;
587
+ uc16 uc;
588
+
589
+ buffer16_.clear();
590
+
591
+ if (c_ == '\\') {
592
+ Advance();
593
+ if (c_ != 'u') {
594
+ return Token::ILLEGAL;
595
+ }
596
+ Advance();
597
+ uc = ScanHexEscape('u', 4);
598
+ if (uc == '\\' || !Chars::IsIdentifierStart(uc)) {
599
+ return Token::ILLEGAL;
600
+ }
601
+ Record16(uc);
602
+ } else {
603
+ Record16Advance();
604
+ }
605
+
606
+ while (Chars::IsIdentifierPart(c_)) {
607
+ if (c_ == '\\') {
608
+ Advance();
609
+ if (c_ != 'u') {
610
+ return Token::ILLEGAL;
611
+ }
612
+ Advance();
613
+ uc = ScanHexEscape('u', 4);
614
+ if (uc == '\\' || !Chars::IsIdentifierPart(uc)) {
615
+ return Token::ILLEGAL;
616
+ }
617
+ Record16(uc);
618
+ } else {
619
+ Record16Advance();
620
+ }
621
+ }
622
+
623
+ if (type & kIdentifyReservedWords) {
624
+ token = DetectKeyword(type & kStrict);
625
+ } else if (type & kIgnoreReservedWordsAndIdentifyGetterOrSetter) {
626
+ token = DetectGetOrSet();
627
+ }
628
+
629
+ return token;
630
+ }
631
+
632
+ // detect which Identifier is Keyword, FutureReservedWord or not
633
+ // Keyword and FutureReservedWord are defined in ECMA-262 5th.
634
+ //
635
+ // Some words such as :
636
+ // int, short, boolean, byte, long, char, float, double, abstract, volatile,
637
+ // transient, final, throws, goto, native, synchronized
638
+ // were defined as FutureReservedWord in ECMA-262 3rd, but not in 5th.
639
+ // So, DetectKeyword interprets them as Identifier.
640
+ Token::Type DetectKeyword(bool strict) const {
641
+ const std::size_t len = buffer16_.size();
642
+ Token::Type token = Token::IDENTIFIER;
643
+ switch (len) {
644
+ case 2:
645
+ // if in do
646
+ if (buffer16_[0] == 'i') {
647
+ if (buffer16_[1] == 'f') {
648
+ token = Token::IF;
649
+ } else if (buffer16_[1] == 'n') {
650
+ token = Token::IN;
651
+ }
652
+ } else if (buffer16_[0] == 'd' && buffer16_[1] == 'o') {
653
+ // do
654
+ token = Token::DO;
655
+ }
656
+ break;
657
+ case 3:
658
+ // for var int new try let
659
+ switch (buffer16_[2]) {
660
+ case 't':
661
+ if (buffer16_[0] == 'l' && buffer16_[1] == 'e' && strict) {
662
+ // let
663
+ token = Token::LET;
664
+ } else if (buffer16_[0] == 'i' && buffer16_[1] == 'n') {
665
+ // int (removed)
666
+ // token = Token::INT;
667
+ }
668
+ break;
669
+ case 'r':
670
+ // for var
671
+ if (buffer16_[0] == 'f' && buffer16_[1] == 'o') {
672
+ // for
673
+ token = Token::FOR;
674
+ } else if (buffer16_[0] == 'v' && buffer16_[1] == 'a') {
675
+ // var
676
+ token = Token::VAR;
677
+ }
678
+ break;
679
+ case 'y':
680
+ // try
681
+ if (buffer16_[0] == 't' && buffer16_[1] == 'r') {
682
+ token = Token::TRY;
683
+ }
684
+ break;
685
+ case 'w':
686
+ // new
687
+ if (buffer16_[0] == 'n' && buffer16_[1] == 'e') {
688
+ token = Token::NEW;
689
+ }
690
+ break;
691
+ }
692
+ break;
693
+ case 4:
694
+ // else case true byte null this
695
+ // void with long enum char goto
696
+ // number 3 character is most duplicated
697
+ switch (buffer16_[3]) {
698
+ case 'e':
699
+ // else case true byte
700
+ if (buffer16_[2] == 's') {
701
+ if (buffer16_[0] == 'e' && buffer16_[1] == 'l') {
702
+ // else
703
+ token = Token::ELSE;
704
+ } else if (buffer16_[0] == 'c' && buffer16_[1] == 'a') {
705
+ // case
706
+ token = Token::CASE;
707
+ }
708
+ } else if (buffer16_[0] == 't' &&
709
+ buffer16_[1] == 'r' && buffer16_[2] == 'u') {
710
+ // true
711
+ token = Token::TRUE_LITERAL;
712
+ } else if (buffer16_[0] == 'b' &&
713
+ buffer16_[1] == 'y' && buffer16_[2] == 't') {
714
+ // byte (removed)
715
+ // token = Token::BYTE;
716
+ }
717
+ break;
718
+ case 'l':
719
+ // null
720
+ if (buffer16_[0] == 'n' &&
721
+ buffer16_[1] == 'u' && buffer16_[2] == 'l') {
722
+ token = Token::NULL_LITERAL;
723
+ }
724
+ break;
725
+ case 's':
726
+ // this
727
+ if (buffer16_[0] == 't' &&
728
+ buffer16_[1] == 'h' && buffer16_[2] == 'i') {
729
+ token = Token::THIS;
730
+ }
731
+ break;
732
+ case 'd':
733
+ // void
734
+ if (buffer16_[0] == 'v' &&
735
+ buffer16_[1] == 'o' && buffer16_[2] == 'i') {
736
+ token = Token::VOID;
737
+ }
738
+ break;
739
+ case 'h':
740
+ // with
741
+ if (buffer16_[0] == 'w' &&
742
+ buffer16_[1] == 'i' && buffer16_[2] == 't') {
743
+ token = Token::WITH;
744
+ }
745
+ break;
746
+ case 'g':
747
+ // long (removed)
748
+ if (buffer16_[0] == 'l' &&
749
+ buffer16_[1] == 'o' && buffer16_[2] == 'n') {
750
+ // token = Token::LONG;
751
+ }
752
+ break;
753
+ case 'm':
754
+ // enum
755
+ if (buffer16_[0] == 'e' &&
756
+ buffer16_[1] == 'n' && buffer16_[2] == 'u') {
757
+ token = Token::ENUM;
758
+ }
759
+ break;
760
+ case 'r':
761
+ // char (removed)
762
+ if (buffer16_[0] == 'c' &&
763
+ buffer16_[1] == 'h' && buffer16_[2] == 'a') {
764
+ // token = Token::CHAR;
765
+ }
766
+ break;
767
+ case 'o':
768
+ // goto (removed)
769
+ if (buffer16_[0] == 'g' &&
770
+ buffer16_[1] == 'o' && buffer16_[2] == 't') {
771
+ // token = Token::GOTO;
772
+ }
773
+ break;
774
+ }
775
+ break;
776
+ case 5:
777
+ // break final float catch super while
778
+ // throw short class const false yield
779
+ // number 3 character is most duplicated
780
+ switch (buffer16_[3]) {
781
+ case 'a':
782
+ // break final float
783
+ if (buffer16_[0] == 'b' && buffer16_[1] == 'r' &&
784
+ buffer16_[2] == 'e' && buffer16_[4] == 'k') {
785
+ // break
786
+ token = Token::BREAK;
787
+ } else if (buffer16_[0] == 'f') {
788
+ if (buffer16_[1] == 'i' &&
789
+ buffer16_[2] == 'n' && buffer16_[4] == 'l') {
790
+ // final (removed)
791
+ // token = Token::FINAL;
792
+ } else if (buffer16_[1] == 'l' &&
793
+ buffer16_[2] == 'o' && buffer16_[4] == 't') {
794
+ // float (removed)
795
+ // token = Token::FLOAT;
796
+ }
797
+ }
798
+ break;
799
+ case 'c':
800
+ if (buffer16_[0] == 'c' && buffer16_[1] == 'a' &&
801
+ buffer16_[2] == 't' && buffer16_[4] == 'h') {
802
+ // catch
803
+ token = Token::CATCH;
804
+ }
805
+ break;
806
+ case 'e':
807
+ if (buffer16_[0] == 's' && buffer16_[1] == 'u' &&
808
+ buffer16_[2] == 'p' && buffer16_[4] == 'r') {
809
+ // super
810
+ token = Token::SUPER;
811
+ }
812
+ break;
813
+ case 'l':
814
+ if (buffer16_[0] == 'w' && buffer16_[1] == 'h' &&
815
+ buffer16_[2] == 'i' && buffer16_[4] == 'e') {
816
+ // while
817
+ token = Token::WHILE;
818
+ } else if (strict &&
819
+ buffer16_[0] == 'y' && buffer16_[1] == 'i' &&
820
+ buffer16_[2] == 'e' && buffer16_[4] == 'd') {
821
+ // yield
822
+ token = Token::YIELD;
823
+ }
824
+ break;
825
+ case 'o':
826
+ if (buffer16_[0] == 't' && buffer16_[1] == 'h' &&
827
+ buffer16_[2] == 'r' && buffer16_[4] == 'w') {
828
+ // throw
829
+ token = Token::THROW;
830
+ }
831
+ break;
832
+ case 'r':
833
+ if (buffer16_[0] == 's' && buffer16_[1] == 'h' &&
834
+ buffer16_[2] == 'o' && buffer16_[4] == 't') {
835
+ // short (removed)
836
+ // token = Token::SHORT;
837
+ }
838
+ break;
839
+ case 's':
840
+ // class const false
841
+ if (buffer16_[0] == 'c') {
842
+ if (buffer16_[1] == 'l' &&
843
+ buffer16_[2] == 'a' && buffer16_[4] == 's') {
844
+ // class
845
+ token = Token::CLASS;
846
+ } else if (buffer16_[1] == 'o' &&
847
+ buffer16_[2] == 'n' && buffer16_[4] == 't') {
848
+ // const
849
+ token = Token::CONST;
850
+ }
851
+ } else if (buffer16_[0] == 'f' && buffer16_[1] == 'a' &&
852
+ buffer16_[2] == 'l' && buffer16_[4] == 'e') {
853
+ // false
854
+ token = Token::FALSE_LITERAL;
855
+ }
856
+ break;
857
+ }
858
+ break;
859
+ case 6:
860
+ // double delete export import native
861
+ // public return static switch typeof throws
862
+ // number 0 character is most duplicated
863
+ switch (buffer16_[0]) {
864
+ case 'd':
865
+ // double delete
866
+ if (buffer16_[5] == 'e' &&
867
+ buffer16_[4] == 'l' && buffer16_[3] == 'b' &&
868
+ buffer16_[2] == 'u' && buffer16_[1] == 'o') {
869
+ // double
870
+ // token = Token::DOUBLE;
871
+ } else if (buffer16_[5] == 'e' &&
872
+ buffer16_[4] == 't' && buffer16_[3] == 'e' &&
873
+ buffer16_[2] == 'l' && buffer16_[1] == 'e') {
874
+ // delete
875
+ token = Token::DELETE;
876
+ }
877
+ break;
878
+ case 'e':
879
+ // export
880
+ token = IsMatch("export", len, Token::EXPORT);
881
+ break;
882
+ case 'i':
883
+ // import
884
+ token = IsMatch("import", len, Token::IMPORT);
885
+ break;
886
+ case 'n':
887
+ // native (removed)
888
+ // token = IsMatch("native", len, Token::NATIVE);
889
+ break;
890
+ case 'p':
891
+ // public
892
+ token = IsMatch("public", len, Token::PUBLIC, strict);
893
+ break;
894
+ case 'r':
895
+ // return
896
+ token = IsMatch("return", len, Token::RETURN);
897
+ break;
898
+ case 's':
899
+ // switch static
900
+ if (buffer16_[1] == 'w' &&
901
+ buffer16_[2] == 'i' && buffer16_[3] == 't' &&
902
+ buffer16_[4] == 'c' && buffer16_[5] == 'h') {
903
+ // switch
904
+ token = Token::SWITCH;
905
+ } else if (strict &&
906
+ buffer16_[1] == 't' &&
907
+ buffer16_[2] == 'a' && buffer16_[3] == 't' &&
908
+ buffer16_[4] == 'i' && buffer16_[5] == 'c') {
909
+ // static
910
+ token = Token::STATIC;
911
+ }
912
+ break;
913
+ case 't':
914
+ // typeof throws
915
+ if (buffer16_[5] == 'f' &&
916
+ buffer16_[4] == 'o' && buffer16_[3] == 'e' &&
917
+ buffer16_[2] == 'p' && buffer16_[1] == 'y') {
918
+ // typeof
919
+ token = Token::TYPEOF;
920
+ } else if (buffer16_[5] == 's' &&
921
+ buffer16_[4] == 'w' && buffer16_[3] == 'o' &&
922
+ buffer16_[2] == 'r' && buffer16_[1] == 'h') {
923
+ // throws (removed)
924
+ // token = Token::THROWS;
925
+ }
926
+ break;
927
+ }
928
+ break;
929
+ case 7:
930
+ // boolean default extends finally package private
931
+ // number 0 character is most duplicated
932
+ switch (buffer16_[0]) {
933
+ case 'b':
934
+ // boolean (removed)
935
+ // token = IsMatch("boolean", len, Token::BOOLEAN);
936
+ break;
937
+ case 'd':
938
+ token = IsMatch("default", len, Token::DEFAULT);
939
+ break;
940
+ case 'e':
941
+ token = IsMatch("extends", len, Token::EXTENDS);
942
+ break;
943
+ case 'f':
944
+ token = IsMatch("finally", len, Token::FINALLY);
945
+ break;
946
+ case 'p':
947
+ if (buffer16_[1] == 'a') {
948
+ token = IsMatch("package", len, Token::PACKAGE, strict);
949
+ } else if (buffer16_[1] == 'r') {
950
+ token = IsMatch("private", len, Token::PRIVATE, strict);
951
+ }
952
+ break;
953
+ }
954
+ break;
955
+ case 8:
956
+ // debugger continue abstract volatile function
957
+ // number 4 character is most duplicated
958
+ switch (buffer16_[4]) {
959
+ case 'g':
960
+ token = IsMatch("debugger", len, Token::DEBUGGER);
961
+ break;
962
+ case 'i':
963
+ token = IsMatch("continue", len, Token::CONTINUE);
964
+ break;
965
+ case 'r':
966
+ // abstract (removed)
967
+ // token = IsMatch("abstract", len, Token::ABSTRACT);
968
+ break;
969
+ case 't':
970
+ if (buffer16_[1] == 'o') {
971
+ // token = IsMatch("volatile", len, Token::VOLATILE);
972
+ } else if (buffer16_[1] == 'u') {
973
+ token = IsMatch("function", len, Token::FUNCTION);
974
+ }
975
+ break;
976
+ }
977
+ break;
978
+ case 9:
979
+ // interface protected transient
980
+ if (buffer16_[1] == 'n') {
981
+ token = IsMatch("interface", len, Token::INTERFACE, strict);
982
+ } else if (buffer16_[1] == 'r') {
983
+ if (buffer16_[0] == 'p') {
984
+ token = IsMatch("protected", len, Token::PROTECTED, strict);
985
+ } else if (buffer16_[0] == 't') {
986
+ // transient (removed)
987
+ // token = IsMatch("transient", len, Token::TRANSIENT);
988
+ }
989
+ }
990
+ break;
991
+ case 10:
992
+ // instanceof implements
993
+ if (buffer16_[1] == 'n') {
994
+ token = IsMatch("instanceof", len, Token::INSTANCEOF);
995
+ } else if (buffer16_[1] == 'm') {
996
+ token = IsMatch("implements", len, Token::IMPLEMENTS, strict);
997
+ }
998
+ break;
999
+ case 12:
1000
+ // synchronized (removed)
1001
+ // token = IsMatch("synchronized", len, Token::SYNCHRONIZED);
1002
+ token = Token::IDENTIFIER;
1003
+ break;
1004
+ }
1005
+ return token;
1006
+ }
1007
+
1008
+ Token::Type DetectGetOrSet() const {
1009
+ if (buffer16_.size() == 3) {
1010
+ if (buffer16_[1] == 'e' && buffer16_[2] == 't') {
1011
+ if (buffer16_[0] == 'g') {
1012
+ return Token::GET;
1013
+ } else if (buffer16_[0] == 's') {
1014
+ return Token::SET;
1015
+ }
1016
+ }
1017
+ }
1018
+ return Token::IDENTIFIER;
1019
+ }
1020
+
1021
+ Token::Type ScanString() {
1022
+ type_ = NONE;
1023
+ const uc16 quote = c_;
1024
+ buffer16_.clear();
1025
+ Advance();
1026
+ while (c_ != quote && c_ >= 0 && !Chars::IsLineTerminator(c_)) {
1027
+ if (c_ == '\\') {
1028
+ Advance();
1029
+ // escape sequence
1030
+ if (c_ < 0) return Token::ILLEGAL;
1031
+ if (type_ == NONE) {
1032
+ type_ = ESCAPE;
1033
+ }
1034
+ ScanEscape();
1035
+ } else {
1036
+ Record16Advance();
1037
+ }
1038
+ }
1039
+ if (c_ != quote) {
1040
+ // not closed
1041
+ return Token::ILLEGAL;
1042
+ }
1043
+ Advance();
1044
+
1045
+ return Token::STRING;
1046
+ }
1047
+
1048
+ void ScanEscape() {
1049
+ if (Chars::IsLineTerminator(c_)) {
1050
+ SkipLineTerminator();
1051
+ return;
1052
+ }
1053
+ switch (c_) {
1054
+ case '\'':
1055
+ case '"' :
1056
+ case '\\':
1057
+ Record16Advance();
1058
+ break;
1059
+ case 'b' :
1060
+ Record16('\b');
1061
+ Advance();
1062
+ break;
1063
+ case 'f' :
1064
+ Record16('\f');
1065
+ Advance();
1066
+ break;
1067
+ case 'n' :
1068
+ Record16('\n');
1069
+ Advance();
1070
+ break;
1071
+ case 'r' :
1072
+ Record16('\r');
1073
+ Advance();
1074
+ break;
1075
+ case 't' :
1076
+ Record16('\t');
1077
+ Advance();
1078
+ break;
1079
+ case 'u' :
1080
+ Advance();
1081
+ Record16(ScanHexEscape('u', 4));
1082
+ break;
1083
+ case 'v' :
1084
+ Record16('\v');
1085
+ Advance();
1086
+ break;
1087
+ case 'x' :
1088
+ Advance();
1089
+ Record16(ScanHexEscape('x', 2));
1090
+ break;
1091
+ case '0' :
1092
+ case '1' :
1093
+ case '2' :
1094
+ case '3' :
1095
+ case '4' :
1096
+ case '5' :
1097
+ case '6' :
1098
+ case '7' :
1099
+ if (type_ != OCTAL) {
1100
+ type_ = OCTAL;
1101
+ }
1102
+ Record16(ScanOctalEscape());
1103
+ break;
1104
+
1105
+ default:
1106
+ Record16Advance();
1107
+ break;
1108
+ }
1109
+ }
1110
+
1111
+ Token::Type ScanNumber(const bool period) {
1112
+ buffer8_.clear();
1113
+ State type = DECIMAL;
1114
+ if (period) {
1115
+ Record8('0');
1116
+ Record8('.');
1117
+ ScanDecimalDigits();
1118
+ } else {
1119
+ if (c_ == '0') {
1120
+ // 0x (hex) or 0 (octal)
1121
+ Record8Advance();
1122
+ if (c_ == 'x' || c_ == 'X') {
1123
+ // 0x (hex)
1124
+ type = HEX;
1125
+ Record8Advance();
1126
+ if (!Chars::IsHexDigit(c_)) {
1127
+ return Token::ILLEGAL;
1128
+ }
1129
+ while (Chars::IsHexDigit(c_)) {
1130
+ Record8Advance();
1131
+ }
1132
+ } else if (Chars::IsOctalDigit(c_)) {
1133
+ // 0 (octal)
1134
+ // octal number cannot convert with strtod
1135
+ type = OCTAL;
1136
+ Record8Advance();
1137
+ while (true) {
1138
+ if (c_ == '8' || c_ == '9') {
1139
+ // not octal digits
1140
+ type = DECIMAL;
1141
+ break;
1142
+ }
1143
+ if (c_ < '0' || '7' < c_) {
1144
+ break;
1145
+ }
1146
+ Record8Advance();
1147
+ }
1148
+ }
1149
+ }
1150
+ if (type == DECIMAL) {
1151
+ ScanDecimalDigits();
1152
+ if (c_ == '.') {
1153
+ Record8Advance();
1154
+ ScanDecimalDigits();
1155
+ }
1156
+ }
1157
+ }
1158
+
1159
+ // exponent part
1160
+ if (c_ == 'e' || c_ == 'E') {
1161
+ if (type != DECIMAL) {
1162
+ return Token::ILLEGAL;
1163
+ }
1164
+ Record8Advance();
1165
+ if (c_ == '+' || c_ == '-') {
1166
+ Record8Advance();
1167
+ }
1168
+ // more than 1 decimal digit required
1169
+ if (!Chars::IsDecimalDigit(c_)) {
1170
+ return Token::ILLEGAL;
1171
+ }
1172
+ ScanDecimalDigits();
1173
+ }
1174
+
1175
+ // see ECMA-262 section 7.8.3
1176
+ // "immediately following a NumericLiteral must not be an IdentifierStart or
1177
+ // DecimalDigit."
1178
+ if (Chars::IsDecimalDigit(c_) || Chars::IsIdentifierStart(c_)) {
1179
+ return Token::ILLEGAL;
1180
+ }
1181
+
1182
+ if (type == OCTAL) {
1183
+ double val = 0;
1184
+ for (std::vector<char>::const_iterator it = buffer8_.begin(),
1185
+ last = buffer8_.end(); it != last; ++it) {
1186
+ val = val * 8 + (*it - '0');
1187
+ }
1188
+ numeric_ = val;
1189
+ } else {
1190
+ Record8('\0'); // Null Terminated String
1191
+ numeric_ = std::strtod(buffer8_.data(), NULL);
1192
+ }
1193
+ type_ = type;
1194
+ return Token::NUMBER;
1195
+ }
1196
+
1197
+ uc16 ScanOctalEscape() {
1198
+ uc16 res = 0;
1199
+ for (int i = 0; i < 3; ++i) {
1200
+ const int d = OctalValue(c_);
1201
+ if (d < 0) {
1202
+ break;
1203
+ }
1204
+ const int t = res * 8 + d;
1205
+ if (t > 255) {
1206
+ break;
1207
+ }
1208
+ res = t;
1209
+ Advance();
1210
+ }
1211
+ return res;
1212
+ }
1213
+
1214
+ uc16 ScanHexEscape(uc16 c, int len) {
1215
+ uc16 res = 0;
1216
+ for (int i = 0; i < len; ++i) {
1217
+ const int d = HexValue(c_);
1218
+ if (d < 0) {
1219
+ for (int j = i - 1; j >= 0; --j) {
1220
+ PushBack();
1221
+ }
1222
+ return c;
1223
+ }
1224
+ res = res * 16 + d;
1225
+ Advance();
1226
+ }
1227
+ return res;
1228
+ }
1229
+
1230
+ inline int OctalValue(const int c) const {
1231
+ if ('0' <= c && c <= '8') {
1232
+ return c - '0';
1233
+ }
1234
+ return -1;
1235
+ }
1236
+
1237
+ inline int HexValue(const int c) const {
1238
+ if ('0' <= c && c <= '9') {
1239
+ return c - '0';
1240
+ }
1241
+ if ('a' <= c && c <= 'f') {
1242
+ return c - 'a' + 10;
1243
+ }
1244
+ if ('A' <= c && c <= 'F') {
1245
+ return c - 'A' + 10;
1246
+ }
1247
+ return -1;
1248
+ }
1249
+
1250
+ void ScanDecimalDigits() {
1251
+ while (Chars::IsDecimalDigit(c_)) {
1252
+ Record8Advance();
1253
+ }
1254
+ }
1255
+
1256
+ void SkipLineTerminator() {
1257
+ const uc16 c = c_;
1258
+ Advance();
1259
+ if (c + c_ == '\n' + '\r') {
1260
+ Advance();
1261
+ }
1262
+ ++line_number_;
1263
+ }
1264
+
1265
+ BasicSource* source_;
1266
+ std::vector<char> buffer8_;
1267
+ std::vector<uc16> buffer16_;
1268
+ double numeric_;
1269
+ State type_;
1270
+ std::size_t pos_;
1271
+ const std::size_t end_;
1272
+ bool has_line_terminator_before_next_;
1273
+ bool has_shebang_;
1274
+ int c_;
1275
+ std::size_t line_number_;
1276
+ Location location_;
1277
+ };
1278
+
1279
+
1280
+ } } // namespace iv::core
1281
+ #endif // _IV_LEXER_H_