iv-phonic 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/.autotest +24 -0
  2. data/Manifest.txt +49 -0
  3. data/README.rdoc +32 -0
  4. data/Rakefile +54 -0
  5. data/ext/include/iv/algorithm.h +23 -0
  6. data/ext/include/iv/alloc.h +200 -0
  7. data/ext/include/iv/any.h +71 -0
  8. data/ext/include/iv/ast-factory.h +277 -0
  9. data/ext/include/iv/ast-fwd.h +92 -0
  10. data/ext/include/iv/ast-serializer.h +579 -0
  11. data/ext/include/iv/ast-visitor.h +121 -0
  12. data/ext/include/iv/ast.h +1127 -0
  13. data/ext/include/iv/chars.h +83 -0
  14. data/ext/include/iv/cmdline.h +830 -0
  15. data/ext/include/iv/conversions.h +308 -0
  16. data/ext/include/iv/dtoa.h +20 -0
  17. data/ext/include/iv/enable_if.h +18 -0
  18. data/ext/include/iv/errors.h +15 -0
  19. data/ext/include/iv/fixedcontainer.h +42 -0
  20. data/ext/include/iv/functor.h +29 -0
  21. data/ext/include/iv/lexer.h +1281 -0
  22. data/ext/include/iv/location.h +23 -0
  23. data/ext/include/iv/mt19937.h +175 -0
  24. data/ext/include/iv/noncopyable.h +30 -0
  25. data/ext/include/iv/none.h +10 -0
  26. data/ext/include/iv/parser.h +2150 -0
  27. data/ext/include/iv/source.h +27 -0
  28. data/ext/include/iv/space.h +178 -0
  29. data/ext/include/iv/static_assert.h +30 -0
  30. data/ext/include/iv/stringpiece.h +385 -0
  31. data/ext/include/iv/token.h +311 -0
  32. data/ext/include/iv/ucdata.h +58 -0
  33. data/ext/include/iv/uchar.h +8 -0
  34. data/ext/include/iv/ustring.h +28 -0
  35. data/ext/include/iv/ustringpiece.h +9 -0
  36. data/ext/include/iv/utils.h +83 -0
  37. data/ext/include/iv/xorshift.h +74 -0
  38. data/ext/iv/phonic/ast-fwd.h +21 -0
  39. data/ext/iv/phonic/ast.h +10 -0
  40. data/ext/iv/phonic/creator.h +530 -0
  41. data/ext/iv/phonic/encoding.h +110 -0
  42. data/ext/iv/phonic/extconf.rb +5 -0
  43. data/ext/iv/phonic/factory.h +247 -0
  44. data/ext/iv/phonic/parser.h +12 -0
  45. data/ext/iv/phonic/phonic.cc +69 -0
  46. data/ext/iv/phonic/rnode.h +15 -0
  47. data/ext/iv/phonic/rparser.h +48 -0
  48. data/ext/iv/phonic/source.h +146 -0
  49. data/test/test_iv_phonic.rb +32 -0
  50. metadata +159 -0
@@ -0,0 +1,1281 @@
1
+ #ifndef _IV_LEXER_H_
2
+ #define _IV_LEXER_H_
3
+
4
+ #include <cstddef>
5
+ #include <cassert>
6
+ #include <cstdlib>
7
+ #include <vector>
8
+ #include <string>
9
+ #include "uchar.h"
10
+ #include "chars.h"
11
+ #include "token.h"
12
+ #include "source.h"
13
+ #include "location.h"
14
+ #include "noncopyable.h"
15
+
16
+ namespace iv {
17
+ namespace core {
18
+
19
+ class Lexer: private Noncopyable<Lexer>::type {
20
+ public:
21
+ enum LexType {
22
+ kClear = 0,
23
+ kIdentifyReservedWords = 1,
24
+ kIgnoreReservedWords = 2,
25
+ kIgnoreReservedWordsAndIdentifyGetterOrSetter = 4,
26
+ kStrict = 8
27
+ };
28
+ enum State {
29
+ NONE,
30
+ ESCAPE,
31
+ DECIMAL,
32
+ HEX,
33
+ OCTAL
34
+ };
35
+
36
+ explicit Lexer(BasicSource* src)
37
+ : source_(src),
38
+ buffer8_(kInitialReadBufferCapacity),
39
+ buffer16_(kInitialReadBufferCapacity),
40
+ pos_(0),
41
+ end_(source_->size()),
42
+ has_line_terminator_before_next_(false),
43
+ has_shebang_(false),
44
+ line_number_(1),
45
+ location_() {
46
+ Initialize();
47
+ }
48
+
49
+ Token::Type Next(int type) {
50
+ Token::Type token;
51
+ has_line_terminator_before_next_ = false;
52
+ do {
53
+ location_.begin_position_ = pos();
54
+ while (Chars::IsWhiteSpace(c_)) {
55
+ // white space
56
+ Advance();
57
+ }
58
+ switch (c_) {
59
+ case '"':
60
+ case '\'':
61
+ // string literal
62
+ token = ScanString();
63
+ break;
64
+
65
+ case '<':
66
+ // < <= << <<= <!--
67
+ Advance();
68
+ if (c_ == '=') {
69
+ Advance();
70
+ token = Token::LTE;
71
+ } else if (c_ == '<') {
72
+ Advance();
73
+ if (c_ == '=') {
74
+ Advance();
75
+ token = Token::ASSIGN_SHL;
76
+ } else {
77
+ token = Token::SHL;
78
+ }
79
+ } else if (c_ == '!') {
80
+ token = ScanHtmlComment();
81
+ } else {
82
+ token = Token::LT;
83
+ }
84
+ break;
85
+
86
+ case '>':
87
+ // > >= >> >>= >>> >>>=
88
+ Advance();
89
+ if (c_ == '=') {
90
+ Advance();
91
+ token = Token::GTE;
92
+ } else if (c_ == '>') {
93
+ Advance();
94
+ if (c_ == '=') {
95
+ Advance();
96
+ token = Token::ASSIGN_SAR;
97
+ } else if (c_ == '>') {
98
+ Advance();
99
+ if (c_ == '=') {
100
+ Advance();
101
+ token = Token::ASSIGN_SHR;
102
+ } else {
103
+ token = Token::SHR;
104
+ }
105
+ } else {
106
+ token = Token::SAR;
107
+ }
108
+ } else {
109
+ token = Token::GT;
110
+ }
111
+ break;
112
+
113
+ case '=':
114
+ // = == ===
115
+ Advance();
116
+ if (c_ == '=') {
117
+ Advance();
118
+ if (c_ == '=') {
119
+ Advance();
120
+ token = Token::EQ_STRICT;
121
+ } else {
122
+ token = Token::EQ;
123
+ }
124
+ } else {
125
+ token = Token::ASSIGN;
126
+ }
127
+ break;
128
+
129
+ case '!':
130
+ // ! != !==
131
+ Advance();
132
+ if (c_ == '=') {
133
+ Advance();
134
+ if (c_ == '=') {
135
+ Advance();
136
+ token = Token::NE_STRICT;
137
+ } else {
138
+ token = Token::NE;
139
+ }
140
+ } else {
141
+ token = Token::NOT;
142
+ }
143
+ break;
144
+
145
+ case '+':
146
+ // + ++ +=
147
+ Advance();
148
+ if (c_ == '+') {
149
+ Advance();
150
+ token = Token::INC;
151
+ } else if (c_ == '=') {
152
+ Advance();
153
+ token = Token::ASSIGN_ADD;
154
+ } else {
155
+ token = Token::ADD;
156
+ }
157
+ break;
158
+
159
+ case '-':
160
+ // - -- --> -=
161
+ Advance();
162
+ if (c_ == '-') {
163
+ Advance();
164
+ if (c_ == '>' && has_line_terminator_before_next_) {
165
+ token = SkipSingleLineComment();
166
+ } else {
167
+ token = Token::DEC;
168
+ }
169
+ } else if (c_ == '=') {
170
+ Advance();
171
+ token = Token::ASSIGN_SUB;
172
+ } else {
173
+ token = Token::SUB;
174
+ }
175
+ break;
176
+
177
+ case '*':
178
+ // * *=
179
+ Advance();
180
+ if (c_ == '=') {
181
+ Advance();
182
+ token = Token::ASSIGN_MUL;
183
+ } else {
184
+ token = Token::MUL;
185
+ }
186
+ break;
187
+
188
+ case '%':
189
+ // % %=
190
+ Advance();
191
+ if (c_ == '=') {
192
+ Advance();
193
+ token = Token::ASSIGN_MOD;
194
+ } else {
195
+ token = Token::MOD;
196
+ }
197
+ break;
198
+
199
+ case '/':
200
+ // / // /* /=
201
+ // ASSIGN_DIV and DIV remain to be solved which is RegExp or not.
202
+ Advance();
203
+ if (c_ == '/') {
204
+ // SINGLE LINE COMMENT
205
+ if (line_number_ == (has_shebang_ ? 1 : 2)) {
206
+ // magic comment
207
+ token = ScanMagicComment();
208
+ } else {
209
+ token = SkipSingleLineComment();
210
+ }
211
+ } else if (c_ == '*') {
212
+ // MULTI LINES COMMENT
213
+ token = SkipMultiLineComment();
214
+ } else if (c_ == '=') {
215
+ // ASSIGN_DIV
216
+ Advance();
217
+ token = Token::ASSIGN_DIV;
218
+ } else {
219
+ // DIV
220
+ token = Token::DIV;
221
+ }
222
+ break;
223
+
224
+ case '&':
225
+ // && &= &
226
+ Advance();
227
+ if (c_ == '&') {
228
+ Advance();
229
+ token = Token::LOGICAL_AND;
230
+ } else if (c_ == '=') {
231
+ Advance();
232
+ token = Token::ASSIGN_BIT_AND;
233
+ } else {
234
+ token = Token::BIT_AND;
235
+ }
236
+ break;
237
+
238
+ case '|':
239
+ // || |= |
240
+ Advance();
241
+ if (c_ == '|') {
242
+ Advance();
243
+ token = Token::LOGICAL_OR;
244
+ } else if (c_ == '=') {
245
+ Advance();
246
+ token = Token::ASSIGN_BIT_OR;
247
+ } else {
248
+ token = Token::BIT_OR;
249
+ }
250
+ break;
251
+
252
+ case '^':
253
+ // ^
254
+ Advance();
255
+ token = Token::BIT_XOR;
256
+ break;
257
+
258
+ case '.':
259
+ // . Number
260
+ Advance();
261
+ if (Chars::IsDecimalDigit(c_)) {
262
+ // float number parse
263
+ token = ScanNumber(true);
264
+ } else {
265
+ token = Token::PERIOD;
266
+ }
267
+ break;
268
+
269
+ case ':':
270
+ Advance();
271
+ token = Token::COLON;
272
+ break;
273
+
274
+ case ';':
275
+ Advance();
276
+ token = Token::SEMICOLON;
277
+ break;
278
+
279
+ case ',':
280
+ Advance();
281
+ token = Token::COMMA;
282
+ break;
283
+
284
+ case '(':
285
+ Advance();
286
+ token = Token::LPAREN;
287
+ break;
288
+
289
+ case ')':
290
+ Advance();
291
+ token = Token::RPAREN;
292
+ break;
293
+
294
+ case '[':
295
+ Advance();
296
+ token = Token::LBRACK;
297
+ break;
298
+
299
+ case ']':
300
+ Advance();
301
+ token = Token::RBRACK;
302
+ break;
303
+
304
+ case '{':
305
+ Advance();
306
+ token = Token::LBRACE;
307
+ break;
308
+
309
+ case '}':
310
+ Advance();
311
+ token = Token::RBRACE;
312
+ break;
313
+
314
+ case '?':
315
+ Advance();
316
+ token = Token::CONDITIONAL;
317
+ break;
318
+
319
+ case '~':
320
+ Advance();
321
+ token = Token::BIT_NOT;
322
+ break;
323
+
324
+ case '#':
325
+ // #!
326
+ // skip shebang as single line comment
327
+ if (pos_ == 1) {
328
+ assert(line_number_ == 1);
329
+ Advance();
330
+ if (c_ == '!') {
331
+ // shebang
332
+ has_shebang_ = true;
333
+ token = SkipSingleLineComment();
334
+ break;
335
+ }
336
+ PushBack();
337
+ }
338
+
339
+ default:
340
+ if (Chars::IsIdentifierStart(c_)) {
341
+ token = ScanIdentifier(type);
342
+ } else if (Chars::IsDecimalDigit(c_)) {
343
+ token = ScanNumber(false);
344
+ } else if (Chars::IsLineTerminator(c_)) {
345
+ SkipLineTerminator();
346
+ has_line_terminator_before_next_ = true;
347
+ token = Token::NOT_FOUND;
348
+ } else if (c_ < 0) {
349
+ // EOS
350
+ token = Token::EOS;
351
+ } else {
352
+ token = Token::ILLEGAL;
353
+ }
354
+ break;
355
+ }
356
+ } while (token == Token::NOT_FOUND);
357
+ location_.end_position_ = pos();
358
+ return token;
359
+ }
360
+
361
+ inline const std::vector<uc16>& Buffer() const {
362
+ return buffer16_;
363
+ }
364
+
365
+ inline const std::vector<char>& Buffer8() const {
366
+ return buffer8_;
367
+ }
368
+
369
+ inline const double& Numeric() const {
370
+ return numeric_;
371
+ }
372
+
373
+ inline State NumericType() const {
374
+ assert(type_ == DECIMAL ||
375
+ type_ == HEX ||
376
+ type_ == OCTAL);
377
+ return type_;
378
+ }
379
+
380
+ inline State StringEscapeType() const {
381
+ assert(type_ == NONE ||
382
+ type_ == ESCAPE ||
383
+ type_ == OCTAL);
384
+ return type_;
385
+ }
386
+
387
+ inline bool has_line_terminator_before_next() const {
388
+ return has_line_terminator_before_next_;
389
+ }
390
+
391
+ std::size_t line_number() const {
392
+ return line_number_;
393
+ }
394
+
395
+ const std::string& filename() const {
396
+ return source_->filename();
397
+ }
398
+
399
+ std::size_t pos() const {
400
+ return pos_;
401
+ }
402
+
403
+ inline BasicSource* source() const {
404
+ return source_;
405
+ }
406
+
407
+ inline Location location() const {
408
+ return location_;
409
+ }
410
+
411
+ bool ScanRegExpLiteral(bool contains_eq) {
412
+ bool character = false;
413
+ buffer16_.clear();
414
+ if (contains_eq) {
415
+ Record16('=');
416
+ }
417
+ while (c_ != '/' || character) {
418
+ // invalid RegExp pattern
419
+ if (Chars::IsLineTerminator(c_) || c_ < 0) {
420
+ return false;
421
+ }
422
+ if (c_ == '\\') {
423
+ // escape
424
+ Record16Advance();
425
+ if (Chars::IsLineTerminator(c_) || c_ < 0) {
426
+ return false;
427
+ }
428
+ Record16Advance();
429
+ } else {
430
+ if (c_ == '[') {
431
+ character = true;
432
+ } else if (c_ == ']') {
433
+ character = false;
434
+ }
435
+ Record16Advance();
436
+ }
437
+ }
438
+ Advance();
439
+ return true;
440
+ }
441
+
442
+ bool ScanRegExpFlags() {
443
+ buffer16_.clear();
444
+ uc16 uc;
445
+ while (Chars::IsIdentifierPart(c_)) {
446
+ if (c_ == '\\') {
447
+ Advance();
448
+ if (c_ != 'u') {
449
+ return false;
450
+ }
451
+ Advance();
452
+ uc = ScanHexEscape('u', 4);
453
+ if (uc == '\\') {
454
+ return false;
455
+ }
456
+ Record16(uc);
457
+ } else {
458
+ Record16Advance();
459
+ }
460
+ }
461
+ return true;
462
+ }
463
+
464
+ private:
465
+ static const std::size_t kInitialReadBufferCapacity = 32;
466
+
467
+ void Initialize() {
468
+ Advance();
469
+ }
470
+
471
+ inline void Advance() {
472
+ if (pos_ == end_) {
473
+ c_ = -1;
474
+ } else {
475
+ c_ = source_->Get(pos_++);
476
+ }
477
+ }
478
+ inline void Record8() {
479
+ buffer8_.push_back(static_cast<char>(c_));
480
+ }
481
+ inline void Record8(const int ch) {
482
+ buffer8_.push_back(static_cast<char>(ch));
483
+ }
484
+ inline void Record16() { buffer16_.push_back(c_); }
485
+ inline void Record16(const int ch) { buffer16_.push_back(ch); }
486
+ inline void Record8Advance() {
487
+ Record8();
488
+ Advance();
489
+ }
490
+ inline void Record16Advance() {
491
+ Record16();
492
+ Advance();
493
+ }
494
+
495
+ void PushBack() {
496
+ if (pos_ < 2) {
497
+ c_ = -1;
498
+ } else {
499
+ c_ = source_->Get(pos_-2);
500
+ --pos_;
501
+ }
502
+ }
503
+
504
+ inline Token::Type IsMatch(char const * keyword,
505
+ std::size_t len,
506
+ Token::Type guess, bool strict) const {
507
+ if (!strict) {
508
+ return Token::IDENTIFIER;
509
+ }
510
+ std::vector<uc16>::const_iterator it = buffer16_.begin();
511
+ do {
512
+ if (*it++ != *keyword++) {
513
+ return Token::IDENTIFIER;
514
+ }
515
+ } while (--len);
516
+ return guess;
517
+ }
518
+
519
+ inline Token::Type IsMatch(char const * keyword,
520
+ std::size_t len,
521
+ Token::Type guess) const {
522
+ std::vector<uc16>::const_iterator it = buffer16_.begin();
523
+ do {
524
+ if (*it++ != *keyword++) {
525
+ return Token::IDENTIFIER;
526
+ }
527
+ } while (--len);
528
+ return guess;
529
+ }
530
+
531
+ Token::Type SkipSingleLineComment() {
532
+ Advance();
533
+ // see ECMA-262 section 7.4
534
+ while (c_ >= 0 && !Chars::IsLineTerminator(c_)) {
535
+ Advance();
536
+ }
537
+ return Token::NOT_FOUND;
538
+ }
539
+
540
+ Token::Type SkipMultiLineComment() {
541
+ Advance();
542
+ // remember previous ch
543
+ uc16 ch;
544
+ while (c_ >= 0) {
545
+ ch = c_;
546
+ Advance();
547
+ if (ch == '*' && c_ == '/') {
548
+ c_ = ' ';
549
+ return Token::NOT_FOUND;
550
+ } else if (Chars::IsLineTerminator(c_)) {
551
+ // see ECMA-262 section 7.4
552
+ SkipLineTerminator();
553
+ has_line_terminator_before_next_ = true;
554
+ ch = '\n';
555
+ }
556
+ }
557
+ return Token::ILLEGAL;
558
+ }
559
+
560
+ Token::Type ScanHtmlComment() {
561
+ Advance();
562
+ if (c_ == '-') {
563
+ // <!-
564
+ Advance();
565
+ if (c_ == '-') {
566
+ // <!--
567
+ return SkipSingleLineComment();
568
+ }
569
+ PushBack();
570
+ }
571
+ // <! is LT and NOT
572
+ PushBack();
573
+ return Token::LT;
574
+ }
575
+
576
+ Token::Type ScanMagicComment() {
577
+ Advance();
578
+ // see ECMA-262 section 7.4
579
+ while (c_ >= 0 && !Chars::IsLineTerminator(c_)) {
580
+ Advance();
581
+ }
582
+ return Token::NOT_FOUND;
583
+ }
584
+
585
+ Token::Type ScanIdentifier(int type) {
586
+ Token::Type token = Token::IDENTIFIER;
587
+ uc16 uc;
588
+
589
+ buffer16_.clear();
590
+
591
+ if (c_ == '\\') {
592
+ Advance();
593
+ if (c_ != 'u') {
594
+ return Token::ILLEGAL;
595
+ }
596
+ Advance();
597
+ uc = ScanHexEscape('u', 4);
598
+ if (uc == '\\' || !Chars::IsIdentifierStart(uc)) {
599
+ return Token::ILLEGAL;
600
+ }
601
+ Record16(uc);
602
+ } else {
603
+ Record16Advance();
604
+ }
605
+
606
+ while (Chars::IsIdentifierPart(c_)) {
607
+ if (c_ == '\\') {
608
+ Advance();
609
+ if (c_ != 'u') {
610
+ return Token::ILLEGAL;
611
+ }
612
+ Advance();
613
+ uc = ScanHexEscape('u', 4);
614
+ if (uc == '\\' || !Chars::IsIdentifierPart(uc)) {
615
+ return Token::ILLEGAL;
616
+ }
617
+ Record16(uc);
618
+ } else {
619
+ Record16Advance();
620
+ }
621
+ }
622
+
623
+ if (type & kIdentifyReservedWords) {
624
+ token = DetectKeyword(type & kStrict);
625
+ } else if (type & kIgnoreReservedWordsAndIdentifyGetterOrSetter) {
626
+ token = DetectGetOrSet();
627
+ }
628
+
629
+ return token;
630
+ }
631
+
632
+ // detect which Identifier is Keyword, FutureReservedWord or not
633
+ // Keyword and FutureReservedWord are defined in ECMA-262 5th.
634
+ //
635
+ // Some words such as :
636
+ // int, short, boolean, byte, long, char, float, double, abstract, volatile,
637
+ // transient, final, throws, goto, native, synchronized
638
+ // were defined as FutureReservedWord in ECMA-262 3rd, but not in 5th.
639
+ // So, DetectKeyword interprets them as Identifier.
640
+ Token::Type DetectKeyword(bool strict) const {
641
+ const std::size_t len = buffer16_.size();
642
+ Token::Type token = Token::IDENTIFIER;
643
+ switch (len) {
644
+ case 2:
645
+ // if in do
646
+ if (buffer16_[0] == 'i') {
647
+ if (buffer16_[1] == 'f') {
648
+ token = Token::IF;
649
+ } else if (buffer16_[1] == 'n') {
650
+ token = Token::IN;
651
+ }
652
+ } else if (buffer16_[0] == 'd' && buffer16_[1] == 'o') {
653
+ // do
654
+ token = Token::DO;
655
+ }
656
+ break;
657
+ case 3:
658
+ // for var int new try let
659
+ switch (buffer16_[2]) {
660
+ case 't':
661
+ if (buffer16_[0] == 'l' && buffer16_[1] == 'e' && strict) {
662
+ // let
663
+ token = Token::LET;
664
+ } else if (buffer16_[0] == 'i' && buffer16_[1] == 'n') {
665
+ // int (removed)
666
+ // token = Token::INT;
667
+ }
668
+ break;
669
+ case 'r':
670
+ // for var
671
+ if (buffer16_[0] == 'f' && buffer16_[1] == 'o') {
672
+ // for
673
+ token = Token::FOR;
674
+ } else if (buffer16_[0] == 'v' && buffer16_[1] == 'a') {
675
+ // var
676
+ token = Token::VAR;
677
+ }
678
+ break;
679
+ case 'y':
680
+ // try
681
+ if (buffer16_[0] == 't' && buffer16_[1] == 'r') {
682
+ token = Token::TRY;
683
+ }
684
+ break;
685
+ case 'w':
686
+ // new
687
+ if (buffer16_[0] == 'n' && buffer16_[1] == 'e') {
688
+ token = Token::NEW;
689
+ }
690
+ break;
691
+ }
692
+ break;
693
+ case 4:
694
+ // else case true byte null this
695
+ // void with long enum char goto
696
+ // number 3 character is most duplicated
697
+ switch (buffer16_[3]) {
698
+ case 'e':
699
+ // else case true byte
700
+ if (buffer16_[2] == 's') {
701
+ if (buffer16_[0] == 'e' && buffer16_[1] == 'l') {
702
+ // else
703
+ token = Token::ELSE;
704
+ } else if (buffer16_[0] == 'c' && buffer16_[1] == 'a') {
705
+ // case
706
+ token = Token::CASE;
707
+ }
708
+ } else if (buffer16_[0] == 't' &&
709
+ buffer16_[1] == 'r' && buffer16_[2] == 'u') {
710
+ // true
711
+ token = Token::TRUE_LITERAL;
712
+ } else if (buffer16_[0] == 'b' &&
713
+ buffer16_[1] == 'y' && buffer16_[2] == 't') {
714
+ // byte (removed)
715
+ // token = Token::BYTE;
716
+ }
717
+ break;
718
+ case 'l':
719
+ // null
720
+ if (buffer16_[0] == 'n' &&
721
+ buffer16_[1] == 'u' && buffer16_[2] == 'l') {
722
+ token = Token::NULL_LITERAL;
723
+ }
724
+ break;
725
+ case 's':
726
+ // this
727
+ if (buffer16_[0] == 't' &&
728
+ buffer16_[1] == 'h' && buffer16_[2] == 'i') {
729
+ token = Token::THIS;
730
+ }
731
+ break;
732
+ case 'd':
733
+ // void
734
+ if (buffer16_[0] == 'v' &&
735
+ buffer16_[1] == 'o' && buffer16_[2] == 'i') {
736
+ token = Token::VOID;
737
+ }
738
+ break;
739
+ case 'h':
740
+ // with
741
+ if (buffer16_[0] == 'w' &&
742
+ buffer16_[1] == 'i' && buffer16_[2] == 't') {
743
+ token = Token::WITH;
744
+ }
745
+ break;
746
+ case 'g':
747
+ // long (removed)
748
+ if (buffer16_[0] == 'l' &&
749
+ buffer16_[1] == 'o' && buffer16_[2] == 'n') {
750
+ // token = Token::LONG;
751
+ }
752
+ break;
753
+ case 'm':
754
+ // enum
755
+ if (buffer16_[0] == 'e' &&
756
+ buffer16_[1] == 'n' && buffer16_[2] == 'u') {
757
+ token = Token::ENUM;
758
+ }
759
+ break;
760
+ case 'r':
761
+ // char (removed)
762
+ if (buffer16_[0] == 'c' &&
763
+ buffer16_[1] == 'h' && buffer16_[2] == 'a') {
764
+ // token = Token::CHAR;
765
+ }
766
+ break;
767
+ case 'o':
768
+ // goto (removed)
769
+ if (buffer16_[0] == 'g' &&
770
+ buffer16_[1] == 'o' && buffer16_[2] == 't') {
771
+ // token = Token::GOTO;
772
+ }
773
+ break;
774
+ }
775
+ break;
776
+ case 5:
777
+ // break final float catch super while
778
+ // throw short class const false yield
779
+ // number 3 character is most duplicated
780
+ switch (buffer16_[3]) {
781
+ case 'a':
782
+ // break final float
783
+ if (buffer16_[0] == 'b' && buffer16_[1] == 'r' &&
784
+ buffer16_[2] == 'e' && buffer16_[4] == 'k') {
785
+ // break
786
+ token = Token::BREAK;
787
+ } else if (buffer16_[0] == 'f') {
788
+ if (buffer16_[1] == 'i' &&
789
+ buffer16_[2] == 'n' && buffer16_[4] == 'l') {
790
+ // final (removed)
791
+ // token = Token::FINAL;
792
+ } else if (buffer16_[1] == 'l' &&
793
+ buffer16_[2] == 'o' && buffer16_[4] == 't') {
794
+ // float (removed)
795
+ // token = Token::FLOAT;
796
+ }
797
+ }
798
+ break;
799
+ case 'c':
800
+ if (buffer16_[0] == 'c' && buffer16_[1] == 'a' &&
801
+ buffer16_[2] == 't' && buffer16_[4] == 'h') {
802
+ // catch
803
+ token = Token::CATCH;
804
+ }
805
+ break;
806
+ case 'e':
807
+ if (buffer16_[0] == 's' && buffer16_[1] == 'u' &&
808
+ buffer16_[2] == 'p' && buffer16_[4] == 'r') {
809
+ // super
810
+ token = Token::SUPER;
811
+ }
812
+ break;
813
+ case 'l':
814
+ if (buffer16_[0] == 'w' && buffer16_[1] == 'h' &&
815
+ buffer16_[2] == 'i' && buffer16_[4] == 'e') {
816
+ // while
817
+ token = Token::WHILE;
818
+ } else if (strict &&
819
+ buffer16_[0] == 'y' && buffer16_[1] == 'i' &&
820
+ buffer16_[2] == 'e' && buffer16_[4] == 'd') {
821
+ // yield
822
+ token = Token::YIELD;
823
+ }
824
+ break;
825
+ case 'o':
826
+ if (buffer16_[0] == 't' && buffer16_[1] == 'h' &&
827
+ buffer16_[2] == 'r' && buffer16_[4] == 'w') {
828
+ // throw
829
+ token = Token::THROW;
830
+ }
831
+ break;
832
+ case 'r':
833
+ if (buffer16_[0] == 's' && buffer16_[1] == 'h' &&
834
+ buffer16_[2] == 'o' && buffer16_[4] == 't') {
835
+ // short (removed)
836
+ // token = Token::SHORT;
837
+ }
838
+ break;
839
+ case 's':
840
+ // class const false
841
+ if (buffer16_[0] == 'c') {
842
+ if (buffer16_[1] == 'l' &&
843
+ buffer16_[2] == 'a' && buffer16_[4] == 's') {
844
+ // class
845
+ token = Token::CLASS;
846
+ } else if (buffer16_[1] == 'o' &&
847
+ buffer16_[2] == 'n' && buffer16_[4] == 't') {
848
+ // const
849
+ token = Token::CONST;
850
+ }
851
+ } else if (buffer16_[0] == 'f' && buffer16_[1] == 'a' &&
852
+ buffer16_[2] == 'l' && buffer16_[4] == 'e') {
853
+ // false
854
+ token = Token::FALSE_LITERAL;
855
+ }
856
+ break;
857
+ }
858
+ break;
859
+ case 6:
860
+ // double delete export import native
861
+ // public return static switch typeof throws
862
+ // number 0 character is most duplicated
863
+ switch (buffer16_[0]) {
864
+ case 'd':
865
+ // double delete
866
+ if (buffer16_[5] == 'e' &&
867
+ buffer16_[4] == 'l' && buffer16_[3] == 'b' &&
868
+ buffer16_[2] == 'u' && buffer16_[1] == 'o') {
869
+ // double
870
+ // token = Token::DOUBLE;
871
+ } else if (buffer16_[5] == 'e' &&
872
+ buffer16_[4] == 't' && buffer16_[3] == 'e' &&
873
+ buffer16_[2] == 'l' && buffer16_[1] == 'e') {
874
+ // delete
875
+ token = Token::DELETE;
876
+ }
877
+ break;
878
+ case 'e':
879
+ // export
880
+ token = IsMatch("export", len, Token::EXPORT);
881
+ break;
882
+ case 'i':
883
+ // import
884
+ token = IsMatch("import", len, Token::IMPORT);
885
+ break;
886
+ case 'n':
887
+ // native (removed)
888
+ // token = IsMatch("native", len, Token::NATIVE);
889
+ break;
890
+ case 'p':
891
+ // public
892
+ token = IsMatch("public", len, Token::PUBLIC, strict);
893
+ break;
894
+ case 'r':
895
+ // return
896
+ token = IsMatch("return", len, Token::RETURN);
897
+ break;
898
+ case 's':
899
+ // switch static
900
+ if (buffer16_[1] == 'w' &&
901
+ buffer16_[2] == 'i' && buffer16_[3] == 't' &&
902
+ buffer16_[4] == 'c' && buffer16_[5] == 'h') {
903
+ // switch
904
+ token = Token::SWITCH;
905
+ } else if (strict &&
906
+ buffer16_[1] == 't' &&
907
+ buffer16_[2] == 'a' && buffer16_[3] == 't' &&
908
+ buffer16_[4] == 'i' && buffer16_[5] == 'c') {
909
+ // static
910
+ token = Token::STATIC;
911
+ }
912
+ break;
913
+ case 't':
914
+ // typeof throws
915
+ if (buffer16_[5] == 'f' &&
916
+ buffer16_[4] == 'o' && buffer16_[3] == 'e' &&
917
+ buffer16_[2] == 'p' && buffer16_[1] == 'y') {
918
+ // typeof
919
+ token = Token::TYPEOF;
920
+ } else if (buffer16_[5] == 's' &&
921
+ buffer16_[4] == 'w' && buffer16_[3] == 'o' &&
922
+ buffer16_[2] == 'r' && buffer16_[1] == 'h') {
923
+ // throws (removed)
924
+ // token = Token::THROWS;
925
+ }
926
+ break;
927
+ }
928
+ break;
929
+ case 7:
930
+ // boolean default extends finally package private
931
+ // number 0 character is most duplicated
932
+ switch (buffer16_[0]) {
933
+ case 'b':
934
+ // boolean (removed)
935
+ // token = IsMatch("boolean", len, Token::BOOLEAN);
936
+ break;
937
+ case 'd':
938
+ token = IsMatch("default", len, Token::DEFAULT);
939
+ break;
940
+ case 'e':
941
+ token = IsMatch("extends", len, Token::EXTENDS);
942
+ break;
943
+ case 'f':
944
+ token = IsMatch("finally", len, Token::FINALLY);
945
+ break;
946
+ case 'p':
947
+ if (buffer16_[1] == 'a') {
948
+ token = IsMatch("package", len, Token::PACKAGE, strict);
949
+ } else if (buffer16_[1] == 'r') {
950
+ token = IsMatch("private", len, Token::PRIVATE, strict);
951
+ }
952
+ break;
953
+ }
954
+ break;
955
+ case 8:
956
+ // debugger continue abstract volatile function
957
+ // number 4 character is most duplicated
958
+ switch (buffer16_[4]) {
959
+ case 'g':
960
+ token = IsMatch("debugger", len, Token::DEBUGGER);
961
+ break;
962
+ case 'i':
963
+ token = IsMatch("continue", len, Token::CONTINUE);
964
+ break;
965
+ case 'r':
966
+ // abstract (removed)
967
+ // token = IsMatch("abstract", len, Token::ABSTRACT);
968
+ break;
969
+ case 't':
970
+ if (buffer16_[1] == 'o') {
971
+ // token = IsMatch("volatile", len, Token::VOLATILE);
972
+ } else if (buffer16_[1] == 'u') {
973
+ token = IsMatch("function", len, Token::FUNCTION);
974
+ }
975
+ break;
976
+ }
977
+ break;
978
+ case 9:
979
+ // interface protected transient
980
+ if (buffer16_[1] == 'n') {
981
+ token = IsMatch("interface", len, Token::INTERFACE, strict);
982
+ } else if (buffer16_[1] == 'r') {
983
+ if (buffer16_[0] == 'p') {
984
+ token = IsMatch("protected", len, Token::PROTECTED, strict);
985
+ } else if (buffer16_[0] == 't') {
986
+ // transient (removed)
987
+ // token = IsMatch("transient", len, Token::TRANSIENT);
988
+ }
989
+ }
990
+ break;
991
+ case 10:
992
+ // instanceof implements
993
+ if (buffer16_[1] == 'n') {
994
+ token = IsMatch("instanceof", len, Token::INSTANCEOF);
995
+ } else if (buffer16_[1] == 'm') {
996
+ token = IsMatch("implements", len, Token::IMPLEMENTS, strict);
997
+ }
998
+ break;
999
+ case 12:
1000
+ // synchronized (removed)
1001
+ // token = IsMatch("synchronized", len, Token::SYNCHRONIZED);
1002
+ token = Token::IDENTIFIER;
1003
+ break;
1004
+ }
1005
+ return token;
1006
+ }
1007
+
1008
+ Token::Type DetectGetOrSet() const {
1009
+ if (buffer16_.size() == 3) {
1010
+ if (buffer16_[1] == 'e' && buffer16_[2] == 't') {
1011
+ if (buffer16_[0] == 'g') {
1012
+ return Token::GET;
1013
+ } else if (buffer16_[0] == 's') {
1014
+ return Token::SET;
1015
+ }
1016
+ }
1017
+ }
1018
+ return Token::IDENTIFIER;
1019
+ }
1020
+
1021
+ Token::Type ScanString() {
1022
+ type_ = NONE;
1023
+ const uc16 quote = c_;
1024
+ buffer16_.clear();
1025
+ Advance();
1026
+ while (c_ != quote && c_ >= 0 && !Chars::IsLineTerminator(c_)) {
1027
+ if (c_ == '\\') {
1028
+ Advance();
1029
+ // escape sequence
1030
+ if (c_ < 0) return Token::ILLEGAL;
1031
+ if (type_ == NONE) {
1032
+ type_ = ESCAPE;
1033
+ }
1034
+ ScanEscape();
1035
+ } else {
1036
+ Record16Advance();
1037
+ }
1038
+ }
1039
+ if (c_ != quote) {
1040
+ // not closed
1041
+ return Token::ILLEGAL;
1042
+ }
1043
+ Advance();
1044
+
1045
+ return Token::STRING;
1046
+ }
1047
+
1048
+ void ScanEscape() {
1049
+ if (Chars::IsLineTerminator(c_)) {
1050
+ SkipLineTerminator();
1051
+ return;
1052
+ }
1053
+ switch (c_) {
1054
+ case '\'':
1055
+ case '"' :
1056
+ case '\\':
1057
+ Record16Advance();
1058
+ break;
1059
+ case 'b' :
1060
+ Record16('\b');
1061
+ Advance();
1062
+ break;
1063
+ case 'f' :
1064
+ Record16('\f');
1065
+ Advance();
1066
+ break;
1067
+ case 'n' :
1068
+ Record16('\n');
1069
+ Advance();
1070
+ break;
1071
+ case 'r' :
1072
+ Record16('\r');
1073
+ Advance();
1074
+ break;
1075
+ case 't' :
1076
+ Record16('\t');
1077
+ Advance();
1078
+ break;
1079
+ case 'u' :
1080
+ Advance();
1081
+ Record16(ScanHexEscape('u', 4));
1082
+ break;
1083
+ case 'v' :
1084
+ Record16('\v');
1085
+ Advance();
1086
+ break;
1087
+ case 'x' :
1088
+ Advance();
1089
+ Record16(ScanHexEscape('x', 2));
1090
+ break;
1091
+ case '0' :
1092
+ case '1' :
1093
+ case '2' :
1094
+ case '3' :
1095
+ case '4' :
1096
+ case '5' :
1097
+ case '6' :
1098
+ case '7' :
1099
+ if (type_ != OCTAL) {
1100
+ type_ = OCTAL;
1101
+ }
1102
+ Record16(ScanOctalEscape());
1103
+ break;
1104
+
1105
+ default:
1106
+ Record16Advance();
1107
+ break;
1108
+ }
1109
+ }
1110
+
1111
+ Token::Type ScanNumber(const bool period) {
1112
+ buffer8_.clear();
1113
+ State type = DECIMAL;
1114
+ if (period) {
1115
+ Record8('0');
1116
+ Record8('.');
1117
+ ScanDecimalDigits();
1118
+ } else {
1119
+ if (c_ == '0') {
1120
+ // 0x (hex) or 0 (octal)
1121
+ Record8Advance();
1122
+ if (c_ == 'x' || c_ == 'X') {
1123
+ // 0x (hex)
1124
+ type = HEX;
1125
+ Record8Advance();
1126
+ if (!Chars::IsHexDigit(c_)) {
1127
+ return Token::ILLEGAL;
1128
+ }
1129
+ while (Chars::IsHexDigit(c_)) {
1130
+ Record8Advance();
1131
+ }
1132
+ } else if (Chars::IsOctalDigit(c_)) {
1133
+ // 0 (octal)
1134
+ // octal number cannot convert with strtod
1135
+ type = OCTAL;
1136
+ Record8Advance();
1137
+ while (true) {
1138
+ if (c_ == '8' || c_ == '9') {
1139
+ // not octal digits
1140
+ type = DECIMAL;
1141
+ break;
1142
+ }
1143
+ if (c_ < '0' || '7' < c_) {
1144
+ break;
1145
+ }
1146
+ Record8Advance();
1147
+ }
1148
+ }
1149
+ }
1150
+ if (type == DECIMAL) {
1151
+ ScanDecimalDigits();
1152
+ if (c_ == '.') {
1153
+ Record8Advance();
1154
+ ScanDecimalDigits();
1155
+ }
1156
+ }
1157
+ }
1158
+
1159
+ // exponent part
1160
+ if (c_ == 'e' || c_ == 'E') {
1161
+ if (type != DECIMAL) {
1162
+ return Token::ILLEGAL;
1163
+ }
1164
+ Record8Advance();
1165
+ if (c_ == '+' || c_ == '-') {
1166
+ Record8Advance();
1167
+ }
1168
+ // more than 1 decimal digit required
1169
+ if (!Chars::IsDecimalDigit(c_)) {
1170
+ return Token::ILLEGAL;
1171
+ }
1172
+ ScanDecimalDigits();
1173
+ }
1174
+
1175
+ // see ECMA-262 section 7.8.3
1176
+ // "immediately following a NumericLiteral must not be an IdentifierStart or
1177
+ // DecimalDigit."
1178
+ if (Chars::IsDecimalDigit(c_) || Chars::IsIdentifierStart(c_)) {
1179
+ return Token::ILLEGAL;
1180
+ }
1181
+
1182
+ if (type == OCTAL) {
1183
+ double val = 0;
1184
+ for (std::vector<char>::const_iterator it = buffer8_.begin(),
1185
+ last = buffer8_.end(); it != last; ++it) {
1186
+ val = val * 8 + (*it - '0');
1187
+ }
1188
+ numeric_ = val;
1189
+ } else {
1190
+ Record8('\0'); // Null Terminated String
1191
+ numeric_ = std::strtod(buffer8_.data(), NULL);
1192
+ }
1193
+ type_ = type;
1194
+ return Token::NUMBER;
1195
+ }
1196
+
1197
+ uc16 ScanOctalEscape() {
1198
+ uc16 res = 0;
1199
+ for (int i = 0; i < 3; ++i) {
1200
+ const int d = OctalValue(c_);
1201
+ if (d < 0) {
1202
+ break;
1203
+ }
1204
+ const int t = res * 8 + d;
1205
+ if (t > 255) {
1206
+ break;
1207
+ }
1208
+ res = t;
1209
+ Advance();
1210
+ }
1211
+ return res;
1212
+ }
1213
+
1214
+ uc16 ScanHexEscape(uc16 c, int len) {
1215
+ uc16 res = 0;
1216
+ for (int i = 0; i < len; ++i) {
1217
+ const int d = HexValue(c_);
1218
+ if (d < 0) {
1219
+ for (int j = i - 1; j >= 0; --j) {
1220
+ PushBack();
1221
+ }
1222
+ return c;
1223
+ }
1224
+ res = res * 16 + d;
1225
+ Advance();
1226
+ }
1227
+ return res;
1228
+ }
1229
+
1230
+ inline int OctalValue(const int c) const {
1231
+ if ('0' <= c && c <= '8') {
1232
+ return c - '0';
1233
+ }
1234
+ return -1;
1235
+ }
1236
+
1237
+ inline int HexValue(const int c) const {
1238
+ if ('0' <= c && c <= '9') {
1239
+ return c - '0';
1240
+ }
1241
+ if ('a' <= c && c <= 'f') {
1242
+ return c - 'a' + 10;
1243
+ }
1244
+ if ('A' <= c && c <= 'F') {
1245
+ return c - 'A' + 10;
1246
+ }
1247
+ return -1;
1248
+ }
1249
+
1250
+ void ScanDecimalDigits() {
1251
+ while (Chars::IsDecimalDigit(c_)) {
1252
+ Record8Advance();
1253
+ }
1254
+ }
1255
+
1256
+ void SkipLineTerminator() {
1257
+ const uc16 c = c_;
1258
+ Advance();
1259
+ if (c + c_ == '\n' + '\r') {
1260
+ Advance();
1261
+ }
1262
+ ++line_number_;
1263
+ }
1264
+
1265
+ BasicSource* source_;
1266
+ std::vector<char> buffer8_;
1267
+ std::vector<uc16> buffer16_;
1268
+ double numeric_;
1269
+ State type_;
1270
+ std::size_t pos_;
1271
+ const std::size_t end_;
1272
+ bool has_line_terminator_before_next_;
1273
+ bool has_shebang_;
1274
+ int c_;
1275
+ std::size_t line_number_;
1276
+ Location location_;
1277
+ };
1278
+
1279
+
1280
+ } } // namespace iv::core
1281
+ #endif // _IV_LEXER_H_