tova 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,16 @@ import { TokenType, Keywords, Token } from './tokens.js';
3
3
  export class Lexer {
4
4
  static MAX_INTERPOLATION_DEPTH = 64;
5
5
 
6
+ // Pre-compiled regex constants (avoid re-compilation in hot loops)
7
+ static UNICODE_LETTER_RE = /\p{Letter}/u;
8
+ static UNICODE_ALPHANUM_RE = /[\p{Letter}\p{Number}\p{Mark}]/u;
9
+ static HEX_DIGIT_RE = /[0-9a-fA-F_]/;
10
+ static BINARY_DIGIT_RE = /[01_]/;
11
+ static OCTAL_DIGIT_RE = /[0-7_]/;
12
+ static REGEX_FLAG_RE = /[gimsuydv]/;
13
+ static REGEX_START_RE = /[\s\/*=]/;
14
+ static JSX_CF_KEYWORDS = new Set(['if', 'for', 'elif', 'else', 'match']);
15
+
6
16
  constructor(source, filename = '<stdin>', lineOffset = 0, columnOffset = 0, _depth = 0) {
7
17
  this.source = source;
8
18
  this.filename = filename;
@@ -20,6 +30,22 @@ export class Lexer {
20
30
  this._jsxExprDepth = 0; // brace depth for {expr} inside JSX
21
31
  this._jsxCF = null; // null | { paren: 0, brace: 0, keyword? } — control flow state
22
32
  this._matchBlockDepth = 0; // brace depth for match body inside JSX
33
+ this._subLexer = null; // reusable sub-lexer for string interpolation
34
+ }
35
+
36
+ reset(source, lineOffset, columnOffset) {
37
+ this.source = source;
38
+ this.tokens = [];
39
+ this.pos = 0;
40
+ this.line = 1 + lineOffset;
41
+ this.column = 1 + columnOffset;
42
+ this.length = source.length;
43
+ this._jsxStack = [];
44
+ this._jsxTagMode = null;
45
+ this._jsxSelfClosing = false;
46
+ this._jsxExprDepth = 0;
47
+ this._jsxCF = null;
48
+ this._matchBlockDepth = 0;
23
49
  }
24
50
 
25
51
  error(message) {
@@ -62,14 +88,14 @@ export class Lexer {
62
88
  isAlpha(ch) {
63
89
  if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '_') return true;
64
90
  // Unicode letter support
65
- if (ch > '\x7f') return /\p{Letter}/u.test(ch);
91
+ if (ch > '\x7f') return Lexer.UNICODE_LETTER_RE.test(ch);
66
92
  return false;
67
93
  }
68
94
 
69
95
  isAlphaNumeric(ch) {
70
96
  if (this.isAlpha(ch) || this.isDigit(ch)) return true;
71
97
  // Unicode continue characters (combining marks, etc.)
72
- if (ch > '\x7f') return /[\p{Letter}\p{Number}\p{Mark}]/u.test(ch);
98
+ if (ch > '\x7f') return Lexer.UNICODE_ALPHANUM_RE.test(ch);
73
99
  return false;
74
100
  }
75
101
 
@@ -77,16 +103,27 @@ export class Lexer {
77
103
  return ch === ' ' || ch === '\t' || ch === '\r';
78
104
  }
79
105
 
106
+ _processEscape(esc) {
107
+ switch (esc) {
108
+ case 'n': return '\n';
109
+ case 't': return '\t';
110
+ case 'r': return '\r';
111
+ case '\\': return '\\';
112
+ case '"': return '"';
113
+ case "'": return "'";
114
+ case '{': return '{';
115
+ case '}': return '}';
116
+ default: return '\\' + esc;
117
+ }
118
+ }
119
+
80
120
  _isJSXStart() {
81
121
  const nextCh = this.peek();
82
122
  if (!this.isAlpha(nextCh)) return false;
83
123
  // Check the token BEFORE < (LESS was already pushed, so it's at length-2)
84
124
  const prev = this.tokens.length > 1 ? this.tokens[this.tokens.length - 2] : null;
85
125
  if (!prev) return true;
86
- const valueTypes = [TokenType.IDENTIFIER, TokenType.NUMBER, TokenType.STRING,
87
- TokenType.STRING_TEMPLATE, TokenType.RPAREN, TokenType.RBRACKET, TokenType.RBRACE,
88
- TokenType.TRUE, TokenType.FALSE, TokenType.NIL];
89
- return !valueTypes.includes(prev.type);
126
+ return !Lexer.VALUE_TOKEN_TYPES.has(prev.type);
90
127
  }
91
128
 
92
129
  tokenize() {
@@ -153,17 +190,12 @@ export class Lexer {
153
190
  // Negative list: if previous token ends an expression (produces a value),
154
191
  // then / is division. Otherwise, / starts a regex.
155
192
  // This is simpler and more robust — new token types default to regex context.
156
- const divisionContextTokens = [
157
- TokenType.IDENTIFIER, TokenType.NUMBER, TokenType.STRING, TokenType.STRING_TEMPLATE,
158
- TokenType.TRUE, TokenType.FALSE, TokenType.NIL,
159
- TokenType.RPAREN, TokenType.RBRACKET, TokenType.RBRACE,
160
- ];
161
- if (prev && !divisionContextTokens.includes(prev.type)) {
193
+ if (prev && !Lexer.VALUE_TOKEN_TYPES.has(prev.type)) {
162
194
  this.scanRegex();
163
195
  return;
164
196
  }
165
197
  // At start of file (no prev token), treat / as regex if followed by a non-space, non-special char
166
- if (!prev && this.pos + 1 < this.length && !/[\s\/*=]/.test(this.peek(1))) {
198
+ if (!prev && this.pos + 1 < this.length && !Lexer.REGEX_START_RE.test(this.peek(1))) {
167
199
  this.scanRegex();
168
200
  return;
169
201
  }
@@ -224,11 +256,10 @@ export class Lexer {
224
256
  }
225
257
  // Check if next non-ws starts a keyword (if/for/elif/else)
226
258
  if (this.isAlpha(nextNonWs)) {
227
- let word = '', wp = pp;
228
- while (wp < this.length && this.isAlphaNumeric(this.source[wp])) {
229
- word += this.source[wp]; wp++;
230
- }
231
- if (['if', 'for', 'elif', 'else', 'match'].includes(word)) {
259
+ let wp = pp;
260
+ while (wp < this.length && this.isAlphaNumeric(this.source[wp])) wp++;
261
+ const word = this.source.substring(pp, wp);
262
+ if (Lexer.JSX_CF_KEYWORDS.has(word)) {
232
263
  while (this.pos < pp) this.advance();
233
264
  return;
234
265
  }
@@ -258,11 +289,10 @@ export class Lexer {
258
289
 
259
290
  // Check for JSX control flow keywords: if, for, elif, else, match
260
291
  if (this.isAlpha(ch)) {
261
- let word = '', peekPos = this.pos;
262
- while (peekPos < this.length && this.isAlphaNumeric(this.source[peekPos])) {
263
- word += this.source[peekPos]; peekPos++;
264
- }
265
- if (['if', 'for', 'elif', 'else', 'match'].includes(word)) {
292
+ let peekPos = this.pos;
293
+ while (peekPos < this.length && this.isAlphaNumeric(this.source[peekPos])) peekPos++;
294
+ const word = this.source.substring(this.pos, peekPos);
295
+ if (Lexer.JSX_CF_KEYWORDS.has(word)) {
266
296
  this.scanIdentifier();
267
297
  // After keyword, enter control flow mode for normal scanning
268
298
  this._jsxCF = { paren: 0, brace: 0, keyword: word };
@@ -281,12 +311,14 @@ export class Lexer {
281
311
  const ch = this.peek();
282
312
  if (ch === '<' || ch === '{' || ch === '"' || ch === "'") break;
283
313
  // Stop at keywords if, for, elif, else preceded by whitespace
284
- if (this.isAlpha(ch) && text.length > 0 && /\s$/.test(text)) {
285
- let word = '', pp = this.pos;
286
- while (pp < this.length && this.isAlphaNumeric(this.source[pp])) {
287
- word += this.source[pp]; pp++;
314
+ if (this.isAlpha(ch) && text.length > 0) {
315
+ const lastCh = text[text.length - 1];
316
+ if (lastCh === ' ' || lastCh === '\t' || lastCh === '\n' || lastCh === '\r') {
317
+ let pp = this.pos;
318
+ while (pp < this.length && this.isAlphaNumeric(this.source[pp])) pp++;
319
+ const word = this.source.substring(this.pos, pp);
320
+ if (Lexer.JSX_CF_KEYWORDS.has(word)) break;
288
321
  }
289
- if (['if', 'for', 'elif', 'else', 'match'].includes(word)) break;
290
322
  }
291
323
  text += this.advance();
292
324
  }
@@ -351,7 +383,7 @@ export class Lexer {
351
383
  if (next === 'x' || next === 'X') {
352
384
  this.advance(); // 0
353
385
  this.advance(); // x
354
- while (this.pos < this.length && /[0-9a-fA-F_]/.test(this.peek())) {
386
+ while (this.pos < this.length && Lexer.HEX_DIGIT_RE.test(this.peek())) {
355
387
  const ch = this.advance();
356
388
  if (ch !== '_') value += ch;
357
389
  }
@@ -362,7 +394,7 @@ export class Lexer {
362
394
  if (next === 'b' || next === 'B') {
363
395
  this.advance(); // 0
364
396
  this.advance(); // b
365
- while (this.pos < this.length && /[01_]/.test(this.peek())) {
397
+ while (this.pos < this.length && Lexer.BINARY_DIGIT_RE.test(this.peek())) {
366
398
  const ch = this.advance();
367
399
  if (ch !== '_') value += ch;
368
400
  }
@@ -373,7 +405,7 @@ export class Lexer {
373
405
  if (next === 'o' || next === 'O') {
374
406
  this.advance(); // 0
375
407
  this.advance(); // o
376
- while (this.pos < this.length && /[0-7_]/.test(this.peek())) {
408
+ while (this.pos < this.length && Lexer.OCTAL_DIGIT_RE.test(this.peek())) {
377
409
  const ch = this.advance();
378
410
  if (ch !== '_') value += ch;
379
411
  }
@@ -383,41 +415,56 @@ export class Lexer {
383
415
  }
384
416
  }
385
417
 
386
- // Decimal
387
- while (this.pos < this.length && (this.isDigit(this.peek()) || this.peek() === '_')) {
388
- const ch = this.advance();
389
- if (ch !== '_') value += ch;
418
+ // Fast path: scan decimal number using index advancement (no string concat)
419
+ // Handles digits, underscores, decimal point, and exponent
420
+ const numStart = this.pos;
421
+ let hasUnderscore = false;
422
+ while (this.pos < this.length) {
423
+ const ch = this.source[this.pos];
424
+ if (ch >= '0' && ch <= '9') { this.pos++; this.column++; }
425
+ else if (ch === '_') { hasUnderscore = true; this.pos++; this.column++; }
426
+ else break;
390
427
  }
391
428
 
392
429
  // Decimal point — only consume if followed by a digit or underscore (not e.g. 15.minutes)
393
- if (this.peek() === '.' && this.peek(1) !== '.' && (this.isDigit(this.peek(1)) || this.peek(1) === '_')) {
394
- value += this.advance(); // .
395
- while (this.pos < this.length && (this.isDigit(this.peek()) || this.peek() === '_')) {
396
- const ch = this.advance();
397
- if (ch !== '_') value += ch;
430
+ if (this.pos < this.length && this.source[this.pos] === '.') {
431
+ const next = this.pos + 1 < this.length ? this.source[this.pos + 1] : '';
432
+ if (next !== '.' && ((next >= '0' && next <= '9') || next === '_')) {
433
+ this.pos++; this.column++; // .
434
+ while (this.pos < this.length) {
435
+ const ch = this.source[this.pos];
436
+ if (ch >= '0' && ch <= '9') { this.pos++; this.column++; }
437
+ else if (ch === '_') { hasUnderscore = true; this.pos++; this.column++; }
438
+ else break;
439
+ }
398
440
  }
399
441
  }
400
442
 
401
443
  // Exponent
402
- if (this.peek() === 'e' || this.peek() === 'E') {
403
- const savedPos = this.pos;
404
- const savedCol = this.column;
405
- let expPart = this.advance(); // consume 'e'/'E'
406
- if (this.peek() === '+' || this.peek() === '-') {
407
- expPart += this.advance();
408
- }
409
- if (this.pos < this.length && this.isDigit(this.peek())) {
410
- value += expPart;
411
- while (this.pos < this.length && this.isDigit(this.peek())) {
412
- value += this.advance();
444
+ if (this.pos < this.length) {
445
+ const ech = this.source[this.pos];
446
+ if (ech === 'e' || ech === 'E') {
447
+ const savedPos = this.pos;
448
+ const savedCol = this.column;
449
+ this.pos++; this.column++;
450
+ if (this.pos < this.length && (this.source[this.pos] === '+' || this.source[this.pos] === '-')) {
451
+ this.pos++; this.column++;
452
+ }
453
+ if (this.pos < this.length && this.source[this.pos] >= '0' && this.source[this.pos] <= '9') {
454
+ while (this.pos < this.length && this.source[this.pos] >= '0' && this.source[this.pos] <= '9') {
455
+ this.pos++; this.column++;
456
+ }
457
+ } else {
458
+ // No digits after exponent — backtrack
459
+ this.pos = savedPos;
460
+ this.column = savedCol;
413
461
  }
414
- } else {
415
- // No digits after exponent — backtrack, treat 'e' as separate token
416
- this.pos = savedPos;
417
- this.column = savedCol;
418
462
  }
419
463
  }
420
464
 
465
+ let numStr = this.source.substring(numStart, this.pos);
466
+ if (hasUnderscore) numStr = numStr.replace(/_/g, '');
467
+ value = numStr;
421
468
  this.tokens.push(new Token(TokenType.NUMBER, parseFloat(value), startLine, startCol));
422
469
  }
423
470
 
@@ -436,17 +483,7 @@ export class Lexer {
436
483
  if (this.pos >= this.length) {
437
484
  this.error('Unterminated string');
438
485
  }
439
- const esc = this.advance();
440
- switch (esc) {
441
- case 'n': current += '\n'; break;
442
- case 't': current += '\t'; break;
443
- case 'r': current += '\r'; break;
444
- case '\\': current += '\\'; break;
445
- case '"': current += '"'; break;
446
- case '{': current += '{'; break;
447
- case '}': current += '}'; break;
448
- default: current += '\\' + esc;
449
- }
486
+ current += this._processEscape(this.advance());
450
487
  continue;
451
488
  }
452
489
 
@@ -508,8 +545,12 @@ export class Lexer {
508
545
  if (this._depth + 1 > Lexer.MAX_INTERPOLATION_DEPTH) {
509
546
  this.error('String interpolation nested too deeply (max ' + Lexer.MAX_INTERPOLATION_DEPTH + ' levels)');
510
547
  }
511
- const subLexer = new Lexer(exprSource, this.filename, exprStartLine, exprStartCol, this._depth + 1);
512
- const exprTokens = subLexer.tokenize();
548
+ if (!this._subLexer) {
549
+ this._subLexer = new Lexer(exprSource, this.filename, exprStartLine, exprStartCol, this._depth + 1);
550
+ } else {
551
+ this._subLexer.reset(exprSource, exprStartLine, exprStartCol);
552
+ }
553
+ const exprTokens = this._subLexer.tokenize();
513
554
  // Remove the EOF token
514
555
  exprTokens.pop();
515
556
 
@@ -566,17 +607,7 @@ export class Lexer {
566
607
  if (this.pos >= this.length) {
567
608
  this.error('Unterminated multiline string');
568
609
  }
569
- const esc = this.advance();
570
- switch (esc) {
571
- case 'n': current += '\n'; break;
572
- case 't': current += '\t'; break;
573
- case 'r': current += '\r'; break;
574
- case '\\': current += '\\'; break;
575
- case '"': current += '"'; break;
576
- case '{': current += '{'; break;
577
- case '}': current += '}'; break;
578
- default: current += '\\' + esc;
579
- }
610
+ current += this._processEscape(this.advance());
580
611
  continue;
581
612
  }
582
613
 
@@ -591,30 +622,30 @@ export class Lexer {
591
622
  const exprStartLine = this.line - 1;
592
623
  const exprStartCol = this.column - 1;
593
624
  let depth = 1;
594
- let exprSource = '';
625
+ const exprParts = [];
595
626
  while (this.pos < this.length && depth > 0) {
596
627
  const ch = this.peek();
597
628
  if (ch === '"' || ch === "'" || ch === '`') {
598
629
  const quote = ch;
599
- exprSource += this.advance();
630
+ exprParts.push(this.advance());
600
631
  let strDepth = 0;
601
632
  while (this.pos < this.length) {
602
633
  if (this.peek() === '\\') {
603
- exprSource += this.advance();
604
- if (this.pos < this.length) exprSource += this.advance();
634
+ exprParts.push(this.advance());
635
+ if (this.pos < this.length) exprParts.push(this.advance());
605
636
  } else if (quote === '"' && this.peek() === '{') {
606
637
  strDepth++;
607
- exprSource += this.advance();
638
+ exprParts.push(this.advance());
608
639
  } else if (quote === '"' && this.peek() === '}' && strDepth > 0) {
609
640
  strDepth--;
610
- exprSource += this.advance();
641
+ exprParts.push(this.advance());
611
642
  } else if (this.peek() === quote && strDepth === 0) {
612
643
  break;
613
644
  } else {
614
- exprSource += this.advance();
645
+ exprParts.push(this.advance());
615
646
  }
616
647
  }
617
- if (this.pos < this.length) exprSource += this.advance();
648
+ if (this.pos < this.length) exprParts.push(this.advance());
618
649
  continue;
619
650
  }
620
651
  if (ch === '{') depth++;
@@ -622,8 +653,9 @@ export class Lexer {
622
653
  depth--;
623
654
  if (depth === 0) break;
624
655
  }
625
- exprSource += this.advance();
656
+ exprParts.push(this.advance());
626
657
  }
658
+ const exprSource = exprParts.join('');
627
659
 
628
660
  if (this.peek() !== '}') {
629
661
  this.error('Unterminated string interpolation in multiline string');
@@ -633,8 +665,12 @@ export class Lexer {
633
665
  if (this._depth + 1 > Lexer.MAX_INTERPOLATION_DEPTH) {
634
666
  this.error('String interpolation nested too deeply (max ' + Lexer.MAX_INTERPOLATION_DEPTH + ' levels)');
635
667
  }
636
- const subLexer = new Lexer(exprSource, this.filename, exprStartLine, exprStartCol, this._depth + 1);
637
- const exprTokens = subLexer.tokenize();
668
+ if (!this._subLexer) {
669
+ this._subLexer = new Lexer(exprSource, this.filename, exprStartLine, exprStartCol, this._depth + 1);
670
+ } else {
671
+ this._subLexer.reset(exprSource, exprStartLine, exprStartCol);
672
+ }
673
+ const exprTokens = this._subLexer.tokenize();
638
674
  exprTokens.pop();
639
675
 
640
676
  parts.push({ type: 'expr', tokens: exprTokens, source: exprSource });
@@ -760,15 +796,7 @@ export class Lexer {
760
796
  if (this.pos >= this.length) {
761
797
  this.error('Unterminated string');
762
798
  }
763
- const esc = this.advance();
764
- switch (esc) {
765
- case 'n': value += '\n'; break;
766
- case 't': value += '\t'; break;
767
- case 'r': value += '\r'; break;
768
- case '\\': value += '\\'; break;
769
- case "'": value += "'"; break;
770
- default: value += '\\' + esc;
771
- }
799
+ value += this._processEscape(this.advance());
772
800
  } else {
773
801
  value += this.advance();
774
802
  }
@@ -821,7 +849,7 @@ export class Lexer {
821
849
 
822
850
  // Read flags
823
851
  let flags = '';
824
- while (this.pos < this.length && /[gimsuydv]/.test(this.peek())) {
852
+ while (this.pos < this.length && Lexer.REGEX_FLAG_RE.test(this.peek())) {
825
853
  flags += this.advance();
826
854
  }
827
855
 
@@ -831,19 +859,31 @@ export class Lexer {
831
859
  scanIdentifier() {
832
860
  const startLine = this.line;
833
861
  const startCol = this.column;
834
- let value = '';
862
+ const startPos = this.pos;
835
863
 
836
- while (this.pos < this.length && this.isAlphaNumeric(this.peek())) {
837
- value += this.advance();
864
+ // Fast path: scan ASCII identifier using index advancement (no string concat)
865
+ while (this.pos < this.length) {
866
+ const ch = this.source[this.pos];
867
+ if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '_' || (ch >= '0' && ch <= '9')) {
868
+ this.pos++;
869
+ this.column++;
870
+ } else if (ch > '\x7f' && Lexer.UNICODE_ALPHANUM_RE.test(ch)) {
871
+ this.pos++;
872
+ this.column++;
873
+ } else {
874
+ break;
875
+ }
838
876
  }
877
+ const value = this.source.substring(startPos, this.pos);
839
878
 
840
879
  // Raw string: r"no\escapes"
841
880
  if (value === 'r' && this.pos < this.length && this.peek() === '"') {
842
881
  this.advance(); // opening "
843
- let raw = '';
882
+ const rawParts = [];
844
883
  while (this.pos < this.length && this.peek() !== '"') {
845
- raw += this.advance();
884
+ rawParts.push(this.advance());
846
885
  }
886
+ const raw = rawParts.join('');
847
887
  if (this.pos >= this.length) {
848
888
  this.error('Unterminated raw string');
849
889
  }
@@ -1049,10 +1089,7 @@ export class Lexer {
1049
1089
  this._jsxTagMode = 'open';
1050
1090
  } else {
1051
1091
  const prev = this.tokens.length > 1 ? this.tokens[this.tokens.length - 2] : null;
1052
- const valueTypes = [TokenType.IDENTIFIER, TokenType.NUMBER, TokenType.STRING,
1053
- TokenType.STRING_TEMPLATE, TokenType.RPAREN, TokenType.RBRACKET, TokenType.RBRACE,
1054
- TokenType.TRUE, TokenType.FALSE, TokenType.NIL];
1055
- if (!prev || !valueTypes.includes(prev.type)) {
1092
+ if (!prev || !Lexer.VALUE_TOKEN_TYPES.has(prev.type)) {
1056
1093
  this._jsxTagMode = 'open';
1057
1094
  }
1058
1095
  }
@@ -1139,8 +1176,19 @@ export class Lexer {
1139
1176
  }
1140
1177
  break;
1141
1178
 
1179
+ case '@':
1180
+ this.tokens.push(new Token(TokenType.AT, '@', startLine, startCol));
1181
+ break;
1182
+
1142
1183
  default:
1143
1184
  this.error(`Unexpected character: '${ch}'`);
1144
1185
  }
1145
1186
  }
1146
1187
  }
1188
+
1189
+ // Initialize static Set after class definition (depends on TokenType)
1190
+ Lexer.VALUE_TOKEN_TYPES = new Set([
1191
+ TokenType.IDENTIFIER, TokenType.NUMBER, TokenType.STRING,
1192
+ TokenType.STRING_TEMPLATE, TokenType.RPAREN, TokenType.RBRACKET, TokenType.RBRACE,
1193
+ TokenType.TRUE, TokenType.FALSE, TokenType.NIL
1194
+ ]);
@@ -157,6 +157,9 @@ export const TokenType = {
157
157
  // Regex
158
158
  REGEX: 'REGEX', // /pattern/flags
159
159
 
160
+ // Decorators
161
+ AT: 'AT', // @
162
+
160
163
  // Special
161
164
  EOF: 'EOF',
162
165
  DOCSTRING: 'DOCSTRING', // /// comment