english-lang 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ "use strict";
2
+ // ============================================================
3
+ // Lexer.ts — Tokenizes .eng source into an INDENT-aware stream
4
+ //
5
+ // Token stream rules:
6
+ // - Blank lines and comment lines (--) produce NO tokens
7
+ // - Each non-blank line ends with NEWLINE
8
+ // - Indentation increases produce INDENT before the line tokens
9
+ // - Indentation decreases produce one or more DEDENT before the line tokens
10
+ // - Tabs are normalized to 4 spaces
11
+ // ============================================================
12
+ Object.defineProperty(exports, "__esModule", { value: true });
13
+ exports.Lexer = void 0;
14
+ class Lexer {
15
+ constructor(source) {
16
+ this.tokens = [];
17
+ this.indentStack = [0];
18
+ this.source = source;
19
+ this.lines = source.split('\n');
20
+ }
21
+ tokenize() {
22
+ for (let lineIdx = 0; lineIdx < this.lines.length; lineIdx++) {
23
+ this.processLine(this.lines[lineIdx], lineIdx + 1);
24
+ }
25
+ // Flush remaining DEDENT levels at EOF
26
+ while (this.indentStack.length > 1) {
27
+ this.indentStack.pop();
28
+ this.push('DEDENT', '', this.lines.length, 1);
29
+ }
30
+ this.push('EOF', '', this.lines.length + 1, 1);
31
+ return this.tokens;
32
+ }
33
+ // ── Line processing ───────────────────────────────────────
34
+ processLine(raw, lineNum) {
35
+ // Measure leading indentation (tabs → 4 spaces)
36
+ let indent = 0;
37
+ let i = 0;
38
+ while (i < raw.length && (raw[i] === ' ' || raw[i] === '\t')) {
39
+ indent += raw[i] === '\t' ? 4 : 1;
40
+ i++;
41
+ }
42
+ const content = raw.slice(i);
43
+ // Skip blank lines and comment lines entirely
44
+ if (content === '' || content.startsWith('--'))
45
+ return;
46
+ // Emit INDENT / DEDENT tokens before the line's own tokens
47
+ const current = this.indentStack[this.indentStack.length - 1];
48
+ if (indent > current) {
49
+ this.indentStack.push(indent);
50
+ this.push('INDENT', '', lineNum, 1);
51
+ }
52
+ else if (indent < current) {
53
+ while (this.indentStack.length > 1 && this.indentStack[this.indentStack.length - 1] > indent) {
54
+ this.indentStack.pop();
55
+ this.push('DEDENT', '', lineNum, 1);
56
+ }
57
+ }
58
+ // Tokenize content characters
59
+ this.tokenizeContent(content, lineNum, i + 1);
60
+ // Every logical line ends with NEWLINE
61
+ this.push('NEWLINE', '\n', lineNum, raw.length + 1);
62
+ }
63
+ // ── Character-level tokenizer ─────────────────────────────
64
+ tokenizeContent(content, lineNum, startCol) {
65
+ let i = 0;
66
+ let col = startCol;
67
+ while (i < content.length) {
68
+ const ch = content[i];
69
+ // Whitespace
70
+ if (ch === ' ' || ch === '\t') {
71
+ const step = ch === '\t' ? 4 : 1;
72
+ i++;
73
+ col += step;
74
+ continue;
75
+ }
76
+ // Comment: -- to end of line
77
+ if (ch === '-' && content[i + 1] === '-')
78
+ break;
79
+ // Arrow: ->
80
+ if (ch === '-' && content[i + 1] === '>') {
81
+ this.push('ARROW', '->', lineNum, col);
82
+ i += 2;
83
+ col += 2;
84
+ continue;
85
+ }
86
+ // String literal: "..."
87
+ if (ch === '"') {
88
+ let j = i + 1;
89
+ while (j < content.length && content[j] !== '"')
90
+ j++;
91
+ const value = content.slice(i + 1, j);
92
+ this.push('STRING', value, lineNum, col);
93
+ col += j - i + 1;
94
+ i = j + 1;
95
+ continue;
96
+ }
97
+ // Possessive: 's
98
+ if (ch === '\'' && content[i + 1] === 's' && (i + 2 >= content.length || !/[a-zA-Z0-9_]/.test(content[i + 2]))) {
99
+ this.push('APOSTROPHE_S', "'s", lineNum, col);
100
+ i += 2;
101
+ col += 2;
102
+ continue;
103
+ }
104
+ // Number: optional leading minus only if not after a word char
105
+ if ((ch >= '0' && ch <= '9') ||
106
+ (ch === '-' && i + 1 < content.length && content[i + 1] >= '0' && content[i + 1] <= '9' &&
107
+ (i === 0 || content[i - 1] === ' ' || content[i - 1] === '\t'))) {
108
+ let j = i + (ch === '-' ? 1 : 0);
109
+ while (j < content.length && (content[j] >= '0' && content[j] <= '9' || content[j] === '.'))
110
+ j++;
111
+ this.push('NUMBER', content.slice(i, j), lineNum, col);
112
+ col += j - i;
113
+ i = j;
114
+ continue;
115
+ }
116
+ // Single-character tokens
117
+ if (ch === ':') {
118
+ this.push('COLON', ':', lineNum, col);
119
+ i++;
120
+ col++;
121
+ continue;
122
+ }
123
+ if (ch === '(') {
124
+ this.push('LPAREN', '(', lineNum, col);
125
+ i++;
126
+ col++;
127
+ continue;
128
+ }
129
+ if (ch === ')') {
130
+ this.push('RPAREN', ')', lineNum, col);
131
+ i++;
132
+ col++;
133
+ continue;
134
+ }
135
+ if (ch === ',') {
136
+ this.push('COMMA', ',', lineNum, col);
137
+ i++;
138
+ col++;
139
+ continue;
140
+ }
141
+ if (ch === '.') {
142
+ this.push('DOT', '.', lineNum, col);
143
+ i++;
144
+ col++;
145
+ continue;
146
+ }
147
+ // Word: identifier or keyword
148
+ if (/[a-zA-Z_]/.test(ch)) {
149
+ let j = i;
150
+ while (j < content.length && /[a-zA-Z0-9_]/.test(content[j]))
151
+ j++;
152
+ this.push('WORD', content.slice(i, j), lineNum, col);
153
+ col += j - i;
154
+ i = j;
155
+ continue;
156
+ }
157
+ // Unknown character — skip
158
+ i++;
159
+ col++;
160
+ }
161
+ }
162
+ // ── Helpers ────────────────────────────────────────────────
163
+ push(type, value, line, col) {
164
+ this.tokens.push({ type, value, line, col });
165
+ }
166
+ }
167
+ exports.Lexer = Lexer;