@rip-lang/schema 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lexer.js ADDED
@@ -0,0 +1,438 @@
1
+ // ==========================================================================
2
+ // Schema Lexer — Tokenizer for Rip Schema Files
3
+ // ==========================================================================
4
+ //
5
+ // Tokenizes schema source into a stream of tagged tokens with
6
+ // indentation-based INDENT/OUTDENT for block structure.
7
+ //
8
+ // Design principles:
9
+ // - Every token carries .loc (location: r, c, n)
10
+ // - Indentation tracked during tokenization
11
+ // - Keywords use @ prefix for definitions and directives
12
+ // - # is a comment when preceded by whitespace, a modifier otherwise
13
+ // - Zero dependencies
14
+ //
15
+ // Token format:
16
+ // { type, value, loc }
17
+ // type — token tag string (IDENTIFIER, MODEL, etc.)
18
+ // value — parsed value (string, number, boolean)
19
+ // loc — { r: row, c: col, n: length }
20
+ //
21
+ // Author: Steve Shreeve <steve.shreeve@gmail.com>
22
+ // Date: January 2026
23
+ // ==========================================================================
24
+
25
+ // ==========================================================================
26
+ // Keyword Maps
27
+ // ==========================================================================
28
+
29
+ // Keywords that require @ prefix (definition and directive keywords)
30
+ let AT_KEYWORDS = {
31
+ // Definitions
32
+ 'enum': 'ENUM',
33
+ 'type': 'TYPE',
34
+ 'model': 'MODEL',
35
+ 'mixin': 'MIXIN',
36
+ 'widget': 'WIDGET',
37
+ 'form': 'FORM',
38
+ 'state': 'STATE',
39
+ 'import': 'IMPORT',
40
+
41
+ // Directives (both snake_case and camelCase accepted)
42
+ 'timestamps': 'TIMESTAMPS',
43
+ 'softDelete': 'SOFT_DELETE',
44
+ 'soft_delete': 'SOFT_DELETE',
45
+ 'include': 'INCLUDE',
46
+ 'computed': 'COMPUTED',
47
+ 'validate': 'VALIDATE',
48
+ 'index': 'INDEX',
49
+ 'pattern': 'PATTERN',
50
+ 'belongs_to': 'BELONGS_TO',
51
+ 'belongsTo': 'BELONGS_TO',
52
+ 'has_one': 'HAS_ONE',
53
+ 'hasOne': 'HAS_ONE',
54
+ 'has_many': 'HAS_MANY',
55
+ 'hasMany': 'HAS_MANY',
56
+ 'one': 'ONE',
57
+ 'many': 'MANY',
58
+ 'link': 'LINK',
59
+ 'events': 'EVENTS',
60
+ 'actions': 'ACTIONS',
61
+ };
62
+
63
+ // Regular keywords (no @ prefix needed)
64
+ let KEYWORDS = {
65
+ 'true': 'BOOL',
66
+ 'false': 'BOOL',
67
+ 'null': 'NULL',
68
+ 'undefined': 'UNDEFINED',
69
+ 'is': 'IS',
70
+ 'isnt': 'ISNT',
71
+ 'not': 'NOT',
72
+ 'and': 'AND',
73
+ 'or': 'OR',
74
+ };
75
+
76
+ // ==========================================================================
77
+ // Regex Patterns
78
+ // ==========================================================================
79
+
80
+ let IDENTIFIER_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*/;
81
+ let NUMBER_RE = /^-?(?:0x[\da-f]+|0b[01]+|0o[0-7]+|\d*\.?\d+(?:e[+-]?\d+)?)/i;
82
+ let STRING_DOUBLE_RE = /^"(?:[^"\\]|\\.)*"/;
83
+ let STRING_SINGLE_RE = /^'(?:[^'\\]|\\.)*'/;
84
+ let REGEX_RE = /^\/(?:[^\/\\]|\\.)+\/[gimsuy]*/;
85
+ let WHITESPACE_RE = /^[^\n\S]+/;
86
+ let COMMENT_RE = /^#.*/;
87
+ let NEWLINE_RE = /^\n/;
88
+ let INDENT_RE = /^[ \t]*/;
89
+ let BLANK_LINE_RE = /^(#[^\n]*)?(\n|$)/;
90
+
91
+ // ==========================================================================
92
+ // Single-character token map
93
+ // ==========================================================================
94
+
95
+ let SINGLE_CHARS = {
96
+ ':': ':', ',': ',', '.': '.', '(': '(', ')': ')',
97
+ '[': '[', ']': ']', '{': '{', '}': '}',
98
+ '!': '!', '#': '#', '?': '?',
99
+ '+': '+', '-': '-', '*': '*', '/': '/',
100
+ '<': '<', '>': '>',
101
+ '=': '=', '&': '&', '|': '|', '^': '^',
102
+ };
103
+
104
+ // Multi-character operators (checked before single chars)
105
+ let MULTI_OPS = ['...', '?.', '->', '==', '!=', '<=', '>=', '&&', '||', '??'];
106
+
107
+ // ==========================================================================
108
+ // Helpers
109
+ // ==========================================================================
110
+
111
+ function syntaxError(message, {r = 0, c = 0, n = 1} = {}) {
112
+ let err = new SyntaxError(message);
113
+ err.location = {r, c, n};
114
+ throw err;
115
+ }
116
+
117
+ // ==========================================================================
118
+ // Schema Lexer
119
+ // ==========================================================================
120
+
121
+ export class SchemaLexer {
122
+ constructor() {
123
+ this.input = '';
124
+ this.pos = 0;
125
+ this._row = 0;
126
+ this._col = 0;
127
+ this.indentStack = [0];
128
+ this.tokens = [];
129
+ this.tokenIndex = 0;
130
+
131
+ // Parser-facing state (set by lex())
132
+ this.text = '';
133
+ this.line = 0;
134
+ this.len = 0;
135
+ this.loc = {};
136
+ this.match = '';
137
+ }
138
+
139
+ // --------------------------------------------------------------------------
140
+ // Input
141
+ // --------------------------------------------------------------------------
142
+
143
+ setInput(input, ctx = {}) {
144
+ this.input = input;
145
+ this.pos = 0;
146
+ this._row = 0;
147
+ this._col = 0;
148
+ this.indentStack = [0];
149
+ this.tokens = [];
150
+ this.tokenIndex = 0;
151
+ this.ctx = ctx;
152
+
153
+ this._tokenize();
154
+ }
155
+
156
+ // --------------------------------------------------------------------------
157
+ // Token creation
158
+ // --------------------------------------------------------------------------
159
+
160
+ _emit(tokens, type, value, r, c, n) {
161
+ let token = { type, value, loc: {r, c, n} };
162
+ tokens.push(token);
163
+ return token;
164
+ }
165
+
166
+ // --------------------------------------------------------------------------
167
+ // Tokenizer
168
+ // --------------------------------------------------------------------------
169
+
170
+ _tokenize() {
171
+ let input = this.input;
172
+ let tokens = this.tokens;
173
+ let pos = 0;
174
+ let row = 0;
175
+ let col = 0;
176
+ let indentStack = [0];
177
+ let atLineStart = true;
178
+ let lastSignificant = null;
179
+
180
+ let emit = (type, value, r, c, n) => {
181
+ this._emit(tokens, type, value, r, c, n);
182
+ if (type !== 'TERMINATOR' && type !== 'INDENT' && type !== 'OUTDENT') {
183
+ lastSignificant = type;
184
+ }
185
+ };
186
+
187
+ while (pos < input.length) {
188
+ let remaining = input.slice(pos);
189
+ let startRow = row;
190
+ let startCol = col;
191
+ let match;
192
+
193
+ // --- Newlines ---
194
+ if (match = remaining.match(NEWLINE_RE)) {
195
+ pos += 1;
196
+ row += 1;
197
+ col = 0;
198
+ atLineStart = true;
199
+
200
+ if (lastSignificant && lastSignificant !== 'TERMINATOR') {
201
+ emit('TERMINATOR', '\n', startRow, startCol, 1);
202
+ }
203
+ continue;
204
+ }
205
+
206
+ // --- Indentation at line start ---
207
+ if (atLineStart) {
208
+ match = remaining.match(INDENT_RE);
209
+ let indent = match[0].length;
210
+ pos += indent;
211
+ col += indent;
212
+ atLineStart = false;
213
+
214
+ // Blank or comment-only line — skip
215
+ let restOfLine = input.slice(pos);
216
+ let blankMatch = restOfLine.match(BLANK_LINE_RE);
217
+
218
+ if (blankMatch) {
219
+ let currentIndent = indentStack[indentStack.length - 1];
220
+
221
+ // Outdent at column 0 even on blank/comment lines
222
+ if (indent === 0 && indent < currentIndent) {
223
+ while (indentStack.length > 1) {
224
+ indentStack.pop();
225
+ emit('OUTDENT', indent, row, 0, indent);
226
+ }
227
+ if (lastSignificant && lastSignificant !== 'TERMINATOR') {
228
+ emit('TERMINATOR', '\n', row, 0, 0);
229
+ }
230
+ }
231
+
232
+ let skipLen = blankMatch[0].length;
233
+ pos += skipLen;
234
+ if (blankMatch[2] === '\n') {
235
+ row += 1;
236
+ col = 0;
237
+ atLineStart = true;
238
+ }
239
+ continue;
240
+ }
241
+
242
+ // Real content — process indent changes
243
+ let currentIndent = indentStack[indentStack.length - 1];
244
+
245
+ if (indent > currentIndent) {
246
+ indentStack.push(indent);
247
+ emit('INDENT', indent, row, 0, indent);
248
+ } else if (indent < currentIndent) {
249
+ while (indentStack.length > 1 && indentStack[indentStack.length - 1] > indent) {
250
+ indentStack.pop();
251
+ emit('OUTDENT', indent, row, 0, indent);
252
+ }
253
+ if (lastSignificant && lastSignificant !== 'TERMINATOR') {
254
+ emit('TERMINATOR', '\n', row, 0, 0);
255
+ }
256
+ }
257
+ continue;
258
+ }
259
+
260
+ // --- Whitespace (mid-line) ---
261
+ if (match = remaining.match(WHITESPACE_RE)) {
262
+ pos += match[0].length;
263
+ col += match[0].length;
264
+ continue;
265
+ }
266
+
267
+ // --- Comments (#) ---
268
+ // # is a comment when preceded by whitespace or at line start.
269
+ // Otherwise it's a modifier token (e.g., email!# email).
270
+ if (remaining[0] === '#') {
271
+ let prevChar = input[pos - 1];
272
+ let isComment = !prevChar || /\s/.test(prevChar);
273
+
274
+ if (isComment) {
275
+ match = remaining.match(COMMENT_RE);
276
+ pos += match[0].length;
277
+ col += match[0].length;
278
+ continue;
279
+ }
280
+ // Fall through to single-char handling
281
+ }
282
+
283
+ // --- @ keywords/directives ---
284
+ if (remaining[0] === '@') {
285
+ pos += 1;
286
+ col += 1;
287
+ let identMatch = input.slice(pos).match(IDENTIFIER_RE);
288
+ if (identMatch) {
289
+ let word = identMatch[0];
290
+ let tokenType = AT_KEYWORDS[word];
291
+ if (tokenType) {
292
+ pos += word.length;
293
+ col += word.length;
294
+ emit(tokenType, word, startRow, startCol, word.length + 1);
295
+ continue;
296
+ }
297
+ // @ followed by non-keyword identifier = @property access
298
+ emit('@', '@', startRow, startCol, 1);
299
+ continue;
300
+ }
301
+ emit('@', '@', startRow, startCol, 1);
302
+ continue;
303
+ }
304
+
305
+ // --- Multi-character operators ---
306
+ let foundMultiOp = false;
307
+ for (let op of MULTI_OPS) {
308
+ if (remaining.startsWith(op)) {
309
+ pos += op.length;
310
+ col += op.length;
311
+ emit(op, op, startRow, startCol, op.length);
312
+ foundMultiOp = true;
313
+ break;
314
+ }
315
+ }
316
+ if (foundMultiOp) continue;
317
+
318
+ // --- Identifiers and keywords ---
319
+ if (match = remaining.match(IDENTIFIER_RE)) {
320
+ let word = match[0];
321
+ pos += word.length;
322
+ col += word.length;
323
+
324
+ let tokenType = KEYWORDS[word] || 'IDENTIFIER';
325
+ let value = (tokenType === 'BOOL') ? (word === 'true') : word;
326
+ emit(tokenType, value, startRow, startCol, word.length);
327
+ continue;
328
+ }
329
+
330
+ // --- Numbers ---
331
+ if (match = remaining.match(NUMBER_RE)) {
332
+ pos += match[0].length;
333
+ col += match[0].length;
334
+ emit('NUMBER', parseFloat(match[0]), startRow, startCol, match[0].length);
335
+ continue;
336
+ }
337
+
338
+ // --- Strings ---
339
+ if (match = remaining.match(STRING_DOUBLE_RE) || remaining.match(STRING_SINGLE_RE)) {
340
+ let str = match[0];
341
+ pos += str.length;
342
+
343
+ let newlines = (str.match(/\n/g) || []).length;
344
+ if (newlines > 0) {
345
+ row += newlines;
346
+ col = str.length - str.lastIndexOf('\n') - 1;
347
+ } else {
348
+ col += str.length;
349
+ }
350
+
351
+ // Remove quotes and unescape
352
+ let value = str.slice(1, -1).replace(/\\(.)/g, (_, c) => {
353
+ switch (c) {
354
+ case 'n': return '\n';
355
+ case 't': return '\t';
356
+ case 'r': return '\r';
357
+ case '\\': return '\\';
358
+ case '"': return '"';
359
+ case "'": return "'";
360
+ default: return c;
361
+ }
362
+ });
363
+ emit('STRING', value, startRow, startCol, str.length);
364
+ continue;
365
+ }
366
+
367
+ // --- Regex literals ---
368
+ let canBeRegex = !lastSignificant ||
369
+ [':', ',', '(', '[', '{', '=', '!', '->', '&&', '||', '??', 'TERMINATOR',
370
+ 'INDENT', 'PATTERN', 'RETURN'].includes(lastSignificant);
371
+
372
+ if (canBeRegex && (match = remaining.match(REGEX_RE))) {
373
+ let regex = match[0];
374
+ pos += regex.length;
375
+ col += regex.length;
376
+ emit('REGEX', regex, startRow, startCol, regex.length);
377
+ continue;
378
+ }
379
+
380
+ // --- Single-character tokens ---
381
+ let char = remaining[0];
382
+ pos += 1;
383
+ col += 1;
384
+
385
+ if (SINGLE_CHARS[char]) {
386
+ emit(SINGLE_CHARS[char], char, startRow, startCol, 1);
387
+ continue;
388
+ }
389
+
390
+ // Unknown character
391
+ syntaxError(`unexpected character '${char}'`, {r: row, c: col - 1, n: 1});
392
+ }
393
+
394
+ // Close remaining indents
395
+ while (indentStack.length > 1) {
396
+ indentStack.pop();
397
+ emit('OUTDENT', 0, row, col, 0);
398
+ }
399
+
400
+ // Final terminator
401
+ if (lastSignificant && lastSignificant !== 'TERMINATOR') {
402
+ emit('TERMINATOR', '\n', row, col, 0);
403
+ }
404
+ }
405
+
406
+ // --------------------------------------------------------------------------
407
+ // Parser interface — returns one token at a time
408
+ // --------------------------------------------------------------------------
409
+
410
+ lex() {
411
+ if (this.tokenIndex >= this.tokens.length) {
412
+ return false; // EOF
413
+ }
414
+
415
+ let token = this.tokens[this.tokenIndex++];
416
+ this.text = token.value;
417
+ this.len = typeof token.value === 'string' ? token.value.length : 1;
418
+ this.line = token.loc.r;
419
+ this.loc = token.loc;
420
+ this.match = String(token.value);
421
+
422
+ return token.type;
423
+ }
424
+
425
+ // --------------------------------------------------------------------------
426
+ // Error display
427
+ // --------------------------------------------------------------------------
428
+
429
+ showPosition() {
430
+ let lines = this.input.split('\n');
431
+ let currentLine = lines[this.line] || '';
432
+ let col = this.loc?.c || 0;
433
+ let pointer = ' '.repeat(col) + '^';
434
+ return `${currentLine}\n${pointer}`;
435
+ }
436
+ }
437
+
438
+ export default SchemaLexer;