utkrisht 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+
2
+
3
+ aaa bbb, ccc = 10
4
+
@@ -0,0 +1,15 @@
1
+ export const keywords = new Set([
2
+ "try",
3
+ "fix",
4
+ "when",
5
+ "else",
6
+ "loop",
7
+ "with",
8
+ "right",
9
+ "wrong",
10
+ "import",
11
+ "export",
12
+ "exit",
13
+ "stop",
14
+ "skip",
15
+ ]);
@@ -0,0 +1,602 @@
1
+ import { error } from "./logger.js";
2
+ import { keywords } from "./keywords.js";
3
+
4
+ export function createLexer(source) {
5
+ return {
6
+ source,
7
+ position: 0,
8
+ line: 1,
9
+ sourceLength: source.length,
10
+ // Number of spaces required for a single level of indentation
11
+ indentWidth: 0,
12
+ // Used to emit the correct number of Dedent tokens when returning to outer scopes.
13
+ nestingDepth: 0,
14
+ };
15
+ }
16
+
17
+ function isAtEnd(lexer) {
18
+ return lexer.position >= lexer.sourceLength;
19
+ }
20
+
21
+ function isCurrentCharacter(lexer, expected) {
22
+ if (isAtEnd(lexer)) {
23
+ return false;
24
+ } else if (typeof expected === "function") {
25
+ return expected(lexer.source[lexer.position]);
26
+ } else if (lexer.source[lexer.position] === expected) {
27
+ return true;
28
+ } else {
29
+ return false;
30
+ }
31
+ }
32
+
33
+ function isDigit(character) {
34
+ return "0123456789".includes(character);
35
+ }
36
+
37
+ function isSmallAlphabet(character) {
38
+ return "abcdefghijklmnopqrstuvwxyz".includes(character);
39
+ }
40
+
41
+ function isBigAlphabet(character) {
42
+ return "ABCDEFGHIJKLMNOPQRSTUVWXYZ".includes(character);
43
+ }
44
+
45
+ function isAlphaNumeric(character) {
46
+ return isSmallAlphabet(character) || isDigit(character) || character === "-";
47
+ }
48
+
49
+
50
+
51
+ function lexString(utkrisht, lexer) {
52
+ const stringStartLine = lexer.line;
53
+ // Skip opening quote
54
+ lexer.position++;
55
+
56
+ let isSingleLine = true;
57
+ let temporaryPosition = lexer.position;
58
+
59
+ // Check for "followed by spaces and a newline"
60
+ while (lexer.source[temporaryPosition] === " ") {
61
+ temporaryPosition++;
62
+ }
63
+
64
+ if (lexer.source[temporaryPosition] === "\n") {
65
+ isSingleLine = false;
66
+ // Skip the spaces and the first \n
67
+ lexer.position = temporaryPosition + 1;
68
+ lexer.line++;
69
+ }
70
+
71
+ if (isSingleLine) {
72
+ const start = lexer.position;
73
+
74
+ while (!isCurrentCharacter(lexer, '"')) {
75
+ if (isAtEnd(lexer)) {
76
+ error(utkrisht, "Unterminated string", stringStartLine);
77
+ return undefined;
78
+ }
79
+
80
+ if (lexer.source[lexer.position] === "\n") {
81
+ error(utkrisht, "Single line strings cannot contain a new line.", stringStartLine);
82
+ return undefined;
83
+ }
84
+
85
+ lexer.position++;
86
+ }
87
+
88
+ const lexeme = lexer.source.slice(start, lexer.position);
89
+
90
+ // Skip closing "
91
+ lexer.position++;
92
+
93
+ return { type: "StringLiteral", lexeme, line: stringStartLine };
94
+ }
95
+
96
+ else { // !isSingleLine
97
+
98
+ const lines = [];
99
+
100
+ if (lexer.indentWidth === 0) {
101
+ let temporaryPosition = lexer.position;
102
+ while (lexer.source[temporaryPosition] === " ") {
103
+ lexer.indentWidth++;
104
+ temporaryPosition++;
105
+ }
106
+ }
107
+ // The indentation we must skip for the closing quote
108
+ let requiredClosingQuoteOffset = lexer.nestingDepth * lexer.indentWidth;
109
+ // The indentation we must skip for EVERY line inside this string
110
+ let requiredContentOffset = requiredClosingQuoteOffset + lexer.indentWidth;
111
+
112
+ while (true) {
113
+ if (isAtEnd(lexer)) {
114
+ error(utkrisht, "Unterminated multiline string", stringStartLine);
115
+ return undefined;
116
+ }
117
+
118
+ // Check if the current line is the closing quote line
119
+ // It must be at the correct nesting level (same as the 'parent' code)
120
+ let closingCandidate = lexer.position;
121
+ while (lexer.source[closingCandidate] === " ") {
122
+ closingCandidate++;
123
+ }
124
+
125
+ if (lexer.source[closingCandidate] === '"') {
126
+ // Ensure the closing quote is indented correctly (matching parent)
127
+ const closingQuoteOffset = closingCandidate - lexer.position;
128
+ if (closingQuoteOffset !== requiredClosingQuoteOffset) {
129
+ error(utkrisht, "Closing quote indentation must match the block level.", lexer.line);
130
+ return undefined;
131
+ }
132
+ // Move past the quote
133
+ lexer.position = closingCandidate + 1;
134
+ break;
135
+ }
136
+
137
+ // Validate leading spaces for the current content line
138
+ let spaceCount = 0;
139
+ while (spaceCount < requiredContentOffset && lexer.source[lexer.position + spaceCount] === " ") {
140
+ spaceCount++;
141
+ }
142
+
143
+ if (spaceCount < requiredContentOffset) {
144
+ error(utkrisht, `Insufficient indentation for multiline string. Expected ${requiredContentOffset} spaces.`, lexer.line);
145
+ return undefined;
146
+ }
147
+
148
+ // Capture from after the skipOffset until the end of the line
149
+ lexer.position += requiredContentOffset;
150
+ const lineStart = lexer.position;
151
+ while (!isAtEnd(lexer) && lexer.source[lexer.position] !== "\n") {
152
+ if (lexer.source[lexer.position] === '"') {
153
+ error(utkrisht, "Closing quote must be on its own line for multiline strings.", lexer.line);
154
+ return undefined;
155
+ }
156
+ lexer.position++;
157
+ }
158
+
159
+ lines.push(lexer.source.slice(lineStart, lexer.position));
160
+
161
+ if (isCurrentCharacter(lexer, "\n")) {
162
+ lexer.line++;
163
+ lexer.position++;
164
+ }
165
+ }
166
+
167
+ return { type: "StringLiteral", lexeme: lines.join("\n"), line: stringStartLine };
168
+ }
169
+ }
170
+
171
+ function lexNumber(lexer, isNegative = false) {
172
+ const numberStartPosition = lexer.position;
173
+ while (isCurrentCharacter(lexer, isDigit)) {
174
+ lexer.position++;
175
+ }
176
+
177
+ if (isCurrentCharacter(lexer, ".")) {
178
+ lexer.position++;
179
+ while (isCurrentCharacter(lexer, isDigit)) {
180
+ lexer.position++;
181
+ }
182
+ }
183
+
184
+ const numberEndPosition = lexer.position;
185
+
186
+ return { type: "NumericLiteral", lexeme: (isNegative ? "-" : "") + lexer.source.slice(numberStartPosition, numberEndPosition), line: lexer.line}
187
+ }
188
+
189
+
190
+
191
+ function lexIdentifier(lexer) {
192
+ const identifierStartPosition = lexer.position;
193
+ while (isCurrentCharacter(lexer, isAlphaNumeric)) {
194
+ lexer.position++;
195
+ }
196
+ const identifierEndPosition = lexer.position;
197
+
198
+ const lexeme = lexer.source.slice(identifierStartPosition, identifierEndPosition);
199
+
200
+ let type = "Identifier";
201
+ if (keywords.has(lexeme)) {
202
+ type = lexeme[0].toUpperCase() + lexeme.slice(1);
203
+ }
204
+
205
+ return { type, lexeme, line: lexer.line};
206
+ }
207
+
208
+
209
+
210
+ function lexNewLine(utkrisht, lexer) {
211
+ const currentLine = lexer.line; // Capture current line before incrementing
212
+ lexer.line++;
213
+ lexer.position++;
214
+
215
+ let leadingSpaces = 0;
216
+ while (!isAtEnd(lexer) && isCurrentCharacter(lexer, " ")) {
217
+ leadingSpaces++;
218
+ lexer.position++;
219
+ }
220
+
221
+ // Ignore blank lines
222
+ if (isCurrentCharacter(lexer, "\n")) {
223
+ return undefined;
224
+ }
225
+
226
+ if (isCurrentCharacter(lexer, "\r")) {
227
+ lexer.position++;
228
+ if (!isCurrentCharacter(lexer, "\n")) {
229
+ error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
230
+ }
231
+ return undefined;
232
+ }
233
+
234
+ // Ignore newline at the very end of file
235
+ if (isAtEnd(lexer)) {
236
+ return undefined;
237
+ }
238
+
239
+ // Ignore commented blank lines
240
+ if (isCurrentCharacter(lexer, "#")) {
241
+ return undefined;
242
+ }
243
+
244
+ // Only set `indentWidth` if we find spaces for the first time
245
+ if (lexer.indentWidth === 0 && leadingSpaces > 0) {
246
+ lexer.indentWidth = leadingSpaces;
247
+ }
248
+
249
+ // Only calculate level if we actually have an indentWidth
250
+ // If indentWidth is still 0, it means we are still at the margin (Level 0)
251
+ let indentLevel = 0;
252
+ if (lexer.indentWidth !== 0) {
253
+ indentLevel = leadingSpaces / lexer.indentWidth;
254
+ }
255
+
256
+ // Error if indentation level is not a multiple of `indentWidth`
257
+ if (!Number.isInteger(indentLevel)) {
258
+ error(utkrisht, "Invalid indentation level. Please indent your code consistently with " + lexer.indentWidth + " spaces.", lexer.line);
259
+ return undefined;
260
+ }
261
+
262
+
263
+ // Handle Indentation Logic
264
+ if (indentLevel > lexer.nestingDepth) {
265
+ if (indentLevel > lexer.nestingDepth + 1) {
266
+ error(utkrisht, "Cannot indent multiple levels at once.", lexer.line);
267
+ return undefined;
268
+ }
269
+
270
+ lexer.nestingDepth++;
271
+ return { type: "Indent", lexeme: "++++", line: lexer.line };
272
+ }
273
+ else if (indentLevel < lexer.nestingDepth) {
274
+ const tokens = []
275
+ while (indentLevel < lexer.nestingDepth) {
276
+ lexer.nestingDepth--;
277
+ tokens.push({ type: "Dedent", lexeme: "----", line: lexer.line });
278
+ }
279
+ return tokens;
280
+ }
281
+ else {
282
+ // Since they are on the same level, newline acts as a terminator
283
+ return { type: "NewLine", lexeme: "\n", line: currentLine };
284
+ }
285
+
286
+ }
287
+
288
+ function lexComent(lexer) {
289
+ while (!isCurrentCharacter(lexer, "\n") && !isAtEnd(lexer)) {
290
+ lexer.position++
291
+ }
292
+ }
293
+
294
+ function lexComma(utkrisht, lexer) {
295
+ const tokens = [{ type: "Comma", lexeme: ",", line: lexer.line }];
296
+ lexer.position++;
297
+
298
+ // Ignore spaces
299
+ // while (true) {
300
+ // if (isCurrentCharacter(lexer, " ")) {
301
+ // lexer.position++;
302
+ // } else if (isCurrentCharacter(lexer, "#")) {
303
+ // lexComent(lexer);
304
+ // } else {
305
+ // break;
306
+ // }
307
+ // }
308
+
309
+ // if (isCurrentCharacter(lexer, "\r")) {
310
+ // lexer.position++;
311
+ // if (!isCurrentCharacter(lexer, "\n")) {
312
+ // error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
313
+ // }
314
+ // }
315
+
316
+ // if (isCurrentCharacter(lexer, "\n")) {
317
+ // const whiteSpaceTokens = lexNewLine(utkrisht, lexer);
318
+ // if (Array.isArray(whiteSpaceTokens) || whiteSpaceTokens !== undefined && whiteSpaceTokens.type !== "NewLine") {
319
+ // tokens.push(...whiteSpaceTokens)
320
+ // }
321
+ // }
322
+
323
+ // Ignore spaces and comments
324
+ while (true) {
325
+ while (true) {
326
+ if (isCurrentCharacter(lexer, " ")) {
327
+ lexer.position++;
328
+ } else if (isCurrentCharacter(lexer, "#")) {
329
+ lexComent(lexer);
330
+ } else {
331
+ break;
332
+ }
333
+ }
334
+
335
+ if (isCurrentCharacter(lexer, "\r")) {
336
+ lexer.position++;
337
+ if (!isCurrentCharacter(lexer, "\n")) {
338
+ error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
339
+ }
340
+ }
341
+
342
+ if (isCurrentCharacter(lexer, "\n")) {
343
+ const whiteSpaceTokens = lexNewLine(utkrisht, lexer);
344
+ if (Array.isArray(whiteSpaceTokens)) {
345
+ tokens.push(...whiteSpaceTokens);
346
+ break;
347
+ }
348
+ if (whiteSpaceTokens !== undefined && whiteSpaceTokens.type !== "NewLine") {
349
+ tokens.push(whiteSpaceTokens);
350
+ break;
351
+ }
352
+ } else {
353
+ break;
354
+ }
355
+ }
356
+
357
+
358
+ return tokens;
359
+ }
360
+
361
+
362
+ function lexOpenBracket(utkrisht, lexer, type, character) {
363
+ const tokens = [{ type, lexeme: character, line: lexer.line }];
364
+ lexer.position++;
365
+
366
+ // Ignore spaces and comments
367
+ while (true) {
368
+ while (true) {
369
+ if (isCurrentCharacter(lexer, " ")) {
370
+ lexer.position++;
371
+ } else if (isCurrentCharacter(lexer, "#")) {
372
+ lexComent(lexer);
373
+ } else {
374
+ break;
375
+ }
376
+ }
377
+
378
+ if (isCurrentCharacter(lexer, "\r")) {
379
+ lexer.position++;
380
+ if (!isCurrentCharacter(lexer, "\n")) {
381
+ error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
382
+ }
383
+ }
384
+
385
+ if (isCurrentCharacter(lexer, "\n")) {
386
+ const whiteSpaceTokens = lexNewLine(utkrisht, lexer);
387
+ if (Array.isArray(whiteSpaceTokens)) {
388
+ tokens.push(...whiteSpaceTokens);
389
+ break;
390
+ }
391
+ if (whiteSpaceTokens !== undefined && whiteSpaceTokens.type !== "NewLine") {
392
+ tokens.push(whiteSpaceTokens);
393
+ break;
394
+ }
395
+ } else {
396
+ break;
397
+ }
398
+ }
399
+
400
+ return tokens
401
+ }
402
+
403
+ function lexToken(utkrisht, lexer) {
404
+ let character = lexer.source[lexer.position];
405
+
406
+ switch (character) {
407
+ case "(":
408
+ return lexOpenBracket(utkrisht, lexer, "LeftRoundBracket", "(")
409
+ case ")":
410
+ lexer.position++;
411
+ return { type: "RightRoundBracket", lexeme: character, line: lexer.line };
412
+ case "[":
413
+ return lexOpenBracket(utkrisht, lexer, "LeftSquareBracket", "[")
414
+ case "]":
415
+ lexer.position++;
416
+ return { type: "RightSquareBracket", lexeme: character, line: lexer.line };
417
+ case "{":
418
+ return lexOpenBracket(utkrisht, lexer, "LeftCurlyBracket", "{");
419
+ case "}":
420
+ lexer.position++;
421
+ return { type: "RightCurlyBracket", lexeme: character, line: lexer.line };
422
+ case ".":
423
+ lexer.position++;
424
+ return { type: "Dot", lexeme: character, line: lexer.line };
425
+ case ",":
426
+ return lexComma(utkrisht, lexer);
427
+ case ":":
428
+ lexer.position++;
429
+ return { type: "Colon", lexeme: character, line: lexer.line };
430
+ case "#":
431
+ lexComent(lexer);
432
+ return undefined;
433
+ case "~":
434
+ lexer.position++;
435
+ return { type: "Tilde", lexeme: character, line: lexer.line };
436
+ case "=":
437
+ lexer.position++;
438
+ return { type: "Equal", lexeme: character, line: lexer.line };
439
+ case "<":
440
+ lexer.position++;
441
+ return { type: "LessThan", lexeme: character, line: lexer.line };
442
+ case ">":
443
+ lexer.position++;
444
+ return { type: "MoreThan", lexeme: character, line: lexer.line };
445
+ case "@":
446
+ lexer.position++;
447
+ return { type: "At", lexeme: character, line: lexer.line };
448
+ case "$":
449
+ lexer.position++;
450
+ return { type: "Dollar", lexeme: character, line: lexer.line };
451
+ case "&":
452
+ lexer.position++;
453
+ return { type: "And", lexeme: character, line: lexer.line };
454
+ case "+":
455
+ lexer.position++;
456
+ if (isCurrentCharacter(lexer, isDigit)) {
457
+ return lexNumber(lexer)
458
+ } else {
459
+ return { type: "Plus", lexeme: character, line: lexer.line };
460
+ }
461
+ case "-":
462
+ lexer.position++;
463
+ if (isCurrentCharacter(lexer, isDigit)) {
464
+ return lexNumber(lexer, /* isNegative */ true)
465
+ } else {
466
+ return { type: "Minus", lexeme: character, line: lexer.line };
467
+ }
468
+ case "*":
469
+ lexer.position++;
470
+ return { type: "Asterisk", lexeme: character, line: lexer.line };
471
+ case "/":
472
+ lexer.position++;
473
+ return { type: "Slash", lexeme: character, line: lexer.line };
474
+ case "|":
475
+ lexer.position++;
476
+ return { type: "Bar", lexeme: character, line: lexer.line };
477
+ case "\\":
478
+ lexer.position++;
479
+ return { type: "BackSlash", lexeme: character, line: lexer.line };
480
+ case " ":
481
+ lexer.position++;
482
+ return undefined;
483
+ case "\r":
484
+ lexer.position++;
485
+ if (isCurrentCharacter(lexer, "\n")) {
486
+ return lexNewLine(utkrisht, lexer);
487
+ } else {
488
+ error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
489
+ lexer.position++
490
+ return undefined;
491
+ }
492
+ case "\n":
493
+ return lexNewLine(utkrisht, lexer)
494
+ case "\t":
495
+ error(utkrisht, "Utkrisht does not support tabs for indentation. Please use spaces.", lexer.line);
496
+ lexer.position++
497
+ return undefined;
498
+ case "!":
499
+ lexer.position++;
500
+
501
+ if (isCurrentCharacter(lexer, "=")) {
502
+ lexer.position++;
503
+ return { type: "ExclamationMarkEqual", lexeme: "!=", line: lexer.line };
504
+ }
505
+ else if (isCurrentCharacter(lexer, "<")) {
506
+ lexer.position++;
507
+ return { type: "ExclamationMarkLessThan", lexeme: "!<", line: lexer.line };
508
+ }
509
+ else if (isCurrentCharacter(lexer, ">")) {
510
+ lexer.position++;
511
+ return { type: "ExclamationMarkMoreThan", lexeme: "!>", line: lexer.line };
512
+ }
513
+ else {
514
+ return { type: "ExclamationMark", lexeme: character, line: lexer.line };
515
+ }
516
+ case '"':
517
+ return lexString(utkrisht, lexer);
518
+ default:
519
+ if (isDigit(character)) {
520
+ return lexNumber(lexer);
521
+ }
522
+ else if (isSmallAlphabet(character)) {
523
+ return lexIdentifier(lexer);
524
+ }
525
+ else if (isBigAlphabet(character)) {
526
+ error(utkrisht, "Big Letters are not allowed in identifiers", lexer.line);
527
+ lexer.position++;
528
+ return undefined;
529
+ }
530
+ error(utkrisht, "Invalid character `" + character + "`", lexer.line);
531
+ lexer.position++
532
+
533
+ }
534
+ }
535
+
536
+
537
+ export function lex(utkrisht, lexer) {
538
+ const tokens = [];
539
+
540
+ let leadingSpaces = 0;
541
+ while (!isAtEnd(lexer)) {
542
+ if (isCurrentCharacter(lexer, " ")) {
543
+ leadingSpaces++;
544
+ lexer.position++
545
+ } else if (isCurrentCharacter(lexer, "\n")) {
546
+ lexer.line++
547
+ lexer.position++
548
+ leadingSpaces = 0;
549
+ } else if (isCurrentCharacter(lexer, "\r")) {
550
+ lexer.position++;
551
+ if (isCurrentCharacter(lexer, "\n")) {
552
+ lexer.line++
553
+ lexer.position++
554
+ leadingSpaces = 0;
555
+ } else {
556
+ error(utkrisht, "Carriage return must be followed by a NewLine character.", lexer.line);
557
+ lexer.position++;
558
+ }
559
+ } else if (isCurrentCharacter(lexer, "#")) {
560
+ leadingSpaces = 0;
561
+ while (!isAtEnd(lexer) && !isCurrentCharacter(lexer, "\n")) {
562
+ lexer.position++
563
+ }
564
+ } else {
565
+ break;
566
+ }
567
+ }
568
+
569
+ if (leadingSpaces !== 0) {
570
+ error(utkrisht, "Invalid Indentation at the start of the file", lexer.line)
571
+ }
572
+
573
+ while (!isAtEnd(lexer)) {
574
+ const token = lexToken(utkrisht, lexer);
575
+ if (token === undefined) {
576
+ continue;
577
+ } else if (Array.isArray(token)) {
578
+ tokens.push(...token);
579
+ } else {
580
+ tokens.push(token);
581
+ }
582
+ }
583
+
584
+ // Add dedents
585
+ while (lexer.nestingDepth > 0) {
586
+ tokens.push({ type: "Dedent", lexeme: "----", line: lexer.line });
587
+ lexer.nestingDepth--;
588
+ }
589
+
590
+ // Add the last token, i.e. EndOfFile
591
+ tokens.push({ type: "EndOfFile", line: lexer.line });
592
+
593
+ return tokens;
594
+ }
595
+
596
+
597
+ // import { createUtkrisht } from "./utkrisht.js";
598
+
599
+ // const utkrisht = createUtkrisht();
600
+ // const lexer = createLexer("110 + +")
601
+
602
+ // console.log(JSON.stringify(lex(utkrisht, lexer), null, 4));
@@ -0,0 +1,11 @@
1
+
2
+ export function error(utkrisht, message, line) {
3
+ const red = "\x1b[31m";
4
+ const reset = "\x1b[0m";
5
+
6
+ console.error(`${red}Error on line ${line}${reset}: ${message}`)
7
+ utkrisht.hadError = true;
8
+ }
9
+
10
+
11
+