tengwar 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tengwar/__init__.py +20 -0
- tengwar/__main__.py +8 -0
- tengwar/ast_nodes.py +351 -0
- tengwar/binary_ast.py +654 -0
- tengwar/errors.py +43 -0
- tengwar/interpreter.py +1845 -0
- tengwar/lexer.py +483 -0
- tengwar/mcp_server.py +496 -0
- tengwar/parser.py +603 -0
- tengwar/repl.py +152 -0
- tengwar/vm.py +425 -0
- tengwar-0.3.1.dist-info/METADATA +202 -0
- tengwar-0.3.1.dist-info/RECORD +17 -0
- tengwar-0.3.1.dist-info/WHEEL +5 -0
- tengwar-0.3.1.dist-info/entry_points.txt +2 -0
- tengwar-0.3.1.dist-info/licenses/LICENSE +21 -0
- tengwar-0.3.1.dist-info/top_level.txt +1 -0
tengwar/lexer.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TENGWAR Lexer
|
|
3
|
+
|
|
4
|
+
Tokenizes TENGWAR source code. Handles both Unicode operators (primary form)
|
|
5
|
+
and ASCII fallbacks for environments with limited Unicode support.
|
|
6
|
+
|
|
7
|
+
Token efficiency is paramount: every token carries maximum semantic weight.
|
|
8
|
+
"""
|
|
9
|
+
from enum import Enum, auto
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import List
|
|
12
|
+
from .errors import LexError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TokenType(Enum):
|
|
16
|
+
# Delimiters
|
|
17
|
+
LPAREN = auto() # (
|
|
18
|
+
RPAREN = auto() # )
|
|
19
|
+
LANGLE = auto() # ⟨ or <|
|
|
20
|
+
RANGLE = auto() # ⟩ or |>
|
|
21
|
+
LBRACKET = auto() # ⟦ or [|
|
|
22
|
+
RBRACKET = auto() # ⟧ or |]
|
|
23
|
+
LBRACE = auto() # {
|
|
24
|
+
RBRACE = auto() # }
|
|
25
|
+
|
|
26
|
+
# Keywords (single Unicode chars)
|
|
27
|
+
LAMBDA = auto() # λ or \
|
|
28
|
+
ARROW = auto() # → or ->
|
|
29
|
+
COND = auto() # ?
|
|
30
|
+
MATCH = auto() # ~
|
|
31
|
+
SEQ = auto() # >>
|
|
32
|
+
PARALLEL = auto() # ∥ or ||
|
|
33
|
+
MODULE = auto() # □ or []
|
|
34
|
+
DEFINE = auto() # :=
|
|
35
|
+
RECURSE = auto() # ↺ or @rec
|
|
36
|
+
TYPE = auto() # τ or :t
|
|
37
|
+
PROOF = auto() # ⊢ or |-
|
|
38
|
+
EFFECT = auto() # ⊕ or +>
|
|
39
|
+
MUTATE = auto() # μ or !
|
|
40
|
+
UNIT = auto() # ∅ or ()
|
|
41
|
+
IMPORT = auto() # ⇐ or <=
|
|
42
|
+
|
|
43
|
+
# Operators
|
|
44
|
+
PLUS = auto() # +
|
|
45
|
+
MINUS = auto() # -
|
|
46
|
+
STAR = auto() # *
|
|
47
|
+
SLASH = auto() # /
|
|
48
|
+
PERCENT = auto() # %
|
|
49
|
+
EQ = auto() # =
|
|
50
|
+
NEQ = auto() # !=
|
|
51
|
+
LT = auto() # <
|
|
52
|
+
GT = auto() # >
|
|
53
|
+
LTE = auto() # <=
|
|
54
|
+
GTE = auto() # >=
|
|
55
|
+
AND = auto() # &
|
|
56
|
+
OR = auto() # |
|
|
57
|
+
NOT = auto() # !
|
|
58
|
+
DOT = auto() # .
|
|
59
|
+
COLON = auto() # :
|
|
60
|
+
|
|
61
|
+
# Literals
|
|
62
|
+
INT = auto()
|
|
63
|
+
FLOAT = auto()
|
|
64
|
+
STRING = auto()
|
|
65
|
+
TEMPLATE = auto() # $"...{expr}..." template string
|
|
66
|
+
TRUE = auto()
|
|
67
|
+
FALSE = auto()
|
|
68
|
+
|
|
69
|
+
# Identifiers
|
|
70
|
+
SYMBOL = auto() # regular name
|
|
71
|
+
HASH_ID = auto() # #abc
|
|
72
|
+
ADDR_REF = auto() # @abc
|
|
73
|
+
|
|
74
|
+
# Special
|
|
75
|
+
COMMENT = auto() # ;;
|
|
76
|
+
UNDERSCORE = auto() # _ (wildcard)
|
|
77
|
+
EOF = auto()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class Token:
|
|
82
|
+
type: TokenType
|
|
83
|
+
value: str
|
|
84
|
+
line: int
|
|
85
|
+
col: int
|
|
86
|
+
|
|
87
|
+
def __repr__(self):
|
|
88
|
+
return f"Token({self.type.name}, {self.value!r}, {self.line}:{self.col})"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Unicode → ASCII fallback mapping
|
|
92
|
+
UNICODE_MAP = {
|
|
93
|
+
'λ': TokenType.LAMBDA,
|
|
94
|
+
'≡': TokenType.DEFINE,
|
|
95
|
+
'→': TokenType.ARROW,
|
|
96
|
+
'↺': TokenType.RECURSE,
|
|
97
|
+
'τ': TokenType.TYPE,
|
|
98
|
+
'⊢': TokenType.PROOF,
|
|
99
|
+
'⊕': TokenType.EFFECT,
|
|
100
|
+
'μ': TokenType.MUTATE,
|
|
101
|
+
'∅': TokenType.UNIT,
|
|
102
|
+
'∥': TokenType.PARALLEL,
|
|
103
|
+
'□': TokenType.MODULE,
|
|
104
|
+
'⇐': TokenType.IMPORT,
|
|
105
|
+
'⟨': TokenType.LANGLE,
|
|
106
|
+
'⟩': TokenType.RANGLE,
|
|
107
|
+
'⟦': TokenType.LBRACKET,
|
|
108
|
+
'⟧': TokenType.RBRACKET,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
KEYWORDS = {
|
|
112
|
+
'true': TokenType.TRUE,
|
|
113
|
+
'false': TokenType.FALSE,
|
|
114
|
+
# ASCII aliases for Unicode operators
|
|
115
|
+
'fn': TokenType.LAMBDA,
|
|
116
|
+
'def': TokenType.DEFINE,
|
|
117
|
+
'if': TokenType.COND,
|
|
118
|
+
'match': TokenType.MATCH,
|
|
119
|
+
'rec': TokenType.RECURSE,
|
|
120
|
+
'do': TokenType.SEQ,
|
|
121
|
+
'par': TokenType.PARALLEL,
|
|
122
|
+
'nil': TokenType.UNIT,
|
|
123
|
+
'mod': TokenType.MODULE,
|
|
124
|
+
# Note: let, pipe, throw, catch, try are handled as symbols
|
|
125
|
+
# by the parser since they need custom parse rules
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
SINGLE_CHAR = {
|
|
129
|
+
'(': TokenType.LPAREN,
|
|
130
|
+
')': TokenType.RPAREN,
|
|
131
|
+
'{': TokenType.LBRACE,
|
|
132
|
+
'}': TokenType.RBRACE,
|
|
133
|
+
'?': TokenType.COND,
|
|
134
|
+
'~': TokenType.MATCH,
|
|
135
|
+
'+': TokenType.PLUS,
|
|
136
|
+
'*': TokenType.STAR,
|
|
137
|
+
'/': TokenType.SLASH,
|
|
138
|
+
'%': TokenType.PERCENT,
|
|
139
|
+
'=': TokenType.EQ,
|
|
140
|
+
'&': TokenType.AND,
|
|
141
|
+
'.': TokenType.DOT,
|
|
142
|
+
':': TokenType.COLON,
|
|
143
|
+
'_': TokenType.UNDERSCORE,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class Lexer:
|
|
148
|
+
def __init__(self, source: str):
|
|
149
|
+
self.source = source
|
|
150
|
+
self.pos = 0
|
|
151
|
+
self.line = 1
|
|
152
|
+
self.col = 1
|
|
153
|
+
self.tokens: List[Token] = []
|
|
154
|
+
|
|
155
|
+
def peek(self) -> str:
|
|
156
|
+
if self.pos >= len(self.source):
|
|
157
|
+
return '\0'
|
|
158
|
+
return self.source[self.pos]
|
|
159
|
+
|
|
160
|
+
def peek_ahead(self, n: int = 1) -> str:
|
|
161
|
+
pos = self.pos + n
|
|
162
|
+
if pos >= len(self.source):
|
|
163
|
+
return '\0'
|
|
164
|
+
return self.source[pos]
|
|
165
|
+
|
|
166
|
+
def advance(self) -> str:
|
|
167
|
+
ch = self.source[self.pos]
|
|
168
|
+
self.pos += 1
|
|
169
|
+
if ch == '\n':
|
|
170
|
+
self.line += 1
|
|
171
|
+
self.col = 1
|
|
172
|
+
else:
|
|
173
|
+
self.col += 1
|
|
174
|
+
return ch
|
|
175
|
+
|
|
176
|
+
def add_token(self, type: TokenType, value: str, line: int, col: int):
|
|
177
|
+
self.tokens.append(Token(type, value, line, col))
|
|
178
|
+
|
|
179
|
+
def skip_whitespace(self):
|
|
180
|
+
while self.pos < len(self.source) and self.source[self.pos] in ' \t\n\r':
|
|
181
|
+
self.advance()
|
|
182
|
+
|
|
183
|
+
def read_string(self) -> str:
|
|
184
|
+
"""Read a string literal (double-quoted)"""
|
|
185
|
+
result = []
|
|
186
|
+
self.advance() # skip opening "
|
|
187
|
+
while self.pos < len(self.source):
|
|
188
|
+
ch = self.advance()
|
|
189
|
+
if ch == '"':
|
|
190
|
+
return ''.join(result)
|
|
191
|
+
elif ch == '\\':
|
|
192
|
+
if self.pos < len(self.source):
|
|
193
|
+
esc = self.advance()
|
|
194
|
+
escape_map = {'n': '\n', 't': '\t', 'r': '\r', '\\': '\\', '"': '"'}
|
|
195
|
+
result.append(escape_map.get(esc, esc))
|
|
196
|
+
else:
|
|
197
|
+
result.append(ch)
|
|
198
|
+
raise LexError("Unterminated string literal", self.line, self.col)
|
|
199
|
+
|
|
200
|
+
def read_number(self) -> Token:
|
|
201
|
+
"""Read integer or float literal"""
|
|
202
|
+
start_line, start_col = self.line, self.col
|
|
203
|
+
result = []
|
|
204
|
+
is_float = False
|
|
205
|
+
is_neg = False
|
|
206
|
+
|
|
207
|
+
if self.peek() == '-':
|
|
208
|
+
result.append(self.advance())
|
|
209
|
+
is_neg = True
|
|
210
|
+
|
|
211
|
+
while self.pos < len(self.source) and (self.source[self.pos].isdigit() or self.source[self.pos] == '.'):
|
|
212
|
+
ch = self.advance()
|
|
213
|
+
if ch == '.':
|
|
214
|
+
if is_float:
|
|
215
|
+
raise LexError("Multiple decimal points in number", self.line, self.col)
|
|
216
|
+
is_float = True
|
|
217
|
+
result.append(ch)
|
|
218
|
+
|
|
219
|
+
# Handle scientific notation
|
|
220
|
+
if self.pos < len(self.source) and self.source[self.pos] in ('e', 'E'):
|
|
221
|
+
is_float = True
|
|
222
|
+
result.append(self.advance())
|
|
223
|
+
if self.pos < len(self.source) and self.source[self.pos] in ('+', '-'):
|
|
224
|
+
result.append(self.advance())
|
|
225
|
+
while self.pos < len(self.source) and self.source[self.pos].isdigit():
|
|
226
|
+
result.append(self.advance())
|
|
227
|
+
|
|
228
|
+
value = ''.join(result)
|
|
229
|
+
if is_float:
|
|
230
|
+
return Token(TokenType.FLOAT, value, start_line, start_col)
|
|
231
|
+
else:
|
|
232
|
+
return Token(TokenType.INT, value, start_line, start_col)
|
|
233
|
+
|
|
234
|
+
def read_symbol(self) -> str:
|
|
235
|
+
"""Read a symbol/identifier"""
|
|
236
|
+
result = []
|
|
237
|
+
valid_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.-')
|
|
238
|
+
while self.pos < len(self.source) and self.source[self.pos] in valid_chars:
|
|
239
|
+
result.append(self.advance())
|
|
240
|
+
# Allow trailing ? or ! for predicates and mutators (e.g., empty?, set!)
|
|
241
|
+
if self.pos < len(self.source) and self.source[self.pos] in ('?', '!'):
|
|
242
|
+
result.append(self.advance())
|
|
243
|
+
return ''.join(result)
|
|
244
|
+
|
|
245
|
+
def read_comment(self) -> str:
|
|
246
|
+
"""Read comment until end of line"""
|
|
247
|
+
result = []
|
|
248
|
+
while self.pos < len(self.source) and self.source[self.pos] != '\n':
|
|
249
|
+
result.append(self.advance())
|
|
250
|
+
return ''.join(result).strip()
|
|
251
|
+
|
|
252
|
+
def tokenize(self) -> List[Token]:
|
|
253
|
+
"""Tokenize the full source into a list of tokens"""
|
|
254
|
+
while self.pos < len(self.source):
|
|
255
|
+
self.skip_whitespace()
|
|
256
|
+
if self.pos >= len(self.source):
|
|
257
|
+
break
|
|
258
|
+
|
|
259
|
+
ch = self.source[self.pos]
|
|
260
|
+
start_line, start_col = self.line, self.col
|
|
261
|
+
|
|
262
|
+
# Unicode operators (primary form)
|
|
263
|
+
if ch in UNICODE_MAP:
|
|
264
|
+
self.advance()
|
|
265
|
+
self.add_token(UNICODE_MAP[ch], ch, start_line, start_col)
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
# Comments: ;;
|
|
269
|
+
if ch == ';' and self.peek_ahead() == ';':
|
|
270
|
+
self.advance()
|
|
271
|
+
self.advance()
|
|
272
|
+
comment = self.read_comment()
|
|
273
|
+
self.add_token(TokenType.COMMENT, comment, start_line, start_col)
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
# Single ; is also a comment
|
|
277
|
+
if ch == ';':
|
|
278
|
+
self.advance()
|
|
279
|
+
comment = self.read_comment()
|
|
280
|
+
self.add_token(TokenType.COMMENT, comment, start_line, start_col)
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
# Two-character operators (ASCII fallbacks and combos)
|
|
284
|
+
if self.pos + 1 < len(self.source):
|
|
285
|
+
two = self.source[self.pos:self.pos+2]
|
|
286
|
+
if two == '->':
|
|
287
|
+
self.advance(); self.advance()
|
|
288
|
+
self.add_token(TokenType.ARROW, '→', start_line, start_col)
|
|
289
|
+
continue
|
|
290
|
+
elif two == '>>':
|
|
291
|
+
self.advance(); self.advance()
|
|
292
|
+
self.add_token(TokenType.SEQ, '>>', start_line, start_col)
|
|
293
|
+
continue
|
|
294
|
+
elif two == ':=':
|
|
295
|
+
self.advance(); self.advance()
|
|
296
|
+
self.add_token(TokenType.DEFINE, ':=', start_line, start_col)
|
|
297
|
+
continue
|
|
298
|
+
elif two == '||':
|
|
299
|
+
self.advance(); self.advance()
|
|
300
|
+
self.add_token(TokenType.PARALLEL, '∥', start_line, start_col)
|
|
301
|
+
continue
|
|
302
|
+
elif two == '|-':
|
|
303
|
+
self.advance(); self.advance()
|
|
304
|
+
self.add_token(TokenType.PROOF, '⊢', start_line, start_col)
|
|
305
|
+
continue
|
|
306
|
+
elif two == '+>':
|
|
307
|
+
self.advance(); self.advance()
|
|
308
|
+
self.add_token(TokenType.EFFECT, '⊕', start_line, start_col)
|
|
309
|
+
continue
|
|
310
|
+
elif two == '!=':
|
|
311
|
+
self.advance(); self.advance()
|
|
312
|
+
self.add_token(TokenType.NEQ, '!=', start_line, start_col)
|
|
313
|
+
continue
|
|
314
|
+
elif two == '<=':
|
|
315
|
+
self.advance(); self.advance()
|
|
316
|
+
self.add_token(TokenType.LTE, '<=', start_line, start_col)
|
|
317
|
+
continue
|
|
318
|
+
elif two == '>=':
|
|
319
|
+
self.advance(); self.advance()
|
|
320
|
+
self.add_token(TokenType.GTE, '>=', start_line, start_col)
|
|
321
|
+
continue
|
|
322
|
+
elif two == '<|':
|
|
323
|
+
self.advance(); self.advance()
|
|
324
|
+
self.add_token(TokenType.LANGLE, '⟨', start_line, start_col)
|
|
325
|
+
continue
|
|
326
|
+
elif two == '|>':
|
|
327
|
+
self.advance(); self.advance()
|
|
328
|
+
self.add_token(TokenType.RANGLE, '⟩', start_line, start_col)
|
|
329
|
+
continue
|
|
330
|
+
elif two == '[|':
|
|
331
|
+
self.advance(); self.advance()
|
|
332
|
+
self.add_token(TokenType.LBRACKET, '⟦', start_line, start_col)
|
|
333
|
+
continue
|
|
334
|
+
elif two == '|]':
|
|
335
|
+
self.advance(); self.advance()
|
|
336
|
+
self.add_token(TokenType.RBRACKET, '⟧', start_line, start_col)
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
# Backslash as lambda
|
|
340
|
+
if ch == '\\':
|
|
341
|
+
self.advance()
|
|
342
|
+
self.add_token(TokenType.LAMBDA, 'λ', start_line, start_col)
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
# Single-character tokens
|
|
346
|
+
if ch in SINGLE_CHAR:
|
|
347
|
+
# Special case: _ followed by . should be read as symbol for dot-indexing
|
|
348
|
+
if ch == '_' and self.pos + 1 < len(self.source) and self.source[self.pos + 1] == '.':
|
|
349
|
+
name = self.read_symbol()
|
|
350
|
+
self.add_token(TokenType.SYMBOL, name, start_line, start_col)
|
|
351
|
+
continue
|
|
352
|
+
self.advance()
|
|
353
|
+
self.add_token(SINGLE_CHAR[ch], ch, start_line, start_col)
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
# Less-than / Greater-than (single char, after checking two-char combos)
|
|
357
|
+
if ch == '<':
|
|
358
|
+
self.advance()
|
|
359
|
+
self.add_token(TokenType.LT, '<', start_line, start_col)
|
|
360
|
+
continue
|
|
361
|
+
if ch == '>':
|
|
362
|
+
self.advance()
|
|
363
|
+
self.add_token(TokenType.GT, '>', start_line, start_col)
|
|
364
|
+
continue
|
|
365
|
+
|
|
366
|
+
# Pipe (single, after checking ||)
|
|
367
|
+
if ch == '|':
|
|
368
|
+
self.advance()
|
|
369
|
+
self.add_token(TokenType.OR, '|', start_line, start_col)
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Bang (single, after checking !=)
|
|
373
|
+
if ch == '!':
|
|
374
|
+
self.advance()
|
|
375
|
+
self.add_token(TokenType.NOT, '!', start_line, start_col)
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
# Hash identifiers: #abc
|
|
379
|
+
if ch == '#':
|
|
380
|
+
self.advance()
|
|
381
|
+
name = self.read_symbol()
|
|
382
|
+
if not name:
|
|
383
|
+
raise LexError("Expected identifier after #", start_line, start_col)
|
|
384
|
+
self.add_token(TokenType.HASH_ID, '#' + name, start_line, start_col)
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
# Address references: @abc
|
|
388
|
+
if ch == '@':
|
|
389
|
+
self.advance()
|
|
390
|
+
name = self.read_symbol()
|
|
391
|
+
if not name:
|
|
392
|
+
raise LexError("Expected identifier after @", start_line, start_col)
|
|
393
|
+
self.add_token(TokenType.ADDR_REF, '@' + name, start_line, start_col)
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
# Template strings: $"Hi {name}! Age: {(+ age 1)}"
|
|
397
|
+
if ch == '$' and self.pos + 1 < len(self.source) and self.source[self.pos + 1] == '"':
|
|
398
|
+
self.advance() # consume $
|
|
399
|
+
self.advance() # consume "
|
|
400
|
+
fmt_parts = [] # literal string parts
|
|
401
|
+
expr_sources = [] # expression source strings
|
|
402
|
+
current = []
|
|
403
|
+
while self.pos < len(self.source) and self.source[self.pos] != '"':
|
|
404
|
+
c = self.source[self.pos]
|
|
405
|
+
if c == '\\':
|
|
406
|
+
self.advance()
|
|
407
|
+
escape_map = {'n': '\n', 't': '\t', 'r': '\r', '\\': '\\', '"': '"', '{': '{', '}': '}'}
|
|
408
|
+
if self.pos < len(self.source):
|
|
409
|
+
current.append(escape_map.get(self.source[self.pos], self.source[self.pos]))
|
|
410
|
+
self.advance()
|
|
411
|
+
elif c == '{':
|
|
412
|
+
# Start of expression
|
|
413
|
+
fmt_parts.append(''.join(current))
|
|
414
|
+
current = []
|
|
415
|
+
self.advance() # consume {
|
|
416
|
+
depth = 1
|
|
417
|
+
expr_chars = []
|
|
418
|
+
while self.pos < len(self.source) and depth > 0:
|
|
419
|
+
ec = self.source[self.pos]
|
|
420
|
+
if ec == '{': depth += 1
|
|
421
|
+
elif ec == '}': depth -= 1
|
|
422
|
+
if depth > 0:
|
|
423
|
+
expr_chars.append(ec)
|
|
424
|
+
self.advance()
|
|
425
|
+
expr_sources.append(''.join(expr_chars))
|
|
426
|
+
else:
|
|
427
|
+
current.append(c)
|
|
428
|
+
self.advance()
|
|
429
|
+
fmt_parts.append(''.join(current))
|
|
430
|
+
if self.pos < len(self.source):
|
|
431
|
+
self.advance() # consume closing "
|
|
432
|
+
# Build format string with {} placeholders
|
|
433
|
+
fmt_str = '{}' .join(fmt_parts)
|
|
434
|
+
# Encode as: fmt_str\x00expr1\x00expr2\x00...
|
|
435
|
+
value = '\x00'.join([fmt_str] + expr_sources)
|
|
436
|
+
self.add_token(TokenType.TEMPLATE, value, start_line, start_col)
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
# String literals
|
|
440
|
+
if ch == '"':
|
|
441
|
+
value = self.read_string()
|
|
442
|
+
self.add_token(TokenType.STRING, value, start_line, start_col)
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
# Numbers (including negative)
|
|
446
|
+
if ch.isdigit():
|
|
447
|
+
tok = self.read_number()
|
|
448
|
+
self.tokens.append(tok)
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
# Minus: could be negative number or operator
|
|
452
|
+
if ch == '-':
|
|
453
|
+
# Treat as negative number if followed by digit AND
|
|
454
|
+
# not immediately after ( (where it would be the operator in (- a b))
|
|
455
|
+
if (self.peek_ahead().isdigit() and
|
|
456
|
+
(not self.tokens or
|
|
457
|
+
self.tokens[-1].type != TokenType.LPAREN)):
|
|
458
|
+
tok = self.read_number()
|
|
459
|
+
self.tokens.append(tok)
|
|
460
|
+
continue
|
|
461
|
+
else:
|
|
462
|
+
self.advance()
|
|
463
|
+
self.add_token(TokenType.MINUS, '-', start_line, start_col)
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
# Symbols / keywords
|
|
467
|
+
if ch.isalpha() or ch == '_':
|
|
468
|
+
name = self.read_symbol()
|
|
469
|
+
if name in KEYWORDS:
|
|
470
|
+
self.add_token(KEYWORDS[name], name, start_line, start_col)
|
|
471
|
+
else:
|
|
472
|
+
self.add_token(TokenType.SYMBOL, name, start_line, start_col)
|
|
473
|
+
continue
|
|
474
|
+
|
|
475
|
+
raise LexError(f"Unexpected character: {ch!r}", start_line, start_col)
|
|
476
|
+
|
|
477
|
+
self.add_token(TokenType.EOF, '', self.line, self.col)
|
|
478
|
+
return self.tokens
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def tokenize(source: str) -> List[Token]:
|
|
482
|
+
"""Convenience function to tokenize a source string"""
|
|
483
|
+
return Lexer(source).tokenize()
|