tengwar 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tengwar/lexer.py ADDED
@@ -0,0 +1,483 @@
1
+ """
2
+ TENGWAR Lexer
3
+
4
+ Tokenizes TENGWAR source code. Handles both Unicode operators (primary form)
5
+ and ASCII fallbacks for environments with limited Unicode support.
6
+
7
+ Token efficiency is paramount: every token carries maximum semantic weight.
8
+ """
9
+ from enum import Enum, auto
10
+ from dataclasses import dataclass
11
+ from typing import List
12
+ from .errors import LexError
13
+
14
+
15
+ class TokenType(Enum):
16
+ # Delimiters
17
+ LPAREN = auto() # (
18
+ RPAREN = auto() # )
19
+ LANGLE = auto() # ⟨ or <|
20
+ RANGLE = auto() # ⟩ or |>
21
+ LBRACKET = auto() # ⟦ or [|
22
+ RBRACKET = auto() # ⟧ or |]
23
+ LBRACE = auto() # {
24
+ RBRACE = auto() # }
25
+
26
+ # Keywords (single Unicode chars)
27
+ LAMBDA = auto() # λ or \
28
+ ARROW = auto() # → or ->
29
+ COND = auto() # ?
30
+ MATCH = auto() # ~
31
+ SEQ = auto() # >>
32
+ PARALLEL = auto() # ∥ or ||
33
+ MODULE = auto() # □ or []
34
+ DEFINE = auto() # :=
35
+ RECURSE = auto() # ↺ or @rec
36
+ TYPE = auto() # τ or :t
37
+ PROOF = auto() # ⊢ or |-
38
+ EFFECT = auto() # ⊕ or +>
39
+ MUTATE = auto() # μ or !
40
+ UNIT = auto() # ∅ or ()
41
+ IMPORT = auto() # ⇐ or <=
42
+
43
+ # Operators
44
+ PLUS = auto() # +
45
+ MINUS = auto() # -
46
+ STAR = auto() # *
47
+ SLASH = auto() # /
48
+ PERCENT = auto() # %
49
+ EQ = auto() # =
50
+ NEQ = auto() # !=
51
+ LT = auto() # <
52
+ GT = auto() # >
53
+ LTE = auto() # <=
54
+ GTE = auto() # >=
55
+ AND = auto() # &
56
+ OR = auto() # |
57
+ NOT = auto() # !
58
+ DOT = auto() # .
59
+ COLON = auto() # :
60
+
61
+ # Literals
62
+ INT = auto()
63
+ FLOAT = auto()
64
+ STRING = auto()
65
+ TEMPLATE = auto() # $"...{expr}..." template string
66
+ TRUE = auto()
67
+ FALSE = auto()
68
+
69
+ # Identifiers
70
+ SYMBOL = auto() # regular name
71
+ HASH_ID = auto() # #abc
72
+ ADDR_REF = auto() # @abc
73
+
74
+ # Special
75
+ COMMENT = auto() # ;;
76
+ UNDERSCORE = auto() # _ (wildcard)
77
+ EOF = auto()
78
+
79
+
80
+ @dataclass
81
+ class Token:
82
+ type: TokenType
83
+ value: str
84
+ line: int
85
+ col: int
86
+
87
+ def __repr__(self):
88
+ return f"Token({self.type.name}, {self.value!r}, {self.line}:{self.col})"
89
+
90
+
91
+ # Unicode → ASCII fallback mapping
92
+ UNICODE_MAP = {
93
+ 'λ': TokenType.LAMBDA,
94
+ '≡': TokenType.DEFINE,
95
+ '→': TokenType.ARROW,
96
+ '↺': TokenType.RECURSE,
97
+ 'τ': TokenType.TYPE,
98
+ '⊢': TokenType.PROOF,
99
+ '⊕': TokenType.EFFECT,
100
+ 'μ': TokenType.MUTATE,
101
+ '∅': TokenType.UNIT,
102
+ '∥': TokenType.PARALLEL,
103
+ '□': TokenType.MODULE,
104
+ '⇐': TokenType.IMPORT,
105
+ '⟨': TokenType.LANGLE,
106
+ '⟩': TokenType.RANGLE,
107
+ '⟦': TokenType.LBRACKET,
108
+ '⟧': TokenType.RBRACKET,
109
+ }
110
+
111
+ KEYWORDS = {
112
+ 'true': TokenType.TRUE,
113
+ 'false': TokenType.FALSE,
114
+ # ASCII aliases for Unicode operators
115
+ 'fn': TokenType.LAMBDA,
116
+ 'def': TokenType.DEFINE,
117
+ 'if': TokenType.COND,
118
+ 'match': TokenType.MATCH,
119
+ 'rec': TokenType.RECURSE,
120
+ 'do': TokenType.SEQ,
121
+ 'par': TokenType.PARALLEL,
122
+ 'nil': TokenType.UNIT,
123
+ 'mod': TokenType.MODULE,
124
+ # Note: let, pipe, throw, catch, try are handled as symbols
125
+ # by the parser since they need custom parse rules
126
+ }
127
+
128
+ SINGLE_CHAR = {
129
+ '(': TokenType.LPAREN,
130
+ ')': TokenType.RPAREN,
131
+ '{': TokenType.LBRACE,
132
+ '}': TokenType.RBRACE,
133
+ '?': TokenType.COND,
134
+ '~': TokenType.MATCH,
135
+ '+': TokenType.PLUS,
136
+ '*': TokenType.STAR,
137
+ '/': TokenType.SLASH,
138
+ '%': TokenType.PERCENT,
139
+ '=': TokenType.EQ,
140
+ '&': TokenType.AND,
141
+ '.': TokenType.DOT,
142
+ ':': TokenType.COLON,
143
+ '_': TokenType.UNDERSCORE,
144
+ }
145
+
146
+
147
+ class Lexer:
148
+ def __init__(self, source: str):
149
+ self.source = source
150
+ self.pos = 0
151
+ self.line = 1
152
+ self.col = 1
153
+ self.tokens: List[Token] = []
154
+
155
+ def peek(self) -> str:
156
+ if self.pos >= len(self.source):
157
+ return '\0'
158
+ return self.source[self.pos]
159
+
160
+ def peek_ahead(self, n: int = 1) -> str:
161
+ pos = self.pos + n
162
+ if pos >= len(self.source):
163
+ return '\0'
164
+ return self.source[pos]
165
+
166
+ def advance(self) -> str:
167
+ ch = self.source[self.pos]
168
+ self.pos += 1
169
+ if ch == '\n':
170
+ self.line += 1
171
+ self.col = 1
172
+ else:
173
+ self.col += 1
174
+ return ch
175
+
176
+ def add_token(self, type: TokenType, value: str, line: int, col: int):
177
+ self.tokens.append(Token(type, value, line, col))
178
+
179
+ def skip_whitespace(self):
180
+ while self.pos < len(self.source) and self.source[self.pos] in ' \t\n\r':
181
+ self.advance()
182
+
183
+ def read_string(self) -> str:
184
+ """Read a string literal (double-quoted)"""
185
+ result = []
186
+ self.advance() # skip opening "
187
+ while self.pos < len(self.source):
188
+ ch = self.advance()
189
+ if ch == '"':
190
+ return ''.join(result)
191
+ elif ch == '\\':
192
+ if self.pos < len(self.source):
193
+ esc = self.advance()
194
+ escape_map = {'n': '\n', 't': '\t', 'r': '\r', '\\': '\\', '"': '"'}
195
+ result.append(escape_map.get(esc, esc))
196
+ else:
197
+ result.append(ch)
198
+ raise LexError("Unterminated string literal", self.line, self.col)
199
+
200
+ def read_number(self) -> Token:
201
+ """Read integer or float literal"""
202
+ start_line, start_col = self.line, self.col
203
+ result = []
204
+ is_float = False
205
+ is_neg = False
206
+
207
+ if self.peek() == '-':
208
+ result.append(self.advance())
209
+ is_neg = True
210
+
211
+ while self.pos < len(self.source) and (self.source[self.pos].isdigit() or self.source[self.pos] == '.'):
212
+ ch = self.advance()
213
+ if ch == '.':
214
+ if is_float:
215
+ raise LexError("Multiple decimal points in number", self.line, self.col)
216
+ is_float = True
217
+ result.append(ch)
218
+
219
+ # Handle scientific notation
220
+ if self.pos < len(self.source) and self.source[self.pos] in ('e', 'E'):
221
+ is_float = True
222
+ result.append(self.advance())
223
+ if self.pos < len(self.source) and self.source[self.pos] in ('+', '-'):
224
+ result.append(self.advance())
225
+ while self.pos < len(self.source) and self.source[self.pos].isdigit():
226
+ result.append(self.advance())
227
+
228
+ value = ''.join(result)
229
+ if is_float:
230
+ return Token(TokenType.FLOAT, value, start_line, start_col)
231
+ else:
232
+ return Token(TokenType.INT, value, start_line, start_col)
233
+
234
+ def read_symbol(self) -> str:
235
+ """Read a symbol/identifier"""
236
+ result = []
237
+ valid_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.-')
238
+ while self.pos < len(self.source) and self.source[self.pos] in valid_chars:
239
+ result.append(self.advance())
240
+ # Allow trailing ? or ! for predicates and mutators (e.g., empty?, set!)
241
+ if self.pos < len(self.source) and self.source[self.pos] in ('?', '!'):
242
+ result.append(self.advance())
243
+ return ''.join(result)
244
+
245
+ def read_comment(self) -> str:
246
+ """Read comment until end of line"""
247
+ result = []
248
+ while self.pos < len(self.source) and self.source[self.pos] != '\n':
249
+ result.append(self.advance())
250
+ return ''.join(result).strip()
251
+
252
+ def tokenize(self) -> List[Token]:
253
+ """Tokenize the full source into a list of tokens"""
254
+ while self.pos < len(self.source):
255
+ self.skip_whitespace()
256
+ if self.pos >= len(self.source):
257
+ break
258
+
259
+ ch = self.source[self.pos]
260
+ start_line, start_col = self.line, self.col
261
+
262
+ # Unicode operators (primary form)
263
+ if ch in UNICODE_MAP:
264
+ self.advance()
265
+ self.add_token(UNICODE_MAP[ch], ch, start_line, start_col)
266
+ continue
267
+
268
+ # Comments: ;;
269
+ if ch == ';' and self.peek_ahead() == ';':
270
+ self.advance()
271
+ self.advance()
272
+ comment = self.read_comment()
273
+ self.add_token(TokenType.COMMENT, comment, start_line, start_col)
274
+ continue
275
+
276
+ # Single ; is also a comment
277
+ if ch == ';':
278
+ self.advance()
279
+ comment = self.read_comment()
280
+ self.add_token(TokenType.COMMENT, comment, start_line, start_col)
281
+ continue
282
+
283
+ # Two-character operators (ASCII fallbacks and combos)
284
+ if self.pos + 1 < len(self.source):
285
+ two = self.source[self.pos:self.pos+2]
286
+ if two == '->':
287
+ self.advance(); self.advance()
288
+ self.add_token(TokenType.ARROW, '→', start_line, start_col)
289
+ continue
290
+ elif two == '>>':
291
+ self.advance(); self.advance()
292
+ self.add_token(TokenType.SEQ, '>>', start_line, start_col)
293
+ continue
294
+ elif two == ':=':
295
+ self.advance(); self.advance()
296
+ self.add_token(TokenType.DEFINE, ':=', start_line, start_col)
297
+ continue
298
+ elif two == '||':
299
+ self.advance(); self.advance()
300
+ self.add_token(TokenType.PARALLEL, '∥', start_line, start_col)
301
+ continue
302
+ elif two == '|-':
303
+ self.advance(); self.advance()
304
+ self.add_token(TokenType.PROOF, '⊢', start_line, start_col)
305
+ continue
306
+ elif two == '+>':
307
+ self.advance(); self.advance()
308
+ self.add_token(TokenType.EFFECT, '⊕', start_line, start_col)
309
+ continue
310
+ elif two == '!=':
311
+ self.advance(); self.advance()
312
+ self.add_token(TokenType.NEQ, '!=', start_line, start_col)
313
+ continue
314
+ elif two == '<=':
315
+ self.advance(); self.advance()
316
+ self.add_token(TokenType.LTE, '<=', start_line, start_col)
317
+ continue
318
+ elif two == '>=':
319
+ self.advance(); self.advance()
320
+ self.add_token(TokenType.GTE, '>=', start_line, start_col)
321
+ continue
322
+ elif two == '<|':
323
+ self.advance(); self.advance()
324
+ self.add_token(TokenType.LANGLE, '⟨', start_line, start_col)
325
+ continue
326
+ elif two == '|>':
327
+ self.advance(); self.advance()
328
+ self.add_token(TokenType.RANGLE, '⟩', start_line, start_col)
329
+ continue
330
+ elif two == '[|':
331
+ self.advance(); self.advance()
332
+ self.add_token(TokenType.LBRACKET, '⟦', start_line, start_col)
333
+ continue
334
+ elif two == '|]':
335
+ self.advance(); self.advance()
336
+ self.add_token(TokenType.RBRACKET, '⟧', start_line, start_col)
337
+ continue
338
+
339
+ # Backslash as lambda
340
+ if ch == '\\':
341
+ self.advance()
342
+ self.add_token(TokenType.LAMBDA, 'λ', start_line, start_col)
343
+ continue
344
+
345
+ # Single-character tokens
346
+ if ch in SINGLE_CHAR:
347
+ # Special case: _ followed by . should be read as symbol for dot-indexing
348
+ if ch == '_' and self.pos + 1 < len(self.source) and self.source[self.pos + 1] == '.':
349
+ name = self.read_symbol()
350
+ self.add_token(TokenType.SYMBOL, name, start_line, start_col)
351
+ continue
352
+ self.advance()
353
+ self.add_token(SINGLE_CHAR[ch], ch, start_line, start_col)
354
+ continue
355
+
356
+ # Less-than / Greater-than (single char, after checking two-char combos)
357
+ if ch == '<':
358
+ self.advance()
359
+ self.add_token(TokenType.LT, '<', start_line, start_col)
360
+ continue
361
+ if ch == '>':
362
+ self.advance()
363
+ self.add_token(TokenType.GT, '>', start_line, start_col)
364
+ continue
365
+
366
+ # Pipe (single, after checking ||)
367
+ if ch == '|':
368
+ self.advance()
369
+ self.add_token(TokenType.OR, '|', start_line, start_col)
370
+ continue
371
+
372
+ # Bang (single, after checking !=)
373
+ if ch == '!':
374
+ self.advance()
375
+ self.add_token(TokenType.NOT, '!', start_line, start_col)
376
+ continue
377
+
378
+ # Hash identifiers: #abc
379
+ if ch == '#':
380
+ self.advance()
381
+ name = self.read_symbol()
382
+ if not name:
383
+ raise LexError("Expected identifier after #", start_line, start_col)
384
+ self.add_token(TokenType.HASH_ID, '#' + name, start_line, start_col)
385
+ continue
386
+
387
+ # Address references: @abc
388
+ if ch == '@':
389
+ self.advance()
390
+ name = self.read_symbol()
391
+ if not name:
392
+ raise LexError("Expected identifier after @", start_line, start_col)
393
+ self.add_token(TokenType.ADDR_REF, '@' + name, start_line, start_col)
394
+ continue
395
+
396
+ # Template strings: $"Hi {name}! Age: {(+ age 1)}"
397
+ if ch == '$' and self.pos + 1 < len(self.source) and self.source[self.pos + 1] == '"':
398
+ self.advance() # consume $
399
+ self.advance() # consume "
400
+ fmt_parts = [] # literal string parts
401
+ expr_sources = [] # expression source strings
402
+ current = []
403
+ while self.pos < len(self.source) and self.source[self.pos] != '"':
404
+ c = self.source[self.pos]
405
+ if c == '\\':
406
+ self.advance()
407
+ escape_map = {'n': '\n', 't': '\t', 'r': '\r', '\\': '\\', '"': '"', '{': '{', '}': '}'}
408
+ if self.pos < len(self.source):
409
+ current.append(escape_map.get(self.source[self.pos], self.source[self.pos]))
410
+ self.advance()
411
+ elif c == '{':
412
+ # Start of expression
413
+ fmt_parts.append(''.join(current))
414
+ current = []
415
+ self.advance() # consume {
416
+ depth = 1
417
+ expr_chars = []
418
+ while self.pos < len(self.source) and depth > 0:
419
+ ec = self.source[self.pos]
420
+ if ec == '{': depth += 1
421
+ elif ec == '}': depth -= 1
422
+ if depth > 0:
423
+ expr_chars.append(ec)
424
+ self.advance()
425
+ expr_sources.append(''.join(expr_chars))
426
+ else:
427
+ current.append(c)
428
+ self.advance()
429
+ fmt_parts.append(''.join(current))
430
+ if self.pos < len(self.source):
431
+ self.advance() # consume closing "
432
+ # Build format string with {} placeholders
433
+ fmt_str = '{}' .join(fmt_parts)
434
+ # Encode as: fmt_str\x00expr1\x00expr2\x00...
435
+ value = '\x00'.join([fmt_str] + expr_sources)
436
+ self.add_token(TokenType.TEMPLATE, value, start_line, start_col)
437
+ continue
438
+
439
+ # String literals
440
+ if ch == '"':
441
+ value = self.read_string()
442
+ self.add_token(TokenType.STRING, value, start_line, start_col)
443
+ continue
444
+
445
+ # Numbers (including negative)
446
+ if ch.isdigit():
447
+ tok = self.read_number()
448
+ self.tokens.append(tok)
449
+ continue
450
+
451
+ # Minus: could be negative number or operator
452
+ if ch == '-':
453
+ # Treat as negative number if followed by digit AND
454
+ # not immediately after ( (where it would be the operator in (- a b))
455
+ if (self.peek_ahead().isdigit() and
456
+ (not self.tokens or
457
+ self.tokens[-1].type != TokenType.LPAREN)):
458
+ tok = self.read_number()
459
+ self.tokens.append(tok)
460
+ continue
461
+ else:
462
+ self.advance()
463
+ self.add_token(TokenType.MINUS, '-', start_line, start_col)
464
+ continue
465
+
466
+ # Symbols / keywords
467
+ if ch.isalpha() or ch == '_':
468
+ name = self.read_symbol()
469
+ if name in KEYWORDS:
470
+ self.add_token(KEYWORDS[name], name, start_line, start_col)
471
+ else:
472
+ self.add_token(TokenType.SYMBOL, name, start_line, start_col)
473
+ continue
474
+
475
+ raise LexError(f"Unexpected character: {ch!r}", start_line, start_col)
476
+
477
+ self.add_token(TokenType.EOF, '', self.line, self.col)
478
+ return self.tokens
479
+
480
+
481
+ def tokenize(source: str) -> List[Token]:
482
+ """Convenience function to tokenize a source string"""
483
+ return Lexer(source).tokenize()