sutra-dev 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sutra_compiler/__init__.py +49 -0
- sutra_compiler/__main__.py +514 -0
- sutra_compiler/ast_nodes.py +553 -0
- sutra_compiler/codegen.py +1811 -0
- sutra_compiler/codegen_base.py +2436 -0
- sutra_compiler/codegen_pytorch.py +1472 -0
- sutra_compiler/diagnostics.py +145 -0
- sutra_compiler/inliner.py +581 -0
- sutra_compiler/lexer.py +821 -0
- sutra_compiler/parser.py +2112 -0
- sutra_compiler/review.py +322 -0
- sutra_compiler/simplify.py +1046 -0
- sutra_compiler/simplify_egglog.py +674 -0
- sutra_compiler/stdlib/axons.su +53 -0
- sutra_compiler/stdlib/embed.su +48 -0
- sutra_compiler/stdlib/javascript_object.su +18 -0
- sutra_compiler/stdlib/logic.su +202 -0
- sutra_compiler/stdlib/math.su +12 -0
- sutra_compiler/stdlib/memory.su +82 -0
- sutra_compiler/stdlib/numbers.su +99 -0
- sutra_compiler/stdlib/rotation.su +83 -0
- sutra_compiler/stdlib/similarity.su +97 -0
- sutra_compiler/stdlib/strings.su +56 -0
- sutra_compiler/stdlib/tensor.su +82 -0
- sutra_compiler/stdlib/vectors.su +119 -0
- sutra_compiler/stdlib_loader.py +219 -0
- sutra_compiler/sutradb_embedded.py +273 -0
- sutra_compiler/trace.py +135 -0
- sutra_compiler/validator.py +552 -0
- sutra_compiler/workspace.py +655 -0
- sutra_dev-0.2.0.dist-info/METADATA +80 -0
- sutra_dev-0.2.0.dist-info/RECORD +36 -0
- sutra_dev-0.2.0.dist-info/WHEEL +5 -0
- sutra_dev-0.2.0.dist-info/entry_points.txt +2 -0
- sutra_dev-0.2.0.dist-info/licenses/LICENSE +201 -0
- sutra_dev-0.2.0.dist-info/top_level.txt +1 -0
sutra_compiler/lexer.py
ADDED
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
"""Lexer for the Sutra language.
|
|
2
|
+
|
|
3
|
+
Produces a flat list of tokens from source text. The lexer is
|
|
4
|
+
intentionally forgiving: unknown characters become `TokenKind.UNKNOWN`
|
|
5
|
+
with a diagnostic attached rather than aborting, so the parser still
|
|
6
|
+
sees a usable stream.
|
|
7
|
+
|
|
8
|
+
Language features handled:
|
|
9
|
+
|
|
10
|
+
- Comment forms: `//` line, `/* */` block, `///` doc line, `#` line.
|
|
11
|
+
Block comments are NOT nested (matches C).
|
|
12
|
+
- String literals: regular `"..."` and interpolated `$"... {expr} ..."`.
|
|
13
|
+
Interpolated strings become a flat sequence:
|
|
14
|
+
STRING_INTERP_START STRING_LIT_CHUNK INTERP_OPEN
|
|
15
|
+
...tokens for expr...
|
|
16
|
+
INTERP_CLOSE STRING_LIT_CHUNK STRING_INTERP_END
|
|
17
|
+
That lets the parser walk inside `{...}` with the full expression
|
|
18
|
+
grammar and still know we're inside a string.
|
|
19
|
+
- Numeric literals: integer and decimal; no hex/exponent yet.
|
|
20
|
+
- Identifiers and keywords.
|
|
21
|
+
- Multi-character operators: `==`, `!=`, `<=`, `>=`, `&&`, `||`,
|
|
22
|
+
`++`, `--`, `+=`, `-=`, `*=`, `/=`, `=>`, `->`, `::`, `|>`.
|
|
23
|
+
(`|>` is lexed so we can flag it explicitly; the spec forbids it.)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from enum import Enum, auto
|
|
30
|
+
from typing import List, Optional
|
|
31
|
+
|
|
32
|
+
from .diagnostics import (
|
|
33
|
+
DiagnosticBag,
|
|
34
|
+
SourcePosition,
|
|
35
|
+
SourceSpan,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TokenKind(Enum):
|
|
40
|
+
# ---- structural ----
|
|
41
|
+
LBRACE = auto() # {
|
|
42
|
+
RBRACE = auto() # }
|
|
43
|
+
LPAREN = auto() # (
|
|
44
|
+
RPAREN = auto() # )
|
|
45
|
+
LBRACKET = auto() # [
|
|
46
|
+
RBRACKET = auto() # ]
|
|
47
|
+
SEMICOLON = auto() # ;
|
|
48
|
+
COMMA = auto() # ,
|
|
49
|
+
DOT = auto() # .
|
|
50
|
+
COLON = auto() # :
|
|
51
|
+
|
|
52
|
+
# ---- operators ----
|
|
53
|
+
PLUS = auto() # +
|
|
54
|
+
MINUS = auto() # -
|
|
55
|
+
STAR = auto() # *
|
|
56
|
+
SLASH = auto() # /
|
|
57
|
+
PERCENT = auto() # %
|
|
58
|
+
BANG = auto() # !
|
|
59
|
+
TILDE = auto() # ~ (alternative NOT)
|
|
60
|
+
QUESTION = auto() # ?
|
|
61
|
+
ASSIGN = auto() # =
|
|
62
|
+
EQ = auto() # ==
|
|
63
|
+
NEQ = auto() # !=
|
|
64
|
+
LT = auto() # <
|
|
65
|
+
GT = auto() # >
|
|
66
|
+
LE = auto() # <=
|
|
67
|
+
GE = auto() # >=
|
|
68
|
+
AND = auto() # &&
|
|
69
|
+
OR = auto() # ||
|
|
70
|
+
BIT_AND = auto() # &
|
|
71
|
+
BIT_OR = auto() # |
|
|
72
|
+
BIT_XOR = auto() # ^
|
|
73
|
+
PLUS_PLUS = auto() # ++
|
|
74
|
+
MINUS_MINUS = auto() # --
|
|
75
|
+
PLUS_ASSIGN = auto() # +=
|
|
76
|
+
MINUS_ASSIGN = auto() # -=
|
|
77
|
+
STAR_ASSIGN = auto() # *=
|
|
78
|
+
SLASH_ASSIGN = auto() # /=
|
|
79
|
+
ARROW = auto() # ->
|
|
80
|
+
FAT_ARROW = auto() # =>
|
|
81
|
+
PIPE_FORWARD = auto() # |> (spec says: not supported)
|
|
82
|
+
DOUBLE_COLON = auto() # ::
|
|
83
|
+
|
|
84
|
+
# ---- literals ----
|
|
85
|
+
INT_LIT = auto()
|
|
86
|
+
FLOAT_LIT = auto()
|
|
87
|
+
IMAG_LIT = auto() # imaginary-unit suffix: 5i, 3.14i
|
|
88
|
+
CHAR_LIT = auto() # single-quoted char literal 'a'
|
|
89
|
+
STRING_LIT = auto() # plain "..." literal
|
|
90
|
+
STRING_INTERP_START = auto() # opening $" of interpolated string
|
|
91
|
+
STRING_INTERP_END = auto() # closing " of interpolated string
|
|
92
|
+
STRING_LIT_CHUNK = auto() # literal text chunk inside interp string
|
|
93
|
+
INTERP_OPEN = auto() # { inside interpolated string
|
|
94
|
+
INTERP_CLOSE = auto() # } inside interpolated string
|
|
95
|
+
TRUE = auto()
|
|
96
|
+
FALSE = auto()
|
|
97
|
+
KW_UNKNOWN = auto() # the `unknown` literal — truth-axis neutral
|
|
98
|
+
KW_WAIT = auto() # the `wait` literal — explicit deferred init
|
|
99
|
+
|
|
100
|
+
# ---- identifiers / keywords ----
|
|
101
|
+
IDENT = auto()
|
|
102
|
+
KW_FUNCTION = auto()
|
|
103
|
+
KW_METHOD = auto()
|
|
104
|
+
KW_STATIC = auto()
|
|
105
|
+
KW_PUBLIC = auto()
|
|
106
|
+
KW_PRIVATE = auto()
|
|
107
|
+
KW_VAR = auto()
|
|
108
|
+
KW_CONST = auto()
|
|
109
|
+
KW_ROLE = auto()
|
|
110
|
+
KW_RETURN = auto()
|
|
111
|
+
KW_IF = auto()
|
|
112
|
+
KW_ELSE = auto()
|
|
113
|
+
KW_WHILE = auto()
|
|
114
|
+
KW_FOR = auto()
|
|
115
|
+
KW_FOREACH = auto()
|
|
116
|
+
KW_IN = auto()
|
|
117
|
+
KW_DO = auto()
|
|
118
|
+
KW_LOOP = auto()
|
|
119
|
+
KW_DO_WHILE = auto()
|
|
120
|
+
KW_WHILE_LOOP = auto()
|
|
121
|
+
KW_ITERATIVE_LOOP = auto()
|
|
122
|
+
KW_FOREACH_LOOP = auto()
|
|
123
|
+
# `pass <exprs>;` — tail-recursive yield in a loop body. Required to
|
|
124
|
+
# provide one expression per state parameter; the condition is
|
|
125
|
+
# re-evaluated automatically against the new state. The `replace`
|
|
126
|
+
# keyword takes the place of an expression to mean "keep this
|
|
127
|
+
# parameter's input value across the recurrence."
|
|
128
|
+
KW_PASS = auto()
|
|
129
|
+
KW_REPLACE = auto()
|
|
130
|
+
# Note: `element` (the foreach_loop's current-array-value reference)
|
|
131
|
+
# and `iterator` (the iterative_loop's tick number) are CONTEXTUAL
|
|
132
|
+
# — they parse as plain IDENT tokens and the codegen recognizes
|
|
133
|
+
# them specially in the identifier translation path. They are not
|
|
134
|
+
# hard keywords so they don't break unrelated `element` / `iterator`
|
|
135
|
+
# variable names elsewhere in user code.
|
|
136
|
+
KW_AS = auto()
|
|
137
|
+
KW_TRY = auto()
|
|
138
|
+
KW_CATCH = auto()
|
|
139
|
+
KW_THIS = auto()
|
|
140
|
+
KW_OPERATOR = auto()
|
|
141
|
+
KW_NEW = auto()
|
|
142
|
+
KW_IMPLICIT = auto()
|
|
143
|
+
# `intrinsic` — declares a function whose body lives in the runtime
|
|
144
|
+
# (no Sutra-level body). Used by stdlib files for leaf primitives
|
|
145
|
+
# like `dot`, `sqrt`, `tanh`, `make_truth`, `embed` that can't be
|
|
146
|
+
# expressed in Sutra arithmetic. Calls compile to `_VSA.<name>(...)`.
|
|
147
|
+
KW_INTRINSIC = auto()
|
|
148
|
+
# Logical-connective keyword operators. Spelled case-insensitively
|
|
149
|
+
# (the lexer lowercases lexemes before matching, only for these).
|
|
150
|
+
# Map to the same stdlib functions the symbolic forms (`!`, `&&`,
|
|
151
|
+
# `||`, etc.) lower to:
|
|
152
|
+
# not / NOT -> logical_not (symbolic: ! ~)
|
|
153
|
+
# and / AND -> logical_and (symbolic: && &)
|
|
154
|
+
# nand -> logical_nand
|
|
155
|
+
# or / OR -> logical_or (symbolic: || |)
|
|
156
|
+
# xor -> logical_xor
|
|
157
|
+
# xnor / iff -> logical_xnor
|
|
158
|
+
KW_LOGICAL_NOT = auto()
|
|
159
|
+
KW_LOGICAL_AND = auto()
|
|
160
|
+
KW_LOGICAL_OR = auto()
|
|
161
|
+
KW_LOGICAL_NAND = auto()
|
|
162
|
+
KW_LOGICAL_XOR = auto()
|
|
163
|
+
KW_LOGICAL_XNOR = auto()
|
|
164
|
+
# `class Name extends Parent { ... }` — user-defined ontology
|
|
165
|
+
# class. MVP scope is empty bodies + single inheritance; the
|
|
166
|
+
# extends-chain must bottom out at a primitive class. See
|
|
167
|
+
# docs/ontology.md.
|
|
168
|
+
KW_CLASS = auto()
|
|
169
|
+
KW_EXTENDS = auto()
|
|
170
|
+
KW_SLOT = auto()
|
|
171
|
+
|
|
172
|
+
# ---- special ----
|
|
173
|
+
EOF = auto()
|
|
174
|
+
UNKNOWN = auto()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# Keywords that have a dedicated TokenKind.
|
|
178
|
+
KEYWORDS = {
|
|
179
|
+
"function": TokenKind.KW_FUNCTION,
|
|
180
|
+
"method": TokenKind.KW_METHOD,
|
|
181
|
+
"static": TokenKind.KW_STATIC,
|
|
182
|
+
"public": TokenKind.KW_PUBLIC,
|
|
183
|
+
"private": TokenKind.KW_PRIVATE,
|
|
184
|
+
"var": TokenKind.KW_VAR,
|
|
185
|
+
"const": TokenKind.KW_CONST,
|
|
186
|
+
# "role" is a CONTEXTUAL keyword — not in the lexer's hard-keyword
|
|
187
|
+
# map so `vector role` parameters and `role` identifiers keep
|
|
188
|
+
# parsing. The parser recognizes `role X = ...;` at statement-start
|
|
189
|
+
# by checking the IDENT lexeme + lookahead. See parser.py.
|
|
190
|
+
"return": TokenKind.KW_RETURN,
|
|
191
|
+
"if": TokenKind.KW_IF,
|
|
192
|
+
"else": TokenKind.KW_ELSE,
|
|
193
|
+
"while": TokenKind.KW_WHILE,
|
|
194
|
+
"for": TokenKind.KW_FOR,
|
|
195
|
+
"foreach": TokenKind.KW_FOREACH,
|
|
196
|
+
"in": TokenKind.KW_IN,
|
|
197
|
+
"do": TokenKind.KW_DO,
|
|
198
|
+
"loop": TokenKind.KW_LOOP,
|
|
199
|
+
"do_while": TokenKind.KW_DO_WHILE,
|
|
200
|
+
"while_loop": TokenKind.KW_WHILE_LOOP,
|
|
201
|
+
"iterative_loop": TokenKind.KW_ITERATIVE_LOOP,
|
|
202
|
+
"foreach_loop": TokenKind.KW_FOREACH_LOOP,
|
|
203
|
+
"pass": TokenKind.KW_PASS,
|
|
204
|
+
"replace": TokenKind.KW_REPLACE,
|
|
205
|
+
"as": TokenKind.KW_AS,
|
|
206
|
+
"try": TokenKind.KW_TRY,
|
|
207
|
+
"catch": TokenKind.KW_CATCH,
|
|
208
|
+
"this": TokenKind.KW_THIS,
|
|
209
|
+
"operator": TokenKind.KW_OPERATOR,
|
|
210
|
+
"new": TokenKind.KW_NEW,
|
|
211
|
+
"implicit": TokenKind.KW_IMPLICIT,
|
|
212
|
+
"intrinsic": TokenKind.KW_INTRINSIC,
|
|
213
|
+
"class": TokenKind.KW_CLASS,
|
|
214
|
+
"extends": TokenKind.KW_EXTENDS,
|
|
215
|
+
"slot": TokenKind.KW_SLOT,
|
|
216
|
+
"true": TokenKind.TRUE,
|
|
217
|
+
"false": TokenKind.FALSE,
|
|
218
|
+
# `unknown` — the neutral point on the truth axis (0.0 between
|
|
219
|
+
# true and false). The first-class three-valued value, and a
|
|
220
|
+
# readability win over `trit t = 0`. `unk` is a short alias
|
|
221
|
+
# that gets the same token — both forms are fine to write.
|
|
222
|
+
"unknown": TokenKind.KW_UNKNOWN,
|
|
223
|
+
"unk": TokenKind.KW_UNKNOWN,
|
|
224
|
+
# `wait` — explicit deferred-initializer marker. Only legal in a
|
|
225
|
+
# var-decl initializer position (`int i = wait;`). Tells the
|
|
226
|
+
# compiler "I'm declaring this name now, an assignment will
|
|
227
|
+
# follow before any read." The validator enforces definite
|
|
228
|
+
# assignment; the codegen emits zero-of-type at the declaration
|
|
229
|
+
# site and the later assignment overrides it.
|
|
230
|
+
"wait": TokenKind.KW_WAIT,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# Primitive type names. They are ordinary identifiers at the lexer
|
|
234
|
+
# level - the parser treats them as types in type positions.
|
|
235
|
+
#
|
|
236
|
+
# `permutation` is a vector at the substrate level (a fixed ±1
|
|
237
|
+
# mask) but it's a distinct compile-time type: the operations on
|
|
238
|
+
# it (compose, invert, act on a vector) are different from the
|
|
239
|
+
# operations on a plain vector.
|
|
240
|
+
#
|
|
241
|
+
# `map` is a built-in generic collection type, written as
|
|
242
|
+
# `map<K, V>` in type position. It's listed here so the validator
|
|
243
|
+
# doesn't flag it as a user-defined class name subject to
|
|
244
|
+
# casing-drift checks, and so that the spec treats it as a primitive
|
|
245
|
+
# container alongside `tuple`.
|
|
246
|
+
PRIMITIVE_TYPE_NAMES = {
|
|
247
|
+
"scalar",
|
|
248
|
+
"vector",
|
|
249
|
+
"matrix",
|
|
250
|
+
"tuple",
|
|
251
|
+
"string",
|
|
252
|
+
"bool",
|
|
253
|
+
"fuzzy",
|
|
254
|
+
"void",
|
|
255
|
+
"permutation",
|
|
256
|
+
"map",
|
|
257
|
+
"char",
|
|
258
|
+
"int",
|
|
259
|
+
# Three-valued primitive class. Same truth-axis storage as
|
|
260
|
+
# `fuzzy`; the difference is compile-time tagging + the
|
|
261
|
+
# three-way polarizer in defuzzification, which preserves the
|
|
262
|
+
# neutral point instead of collapsing it.
|
|
263
|
+
"trit",
|
|
264
|
+
# Complex numbers — real+imaginary pair on synthetic[AXIS_REAL]
|
|
265
|
+
# and synthetic[AXIS_IMAG]. Every numeric value is implicitly
|
|
266
|
+
# on the complex plane; the `complex` type tag is compile-time
|
|
267
|
+
# metadata for type-hygiene purposes. `5i` / `5 + 5i` literals
|
|
268
|
+
# already emit make_complex calls; the type lets the programmer
|
|
269
|
+
# declare the intent at the slot level.
|
|
270
|
+
"complex",
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# Logical-connective keywords. CONTEXTUAL — these names lex as
|
|
274
|
+
# IDENT so user identifiers like `Iff`, `Nand`, `XorTable` keep
|
|
275
|
+
# parsing. The parser checks IDENT lexemes against this map (after
|
|
276
|
+
# lowercasing) only in expression positions, where they then become
|
|
277
|
+
# operators. Maps lowercased lexeme -> the logical-op string the
|
|
278
|
+
# inliner lowers to. Symbolic equivalents (`!`, `~`, `&&`, `&`,
|
|
279
|
+
# `||`, `|`) come through dedicated tokens, not this map.
|
|
280
|
+
_LOGIC_KEYWORD_NAMES = {
|
|
281
|
+
"not": "!", # unary
|
|
282
|
+
"and": "&&", # binary
|
|
283
|
+
"or": "||", # binary
|
|
284
|
+
"nand": "nand", # binary
|
|
285
|
+
"xor": "xor", # binary
|
|
286
|
+
"xnor": "xnor", # binary
|
|
287
|
+
"iff": "xnor", # binary, alias for xnor
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Contextual keywords: identifiers with special meaning in expressions
|
|
291
|
+
# but which are still legal bareword identifiers in other positions.
|
|
292
|
+
CONTEXTUAL_KEYWORDS = {
|
|
293
|
+
"defuzzy",
|
|
294
|
+
"embed",
|
|
295
|
+
"unsafeCast",
|
|
296
|
+
"unsafeOverride",
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@dataclass
|
|
301
|
+
class Token:
|
|
302
|
+
kind: TokenKind
|
|
303
|
+
lexeme: str
|
|
304
|
+
span: SourceSpan
|
|
305
|
+
# For literals: the interpreted value. `value` is a Python object
|
|
306
|
+
# for ease of later lowering; for now the parser only cares about
|
|
307
|
+
# it for strings.
|
|
308
|
+
value: object = None
|
|
309
|
+
|
|
310
|
+
def __repr__(self) -> str: # pragma: no cover - debug aid
|
|
311
|
+
return f"Token({self.kind.name}, {self.lexeme!r}, {self.span.start})"
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class Lexer:
|
|
315
|
+
"""Tokenize Sutra source into a flat list.
|
|
316
|
+
|
|
317
|
+
Call `tokenize()` and then consume `tokens` and `diagnostics`.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, source: str, *, file: Optional[str] = None) -> None:
|
|
321
|
+
self.source = source
|
|
322
|
+
self.file = file
|
|
323
|
+
self.diagnostics = DiagnosticBag(file=file)
|
|
324
|
+
self.tokens: List[Token] = []
|
|
325
|
+
self._pos = 0
|
|
326
|
+
self._line = 1
|
|
327
|
+
self._col = 1
|
|
328
|
+
# Stack of open interpolated-string states. Each entry stores
|
|
329
|
+
# (start_pos, brace_depth_at_interp_open). When we are inside
|
|
330
|
+
# an interpolation's `{...}`, we count braces so we only return
|
|
331
|
+
# to string mode on the matching `}`.
|
|
332
|
+
self._interp_stack: List[int] = []
|
|
333
|
+
|
|
334
|
+
# ---- public API -------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
def tokenize(self) -> List[Token]:
|
|
337
|
+
while not self._at_end():
|
|
338
|
+
if self._interp_stack and self._interp_stack[-1] == 0:
|
|
339
|
+
# We are inside the literal part of an interpolated
|
|
340
|
+
# string (not within `{...}`). Continue scanning the
|
|
341
|
+
# string body.
|
|
342
|
+
self._scan_interp_body()
|
|
343
|
+
continue
|
|
344
|
+
self._scan_token()
|
|
345
|
+
self._emit(TokenKind.EOF, "", self._pos, self._pos)
|
|
346
|
+
return self.tokens
|
|
347
|
+
|
|
348
|
+
# ---- position bookkeeping --------------------------------------------
|
|
349
|
+
|
|
350
|
+
def _at_end(self) -> bool:
|
|
351
|
+
return self._pos >= len(self.source)
|
|
352
|
+
|
|
353
|
+
def _peek(self, offset: int = 0) -> str:
|
|
354
|
+
idx = self._pos + offset
|
|
355
|
+
if idx >= len(self.source):
|
|
356
|
+
return ""
|
|
357
|
+
return self.source[idx]
|
|
358
|
+
|
|
359
|
+
def _advance(self) -> str:
|
|
360
|
+
ch = self.source[self._pos]
|
|
361
|
+
self._pos += 1
|
|
362
|
+
if ch == "\n":
|
|
363
|
+
self._line += 1
|
|
364
|
+
self._col = 1
|
|
365
|
+
else:
|
|
366
|
+
self._col += 1
|
|
367
|
+
return ch
|
|
368
|
+
|
|
369
|
+
def _position_at(self, offset: int) -> SourcePosition:
|
|
370
|
+
# Walk from 0 to offset to get accurate line/col. Only called
|
|
371
|
+
# for token starts/ends on the main path, so we use a cheap
|
|
372
|
+
# incremental tracker instead: line/col are maintained by
|
|
373
|
+
# `_advance`. For span starts we snapshot before scanning.
|
|
374
|
+
raise NotImplementedError("Use _snapshot / _make_span instead")
|
|
375
|
+
|
|
376
|
+
def _snapshot(self) -> SourcePosition:
|
|
377
|
+
return SourcePosition(line=self._line, column=self._col, offset=self._pos)
|
|
378
|
+
|
|
379
|
+
def _span(self, start: SourcePosition) -> SourceSpan:
|
|
380
|
+
return SourceSpan(start=start, end=self._snapshot())
|
|
381
|
+
|
|
382
|
+
# ---- token emission ---------------------------------------------------
|
|
383
|
+
|
|
384
|
+
def _emit(
|
|
385
|
+
self,
|
|
386
|
+
kind: TokenKind,
|
|
387
|
+
lexeme: str,
|
|
388
|
+
start_offset: int,
|
|
389
|
+
end_offset: int,
|
|
390
|
+
*,
|
|
391
|
+
value: object = None,
|
|
392
|
+
) -> None:
|
|
393
|
+
# Compute accurate positions from offsets by re-scanning the
|
|
394
|
+
# known lexeme boundaries using the maintained _line/_col. In
|
|
395
|
+
# practice the caller already has a SourcePosition snapshot so
|
|
396
|
+
# we accept that via `_emit_with_span` instead. This helper is
|
|
397
|
+
# kept for the EOF sentinel only.
|
|
398
|
+
pos = SourcePosition(line=self._line, column=self._col, offset=end_offset)
|
|
399
|
+
span = SourceSpan(start=pos, end=pos)
|
|
400
|
+
self.tokens.append(Token(kind=kind, lexeme=lexeme, span=span, value=value))
|
|
401
|
+
|
|
402
|
+
def _emit_tok(
|
|
403
|
+
self,
|
|
404
|
+
kind: TokenKind,
|
|
405
|
+
lexeme: str,
|
|
406
|
+
start: SourcePosition,
|
|
407
|
+
*,
|
|
408
|
+
value: object = None,
|
|
409
|
+
) -> None:
|
|
410
|
+
span = self._span(start)
|
|
411
|
+
self.tokens.append(Token(kind=kind, lexeme=lexeme, span=span, value=value))
|
|
412
|
+
|
|
413
|
+
# ---- main scanner -----------------------------------------------------
|
|
414
|
+
|
|
415
|
+
def _scan_token(self) -> None:
|
|
416
|
+
# Skip whitespace (but not newlines inside counts)
|
|
417
|
+
while not self._at_end() and self._peek() in " \t\r\n":
|
|
418
|
+
self._advance()
|
|
419
|
+
if self._at_end():
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
start = self._snapshot()
|
|
423
|
+
ch = self._peek()
|
|
424
|
+
|
|
425
|
+
# Comments --------------------------------------------------------
|
|
426
|
+
if ch == "/" and self._peek(1) == "/":
|
|
427
|
+
self._scan_line_comment()
|
|
428
|
+
return
|
|
429
|
+
if ch == "/" and self._peek(1) == "*":
|
|
430
|
+
self._scan_block_comment(start)
|
|
431
|
+
return
|
|
432
|
+
if ch == "#":
|
|
433
|
+
self._scan_line_comment()
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
# Strings ----------------------------------------------------------
|
|
437
|
+
if ch == '"':
|
|
438
|
+
self._scan_plain_string(start)
|
|
439
|
+
return
|
|
440
|
+
if ch == "$" and self._peek(1) == '"':
|
|
441
|
+
self._scan_interp_string_open(start)
|
|
442
|
+
return
|
|
443
|
+
if ch == "'":
|
|
444
|
+
self._scan_char(start)
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
# Numbers ----------------------------------------------------------
|
|
448
|
+
if ch.isdigit():
|
|
449
|
+
self._scan_number(start)
|
|
450
|
+
return
|
|
451
|
+
|
|
452
|
+
# Identifiers / keywords ------------------------------------------
|
|
453
|
+
if ch == "_" or ch.isalpha():
|
|
454
|
+
self._scan_ident(start)
|
|
455
|
+
return
|
|
456
|
+
|
|
457
|
+
# Operators & punctuation -----------------------------------------
|
|
458
|
+
self._scan_operator(start)
|
|
459
|
+
|
|
460
|
+
# ---- comments ---------------------------------------------------------
|
|
461
|
+
|
|
462
|
+
def _scan_line_comment(self) -> None:
|
|
463
|
+
while not self._at_end() and self._peek() != "\n":
|
|
464
|
+
self._advance()
|
|
465
|
+
|
|
466
|
+
def _scan_block_comment(self, start: SourcePosition) -> None:
|
|
467
|
+
# Consume "/*"
|
|
468
|
+
self._advance()
|
|
469
|
+
self._advance()
|
|
470
|
+
while not self._at_end():
|
|
471
|
+
if self._peek() == "*" and self._peek(1) == "/":
|
|
472
|
+
self._advance()
|
|
473
|
+
self._advance()
|
|
474
|
+
return
|
|
475
|
+
self._advance()
|
|
476
|
+
# Unterminated
|
|
477
|
+
self.diagnostics.error(
|
|
478
|
+
"unterminated block comment",
|
|
479
|
+
self._span(start),
|
|
480
|
+
code="SUT0001",
|
|
481
|
+
hint="add `*/` to close the comment",
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# ---- strings ----------------------------------------------------------
|
|
485
|
+
|
|
486
|
+
def _scan_plain_string(self, start: SourcePosition) -> None:
|
|
487
|
+
self._advance() # opening "
|
|
488
|
+
buf: List[str] = []
|
|
489
|
+
while not self._at_end() and self._peek() != '"':
|
|
490
|
+
ch = self._advance()
|
|
491
|
+
if ch == "\\":
|
|
492
|
+
if self._at_end():
|
|
493
|
+
break
|
|
494
|
+
esc = self._advance()
|
|
495
|
+
buf.append(self._interpret_escape(esc))
|
|
496
|
+
elif ch == "\n":
|
|
497
|
+
self.diagnostics.error(
|
|
498
|
+
"unterminated string literal (newline before closing quote)",
|
|
499
|
+
self._span(start),
|
|
500
|
+
code="SUT0002",
|
|
501
|
+
)
|
|
502
|
+
break
|
|
503
|
+
else:
|
|
504
|
+
buf.append(ch)
|
|
505
|
+
if not self._at_end() and self._peek() == '"':
|
|
506
|
+
self._advance()
|
|
507
|
+
else:
|
|
508
|
+
self.diagnostics.error(
|
|
509
|
+
"unterminated string literal",
|
|
510
|
+
self._span(start),
|
|
511
|
+
code="SUT0002",
|
|
512
|
+
)
|
|
513
|
+
lexeme = self.source[start.offset:self._pos]
|
|
514
|
+
self._emit_tok(
|
|
515
|
+
TokenKind.STRING_LIT, lexeme, start, value="".join(buf)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
def _scan_interp_string_open(self, start: SourcePosition) -> None:
|
|
519
|
+
# `$"` opens an interpolated string. We emit a STRING_INTERP_START
|
|
520
|
+
# token and then push a state entry. The main loop will call
|
|
521
|
+
# `_scan_interp_body` until the string is closed.
|
|
522
|
+
self._advance() # $
|
|
523
|
+
self._advance() # "
|
|
524
|
+
self._emit_tok(TokenKind.STRING_INTERP_START, "$\"", start)
|
|
525
|
+
self._interp_stack.append(0)
|
|
526
|
+
|
|
527
|
+
def _scan_interp_body(self) -> None:
|
|
528
|
+
"""Scan inside an interpolated string, outside `{...}` regions."""
|
|
529
|
+
buf_start = self._snapshot()
|
|
530
|
+
buf: List[str] = []
|
|
531
|
+
while not self._at_end():
|
|
532
|
+
ch = self._peek()
|
|
533
|
+
if ch == '"':
|
|
534
|
+
# End of the interpolated string.
|
|
535
|
+
if buf:
|
|
536
|
+
lexeme = self.source[buf_start.offset:self._pos]
|
|
537
|
+
self._emit_tok(
|
|
538
|
+
TokenKind.STRING_LIT_CHUNK, lexeme, buf_start,
|
|
539
|
+
value="".join(buf),
|
|
540
|
+
)
|
|
541
|
+
close_start = self._snapshot()
|
|
542
|
+
self._advance()
|
|
543
|
+
self._emit_tok(TokenKind.STRING_INTERP_END, "\"", close_start)
|
|
544
|
+
self._interp_stack.pop()
|
|
545
|
+
return
|
|
546
|
+
if ch == "{":
|
|
547
|
+
# Emit any pending chunk, then enter interpolation mode.
|
|
548
|
+
if buf:
|
|
549
|
+
lexeme = self.source[buf_start.offset:self._pos]
|
|
550
|
+
self._emit_tok(
|
|
551
|
+
TokenKind.STRING_LIT_CHUNK, lexeme, buf_start,
|
|
552
|
+
value="".join(buf),
|
|
553
|
+
)
|
|
554
|
+
open_start = self._snapshot()
|
|
555
|
+
self._advance()
|
|
556
|
+
self._emit_tok(TokenKind.INTERP_OPEN, "{", open_start)
|
|
557
|
+
# Mark that we are now tracking a nested brace.
|
|
558
|
+
self._interp_stack[-1] = 1
|
|
559
|
+
return
|
|
560
|
+
if ch == "\\":
|
|
561
|
+
self._advance()
|
|
562
|
+
if self._at_end():
|
|
563
|
+
break
|
|
564
|
+
esc = self._advance()
|
|
565
|
+
buf.append(self._interpret_escape(esc))
|
|
566
|
+
continue
|
|
567
|
+
if ch == "\n":
|
|
568
|
+
self.diagnostics.error(
|
|
569
|
+
"unterminated interpolated string literal",
|
|
570
|
+
self._span(buf_start),
|
|
571
|
+
code="SUT0002",
|
|
572
|
+
)
|
|
573
|
+
break
|
|
574
|
+
self._advance()
|
|
575
|
+
buf.append(ch)
|
|
576
|
+
# EOF without closing quote.
|
|
577
|
+
self.diagnostics.error(
|
|
578
|
+
"unterminated interpolated string literal",
|
|
579
|
+
self._span(buf_start),
|
|
580
|
+
code="SUT0002",
|
|
581
|
+
)
|
|
582
|
+
# Pop so we don't loop.
|
|
583
|
+
if self._interp_stack:
|
|
584
|
+
self._interp_stack.pop()
|
|
585
|
+
|
|
586
|
+
def _scan_char(self, start: SourcePosition) -> None:
|
|
587
|
+
"""Scan a single-quoted character literal: `'a'`, `'\\n'`, `'\\''`.
|
|
588
|
+
|
|
589
|
+
Runs after the dispatcher sees a leading `'`. Recognises the
|
|
590
|
+
same escape sequences as string literals (see
|
|
591
|
+
`_interpret_escape`). Empty literal `''` and unterminated
|
|
592
|
+
literal both produce diagnostics and emit CHAR_LIT with value
|
|
593
|
+
0 so the parser keeps making progress.
|
|
594
|
+
"""
|
|
595
|
+
self._advance() # opening '
|
|
596
|
+
value = 0
|
|
597
|
+
if self._at_end() or self._peek() == "'":
|
|
598
|
+
self.diagnostics.error(
|
|
599
|
+
"empty character literal",
|
|
600
|
+
self._span(start),
|
|
601
|
+
code="SUT0003",
|
|
602
|
+
hint="a character literal must contain exactly one character",
|
|
603
|
+
)
|
|
604
|
+
if not self._at_end() and self._peek() == "'":
|
|
605
|
+
self._advance()
|
|
606
|
+
lexeme = self.source[start.offset:self._pos]
|
|
607
|
+
self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
|
|
608
|
+
return
|
|
609
|
+
|
|
610
|
+
ch = self._advance()
|
|
611
|
+
if ch == "\\":
|
|
612
|
+
if self._at_end():
|
|
613
|
+
self.diagnostics.error(
|
|
614
|
+
"unterminated character literal",
|
|
615
|
+
self._span(start),
|
|
616
|
+
code="SUT0003",
|
|
617
|
+
)
|
|
618
|
+
lexeme = self.source[start.offset:self._pos]
|
|
619
|
+
self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
|
|
620
|
+
return
|
|
621
|
+
esc = self._advance()
|
|
622
|
+
decoded = self._interpret_escape(esc)
|
|
623
|
+
value = ord(decoded)
|
|
624
|
+
elif ch == "\n":
|
|
625
|
+
self.diagnostics.error(
|
|
626
|
+
"unterminated character literal (newline before closing quote)",
|
|
627
|
+
self._span(start),
|
|
628
|
+
code="SUT0003",
|
|
629
|
+
)
|
|
630
|
+
lexeme = self.source[start.offset:self._pos]
|
|
631
|
+
self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
|
|
632
|
+
return
|
|
633
|
+
else:
|
|
634
|
+
value = ord(ch)
|
|
635
|
+
|
|
636
|
+
if not self._at_end() and self._peek() == "'":
|
|
637
|
+
self._advance()
|
|
638
|
+
else:
|
|
639
|
+
self.diagnostics.error(
|
|
640
|
+
"unterminated character literal (expected closing `'`)",
|
|
641
|
+
self._span(start),
|
|
642
|
+
code="SUT0003",
|
|
643
|
+
)
|
|
644
|
+
lexeme = self.source[start.offset:self._pos]
|
|
645
|
+
self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
|
|
646
|
+
|
|
647
|
+
def _interpret_escape(self, ch: str) -> str:
|
|
648
|
+
mapping = {
|
|
649
|
+
"n": "\n",
|
|
650
|
+
"t": "\t",
|
|
651
|
+
"r": "\r",
|
|
652
|
+
"\\": "\\",
|
|
653
|
+
"\"": "\"",
|
|
654
|
+
"'": "'",
|
|
655
|
+
"0": "\0",
|
|
656
|
+
"{": "{",
|
|
657
|
+
"}": "}",
|
|
658
|
+
"$": "$",
|
|
659
|
+
}
|
|
660
|
+
return mapping.get(ch, ch)
|
|
661
|
+
|
|
662
|
+
# ---- numbers ----------------------------------------------------------
|
|
663
|
+
|
|
664
|
+
def _scan_number(self, start: SourcePosition) -> None:
|
|
665
|
+
is_float = False
|
|
666
|
+
while not self._at_end() and self._peek().isdigit():
|
|
667
|
+
self._advance()
|
|
668
|
+
if self._peek() == "." and self._peek(1).isdigit():
|
|
669
|
+
is_float = True
|
|
670
|
+
self._advance()
|
|
671
|
+
while not self._at_end() and self._peek().isdigit():
|
|
672
|
+
self._advance()
|
|
673
|
+
# Imaginary-unit suffix: `5i`, `3.14i`. Only binds when the
|
|
674
|
+
# character AFTER the `i` is not an identifier continuation —
|
|
675
|
+
# so `5i` → IMAG_LIT(5) but `5index` → INT_LIT(5) + IDENT("index")
|
|
676
|
+
# and the bare variable name `i` still lexes as IDENT. Same
|
|
677
|
+
# disambiguation pattern as numeric suffixes in Rust / C#.
|
|
678
|
+
if self._peek() == "i":
|
|
679
|
+
nxt = self._peek(1)
|
|
680
|
+
if nxt == "" or not (nxt.isalnum() or nxt == "_"):
|
|
681
|
+
self._advance() # consume the `i`
|
|
682
|
+
lexeme = self.source[start.offset:self._pos]
|
|
683
|
+
# Magnitude is the numeric part without the trailing `i`.
|
|
684
|
+
magnitude = float(lexeme[:-1])
|
|
685
|
+
self._emit_tok(
|
|
686
|
+
TokenKind.IMAG_LIT, lexeme, start, value=magnitude
|
|
687
|
+
)
|
|
688
|
+
return
|
|
689
|
+
lexeme = self.source[start.offset:self._pos]
|
|
690
|
+
if is_float:
|
|
691
|
+
self._emit_tok(TokenKind.FLOAT_LIT, lexeme, start, value=float(lexeme))
|
|
692
|
+
else:
|
|
693
|
+
self._emit_tok(TokenKind.INT_LIT, lexeme, start, value=int(lexeme))
|
|
694
|
+
|
|
695
|
+
# ---- identifiers ------------------------------------------------------
|
|
696
|
+
|
|
697
|
+
def _scan_ident(self, start: SourcePosition) -> None:
|
|
698
|
+
while not self._at_end():
|
|
699
|
+
ch = self._peek()
|
|
700
|
+
if ch == "_" or ch.isalnum():
|
|
701
|
+
self._advance()
|
|
702
|
+
else:
|
|
703
|
+
break
|
|
704
|
+
lexeme = self.source[start.offset:self._pos]
|
|
705
|
+
kind = KEYWORDS.get(lexeme, TokenKind.IDENT)
|
|
706
|
+
# The logical-connective keywords (`not`, `and`, `or`, `nand`,
|
|
707
|
+
# `xor`, `xnor`, `iff`) are CONTEXTUAL — they emit as IDENT so
|
|
708
|
+
# they don't shadow user identifiers like `Iff` or `Nand`.
|
|
709
|
+
# The parser recognizes them as operators in expression
|
|
710
|
+
# positions by checking the IDENT lexeme (case-insensitively).
|
|
711
|
+
# See _LOGIC_KEYWORD_NAMES below.
|
|
712
|
+
self._emit_tok(kind, lexeme, start)
|
|
713
|
+
|
|
714
|
+
# ---- operators --------------------------------------------------------
|
|
715
|
+
|
|
716
|
+
def _scan_operator(self, start: SourcePosition) -> None:
|
|
717
|
+
ch = self._advance()
|
|
718
|
+
nxt = self._peek()
|
|
719
|
+
|
|
720
|
+
# Two-character operators first.
|
|
721
|
+
two: Optional[TokenKind] = None
|
|
722
|
+
if ch == "=" and nxt == "=":
|
|
723
|
+
two = TokenKind.EQ
|
|
724
|
+
elif ch == "!" and nxt == "=":
|
|
725
|
+
two = TokenKind.NEQ
|
|
726
|
+
elif ch == "<" and nxt == "=":
|
|
727
|
+
two = TokenKind.LE
|
|
728
|
+
elif ch == ">" and nxt == "=":
|
|
729
|
+
two = TokenKind.GE
|
|
730
|
+
elif ch == "&" and nxt == "&":
|
|
731
|
+
two = TokenKind.AND
|
|
732
|
+
elif ch == "|" and nxt == "|":
|
|
733
|
+
two = TokenKind.OR
|
|
734
|
+
elif ch == "+" and nxt == "+":
|
|
735
|
+
two = TokenKind.PLUS_PLUS
|
|
736
|
+
elif ch == "-" and nxt == "-":
|
|
737
|
+
two = TokenKind.MINUS_MINUS
|
|
738
|
+
elif ch == "+" and nxt == "=":
|
|
739
|
+
two = TokenKind.PLUS_ASSIGN
|
|
740
|
+
elif ch == "-" and nxt == "=":
|
|
741
|
+
two = TokenKind.MINUS_ASSIGN
|
|
742
|
+
elif ch == "*" and nxt == "=":
|
|
743
|
+
two = TokenKind.STAR_ASSIGN
|
|
744
|
+
elif ch == "/" and nxt == "=":
|
|
745
|
+
two = TokenKind.SLASH_ASSIGN
|
|
746
|
+
elif ch == "-" and nxt == ">":
|
|
747
|
+
two = TokenKind.ARROW
|
|
748
|
+
elif ch == "=" and nxt == ">":
|
|
749
|
+
two = TokenKind.FAT_ARROW
|
|
750
|
+
elif ch == "|" and nxt == ">":
|
|
751
|
+
two = TokenKind.PIPE_FORWARD
|
|
752
|
+
elif ch == ":" and nxt == ":":
|
|
753
|
+
two = TokenKind.DOUBLE_COLON
|
|
754
|
+
|
|
755
|
+
if two is not None:
|
|
756
|
+
self._advance()
|
|
757
|
+
lex = self.source[start.offset:self._pos]
|
|
758
|
+
self._emit_tok(two, lex, start)
|
|
759
|
+
return
|
|
760
|
+
|
|
761
|
+
# Single-character operators / punctuation.
|
|
762
|
+
single = {
|
|
763
|
+
"{": TokenKind.LBRACE,
|
|
764
|
+
"}": TokenKind.RBRACE,
|
|
765
|
+
"(": TokenKind.LPAREN,
|
|
766
|
+
")": TokenKind.RPAREN,
|
|
767
|
+
"[": TokenKind.LBRACKET,
|
|
768
|
+
"]": TokenKind.RBRACKET,
|
|
769
|
+
";": TokenKind.SEMICOLON,
|
|
770
|
+
",": TokenKind.COMMA,
|
|
771
|
+
".": TokenKind.DOT,
|
|
772
|
+
":": TokenKind.COLON,
|
|
773
|
+
"+": TokenKind.PLUS,
|
|
774
|
+
"-": TokenKind.MINUS,
|
|
775
|
+
"*": TokenKind.STAR,
|
|
776
|
+
"/": TokenKind.SLASH,
|
|
777
|
+
"%": TokenKind.PERCENT,
|
|
778
|
+
"!": TokenKind.BANG,
|
|
779
|
+
"?": TokenKind.QUESTION,
|
|
780
|
+
"=": TokenKind.ASSIGN,
|
|
781
|
+
"<": TokenKind.LT,
|
|
782
|
+
">": TokenKind.GT,
|
|
783
|
+
"~": TokenKind.TILDE,
|
|
784
|
+
# Single `&` and `|` are logical, not bitwise — Sutra has
|
|
785
|
+
# no bits to flip. They lex to the same kinds as `&&` and
|
|
786
|
+
# `||` so the parser and inliner treat them uniformly.
|
|
787
|
+
"&": TokenKind.AND,
|
|
788
|
+
"|": TokenKind.OR,
|
|
789
|
+
"^": TokenKind.BIT_XOR,
|
|
790
|
+
}
|
|
791
|
+
kind = single.get(ch)
|
|
792
|
+
if kind is None:
|
|
793
|
+
self.diagnostics.error(
|
|
794
|
+
f"unexpected character {ch!r}",
|
|
795
|
+
self._span(start),
|
|
796
|
+
code="SUT0003",
|
|
797
|
+
)
|
|
798
|
+
self._emit_tok(TokenKind.UNKNOWN, ch, start)
|
|
799
|
+
return
|
|
800
|
+
self._emit_tok(kind, ch, start)
|
|
801
|
+
|
|
802
|
+
# Brace counting inside interpolated strings. When we see `{`
|
|
803
|
+
# or `}` inside a `{ expr }` region of an interpolated string,
|
|
804
|
+
# we adjust the depth counter. A matching close returns control
|
|
805
|
+
# to the string body.
|
|
806
|
+
if self._interp_stack and self._interp_stack[-1] > 0:
|
|
807
|
+
if kind is TokenKind.LBRACE:
|
|
808
|
+
self._interp_stack[-1] += 1
|
|
809
|
+
elif kind is TokenKind.RBRACE:
|
|
810
|
+
self._interp_stack[-1] -= 1
|
|
811
|
+
if self._interp_stack[-1] == 0:
|
|
812
|
+
# Replace the last-emitted RBRACE with INTERP_CLOSE
|
|
813
|
+
# so the parser knows we're back in string mode.
|
|
814
|
+
closing = self.tokens.pop()
|
|
815
|
+
self.tokens.append(
|
|
816
|
+
Token(
|
|
817
|
+
kind=TokenKind.INTERP_CLOSE,
|
|
818
|
+
lexeme=closing.lexeme,
|
|
819
|
+
span=closing.span,
|
|
820
|
+
)
|
|
821
|
+
)
|