sutra-dev 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,821 @@
1
+ """Lexer for the Sutra language.
2
+
3
+ Produces a flat list of tokens from source text. The lexer is
4
+ intentionally forgiving: unknown characters become `TokenKind.UNKNOWN`
5
+ with a diagnostic attached rather than aborting, so the parser still
6
+ sees a usable stream.
7
+
8
+ Language features handled:
9
+
10
+ - Comment forms: `//` line, `/* */` block, `///` doc line, `#` line.
11
+ Block comments are NOT nested (matches C).
12
+ - String literals: regular `"..."` and interpolated `$"... {expr} ..."`.
13
+ Interpolated strings become a flat sequence:
14
+ STRING_INTERP_START STRING_LIT_CHUNK INTERP_OPEN
15
+ ...tokens for expr...
16
+ INTERP_CLOSE STRING_LIT_CHUNK STRING_INTERP_END
17
+ That lets the parser walk inside `{...}` with the full expression
18
+ grammar and still know we're inside a string.
19
+ - Numeric literals: integer and decimal; no hex/exponent yet.
20
+ - Identifiers and keywords.
21
+ - Multi-character operators: `==`, `!=`, `<=`, `>=`, `&&`, `||`,
22
+ `++`, `--`, `+=`, `-=`, `*=`, `/=`, `=>`, `->`, `::`, `|>`.
23
+ (`|>` is lexed so we can flag it explicitly; the spec forbids it.)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from dataclasses import dataclass
29
+ from enum import Enum, auto
30
+ from typing import List, Optional
31
+
32
+ from .diagnostics import (
33
+ DiagnosticBag,
34
+ SourcePosition,
35
+ SourceSpan,
36
+ )
37
+
38
+
39
+ class TokenKind(Enum):
40
+ # ---- structural ----
41
+ LBRACE = auto() # {
42
+ RBRACE = auto() # }
43
+ LPAREN = auto() # (
44
+ RPAREN = auto() # )
45
+ LBRACKET = auto() # [
46
+ RBRACKET = auto() # ]
47
+ SEMICOLON = auto() # ;
48
+ COMMA = auto() # ,
49
+ DOT = auto() # .
50
+ COLON = auto() # :
51
+
52
+ # ---- operators ----
53
+ PLUS = auto() # +
54
+ MINUS = auto() # -
55
+ STAR = auto() # *
56
+ SLASH = auto() # /
57
+ PERCENT = auto() # %
58
+ BANG = auto() # !
59
+ TILDE = auto() # ~ (alternative NOT)
60
+ QUESTION = auto() # ?
61
+ ASSIGN = auto() # =
62
+ EQ = auto() # ==
63
+ NEQ = auto() # !=
64
+ LT = auto() # <
65
+ GT = auto() # >
66
+ LE = auto() # <=
67
+ GE = auto() # >=
68
+ AND = auto() # &&
69
+ OR = auto() # ||
70
+ BIT_AND = auto() # &
71
+ BIT_OR = auto() # |
72
+ BIT_XOR = auto() # ^
73
+ PLUS_PLUS = auto() # ++
74
+ MINUS_MINUS = auto() # --
75
+ PLUS_ASSIGN = auto() # +=
76
+ MINUS_ASSIGN = auto() # -=
77
+ STAR_ASSIGN = auto() # *=
78
+ SLASH_ASSIGN = auto() # /=
79
+ ARROW = auto() # ->
80
+ FAT_ARROW = auto() # =>
81
+ PIPE_FORWARD = auto() # |> (spec says: not supported)
82
+ DOUBLE_COLON = auto() # ::
83
+
84
+ # ---- literals ----
85
+ INT_LIT = auto()
86
+ FLOAT_LIT = auto()
87
+ IMAG_LIT = auto() # imaginary-unit suffix: 5i, 3.14i
88
+ CHAR_LIT = auto() # single-quoted char literal 'a'
89
+ STRING_LIT = auto() # plain "..." literal
90
+ STRING_INTERP_START = auto() # opening $" of interpolated string
91
+ STRING_INTERP_END = auto() # closing " of interpolated string
92
+ STRING_LIT_CHUNK = auto() # literal text chunk inside interp string
93
+ INTERP_OPEN = auto() # { inside interpolated string
94
+ INTERP_CLOSE = auto() # } inside interpolated string
95
+ TRUE = auto()
96
+ FALSE = auto()
97
+ KW_UNKNOWN = auto() # the `unknown` literal — truth-axis neutral
98
+ KW_WAIT = auto() # the `wait` literal — explicit deferred init
99
+
100
+ # ---- identifiers / keywords ----
101
+ IDENT = auto()
102
+ KW_FUNCTION = auto()
103
+ KW_METHOD = auto()
104
+ KW_STATIC = auto()
105
+ KW_PUBLIC = auto()
106
+ KW_PRIVATE = auto()
107
+ KW_VAR = auto()
108
+ KW_CONST = auto()
109
+ KW_ROLE = auto()
110
+ KW_RETURN = auto()
111
+ KW_IF = auto()
112
+ KW_ELSE = auto()
113
+ KW_WHILE = auto()
114
+ KW_FOR = auto()
115
+ KW_FOREACH = auto()
116
+ KW_IN = auto()
117
+ KW_DO = auto()
118
+ KW_LOOP = auto()
119
+ KW_DO_WHILE = auto()
120
+ KW_WHILE_LOOP = auto()
121
+ KW_ITERATIVE_LOOP = auto()
122
+ KW_FOREACH_LOOP = auto()
123
+ # `pass <exprs>;` — tail-recursive yield in a loop body. Required to
124
+ # provide one expression per state parameter; the condition is
125
+ # re-evaluated automatically against the new state. The `replace`
126
+ # keyword takes the place of an expression to mean "keep this
127
+ # parameter's input value across the recurrence."
128
+ KW_PASS = auto()
129
+ KW_REPLACE = auto()
130
+ # Note: `element` (the foreach_loop's current-array-value reference)
131
+ # and `iterator` (the iterative_loop's tick number) are CONTEXTUAL
132
+ # — they parse as plain IDENT tokens and the codegen recognizes
133
+ # them specially in the identifier translation path. They are not
134
+ # hard keywords so they don't break unrelated `element` / `iterator`
135
+ # variable names elsewhere in user code.
136
+ KW_AS = auto()
137
+ KW_TRY = auto()
138
+ KW_CATCH = auto()
139
+ KW_THIS = auto()
140
+ KW_OPERATOR = auto()
141
+ KW_NEW = auto()
142
+ KW_IMPLICIT = auto()
143
+ # `intrinsic` — declares a function whose body lives in the runtime
144
+ # (no Sutra-level body). Used by stdlib files for leaf primitives
145
+ # like `dot`, `sqrt`, `tanh`, `make_truth`, `embed` that can't be
146
+ # expressed in Sutra arithmetic. Calls compile to `_VSA.<name>(...)`.
147
+ KW_INTRINSIC = auto()
148
+ # Logical-connective keyword operators. Spelled case-insensitively
149
+ # (the lexer lowercases lexemes before matching, only for these).
150
+ # Map to the same stdlib functions the symbolic forms (`!`, `&&`,
151
+ # `||`, etc.) lower to:
152
+ # not / NOT -> logical_not (symbolic: ! ~)
153
+ # and / AND -> logical_and (symbolic: && &)
154
+ # nand -> logical_nand
155
+ # or / OR -> logical_or (symbolic: || |)
156
+ # xor -> logical_xor
157
+ # xnor / iff -> logical_xnor
158
+ KW_LOGICAL_NOT = auto()
159
+ KW_LOGICAL_AND = auto()
160
+ KW_LOGICAL_OR = auto()
161
+ KW_LOGICAL_NAND = auto()
162
+ KW_LOGICAL_XOR = auto()
163
+ KW_LOGICAL_XNOR = auto()
164
+ # `class Name extends Parent { ... }` — user-defined ontology
165
+ # class. MVP scope is empty bodies + single inheritance; the
166
+ # extends-chain must bottom out at a primitive class. See
167
+ # docs/ontology.md.
168
+ KW_CLASS = auto()
169
+ KW_EXTENDS = auto()
170
+ KW_SLOT = auto()
171
+
172
+ # ---- special ----
173
+ EOF = auto()
174
+ UNKNOWN = auto()
175
+
176
+
177
+ # Keywords that have a dedicated TokenKind.
178
+ KEYWORDS = {
179
+ "function": TokenKind.KW_FUNCTION,
180
+ "method": TokenKind.KW_METHOD,
181
+ "static": TokenKind.KW_STATIC,
182
+ "public": TokenKind.KW_PUBLIC,
183
+ "private": TokenKind.KW_PRIVATE,
184
+ "var": TokenKind.KW_VAR,
185
+ "const": TokenKind.KW_CONST,
186
+ # "role" is a CONTEXTUAL keyword — not in the lexer's hard-keyword
187
+ # map so `vector role` parameters and `role` identifiers keep
188
+ # parsing. The parser recognizes `role X = ...;` at statement-start
189
+ # by checking the IDENT lexeme + lookahead. See parser.py.
190
+ "return": TokenKind.KW_RETURN,
191
+ "if": TokenKind.KW_IF,
192
+ "else": TokenKind.KW_ELSE,
193
+ "while": TokenKind.KW_WHILE,
194
+ "for": TokenKind.KW_FOR,
195
+ "foreach": TokenKind.KW_FOREACH,
196
+ "in": TokenKind.KW_IN,
197
+ "do": TokenKind.KW_DO,
198
+ "loop": TokenKind.KW_LOOP,
199
+ "do_while": TokenKind.KW_DO_WHILE,
200
+ "while_loop": TokenKind.KW_WHILE_LOOP,
201
+ "iterative_loop": TokenKind.KW_ITERATIVE_LOOP,
202
+ "foreach_loop": TokenKind.KW_FOREACH_LOOP,
203
+ "pass": TokenKind.KW_PASS,
204
+ "replace": TokenKind.KW_REPLACE,
205
+ "as": TokenKind.KW_AS,
206
+ "try": TokenKind.KW_TRY,
207
+ "catch": TokenKind.KW_CATCH,
208
+ "this": TokenKind.KW_THIS,
209
+ "operator": TokenKind.KW_OPERATOR,
210
+ "new": TokenKind.KW_NEW,
211
+ "implicit": TokenKind.KW_IMPLICIT,
212
+ "intrinsic": TokenKind.KW_INTRINSIC,
213
+ "class": TokenKind.KW_CLASS,
214
+ "extends": TokenKind.KW_EXTENDS,
215
+ "slot": TokenKind.KW_SLOT,
216
+ "true": TokenKind.TRUE,
217
+ "false": TokenKind.FALSE,
218
+ # `unknown` — the neutral point on the truth axis (0.0 between
219
+ # true and false). The first-class three-valued value, and a
220
+ # readability win over `trit t = 0`. `unk` is a short alias
221
+ # that gets the same token — both forms are fine to write.
222
+ "unknown": TokenKind.KW_UNKNOWN,
223
+ "unk": TokenKind.KW_UNKNOWN,
224
+ # `wait` — explicit deferred-initializer marker. Only legal in a
225
+ # var-decl initializer position (`int i = wait;`). Tells the
226
+ # compiler "I'm declaring this name now, an assignment will
227
+ # follow before any read." The validator enforces definite
228
+ # assignment; the codegen emits zero-of-type at the declaration
229
+ # site and the later assignment overrides it.
230
+ "wait": TokenKind.KW_WAIT,
231
+ }
232
+
233
+ # Primitive type names. They are ordinary identifiers at the lexer
234
+ # level - the parser treats them as types in type positions.
235
+ #
236
+ # `permutation` is a vector at the substrate level (a fixed ±1
237
+ # mask) but it's a distinct compile-time type: the operations on
238
+ # it (compose, invert, act on a vector) are different from the
239
+ # operations on a plain vector.
240
+ #
241
+ # `map` is a built-in generic collection type, written as
242
+ # `map<K, V>` in type position. It's listed here so the validator
243
+ # doesn't flag it as a user-defined class name subject to
244
+ # casing-drift checks, and so that the spec treats it as a primitive
245
+ # container alongside `tuple`.
246
+ PRIMITIVE_TYPE_NAMES = {
247
+ "scalar",
248
+ "vector",
249
+ "matrix",
250
+ "tuple",
251
+ "string",
252
+ "bool",
253
+ "fuzzy",
254
+ "void",
255
+ "permutation",
256
+ "map",
257
+ "char",
258
+ "int",
259
+ # Three-valued primitive class. Same truth-axis storage as
260
+ # `fuzzy`; the difference is compile-time tagging + the
261
+ # three-way polarizer in defuzzification, which preserves the
262
+ # neutral point instead of collapsing it.
263
+ "trit",
264
+ # Complex numbers — real+imaginary pair on synthetic[AXIS_REAL]
265
+ # and synthetic[AXIS_IMAG]. Every numeric value is implicitly
266
+ # on the complex plane; the `complex` type tag is compile-time
267
+ # metadata for type-hygiene purposes. `5i` / `5 + 5i` literals
268
+ # already emit make_complex calls; the type lets the programmer
269
+ # declare the intent at the slot level.
270
+ "complex",
271
+ }
272
+
273
+ # Logical-connective keywords. CONTEXTUAL — these names lex as
274
+ # IDENT so user identifiers like `Iff`, `Nand`, `XorTable` keep
275
+ # parsing. The parser checks IDENT lexemes against this map (after
276
+ # lowercasing) only in expression positions, where they then become
277
+ # operators. Maps lowercased lexeme -> the logical-op string the
278
+ # inliner lowers to. Symbolic equivalents (`!`, `~`, `&&`, `&`,
279
+ # `||`, `|`) come through dedicated tokens, not this map.
280
+ _LOGIC_KEYWORD_NAMES = {
281
+ "not": "!", # unary
282
+ "and": "&&", # binary
283
+ "or": "||", # binary
284
+ "nand": "nand", # binary
285
+ "xor": "xor", # binary
286
+ "xnor": "xnor", # binary
287
+ "iff": "xnor", # binary, alias for xnor
288
+ }
289
+
290
+ # Contextual keywords: identifiers with special meaning in expressions
291
+ # but which are still legal bareword identifiers in other positions.
292
+ CONTEXTUAL_KEYWORDS = {
293
+ "defuzzy",
294
+ "embed",
295
+ "unsafeCast",
296
+ "unsafeOverride",
297
+ }
298
+
299
+
300
+ @dataclass
301
+ class Token:
302
+ kind: TokenKind
303
+ lexeme: str
304
+ span: SourceSpan
305
+ # For literals: the interpreted value. `value` is a Python object
306
+ # for ease of later lowering; for now the parser only cares about
307
+ # it for strings.
308
+ value: object = None
309
+
310
+ def __repr__(self) -> str: # pragma: no cover - debug aid
311
+ return f"Token({self.kind.name}, {self.lexeme!r}, {self.span.start})"
312
+
313
+
314
+ class Lexer:
315
+ """Tokenize Sutra source into a flat list.
316
+
317
+ Call `tokenize()` and then consume `tokens` and `diagnostics`.
318
+ """
319
+
320
+ def __init__(self, source: str, *, file: Optional[str] = None) -> None:
321
+ self.source = source
322
+ self.file = file
323
+ self.diagnostics = DiagnosticBag(file=file)
324
+ self.tokens: List[Token] = []
325
+ self._pos = 0
326
+ self._line = 1
327
+ self._col = 1
328
+ # Stack of open interpolated-string states. Each entry stores
329
+ # (start_pos, brace_depth_at_interp_open). When we are inside
330
+ # an interpolation's `{...}`, we count braces so we only return
331
+ # to string mode on the matching `}`.
332
+ self._interp_stack: List[int] = []
333
+
334
+ # ---- public API -------------------------------------------------------
335
+
336
+ def tokenize(self) -> List[Token]:
337
+ while not self._at_end():
338
+ if self._interp_stack and self._interp_stack[-1] == 0:
339
+ # We are inside the literal part of an interpolated
340
+ # string (not within `{...}`). Continue scanning the
341
+ # string body.
342
+ self._scan_interp_body()
343
+ continue
344
+ self._scan_token()
345
+ self._emit(TokenKind.EOF, "", self._pos, self._pos)
346
+ return self.tokens
347
+
348
+ # ---- position bookkeeping --------------------------------------------
349
+
350
+ def _at_end(self) -> bool:
351
+ return self._pos >= len(self.source)
352
+
353
+ def _peek(self, offset: int = 0) -> str:
354
+ idx = self._pos + offset
355
+ if idx >= len(self.source):
356
+ return ""
357
+ return self.source[idx]
358
+
359
+ def _advance(self) -> str:
360
+ ch = self.source[self._pos]
361
+ self._pos += 1
362
+ if ch == "\n":
363
+ self._line += 1
364
+ self._col = 1
365
+ else:
366
+ self._col += 1
367
+ return ch
368
+
369
+ def _position_at(self, offset: int) -> SourcePosition:
370
+ # Walk from 0 to offset to get accurate line/col. Only called
371
+ # for token starts/ends on the main path, so we use a cheap
372
+ # incremental tracker instead: line/col are maintained by
373
+ # `_advance`. For span starts we snapshot before scanning.
374
+ raise NotImplementedError("Use _snapshot / _make_span instead")
375
+
376
+ def _snapshot(self) -> SourcePosition:
377
+ return SourcePosition(line=self._line, column=self._col, offset=self._pos)
378
+
379
+ def _span(self, start: SourcePosition) -> SourceSpan:
380
+ return SourceSpan(start=start, end=self._snapshot())
381
+
382
+ # ---- token emission ---------------------------------------------------
383
+
384
+ def _emit(
385
+ self,
386
+ kind: TokenKind,
387
+ lexeme: str,
388
+ start_offset: int,
389
+ end_offset: int,
390
+ *,
391
+ value: object = None,
392
+ ) -> None:
393
+ # Compute accurate positions from offsets by re-scanning the
394
+ # known lexeme boundaries using the maintained _line/_col. In
395
+ # practice the caller already has a SourcePosition snapshot so
396
+ # we accept that via `_emit_with_span` instead. This helper is
397
+ # kept for the EOF sentinel only.
398
+ pos = SourcePosition(line=self._line, column=self._col, offset=end_offset)
399
+ span = SourceSpan(start=pos, end=pos)
400
+ self.tokens.append(Token(kind=kind, lexeme=lexeme, span=span, value=value))
401
+
402
+ def _emit_tok(
403
+ self,
404
+ kind: TokenKind,
405
+ lexeme: str,
406
+ start: SourcePosition,
407
+ *,
408
+ value: object = None,
409
+ ) -> None:
410
+ span = self._span(start)
411
+ self.tokens.append(Token(kind=kind, lexeme=lexeme, span=span, value=value))
412
+
413
+ # ---- main scanner -----------------------------------------------------
414
+
415
+ def _scan_token(self) -> None:
416
+ # Skip whitespace (but not newlines inside counts)
417
+ while not self._at_end() and self._peek() in " \t\r\n":
418
+ self._advance()
419
+ if self._at_end():
420
+ return
421
+
422
+ start = self._snapshot()
423
+ ch = self._peek()
424
+
425
+ # Comments --------------------------------------------------------
426
+ if ch == "/" and self._peek(1) == "/":
427
+ self._scan_line_comment()
428
+ return
429
+ if ch == "/" and self._peek(1) == "*":
430
+ self._scan_block_comment(start)
431
+ return
432
+ if ch == "#":
433
+ self._scan_line_comment()
434
+ return
435
+
436
+ # Strings ----------------------------------------------------------
437
+ if ch == '"':
438
+ self._scan_plain_string(start)
439
+ return
440
+ if ch == "$" and self._peek(1) == '"':
441
+ self._scan_interp_string_open(start)
442
+ return
443
+ if ch == "'":
444
+ self._scan_char(start)
445
+ return
446
+
447
+ # Numbers ----------------------------------------------------------
448
+ if ch.isdigit():
449
+ self._scan_number(start)
450
+ return
451
+
452
+ # Identifiers / keywords ------------------------------------------
453
+ if ch == "_" or ch.isalpha():
454
+ self._scan_ident(start)
455
+ return
456
+
457
+ # Operators & punctuation -----------------------------------------
458
+ self._scan_operator(start)
459
+
460
+ # ---- comments ---------------------------------------------------------
461
+
462
+ def _scan_line_comment(self) -> None:
463
+ while not self._at_end() and self._peek() != "\n":
464
+ self._advance()
465
+
466
+ def _scan_block_comment(self, start: SourcePosition) -> None:
467
+ # Consume "/*"
468
+ self._advance()
469
+ self._advance()
470
+ while not self._at_end():
471
+ if self._peek() == "*" and self._peek(1) == "/":
472
+ self._advance()
473
+ self._advance()
474
+ return
475
+ self._advance()
476
+ # Unterminated
477
+ self.diagnostics.error(
478
+ "unterminated block comment",
479
+ self._span(start),
480
+ code="SUT0001",
481
+ hint="add `*/` to close the comment",
482
+ )
483
+
484
+ # ---- strings ----------------------------------------------------------
485
+
486
+ def _scan_plain_string(self, start: SourcePosition) -> None:
487
+ self._advance() # opening "
488
+ buf: List[str] = []
489
+ while not self._at_end() and self._peek() != '"':
490
+ ch = self._advance()
491
+ if ch == "\\":
492
+ if self._at_end():
493
+ break
494
+ esc = self._advance()
495
+ buf.append(self._interpret_escape(esc))
496
+ elif ch == "\n":
497
+ self.diagnostics.error(
498
+ "unterminated string literal (newline before closing quote)",
499
+ self._span(start),
500
+ code="SUT0002",
501
+ )
502
+ break
503
+ else:
504
+ buf.append(ch)
505
+ if not self._at_end() and self._peek() == '"':
506
+ self._advance()
507
+ else:
508
+ self.diagnostics.error(
509
+ "unterminated string literal",
510
+ self._span(start),
511
+ code="SUT0002",
512
+ )
513
+ lexeme = self.source[start.offset:self._pos]
514
+ self._emit_tok(
515
+ TokenKind.STRING_LIT, lexeme, start, value="".join(buf)
516
+ )
517
+
518
+ def _scan_interp_string_open(self, start: SourcePosition) -> None:
519
+ # `$"` opens an interpolated string. We emit a STRING_INTERP_START
520
+ # token and then push a state entry. The main loop will call
521
+ # `_scan_interp_body` until the string is closed.
522
+ self._advance() # $
523
+ self._advance() # "
524
+ self._emit_tok(TokenKind.STRING_INTERP_START, "$\"", start)
525
+ self._interp_stack.append(0)
526
+
527
+ def _scan_interp_body(self) -> None:
528
+ """Scan inside an interpolated string, outside `{...}` regions."""
529
+ buf_start = self._snapshot()
530
+ buf: List[str] = []
531
+ while not self._at_end():
532
+ ch = self._peek()
533
+ if ch == '"':
534
+ # End of the interpolated string.
535
+ if buf:
536
+ lexeme = self.source[buf_start.offset:self._pos]
537
+ self._emit_tok(
538
+ TokenKind.STRING_LIT_CHUNK, lexeme, buf_start,
539
+ value="".join(buf),
540
+ )
541
+ close_start = self._snapshot()
542
+ self._advance()
543
+ self._emit_tok(TokenKind.STRING_INTERP_END, "\"", close_start)
544
+ self._interp_stack.pop()
545
+ return
546
+ if ch == "{":
547
+ # Emit any pending chunk, then enter interpolation mode.
548
+ if buf:
549
+ lexeme = self.source[buf_start.offset:self._pos]
550
+ self._emit_tok(
551
+ TokenKind.STRING_LIT_CHUNK, lexeme, buf_start,
552
+ value="".join(buf),
553
+ )
554
+ open_start = self._snapshot()
555
+ self._advance()
556
+ self._emit_tok(TokenKind.INTERP_OPEN, "{", open_start)
557
+ # Mark that we are now tracking a nested brace.
558
+ self._interp_stack[-1] = 1
559
+ return
560
+ if ch == "\\":
561
+ self._advance()
562
+ if self._at_end():
563
+ break
564
+ esc = self._advance()
565
+ buf.append(self._interpret_escape(esc))
566
+ continue
567
+ if ch == "\n":
568
+ self.diagnostics.error(
569
+ "unterminated interpolated string literal",
570
+ self._span(buf_start),
571
+ code="SUT0002",
572
+ )
573
+ break
574
+ self._advance()
575
+ buf.append(ch)
576
+ # EOF without closing quote.
577
+ self.diagnostics.error(
578
+ "unterminated interpolated string literal",
579
+ self._span(buf_start),
580
+ code="SUT0002",
581
+ )
582
+ # Pop so we don't loop.
583
+ if self._interp_stack:
584
+ self._interp_stack.pop()
585
+
586
+ def _scan_char(self, start: SourcePosition) -> None:
587
+ """Scan a single-quoted character literal: `'a'`, `'\\n'`, `'\\''`.
588
+
589
+ Runs after the dispatcher sees a leading `'`. Recognises the
590
+ same escape sequences as string literals (see
591
+ `_interpret_escape`). Empty literal `''` and unterminated
592
+ literal both produce diagnostics and emit CHAR_LIT with value
593
+ 0 so the parser keeps making progress.
594
+ """
595
+ self._advance() # opening '
596
+ value = 0
597
+ if self._at_end() or self._peek() == "'":
598
+ self.diagnostics.error(
599
+ "empty character literal",
600
+ self._span(start),
601
+ code="SUT0003",
602
+ hint="a character literal must contain exactly one character",
603
+ )
604
+ if not self._at_end() and self._peek() == "'":
605
+ self._advance()
606
+ lexeme = self.source[start.offset:self._pos]
607
+ self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
608
+ return
609
+
610
+ ch = self._advance()
611
+ if ch == "\\":
612
+ if self._at_end():
613
+ self.diagnostics.error(
614
+ "unterminated character literal",
615
+ self._span(start),
616
+ code="SUT0003",
617
+ )
618
+ lexeme = self.source[start.offset:self._pos]
619
+ self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
620
+ return
621
+ esc = self._advance()
622
+ decoded = self._interpret_escape(esc)
623
+ value = ord(decoded)
624
+ elif ch == "\n":
625
+ self.diagnostics.error(
626
+ "unterminated character literal (newline before closing quote)",
627
+ self._span(start),
628
+ code="SUT0003",
629
+ )
630
+ lexeme = self.source[start.offset:self._pos]
631
+ self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
632
+ return
633
+ else:
634
+ value = ord(ch)
635
+
636
+ if not self._at_end() and self._peek() == "'":
637
+ self._advance()
638
+ else:
639
+ self.diagnostics.error(
640
+ "unterminated character literal (expected closing `'`)",
641
+ self._span(start),
642
+ code="SUT0003",
643
+ )
644
+ lexeme = self.source[start.offset:self._pos]
645
+ self._emit_tok(TokenKind.CHAR_LIT, lexeme, start, value=value)
646
+
647
+ def _interpret_escape(self, ch: str) -> str:
648
+ mapping = {
649
+ "n": "\n",
650
+ "t": "\t",
651
+ "r": "\r",
652
+ "\\": "\\",
653
+ "\"": "\"",
654
+ "'": "'",
655
+ "0": "\0",
656
+ "{": "{",
657
+ "}": "}",
658
+ "$": "$",
659
+ }
660
+ return mapping.get(ch, ch)
661
+
662
+ # ---- numbers ----------------------------------------------------------
663
+
664
+ def _scan_number(self, start: SourcePosition) -> None:
665
+ is_float = False
666
+ while not self._at_end() and self._peek().isdigit():
667
+ self._advance()
668
+ if self._peek() == "." and self._peek(1).isdigit():
669
+ is_float = True
670
+ self._advance()
671
+ while not self._at_end() and self._peek().isdigit():
672
+ self._advance()
673
+ # Imaginary-unit suffix: `5i`, `3.14i`. Only binds when the
674
+ # character AFTER the `i` is not an identifier continuation —
675
+ # so `5i` → IMAG_LIT(5) but `5index` → INT_LIT(5) + IDENT("index")
676
+ # and the bare variable name `i` still lexes as IDENT. Same
677
+ # disambiguation pattern as numeric suffixes in Rust / C#.
678
+ if self._peek() == "i":
679
+ nxt = self._peek(1)
680
+ if nxt == "" or not (nxt.isalnum() or nxt == "_"):
681
+ self._advance() # consume the `i`
682
+ lexeme = self.source[start.offset:self._pos]
683
+ # Magnitude is the numeric part without the trailing `i`.
684
+ magnitude = float(lexeme[:-1])
685
+ self._emit_tok(
686
+ TokenKind.IMAG_LIT, lexeme, start, value=magnitude
687
+ )
688
+ return
689
+ lexeme = self.source[start.offset:self._pos]
690
+ if is_float:
691
+ self._emit_tok(TokenKind.FLOAT_LIT, lexeme, start, value=float(lexeme))
692
+ else:
693
+ self._emit_tok(TokenKind.INT_LIT, lexeme, start, value=int(lexeme))
694
+
695
+ # ---- identifiers ------------------------------------------------------
696
+
697
+ def _scan_ident(self, start: SourcePosition) -> None:
698
+ while not self._at_end():
699
+ ch = self._peek()
700
+ if ch == "_" or ch.isalnum():
701
+ self._advance()
702
+ else:
703
+ break
704
+ lexeme = self.source[start.offset:self._pos]
705
+ kind = KEYWORDS.get(lexeme, TokenKind.IDENT)
706
+ # The logical-connective keywords (`not`, `and`, `or`, `nand`,
707
+ # `xor`, `xnor`, `iff`) are CONTEXTUAL — they emit as IDENT so
708
+ # they don't shadow user identifiers like `Iff` or `Nand`.
709
+ # The parser recognizes them as operators in expression
710
+ # positions by checking the IDENT lexeme (case-insensitively).
711
+ # See _LOGIC_KEYWORD_NAMES below.
712
+ self._emit_tok(kind, lexeme, start)
713
+
714
+ # ---- operators --------------------------------------------------------
715
+
716
+ def _scan_operator(self, start: SourcePosition) -> None:
717
+ ch = self._advance()
718
+ nxt = self._peek()
719
+
720
+ # Two-character operators first.
721
+ two: Optional[TokenKind] = None
722
+ if ch == "=" and nxt == "=":
723
+ two = TokenKind.EQ
724
+ elif ch == "!" and nxt == "=":
725
+ two = TokenKind.NEQ
726
+ elif ch == "<" and nxt == "=":
727
+ two = TokenKind.LE
728
+ elif ch == ">" and nxt == "=":
729
+ two = TokenKind.GE
730
+ elif ch == "&" and nxt == "&":
731
+ two = TokenKind.AND
732
+ elif ch == "|" and nxt == "|":
733
+ two = TokenKind.OR
734
+ elif ch == "+" and nxt == "+":
735
+ two = TokenKind.PLUS_PLUS
736
+ elif ch == "-" and nxt == "-":
737
+ two = TokenKind.MINUS_MINUS
738
+ elif ch == "+" and nxt == "=":
739
+ two = TokenKind.PLUS_ASSIGN
740
+ elif ch == "-" and nxt == "=":
741
+ two = TokenKind.MINUS_ASSIGN
742
+ elif ch == "*" and nxt == "=":
743
+ two = TokenKind.STAR_ASSIGN
744
+ elif ch == "/" and nxt == "=":
745
+ two = TokenKind.SLASH_ASSIGN
746
+ elif ch == "-" and nxt == ">":
747
+ two = TokenKind.ARROW
748
+ elif ch == "=" and nxt == ">":
749
+ two = TokenKind.FAT_ARROW
750
+ elif ch == "|" and nxt == ">":
751
+ two = TokenKind.PIPE_FORWARD
752
+ elif ch == ":" and nxt == ":":
753
+ two = TokenKind.DOUBLE_COLON
754
+
755
+ if two is not None:
756
+ self._advance()
757
+ lex = self.source[start.offset:self._pos]
758
+ self._emit_tok(two, lex, start)
759
+ return
760
+
761
+ # Single-character operators / punctuation.
762
+ single = {
763
+ "{": TokenKind.LBRACE,
764
+ "}": TokenKind.RBRACE,
765
+ "(": TokenKind.LPAREN,
766
+ ")": TokenKind.RPAREN,
767
+ "[": TokenKind.LBRACKET,
768
+ "]": TokenKind.RBRACKET,
769
+ ";": TokenKind.SEMICOLON,
770
+ ",": TokenKind.COMMA,
771
+ ".": TokenKind.DOT,
772
+ ":": TokenKind.COLON,
773
+ "+": TokenKind.PLUS,
774
+ "-": TokenKind.MINUS,
775
+ "*": TokenKind.STAR,
776
+ "/": TokenKind.SLASH,
777
+ "%": TokenKind.PERCENT,
778
+ "!": TokenKind.BANG,
779
+ "?": TokenKind.QUESTION,
780
+ "=": TokenKind.ASSIGN,
781
+ "<": TokenKind.LT,
782
+ ">": TokenKind.GT,
783
+ "~": TokenKind.TILDE,
784
+ # Single `&` and `|` are logical, not bitwise — Sutra has
785
+ # no bits to flip. They lex to the same kinds as `&&` and
786
+ # `||` so the parser and inliner treat them uniformly.
787
+ "&": TokenKind.AND,
788
+ "|": TokenKind.OR,
789
+ "^": TokenKind.BIT_XOR,
790
+ }
791
+ kind = single.get(ch)
792
+ if kind is None:
793
+ self.diagnostics.error(
794
+ f"unexpected character {ch!r}",
795
+ self._span(start),
796
+ code="SUT0003",
797
+ )
798
+ self._emit_tok(TokenKind.UNKNOWN, ch, start)
799
+ return
800
+ self._emit_tok(kind, ch, start)
801
+
802
+ # Brace counting inside interpolated strings. When we see `{`
803
+ # or `}` inside a `{ expr }` region of an interpolated string,
804
+ # we adjust the depth counter. A matching close returns control
805
+ # to the string body.
806
+ if self._interp_stack and self._interp_stack[-1] > 0:
807
+ if kind is TokenKind.LBRACE:
808
+ self._interp_stack[-1] += 1
809
+ elif kind is TokenKind.RBRACE:
810
+ self._interp_stack[-1] -= 1
811
+ if self._interp_stack[-1] == 0:
812
+ # Replace the last-emitted RBRACE with INTERP_CLOSE
813
+ # so the parser knows we're back in string mode.
814
+ closing = self.tokens.pop()
815
+ self.tokens.append(
816
+ Token(
817
+ kind=TokenKind.INTERP_CLOSE,
818
+ lexeme=closing.lexeme,
819
+ span=closing.span,
820
+ )
821
+ )