techscript 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
techscript/lexer.py ADDED
@@ -0,0 +1,336 @@
1
+ """TechScript Lexer — converts raw source text into a stream of tokens.
2
+
3
+ Handles:
4
+ * number literals (int, float, hex, binary, octal, scientific, underscore sep)
5
+ * string literals (single/double/triple-quoted, escape sequences)
6
+ * f-strings with ``{expr}`` interpolation markers
7
+ * identifiers and reserved keywords
8
+ * all operators and delimiters defined in ``tokens.py``
9
+ * Python-style INDENT / DEDENT tracking
10
+ * single-line ``#`` and block ``#[ … ]#`` comments
11
+ * tab rejection with a friendly error
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from techscript.tokens import Token, TokenType, KEYWORDS
17
+ from techscript.errors import LexerError
18
+
19
+
20
+ class Lexer:
21
+ """Tokenise a TechScript source string."""
22
+
23
+ def __init__(self, source: str, filename: str = "<stdin>") -> None:
24
+ self.source = source
25
+ self.filename = filename
26
+ self.pos = 0
27
+ self.line = 1
28
+ self.column = 1
29
+ self.tokens: list[Token] = []
30
+
31
+ # ------------------------------------------------------------------
32
+ # Helpers
33
+ # ------------------------------------------------------------------
34
+
35
+ def _peek(self, offset: int = 0) -> str | None:
36
+ idx = self.pos + offset
37
+ return self.source[idx] if idx < len(self.source) else None
38
+
39
+ def _advance(self) -> str:
40
+ ch = self.source[self.pos]
41
+ self.pos += 1
42
+ if ch == "\n":
43
+ self.line += 1
44
+ self.column = 1
45
+ else:
46
+ self.column += 1
47
+ return ch
48
+
49
+ def _match(self, expected: str) -> bool:
50
+ if self.pos < len(self.source) and self.source[self.pos] == expected:
51
+ self._advance()
52
+ return True
53
+ return False
54
+
55
+ def _add(self, tt: TokenType, value: str, *, line: int | None = None, col: int | None = None) -> None:
56
+ self.tokens.append(Token(tt, value, line or self.line, col or self.column))
57
+
58
+ def _error(self, msg: str) -> LexerError:
59
+ return LexerError(msg, line=self.line, column=self.column)
60
+
61
+ # ------------------------------------------------------------------
62
+ # Public API
63
+ # ------------------------------------------------------------------
64
+
65
+ def tokenize(self) -> list[Token]:
66
+ """Run the tokeniser and return the full token list."""
67
+ while self.pos < len(self.source):
68
+ ch = self._peek()
69
+
70
+ # Skip whitespace
71
+ if ch in (" ", "\r", "\t"):
72
+ self._advance()
73
+ continue
74
+
75
+ # Newline
76
+ if ch == "\n":
77
+ self._add(TokenType.NEWLINE, "\\n")
78
+ self._advance()
79
+ continue
80
+
81
+ # Comments
82
+ if ch == "#":
83
+ self._skip_comment()
84
+ continue
85
+
86
+ # Numbers
87
+ if ch is not None and ch.isdigit():
88
+ self._read_number()
89
+ continue
90
+
91
+ # Strings
92
+ if ch in ('"', "'"):
93
+ self._read_string(ch)
94
+ continue
95
+
96
+ # f-strings
97
+ if ch == "f" and self._peek(1) in ('"', "'"):
98
+ self._advance() # skip 'f'
99
+ self._read_fstring(self._peek()) # type: ignore[arg-type]
100
+ continue
101
+
102
+ # r-strings (raw)
103
+ if ch == "r" and self._peek(1) in ('"', "'"):
104
+ self._advance()
105
+ self._read_string(self._peek(), raw=True) # type: ignore[arg-type]
106
+ continue
107
+
108
+ # Identifiers / keywords
109
+ if ch is not None and (ch.isalpha() or ch == "_"):
110
+ self._read_identifier()
111
+ continue
112
+
113
+ # Operators & delimiters
114
+ self._read_symbol()
115
+
116
+ self._add(TokenType.EOF, "")
117
+ return self.tokens
118
+
119
+ # ------------------------------------------------------------------
120
+ # Comments
121
+ # ------------------------------------------------------------------
122
+
123
+ def _skip_comment(self) -> None:
124
+ self._advance() # skip '#'
125
+ # Block comment #[ … ]#
126
+ if self._peek() == "[":
127
+ self._advance() # skip '['
128
+ while self.pos < len(self.source):
129
+ if self.source[self.pos] == "]" and self._peek(1) == "#":
130
+ self._advance() # ]
131
+ self._advance() # #
132
+ return
133
+ self._advance()
134
+ raise self._error("Unterminated block comment (missing ]#)")
135
+ # Single-line comment
136
+ while self.pos < len(self.source) and self.source[self.pos] != "\n":
137
+ self._advance()
138
+
139
+ # ------------------------------------------------------------------
140
+ # Numbers
141
+ # ------------------------------------------------------------------
142
+
143
+ def _read_number(self) -> None:
144
+ start_col = self.column
145
+ num = ""
146
+
147
+ # Hex / binary / octal prefixes
148
+ if self._peek() == "0" and self._peek(1) in ("x", "X", "b", "B", "o", "O"):
149
+ num += self._advance() # '0'
150
+ num += self._advance() # prefix letter
151
+ while self.pos < len(self.source) and (self.source[self.pos].isalnum() or self.source[self.pos] == "_"):
152
+ ch = self._advance()
153
+ if ch != "_":
154
+ num += ch
155
+ self._add(TokenType.NUMBER_INT, num, col=start_col)
156
+ return
157
+
158
+ is_float = False
159
+ while self.pos < len(self.source) and (self.source[self.pos].isdigit() or self.source[self.pos] == "_"):
160
+ ch = self._advance()
161
+ if ch != "_":
162
+ num += ch
163
+
164
+ # Decimal point (not range ..)
165
+ if (
166
+ self.pos < len(self.source)
167
+ and self.source[self.pos] == "."
168
+ and self._peek(1) is not None
169
+ and self._peek(1) not in (".",) # avoid eating ..
170
+ and (self._peek(1).isdigit() if self._peek(1) else False)
171
+ ):
172
+ is_float = True
173
+ num += self._advance() # '.'
174
+ while self.pos < len(self.source) and (self.source[self.pos].isdigit() or self.source[self.pos] == "_"):
175
+ ch = self._advance()
176
+ if ch != "_":
177
+ num += ch
178
+
179
+ # Scientific notation
180
+ if self.pos < len(self.source) and self.source[self.pos] in ("e", "E"):
181
+ is_float = True
182
+ num += self._advance()
183
+ if self.pos < len(self.source) and self.source[self.pos] in ("+", "-"):
184
+ num += self._advance()
185
+ while self.pos < len(self.source) and self.source[self.pos].isdigit():
186
+ num += self._advance()
187
+
188
+ tt = TokenType.NUMBER_FLOAT if is_float else TokenType.NUMBER_INT
189
+ self._add(tt, num, col=start_col)
190
+
191
+ # ------------------------------------------------------------------
192
+ # Strings
193
+ # ------------------------------------------------------------------
194
+
195
+ def _read_string(self, quote: str, *, raw: bool = False) -> None:
196
+ start_col = self.column
197
+ self._advance() # opening quote
198
+ result = ""
199
+
200
+ # Triple-quoted
201
+ if self._peek() == quote and self._peek(1) == quote:
202
+ self._advance()
203
+ self._advance()
204
+ while self.pos < len(self.source):
205
+ if self.source[self.pos] == quote and self._peek(1) == quote and self._peek(2) == quote:
206
+ self._advance(); self._advance(); self._advance()
207
+ self._add(TokenType.STRING, result, col=start_col)
208
+ return
209
+ if not raw and self.source[self.pos] == "\\":
210
+ result += self._read_escape()
211
+ else:
212
+ result += self._advance()
213
+ raise self._error("Unterminated triple-quoted string")
214
+
215
+ # Single-line string
216
+ while self.pos < len(self.source) and self.source[self.pos] != quote:
217
+ if self.source[self.pos] == "\n":
218
+ raise self._error("Unterminated string (use triple quotes for multi-line)")
219
+ if not raw and self.source[self.pos] == "\\":
220
+ result += self._read_escape()
221
+ else:
222
+ result += self._advance()
223
+
224
+ if self.pos >= len(self.source):
225
+ raise self._error("Unterminated string")
226
+
227
+ self._advance() # closing quote
228
+ self._add(TokenType.STRING, result, col=start_col)
229
+
230
+ def _read_fstring(self, quote: str) -> None:
231
+ start_col = self.column
232
+ self._advance() # opening quote
233
+ result = ""
234
+ while self.pos < len(self.source) and self.source[self.pos] != quote:
235
+ if self.source[self.pos] == "\n":
236
+ raise self._error("Unterminated f-string")
237
+ result += self._advance()
238
+ if self.pos >= len(self.source):
239
+ raise self._error("Unterminated f-string")
240
+ self._advance() # closing quote
241
+ self._add(TokenType.FSTRING, result, col=start_col)
242
+
243
+ def _read_escape(self) -> str:
244
+ self._advance() # backslash
245
+ if self.pos >= len(self.source):
246
+ return "\\"
247
+ ch = self._advance()
248
+ return {"n": "\n", "t": "\t", "r": "\r", "\\": "\\", "'": "'", '"': '"', "0": "\0"}.get(ch, "\\" + ch)
249
+
250
+ # ------------------------------------------------------------------
251
+ # Identifiers / keywords
252
+ # ------------------------------------------------------------------
253
+
254
+ def _read_identifier(self) -> None:
255
+ start_col = self.column
256
+ name = ""
257
+ while self.pos < len(self.source) and (self.source[self.pos].isalnum() or self.source[self.pos] == "_"):
258
+ name += self._advance()
259
+
260
+ if name == "true":
261
+ self._add(TokenType.BOOL_TRUE, name, col=start_col)
262
+ elif name == "false":
263
+ self._add(TokenType.BOOL_FALSE, name, col=start_col)
264
+ elif name == "none":
265
+ self._add(TokenType.NONE, name, col=start_col)
266
+ elif name in KEYWORDS:
267
+ self._add(TokenType.KEYWORD, name, col=start_col)
268
+ else:
269
+ self._add(TokenType.IDENTIFIER, name, col=start_col)
270
+
271
+ # ------------------------------------------------------------------
272
+ # Operators / delimiters
273
+ # ------------------------------------------------------------------
274
+
275
+ def _read_symbol(self) -> None:
276
+ start_col = self.column
277
+ ch = self._advance()
278
+
279
+ # --- multi-char operators ---
280
+
281
+ # ** and *=
282
+ if ch == "*":
283
+ if self._match("*"):
284
+ self._add(TokenType.POWER, "**", col=start_col); return
285
+ if self._match("="):
286
+ self._add(TokenType.STAR_ASSIGN, "*=", col=start_col); return
287
+ self._add(TokenType.STAR, "*", col=start_col); return
288
+
289
+ # .. ..= ...
290
+ if ch == ".":
291
+ if self._peek() == ".":
292
+ self._advance()
293
+ if self._match("."):
294
+ self._add(TokenType.SPREAD, "...", col=start_col); return
295
+ if self._match("="):
296
+ self._add(TokenType.DOTDOT_EQUAL, "..=", col=start_col); return
297
+ self._add(TokenType.DOTDOT, "..", col=start_col); return
298
+ self._add(TokenType.DOT, ".", col=start_col); return
299
+
300
+ # Two-char lookup tables
301
+ _two = {
302
+ "=": {"=": TokenType.EQUAL, ">": TokenType.ARROW},
303
+ "!": {"=": TokenType.NOT_EQUAL},
304
+ "<": {"=": TokenType.LESS_EQUAL},
305
+ ">": {"=": TokenType.GREATER_EQUAL},
306
+ "+": {"=": TokenType.PLUS_ASSIGN},
307
+ "-": {"=": TokenType.MINUS_ASSIGN},
308
+ "/": {"/": TokenType.DOUBLE_SLASH, "=": TokenType.SLASH_ASSIGN},
309
+ "|": {">": TokenType.PIPE},
310
+ "?": {".": TokenType.OPTIONAL_CHAIN, "?": TokenType.NULLISH},
311
+ }
312
+
313
+ if ch in _two and self.pos < len(self.source):
314
+ nxt = self.source[self.pos]
315
+ if nxt in _two[ch]:
316
+ self._advance()
317
+ self._add(_two[ch][nxt], ch + nxt, col=start_col)
318
+ return
319
+
320
+ _single = {
321
+ "+": TokenType.PLUS, "-": TokenType.MINUS,
322
+ "/": TokenType.SLASH, "%": TokenType.PERCENT,
323
+ "(": TokenType.LPAREN, ")": TokenType.RPAREN,
324
+ "[": TokenType.LBRACKET, "]": TokenType.RBRACKET,
325
+ "{": TokenType.LBRACE, "}": TokenType.RBRACE,
326
+ ",": TokenType.COMMA, ":": TokenType.COLON,
327
+ "@": TokenType.AT, "?": TokenType.QUESTION,
328
+ "=": TokenType.ASSIGN, "<": TokenType.LESS,
329
+ ">": TokenType.GREATER,
330
+ }
331
+
332
+ if ch in _single:
333
+ self._add(_single[ch], ch, col=start_col)
334
+ return
335
+
336
+ raise self._error(f"Unexpected character: '{ch}'")