multilingualprogramming 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multilingualprogramming/__init__.py +74 -0
- multilingualprogramming/__main__.py +194 -0
- multilingualprogramming/codegen/__init__.py +12 -0
- multilingualprogramming/codegen/executor.py +215 -0
- multilingualprogramming/codegen/python_generator.py +592 -0
- multilingualprogramming/codegen/repl.py +489 -0
- multilingualprogramming/codegen/runtime_builtins.py +308 -0
- multilingualprogramming/core/__init__.py +12 -0
- multilingualprogramming/core/ir.py +29 -0
- multilingualprogramming/core/lowering.py +24 -0
- multilingualprogramming/datetime/__init__.py +11 -0
- multilingualprogramming/datetime/date_parser.py +190 -0
- multilingualprogramming/datetime/mp_date.py +210 -0
- multilingualprogramming/datetime/mp_datetime.py +153 -0
- multilingualprogramming/datetime/mp_time.py +147 -0
- multilingualprogramming/datetime/resource_loader.py +18 -0
- multilingualprogramming/exceptions.py +158 -0
- multilingualprogramming/imports.py +150 -0
- multilingualprogramming/keyword/__init__.py +13 -0
- multilingualprogramming/keyword/keyword_registry.py +249 -0
- multilingualprogramming/keyword/keyword_validator.py +59 -0
- multilingualprogramming/keyword/language_pack_validator.py +110 -0
- multilingualprogramming/lexer/__init__.py +11 -0
- multilingualprogramming/lexer/lexer.py +570 -0
- multilingualprogramming/lexer/source_reader.py +91 -0
- multilingualprogramming/lexer/token.py +54 -0
- multilingualprogramming/lexer/token_types.py +38 -0
- multilingualprogramming/numeral/__init__.py +11 -0
- multilingualprogramming/numeral/abstract_numeral.py +232 -0
- multilingualprogramming/numeral/complex_numeral.py +190 -0
- multilingualprogramming/numeral/fraction_numeral.py +165 -0
- multilingualprogramming/numeral/mp_numeral.py +243 -0
- multilingualprogramming/numeral/numeral_converter.py +151 -0
- multilingualprogramming/numeral/roman_numeral.py +301 -0
- multilingualprogramming/numeral/unicode_numeral.py +292 -0
- multilingualprogramming/parser/__init__.py +28 -0
- multilingualprogramming/parser/ast_nodes.py +459 -0
- multilingualprogramming/parser/ast_printer.py +677 -0
- multilingualprogramming/parser/error_messages.py +75 -0
- multilingualprogramming/parser/parser.py +1796 -0
- multilingualprogramming/parser/semantic_analyzer.py +689 -0
- multilingualprogramming/parser/surface_normalizer.py +282 -0
- multilingualprogramming/resources/datetime/eras.json +23 -0
- multilingualprogramming/resources/datetime/formats.json +32 -0
- multilingualprogramming/resources/datetime/months.json +150 -0
- multilingualprogramming/resources/datetime/weekdays.json +90 -0
- multilingualprogramming/resources/parser/error_messages.json +310 -0
- multilingualprogramming/resources/repl/commands.json +636 -0
- multilingualprogramming/resources/usm/builtins_aliases.json +731 -0
- multilingualprogramming/resources/usm/keywords.json +1063 -0
- multilingualprogramming/resources/usm/operators.json +532 -0
- multilingualprogramming/resources/usm/schema.json +34 -0
- multilingualprogramming/resources/usm/surface_patterns.json +1523 -0
- multilingualprogramming/unicode_string.py +140 -0
- multilingualprogramming/version.py +9 -0
- multilingualprogramming-0.2.0.dist-info/METADATA +350 -0
- multilingualprogramming-0.2.0.dist-info/RECORD +61 -0
- multilingualprogramming-0.2.0.dist-info/WHEEL +5 -0
- multilingualprogramming-0.2.0.dist-info/entry_points.txt +3 -0
- multilingualprogramming-0.2.0.dist-info/licenses/LICENSE +674 -0
- multilingualprogramming-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
"""Multilingual lexer that tokenizes mixed-script source code."""
|
|
7
|
+
import json
|
|
8
|
+
import unicodedata
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from multilingualprogramming.lexer.token_types import TokenType
|
|
11
|
+
from multilingualprogramming.lexer.token import Token
|
|
12
|
+
from multilingualprogramming.lexer.source_reader import SourceReader
|
|
13
|
+
from multilingualprogramming.keyword.keyword_registry import KeywordRegistry
|
|
14
|
+
from multilingualprogramming.exceptions import UnexpectedTokenError
|
|
15
|
+
# Operator characters and multi-character operators
|
|
16
|
+
_DEFAULT_SINGLE_OPERATORS = set("+-*/%<>=!&|^~")
|
|
17
|
+
_DEFAULT_MULTI_OPERATORS = {
|
|
18
|
+
"**", "//", "==", "!=", "<=", ">=", "<<", ">>",
|
|
19
|
+
"+=", "-=", "*=", "/=", "->", ":=",
|
|
20
|
+
"**=", "//=", "%=", "&=", "|=", "^=", "<<=", ">>=",
|
|
21
|
+
}
|
|
22
|
+
# Unicode operator alternatives
|
|
23
|
+
_DEFAULT_UNICODE_OPERATORS = {
|
|
24
|
+
"\u00d7": "*", # ×
|
|
25
|
+
"\u00f7": "/", # ÷
|
|
26
|
+
"\u2212": "-", # −
|
|
27
|
+
"\u2260": "!=", # ≠
|
|
28
|
+
"\u2264": "<=", # ≤
|
|
29
|
+
"\u2265": ">=", # ≥
|
|
30
|
+
"\u2192": "->", # →
|
|
31
|
+
}
|
|
32
|
+
_DEFAULT_DELIMITERS = set("()[]{},:;.@")
|
|
33
|
+
# Unicode delimiter alternatives
|
|
34
|
+
_DEFAULT_UNICODE_DELIMITERS = {
|
|
35
|
+
"\uff08": "(", "\uff09": ")", # fullwidth parens
|
|
36
|
+
"\uff3b": "[", "\uff3d": "]", # fullwidth brackets
|
|
37
|
+
"\uff5b": "{", "\uff5d": "}", # fullwidth braces
|
|
38
|
+
"\uff0c": ",", "\u060c": ",", # fullwidth/Arabic comma
|
|
39
|
+
"\uff1a": ":", # fullwidth colon
|
|
40
|
+
"\uff1b": ";", "\u061b": ";", # fullwidth/Arabic semicolon
|
|
41
|
+
}
|
|
42
|
+
# String delimiter pairs: (open, close)
|
|
43
|
+
STRING_PAIRS = {
|
|
44
|
+
'"': '"',
|
|
45
|
+
"'": "'",
|
|
46
|
+
"\u300c": "\u300d", # 「」 CJK corner brackets
|
|
47
|
+
"\u00ab": "\u00bb", # «» guillemets
|
|
48
|
+
"\u201c": "\u201d", # "" smart double quotes
|
|
49
|
+
"\u2018": "\u2019", # '' smart single quotes
|
|
50
|
+
}
|
|
51
|
+
# Date literal delimiters
|
|
52
|
+
DATE_OPEN = "\u3014" # 〔
|
|
53
|
+
DATE_CLOSE = "\u3015" # 〕
|
|
54
|
+
def _load_operator_config():
|
|
55
|
+
"""Load operator and delimiter tables from operators.json."""
|
|
56
|
+
single_ops = set(_DEFAULT_SINGLE_OPERATORS)
|
|
57
|
+
multi_ops = set(_DEFAULT_MULTI_OPERATORS)
|
|
58
|
+
unicode_ops = dict(_DEFAULT_UNICODE_OPERATORS)
|
|
59
|
+
delimiters = set(_DEFAULT_DELIMITERS)
|
|
60
|
+
unicode_delims = dict(_DEFAULT_UNICODE_DELIMITERS)
|
|
61
|
+
date_open = DATE_OPEN
|
|
62
|
+
date_close = DATE_CLOSE
|
|
63
|
+
config_path = (
|
|
64
|
+
Path(__file__).resolve().parent.parent
|
|
65
|
+
/ "resources" / "usm" / "operators.json"
|
|
66
|
+
)
|
|
67
|
+
try:
|
|
68
|
+
with open(config_path, "r", encoding="utf-8-sig") as handle:
|
|
69
|
+
data = json.load(handle)
|
|
70
|
+
except Exception:
|
|
71
|
+
return (
|
|
72
|
+
single_ops, multi_ops, unicode_ops,
|
|
73
|
+
delimiters, unicode_delims, date_open, date_close,
|
|
74
|
+
)
|
|
75
|
+
for section in ("arithmetic", "comparison", "assignment", "bitwise"):
|
|
76
|
+
entries = data.get(section, {})
|
|
77
|
+
for meta in entries.values():
|
|
78
|
+
symbols = meta.get("symbols", [])
|
|
79
|
+
if not symbols:
|
|
80
|
+
continue
|
|
81
|
+
canonical = symbols[0]
|
|
82
|
+
for symbol in symbols:
|
|
83
|
+
if len(symbol) > 1:
|
|
84
|
+
multi_ops.add(symbol)
|
|
85
|
+
else:
|
|
86
|
+
single_ops.add(symbol)
|
|
87
|
+
for alt in meta.get("unicode_alt", []):
|
|
88
|
+
unicode_ops[alt] = canonical
|
|
89
|
+
for name, meta in data.get("delimiters", {}).items():
|
|
90
|
+
symbols = meta.get("symbols", [])
|
|
91
|
+
if not symbols:
|
|
92
|
+
continue
|
|
93
|
+
canonical = symbols[0]
|
|
94
|
+
if name == "ARROW":
|
|
95
|
+
multi_ops.add(canonical)
|
|
96
|
+
for alt in meta.get("unicode_alt", []):
|
|
97
|
+
unicode_ops[alt] = canonical
|
|
98
|
+
continue
|
|
99
|
+
if name == "DATE_OPEN":
|
|
100
|
+
date_open = canonical
|
|
101
|
+
continue
|
|
102
|
+
if name == "DATE_CLOSE":
|
|
103
|
+
date_close = canonical
|
|
104
|
+
continue
|
|
105
|
+
delimiters.add(canonical)
|
|
106
|
+
for alt in meta.get("unicode_alt", []):
|
|
107
|
+
unicode_delims[alt] = canonical
|
|
108
|
+
return (
|
|
109
|
+
single_ops, multi_ops, unicode_ops,
|
|
110
|
+
delimiters, unicode_delims, date_open, date_close,
|
|
111
|
+
)
|
|
112
|
+
(SINGLE_OPERATORS, MULTI_OPERATORS, UNICODE_OPERATORS, DELIMITERS,
|
|
113
|
+
UNICODE_DELIMITERS, DATE_OPEN, DATE_CLOSE) = _load_operator_config()
|
|
114
|
+
def _is_identifier_start(char):
|
|
115
|
+
"""Check if a character can start an identifier."""
|
|
116
|
+
if not char:
|
|
117
|
+
return False
|
|
118
|
+
cat = unicodedata.category(char)
|
|
119
|
+
# Lu=uppercase, Ll=lowercase, Lt=titlecase, Lm=modifier, Lo=other letter
|
|
120
|
+
# Mn=nonspacing mark (e.g., Devanagari vowel signs that start conjuncts)
|
|
121
|
+
return cat.startswith("L") or cat in ("Mn", "Mc") or char == "_"
|
|
122
|
+
def _is_identifier_part(char):
|
|
123
|
+
"""Check if a character can be part of an identifier."""
|
|
124
|
+
if not char:
|
|
125
|
+
return False
|
|
126
|
+
cat = unicodedata.category(char)
|
|
127
|
+
# Include combining marks (Mn=nonspacing, Mc=spacing combining)
|
|
128
|
+
# needed for Devanagari, Arabic, and other complex scripts
|
|
129
|
+
return (cat.startswith("L") or cat == "Nd"
|
|
130
|
+
or cat in ("Mn", "Mc") or char == "_")
|
|
131
|
+
def _is_digit(char):
|
|
132
|
+
"""Check if a character is a Unicode decimal digit."""
|
|
133
|
+
if not char:
|
|
134
|
+
return False
|
|
135
|
+
return unicodedata.category(char) == "Nd"
|
|
136
|
+
def _is_hex_digit(char):
|
|
137
|
+
"""Check if a character is an ASCII hexadecimal digit."""
|
|
138
|
+
return char.isdigit() or char.lower() in "abcdef"
|
|
139
|
+
# pylint: disable=too-few-public-methods
|
|
140
|
+
class Lexer:
|
|
141
|
+
"""
|
|
142
|
+
Tokenizes multilingual source code.
|
|
143
|
+
Recognizes keywords in any of the 10 pilot languages,
|
|
144
|
+
Unicode identifiers, multilingual numerals, multilingual
|
|
145
|
+
string literals, and operators (including Unicode alternatives).
|
|
146
|
+
"""
|
|
147
|
+
_MAX_KEYWORD_WORDS = 3
|
|
148
|
+
|
|
149
|
+
def __init__(self, source, language=None):
|
|
150
|
+
"""
|
|
151
|
+
Initialize the lexer.
|
|
152
|
+
Parameters:
|
|
153
|
+
source (str): Source code to tokenize
|
|
154
|
+
language (str): If given, only this language's keywords
|
|
155
|
+
are recognized. If None, auto-detect.
|
|
156
|
+
"""
|
|
157
|
+
self.reader = SourceReader(source)
|
|
158
|
+
self.language = language
|
|
159
|
+
self.registry = KeywordRegistry()
|
|
160
|
+
self.tokens = []
|
|
161
|
+
self._indent_stack = [0]
|
|
162
|
+
self._at_line_start = True
|
|
163
|
+
self._detected_keywords = []
|
|
164
|
+
|
|
165
|
+
def _reader_state(self):
|
|
166
|
+
"""Snapshot current reader state."""
|
|
167
|
+
return (self.reader.pos, self.reader.line, self.reader.column)
|
|
168
|
+
|
|
169
|
+
def _restore_reader_state(self, state):
|
|
170
|
+
"""Restore a previously saved reader state."""
|
|
171
|
+
self.reader.pos, self.reader.line, self.reader.column = state
|
|
172
|
+
|
|
173
|
+
def _match_keyword(self, text):
|
|
174
|
+
"""Return (concept, language) if text is a keyword, else None."""
|
|
175
|
+
if self.language is not None:
|
|
176
|
+
if self.registry.is_keyword(text, self.language):
|
|
177
|
+
return (self.registry.get_concept(text, self.language), self.language)
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
for try_lang in self.registry.get_supported_languages():
|
|
181
|
+
if self.registry.is_keyword(text, try_lang):
|
|
182
|
+
return (self.registry.get_concept(text, try_lang), try_lang)
|
|
183
|
+
return None
|
|
184
|
+
# pylint: disable=too-many-branches,too-many-statements
|
|
185
|
+
def tokenize(self):
|
|
186
|
+
"""
|
|
187
|
+
Tokenize the entire source string.
|
|
188
|
+
Returns:
|
|
189
|
+
list[Token]: List of tokens
|
|
190
|
+
"""
|
|
191
|
+
while not self.reader.is_at_end():
|
|
192
|
+
self._skip_spaces()
|
|
193
|
+
if self.reader.is_at_end():
|
|
194
|
+
break
|
|
195
|
+
char = self.reader.peek()
|
|
196
|
+
# Newline
|
|
197
|
+
if char == "\n":
|
|
198
|
+
self._read_newline()
|
|
199
|
+
continue
|
|
200
|
+
# Comment
|
|
201
|
+
if char == "#":
|
|
202
|
+
self._read_comment()
|
|
203
|
+
continue
|
|
204
|
+
# Handle indentation at start of line
|
|
205
|
+
if self._at_line_start:
|
|
206
|
+
self._handle_indentation()
|
|
207
|
+
self._at_line_start = False
|
|
208
|
+
if self.reader.is_at_end():
|
|
209
|
+
break
|
|
210
|
+
char = self.reader.peek()
|
|
211
|
+
if char in ("\n", "#"):
|
|
212
|
+
continue
|
|
213
|
+
# F-string literals: f"..." or f'...'
|
|
214
|
+
if char in ('f', 'F') and self.reader.peek_ahead(1) in ('"', "'"):
|
|
215
|
+
self._read_fstring()
|
|
216
|
+
continue
|
|
217
|
+
# String literals (check triple-quoted first)
|
|
218
|
+
if char in ('"', "'") and self.reader.peek_ahead(1) == char \
|
|
219
|
+
and self.reader.peek_ahead(2) == char:
|
|
220
|
+
self._read_triple_string(char)
|
|
221
|
+
continue
|
|
222
|
+
# String literals
|
|
223
|
+
if char in STRING_PAIRS:
|
|
224
|
+
self._read_string(char)
|
|
225
|
+
continue
|
|
226
|
+
# Date literals
|
|
227
|
+
if char == DATE_OPEN:
|
|
228
|
+
self._read_date_literal()
|
|
229
|
+
continue
|
|
230
|
+
# Numerals (Unicode decimal digits or ASCII digits, or leading -)
|
|
231
|
+
if _is_digit(char):
|
|
232
|
+
self._read_numeral()
|
|
233
|
+
continue
|
|
234
|
+
# Identifiers and keywords
|
|
235
|
+
if _is_identifier_start(char):
|
|
236
|
+
self._read_identifier_or_keyword()
|
|
237
|
+
continue
|
|
238
|
+
# Operators (Unicode)
|
|
239
|
+
if char in UNICODE_OPERATORS:
|
|
240
|
+
line, col = self.reader.line, self.reader.column
|
|
241
|
+
self.reader.advance()
|
|
242
|
+
self.tokens.append(Token(
|
|
243
|
+
TokenType.OPERATOR, UNICODE_OPERATORS[char],
|
|
244
|
+
line, col
|
|
245
|
+
))
|
|
246
|
+
continue
|
|
247
|
+
# Operators (ASCII)
|
|
248
|
+
if char in SINGLE_OPERATORS:
|
|
249
|
+
self._read_operator()
|
|
250
|
+
continue
|
|
251
|
+
# Walrus operator uses ':' prefix, which is also a delimiter.
|
|
252
|
+
if char == ":" and self.reader.peek_ahead(1) == "=":
|
|
253
|
+
line, col = self.reader.line, self.reader.column
|
|
254
|
+
self.reader.advance()
|
|
255
|
+
self.reader.advance()
|
|
256
|
+
self.tokens.append(Token(
|
|
257
|
+
TokenType.OPERATOR, ":=", line, col
|
|
258
|
+
))
|
|
259
|
+
continue
|
|
260
|
+
# Delimiters (Unicode)
|
|
261
|
+
if char in UNICODE_DELIMITERS:
|
|
262
|
+
line, col = self.reader.line, self.reader.column
|
|
263
|
+
self.reader.advance()
|
|
264
|
+
self.tokens.append(Token(
|
|
265
|
+
TokenType.DELIMITER, UNICODE_DELIMITERS[char],
|
|
266
|
+
line, col
|
|
267
|
+
))
|
|
268
|
+
continue
|
|
269
|
+
# Delimiters (ASCII)
|
|
270
|
+
if char in DELIMITERS:
|
|
271
|
+
line, col = self.reader.line, self.reader.column
|
|
272
|
+
self.reader.advance()
|
|
273
|
+
self.tokens.append(Token(
|
|
274
|
+
TokenType.DELIMITER, char, line, col
|
|
275
|
+
))
|
|
276
|
+
continue
|
|
277
|
+
# Whitespace (spaces/tabs already handled)
|
|
278
|
+
if char in (" ", "\t", "\r"):
|
|
279
|
+
self.reader.advance()
|
|
280
|
+
continue
|
|
281
|
+
# Unknown character
|
|
282
|
+
raise UnexpectedTokenError(
|
|
283
|
+
repr(char), self.reader.line, self.reader.column
|
|
284
|
+
)
|
|
285
|
+
# Emit remaining DEDENTs
|
|
286
|
+
while len(self._indent_stack) > 1:
|
|
287
|
+
self._indent_stack.pop()
|
|
288
|
+
self.tokens.append(Token(
|
|
289
|
+
TokenType.DEDENT, "", self.reader.line, self.reader.column
|
|
290
|
+
))
|
|
291
|
+
self.tokens.append(Token(
|
|
292
|
+
TokenType.EOF, "", self.reader.line, self.reader.column
|
|
293
|
+
))
|
|
294
|
+
# Auto-detect language if not set
|
|
295
|
+
if self.language is None and self._detected_keywords:
|
|
296
|
+
self.language = self.registry.detect_language(
|
|
297
|
+
self._detected_keywords
|
|
298
|
+
)
|
|
299
|
+
return self.tokens
|
|
300
|
+
def _skip_spaces(self):
|
|
301
|
+
"""Skip spaces and tabs (not newlines)."""
|
|
302
|
+
while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
|
|
303
|
+
if self._at_line_start:
|
|
304
|
+
break # Don't skip — indentation matters
|
|
305
|
+
self.reader.advance()
|
|
306
|
+
def _read_newline(self):
|
|
307
|
+
"""Read a newline and emit NEWLINE token."""
|
|
308
|
+
line, col = self.reader.line, self.reader.column
|
|
309
|
+
self.reader.advance()
|
|
310
|
+
self.tokens.append(Token(TokenType.NEWLINE, "\\n", line, col))
|
|
311
|
+
self._at_line_start = True
|
|
312
|
+
def _read_comment(self):
|
|
313
|
+
"""Read a comment (# to end of line)."""
|
|
314
|
+
line, col = self.reader.line, self.reader.column
|
|
315
|
+
text = ""
|
|
316
|
+
while not self.reader.is_at_end() and self.reader.peek() != "\n":
|
|
317
|
+
text += self.reader.advance()
|
|
318
|
+
self.tokens.append(Token(TokenType.COMMENT, text, line, col))
|
|
319
|
+
def _handle_indentation(self):
|
|
320
|
+
"""Handle Python-style indentation."""
|
|
321
|
+
line, col = self.reader.line, self.reader.column
|
|
322
|
+
indent = 0
|
|
323
|
+
while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
|
|
324
|
+
char = self.reader.advance()
|
|
325
|
+
if char == "\t":
|
|
326
|
+
indent += 4 # Tab = 4 spaces
|
|
327
|
+
else:
|
|
328
|
+
indent += 1
|
|
329
|
+
# Skip blank lines and comment-only lines
|
|
330
|
+
if not self.reader.is_at_end() and self.reader.peek() in ("\n", "#"):
|
|
331
|
+
return
|
|
332
|
+
current = self._indent_stack[-1]
|
|
333
|
+
if indent > current:
|
|
334
|
+
self._indent_stack.append(indent)
|
|
335
|
+
self.tokens.append(Token(TokenType.INDENT, "", line, col))
|
|
336
|
+
elif indent < current:
|
|
337
|
+
while self._indent_stack and self._indent_stack[-1] > indent:
|
|
338
|
+
self._indent_stack.pop()
|
|
339
|
+
self.tokens.append(Token(TokenType.DEDENT, "", line, col))
|
|
340
|
+
def _read_numeral(self):
|
|
341
|
+
"""Read numeral token (decimal, base-prefixed, scientific)."""
|
|
342
|
+
line, col = self.reader.line, self.reader.column
|
|
343
|
+
text = self.reader.advance() # first digit already confirmed
|
|
344
|
+
# Base-prefixed numerals: 0x..., 0o..., 0b...
|
|
345
|
+
if text == "0" and not self.reader.is_at_end():
|
|
346
|
+
prefix = self.reader.peek()
|
|
347
|
+
if prefix.lower() in ("x", "o", "b"):
|
|
348
|
+
text += self.reader.advance()
|
|
349
|
+
while not self.reader.is_at_end():
|
|
350
|
+
char = self.reader.peek()
|
|
351
|
+
valid = False
|
|
352
|
+
if prefix.lower() == "x":
|
|
353
|
+
valid = _is_hex_digit(char) or char == "_"
|
|
354
|
+
elif prefix.lower() == "o":
|
|
355
|
+
valid = char in "01234567_"
|
|
356
|
+
elif prefix.lower() == "b":
|
|
357
|
+
valid = char in "01_"
|
|
358
|
+
if not valid:
|
|
359
|
+
break
|
|
360
|
+
text += self.reader.advance()
|
|
361
|
+
self.tokens.append(Token(TokenType.NUMERAL, text, line, col))
|
|
362
|
+
return
|
|
363
|
+
# Decimal and float part
|
|
364
|
+
while not self.reader.is_at_end():
|
|
365
|
+
char = self.reader.peek()
|
|
366
|
+
if _is_digit(char) or char == "_":
|
|
367
|
+
text += self.reader.advance()
|
|
368
|
+
else:
|
|
369
|
+
break
|
|
370
|
+
# Fractional part
|
|
371
|
+
if not self.reader.is_at_end() and self.reader.peek() == ".":
|
|
372
|
+
text += self.reader.advance()
|
|
373
|
+
while not self.reader.is_at_end():
|
|
374
|
+
char = self.reader.peek()
|
|
375
|
+
if _is_digit(char) or char == "_":
|
|
376
|
+
text += self.reader.advance()
|
|
377
|
+
else:
|
|
378
|
+
break
|
|
379
|
+
# Scientific notation (ASCII e/E)
|
|
380
|
+
if not self.reader.is_at_end() and self.reader.peek() in ("e", "E"):
|
|
381
|
+
sign = self.reader.peek_ahead(1)
|
|
382
|
+
first_digit = self.reader.peek_ahead(2) if sign in ("+", "-") \
|
|
383
|
+
else sign
|
|
384
|
+
if first_digit and _is_digit(first_digit):
|
|
385
|
+
text += self.reader.advance() # e/E
|
|
386
|
+
if sign in ("+", "-"):
|
|
387
|
+
text += self.reader.advance()
|
|
388
|
+
while not self.reader.is_at_end():
|
|
389
|
+
char = self.reader.peek()
|
|
390
|
+
if _is_digit(char) or char == "_":
|
|
391
|
+
text += self.reader.advance()
|
|
392
|
+
else:
|
|
393
|
+
break
|
|
394
|
+
self.tokens.append(Token(TokenType.NUMERAL, text, line, col))
|
|
395
|
+
def _read_identifier_or_keyword(self):
|
|
396
|
+
"""Read an identifier or keyword token."""
|
|
397
|
+
line, col = self.reader.line, self.reader.column
|
|
398
|
+
text = ""
|
|
399
|
+
while not self.reader.is_at_end() and _is_identifier_part(self.reader.peek()):
|
|
400
|
+
text += self.reader.advance()
|
|
401
|
+
|
|
402
|
+
first_word_end = self._reader_state()
|
|
403
|
+
words = [text]
|
|
404
|
+
best_match = None
|
|
405
|
+
|
|
406
|
+
initial_match = self._match_keyword(text)
|
|
407
|
+
if initial_match is not None:
|
|
408
|
+
best_match = (text, initial_match[0], initial_match[1], first_word_end)
|
|
409
|
+
|
|
410
|
+
for _ in range(self._MAX_KEYWORD_WORDS - 1):
|
|
411
|
+
before_gap = self._reader_state()
|
|
412
|
+
saw_gap = False
|
|
413
|
+
while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
|
|
414
|
+
saw_gap = True
|
|
415
|
+
self.reader.advance()
|
|
416
|
+
if (not saw_gap) or self.reader.is_at_end():
|
|
417
|
+
self._restore_reader_state(before_gap)
|
|
418
|
+
break
|
|
419
|
+
if not _is_identifier_start(self.reader.peek()):
|
|
420
|
+
self._restore_reader_state(before_gap)
|
|
421
|
+
break
|
|
422
|
+
|
|
423
|
+
next_word = ""
|
|
424
|
+
while (not self.reader.is_at_end()
|
|
425
|
+
and _is_identifier_part(self.reader.peek())):
|
|
426
|
+
next_word += self.reader.advance()
|
|
427
|
+
|
|
428
|
+
words.append(next_word)
|
|
429
|
+
phrase = " ".join(words)
|
|
430
|
+
phrase_match = self._match_keyword(phrase)
|
|
431
|
+
if phrase_match is not None:
|
|
432
|
+
best_match = (
|
|
433
|
+
phrase,
|
|
434
|
+
phrase_match[0],
|
|
435
|
+
phrase_match[1],
|
|
436
|
+
self._reader_state(),
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if best_match is not None:
|
|
440
|
+
phrase, concept, language, end_state = best_match
|
|
441
|
+
self._restore_reader_state(end_state)
|
|
442
|
+
self._detected_keywords.append(phrase)
|
|
443
|
+
self.tokens.append(Token(
|
|
444
|
+
TokenType.KEYWORD, phrase, line, col,
|
|
445
|
+
concept=concept, language=language
|
|
446
|
+
))
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
self._restore_reader_state(first_word_end)
|
|
450
|
+
self.tokens.append(Token(TokenType.IDENTIFIER, text, line, col))
|
|
451
|
+
def _read_fstring(self):
|
|
452
|
+
"""Read an f-string literal: f"text {expr} text"."""
|
|
453
|
+
line, col = self.reader.line, self.reader.column
|
|
454
|
+
self.reader.advance() # consume 'f'
|
|
455
|
+
quote_char = self.reader.advance() # consume opening quote
|
|
456
|
+
text = ""
|
|
457
|
+
while not self.reader.is_at_end():
|
|
458
|
+
char = self.reader.peek()
|
|
459
|
+
if char == quote_char:
|
|
460
|
+
self.reader.advance()
|
|
461
|
+
self.tokens.append(Token(
|
|
462
|
+
TokenType.FSTRING, text, line, col
|
|
463
|
+
))
|
|
464
|
+
return
|
|
465
|
+
if char == "\\" and quote_char in ('"', "'"):
|
|
466
|
+
self.reader.advance()
|
|
467
|
+
next_char = self.reader.advance()
|
|
468
|
+
text += "\\" + next_char
|
|
469
|
+
else:
|
|
470
|
+
text += self.reader.advance()
|
|
471
|
+
raise UnexpectedTokenError(
|
|
472
|
+
"Unterminated f-string literal",
|
|
473
|
+
line, col
|
|
474
|
+
)
|
|
475
|
+
def _read_triple_string(self, quote_char):
|
|
476
|
+
"""Read a triple-quoted string literal (\"\"\"...\"\"\" or '''...''')."""
|
|
477
|
+
line, col = self.reader.line, self.reader.column
|
|
478
|
+
# Consume the three opening quotes
|
|
479
|
+
self.reader.advance()
|
|
480
|
+
self.reader.advance()
|
|
481
|
+
self.reader.advance()
|
|
482
|
+
text = ""
|
|
483
|
+
while not self.reader.is_at_end():
|
|
484
|
+
char = self.reader.peek()
|
|
485
|
+
if char == quote_char and self.reader.peek_ahead(1) == quote_char \
|
|
486
|
+
and self.reader.peek_ahead(2) == quote_char:
|
|
487
|
+
# Consume the three closing quotes
|
|
488
|
+
self.reader.advance()
|
|
489
|
+
self.reader.advance()
|
|
490
|
+
self.reader.advance()
|
|
491
|
+
self.tokens.append(Token(
|
|
492
|
+
TokenType.STRING, text, line, col
|
|
493
|
+
))
|
|
494
|
+
return
|
|
495
|
+
if char == "\\" and quote_char in ('"', "'"):
|
|
496
|
+
self.reader.advance() # consume backslash
|
|
497
|
+
next_char = self.reader.advance()
|
|
498
|
+
text += "\\" + next_char
|
|
499
|
+
else:
|
|
500
|
+
text += self.reader.advance()
|
|
501
|
+
raise UnexpectedTokenError(
|
|
502
|
+
"Unterminated triple-quoted string literal",
|
|
503
|
+
line, col
|
|
504
|
+
)
|
|
505
|
+
def _read_string(self, open_char):
|
|
506
|
+
"""Read a string literal."""
|
|
507
|
+
line, col = self.reader.line, self.reader.column
|
|
508
|
+
close_char = STRING_PAIRS[open_char]
|
|
509
|
+
self.reader.advance() # consume opening quote
|
|
510
|
+
text = ""
|
|
511
|
+
while not self.reader.is_at_end():
|
|
512
|
+
char = self.reader.peek()
|
|
513
|
+
if char == close_char:
|
|
514
|
+
self.reader.advance()
|
|
515
|
+
self.tokens.append(Token(
|
|
516
|
+
TokenType.STRING, text, line, col
|
|
517
|
+
))
|
|
518
|
+
return
|
|
519
|
+
if char == "\\" and close_char in ('"', "'"):
|
|
520
|
+
self.reader.advance() # consume backslash
|
|
521
|
+
next_char = self.reader.advance()
|
|
522
|
+
text += "\\" + next_char
|
|
523
|
+
else:
|
|
524
|
+
text += self.reader.advance()
|
|
525
|
+
# Unterminated string
|
|
526
|
+
raise UnexpectedTokenError(
|
|
527
|
+
"Unterminated string literal",
|
|
528
|
+
line, col
|
|
529
|
+
)
|
|
530
|
+
def _read_date_literal(self):
|
|
531
|
+
"""Read a date literal enclosed in 〔 and 〕."""
|
|
532
|
+
line, col = self.reader.line, self.reader.column
|
|
533
|
+
self.reader.advance() # consume 〔
|
|
534
|
+
text = ""
|
|
535
|
+
while not self.reader.is_at_end():
|
|
536
|
+
char = self.reader.peek()
|
|
537
|
+
if char == DATE_CLOSE:
|
|
538
|
+
self.reader.advance()
|
|
539
|
+
self.tokens.append(Token(
|
|
540
|
+
TokenType.DATE_LITERAL, text, line, col
|
|
541
|
+
))
|
|
542
|
+
return
|
|
543
|
+
text += self.reader.advance()
|
|
544
|
+
raise UnexpectedTokenError(
|
|
545
|
+
"Unterminated date literal",
|
|
546
|
+
line, col
|
|
547
|
+
)
|
|
548
|
+
def _read_operator(self):
|
|
549
|
+
"""Read an operator token, checking for multi-character operators."""
|
|
550
|
+
line, col = self.reader.line, self.reader.column
|
|
551
|
+
char = self.reader.advance()
|
|
552
|
+
# Check for three-character operators first (e.g., **=, //=, <<=, >>=)
|
|
553
|
+
if not self.reader.is_at_end():
|
|
554
|
+
two_char = char + self.reader.peek()
|
|
555
|
+
if two_char in MULTI_OPERATORS:
|
|
556
|
+
peek2 = self.reader.peek_ahead(1)
|
|
557
|
+
three_char = two_char + peek2 if peek2 else ""
|
|
558
|
+
if len(three_char) == 3 and three_char in MULTI_OPERATORS:
|
|
559
|
+
self.reader.advance() # consume second char
|
|
560
|
+
self.reader.advance() # consume third char
|
|
561
|
+
self.tokens.append(Token(
|
|
562
|
+
TokenType.OPERATOR, three_char, line, col
|
|
563
|
+
))
|
|
564
|
+
return
|
|
565
|
+
self.reader.advance()
|
|
566
|
+
self.tokens.append(Token(
|
|
567
|
+
TokenType.OPERATOR, two_char, line, col
|
|
568
|
+
))
|
|
569
|
+
return
|
|
570
|
+
self.tokens.append(Token(TokenType.OPERATOR, char, line, col))
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Source reader with position tracking for the lexer."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SourceReader:
|
|
11
|
+
"""
|
|
12
|
+
Wraps a source string with character-by-character reading
|
|
13
|
+
and position tracking.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, source):
|
|
17
|
+
"""
|
|
18
|
+
Initialize the reader.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
source (str): The source code string
|
|
22
|
+
"""
|
|
23
|
+
self.source = source
|
|
24
|
+
self.pos = 0
|
|
25
|
+
self.line = 1
|
|
26
|
+
self.column = 1
|
|
27
|
+
|
|
28
|
+
def peek(self):
|
|
29
|
+
"""
|
|
30
|
+
Look at the current character without consuming it.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
str: Current character, or empty string if at end
|
|
34
|
+
"""
|
|
35
|
+
if self.is_at_end():
|
|
36
|
+
return ""
|
|
37
|
+
return self.source[self.pos]
|
|
38
|
+
|
|
39
|
+
def peek_ahead(self, offset=1):
|
|
40
|
+
"""
|
|
41
|
+
Look ahead by offset characters without consuming.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: Character at offset, or empty string if beyond end
|
|
45
|
+
"""
|
|
46
|
+
idx = self.pos + offset
|
|
47
|
+
if idx >= len(self.source):
|
|
48
|
+
return ""
|
|
49
|
+
return self.source[idx]
|
|
50
|
+
|
|
51
|
+
def advance(self):
|
|
52
|
+
"""
|
|
53
|
+
Consume and return the current character.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
str: The consumed character, or empty string if at end
|
|
57
|
+
"""
|
|
58
|
+
if self.is_at_end():
|
|
59
|
+
return ""
|
|
60
|
+
char = self.source[self.pos]
|
|
61
|
+
self.pos += 1
|
|
62
|
+
if char == "\n":
|
|
63
|
+
self.line += 1
|
|
64
|
+
self.column = 1
|
|
65
|
+
else:
|
|
66
|
+
self.column += 1
|
|
67
|
+
return char
|
|
68
|
+
|
|
69
|
+
def match(self, expected):
|
|
70
|
+
"""
|
|
71
|
+
Consume the current character if it matches expected.
|
|
72
|
+
|
|
73
|
+
Parameters:
|
|
74
|
+
expected (str): The expected character
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
bool: True if matched and consumed
|
|
78
|
+
"""
|
|
79
|
+
if self.is_at_end() or self.source[self.pos] != expected:
|
|
80
|
+
return False
|
|
81
|
+
self.advance()
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
def is_at_end(self):
|
|
85
|
+
"""
|
|
86
|
+
Check if we've reached the end of the source.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
bool: True if at end
|
|
90
|
+
"""
|
|
91
|
+
return self.pos >= len(self.source)
|