multilingualprogramming 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. multilingualprogramming/__init__.py +74 -0
  2. multilingualprogramming/__main__.py +194 -0
  3. multilingualprogramming/codegen/__init__.py +12 -0
  4. multilingualprogramming/codegen/executor.py +215 -0
  5. multilingualprogramming/codegen/python_generator.py +592 -0
  6. multilingualprogramming/codegen/repl.py +489 -0
  7. multilingualprogramming/codegen/runtime_builtins.py +308 -0
  8. multilingualprogramming/core/__init__.py +12 -0
  9. multilingualprogramming/core/ir.py +29 -0
  10. multilingualprogramming/core/lowering.py +24 -0
  11. multilingualprogramming/datetime/__init__.py +11 -0
  12. multilingualprogramming/datetime/date_parser.py +190 -0
  13. multilingualprogramming/datetime/mp_date.py +210 -0
  14. multilingualprogramming/datetime/mp_datetime.py +153 -0
  15. multilingualprogramming/datetime/mp_time.py +147 -0
  16. multilingualprogramming/datetime/resource_loader.py +18 -0
  17. multilingualprogramming/exceptions.py +158 -0
  18. multilingualprogramming/imports.py +150 -0
  19. multilingualprogramming/keyword/__init__.py +13 -0
  20. multilingualprogramming/keyword/keyword_registry.py +249 -0
  21. multilingualprogramming/keyword/keyword_validator.py +59 -0
  22. multilingualprogramming/keyword/language_pack_validator.py +110 -0
  23. multilingualprogramming/lexer/__init__.py +11 -0
  24. multilingualprogramming/lexer/lexer.py +570 -0
  25. multilingualprogramming/lexer/source_reader.py +91 -0
  26. multilingualprogramming/lexer/token.py +54 -0
  27. multilingualprogramming/lexer/token_types.py +38 -0
  28. multilingualprogramming/numeral/__init__.py +11 -0
  29. multilingualprogramming/numeral/abstract_numeral.py +232 -0
  30. multilingualprogramming/numeral/complex_numeral.py +190 -0
  31. multilingualprogramming/numeral/fraction_numeral.py +165 -0
  32. multilingualprogramming/numeral/mp_numeral.py +243 -0
  33. multilingualprogramming/numeral/numeral_converter.py +151 -0
  34. multilingualprogramming/numeral/roman_numeral.py +301 -0
  35. multilingualprogramming/numeral/unicode_numeral.py +292 -0
  36. multilingualprogramming/parser/__init__.py +28 -0
  37. multilingualprogramming/parser/ast_nodes.py +459 -0
  38. multilingualprogramming/parser/ast_printer.py +677 -0
  39. multilingualprogramming/parser/error_messages.py +75 -0
  40. multilingualprogramming/parser/parser.py +1796 -0
  41. multilingualprogramming/parser/semantic_analyzer.py +689 -0
  42. multilingualprogramming/parser/surface_normalizer.py +282 -0
  43. multilingualprogramming/resources/datetime/eras.json +23 -0
  44. multilingualprogramming/resources/datetime/formats.json +32 -0
  45. multilingualprogramming/resources/datetime/months.json +150 -0
  46. multilingualprogramming/resources/datetime/weekdays.json +90 -0
  47. multilingualprogramming/resources/parser/error_messages.json +310 -0
  48. multilingualprogramming/resources/repl/commands.json +636 -0
  49. multilingualprogramming/resources/usm/builtins_aliases.json +731 -0
  50. multilingualprogramming/resources/usm/keywords.json +1063 -0
  51. multilingualprogramming/resources/usm/operators.json +532 -0
  52. multilingualprogramming/resources/usm/schema.json +34 -0
  53. multilingualprogramming/resources/usm/surface_patterns.json +1523 -0
  54. multilingualprogramming/unicode_string.py +140 -0
  55. multilingualprogramming/version.py +9 -0
  56. multilingualprogramming-0.2.0.dist-info/METADATA +350 -0
  57. multilingualprogramming-0.2.0.dist-info/RECORD +61 -0
  58. multilingualprogramming-0.2.0.dist-info/WHEEL +5 -0
  59. multilingualprogramming-0.2.0.dist-info/entry_points.txt +3 -0
  60. multilingualprogramming-0.2.0.dist-info/licenses/LICENSE +674 -0
  61. multilingualprogramming-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,570 @@
1
+ #
2
+ # SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
3
+ #
4
+ # SPDX-License-Identifier: GPL-3.0-or-later
5
+ #
6
+ """Multilingual lexer that tokenizes mixed-script source code."""
7
+ import json
8
+ import unicodedata
9
+ from pathlib import Path
10
+ from multilingualprogramming.lexer.token_types import TokenType
11
+ from multilingualprogramming.lexer.token import Token
12
+ from multilingualprogramming.lexer.source_reader import SourceReader
13
+ from multilingualprogramming.keyword.keyword_registry import KeywordRegistry
14
+ from multilingualprogramming.exceptions import UnexpectedTokenError
15
+ # Operator characters and multi-character operators
16
+ _DEFAULT_SINGLE_OPERATORS = set("+-*/%<>=!&|^~")
17
+ _DEFAULT_MULTI_OPERATORS = {
18
+ "**", "//", "==", "!=", "<=", ">=", "<<", ">>",
19
+ "+=", "-=", "*=", "/=", "->", ":=",
20
+ "**=", "//=", "%=", "&=", "|=", "^=", "<<=", ">>=",
21
+ }
22
+ # Unicode operator alternatives
23
+ _DEFAULT_UNICODE_OPERATORS = {
24
+ "\u00d7": "*", # ×
25
+ "\u00f7": "/", # ÷
26
+ "\u2212": "-", # −
27
+ "\u2260": "!=", # ≠
28
+ "\u2264": "<=", # ≤
29
+ "\u2265": ">=", # ≥
30
+ "\u2192": "->", # →
31
+ }
32
+ _DEFAULT_DELIMITERS = set("()[]{},:;.@")
33
+ # Unicode delimiter alternatives
34
+ _DEFAULT_UNICODE_DELIMITERS = {
35
+ "\uff08": "(", "\uff09": ")", # fullwidth parens
36
+ "\uff3b": "[", "\uff3d": "]", # fullwidth brackets
37
+ "\uff5b": "{", "\uff5d": "}", # fullwidth braces
38
+ "\uff0c": ",", "\u060c": ",", # fullwidth/Arabic comma
39
+ "\uff1a": ":", # fullwidth colon
40
+ "\uff1b": ";", "\u061b": ";", # fullwidth/Arabic semicolon
41
+ }
42
+ # String delimiter pairs: (open, close)
43
+ STRING_PAIRS = {
44
+ '"': '"',
45
+ "'": "'",
46
+ "\u300c": "\u300d", # 「」 CJK corner brackets
47
+ "\u00ab": "\u00bb", # «» guillemets
48
+ "\u201c": "\u201d", # "" smart double quotes
49
+ "\u2018": "\u2019", # '' smart single quotes
50
+ }
51
+ # Date literal delimiters
52
+ DATE_OPEN = "\u3014" # 〔
53
+ DATE_CLOSE = "\u3015" # 〕
54
+ def _load_operator_config():
55
+ """Load operator and delimiter tables from operators.json."""
56
+ single_ops = set(_DEFAULT_SINGLE_OPERATORS)
57
+ multi_ops = set(_DEFAULT_MULTI_OPERATORS)
58
+ unicode_ops = dict(_DEFAULT_UNICODE_OPERATORS)
59
+ delimiters = set(_DEFAULT_DELIMITERS)
60
+ unicode_delims = dict(_DEFAULT_UNICODE_DELIMITERS)
61
+ date_open = DATE_OPEN
62
+ date_close = DATE_CLOSE
63
+ config_path = (
64
+ Path(__file__).resolve().parent.parent
65
+ / "resources" / "usm" / "operators.json"
66
+ )
67
+ try:
68
+ with open(config_path, "r", encoding="utf-8-sig") as handle:
69
+ data = json.load(handle)
70
+ except Exception:
71
+ return (
72
+ single_ops, multi_ops, unicode_ops,
73
+ delimiters, unicode_delims, date_open, date_close,
74
+ )
75
+ for section in ("arithmetic", "comparison", "assignment", "bitwise"):
76
+ entries = data.get(section, {})
77
+ for meta in entries.values():
78
+ symbols = meta.get("symbols", [])
79
+ if not symbols:
80
+ continue
81
+ canonical = symbols[0]
82
+ for symbol in symbols:
83
+ if len(symbol) > 1:
84
+ multi_ops.add(symbol)
85
+ else:
86
+ single_ops.add(symbol)
87
+ for alt in meta.get("unicode_alt", []):
88
+ unicode_ops[alt] = canonical
89
+ for name, meta in data.get("delimiters", {}).items():
90
+ symbols = meta.get("symbols", [])
91
+ if not symbols:
92
+ continue
93
+ canonical = symbols[0]
94
+ if name == "ARROW":
95
+ multi_ops.add(canonical)
96
+ for alt in meta.get("unicode_alt", []):
97
+ unicode_ops[alt] = canonical
98
+ continue
99
+ if name == "DATE_OPEN":
100
+ date_open = canonical
101
+ continue
102
+ if name == "DATE_CLOSE":
103
+ date_close = canonical
104
+ continue
105
+ delimiters.add(canonical)
106
+ for alt in meta.get("unicode_alt", []):
107
+ unicode_delims[alt] = canonical
108
+ return (
109
+ single_ops, multi_ops, unicode_ops,
110
+ delimiters, unicode_delims, date_open, date_close,
111
+ )
112
+ (SINGLE_OPERATORS, MULTI_OPERATORS, UNICODE_OPERATORS, DELIMITERS,
113
+ UNICODE_DELIMITERS, DATE_OPEN, DATE_CLOSE) = _load_operator_config()
114
+ def _is_identifier_start(char):
115
+ """Check if a character can start an identifier."""
116
+ if not char:
117
+ return False
118
+ cat = unicodedata.category(char)
119
+ # Lu=uppercase, Ll=lowercase, Lt=titlecase, Lm=modifier, Lo=other letter
120
+ # Mn=nonspacing mark (e.g., Devanagari vowel signs that start conjuncts)
121
+ return cat.startswith("L") or cat in ("Mn", "Mc") or char == "_"
122
+ def _is_identifier_part(char):
123
+ """Check if a character can be part of an identifier."""
124
+ if not char:
125
+ return False
126
+ cat = unicodedata.category(char)
127
+ # Include combining marks (Mn=nonspacing, Mc=spacing combining)
128
+ # needed for Devanagari, Arabic, and other complex scripts
129
+ return (cat.startswith("L") or cat == "Nd"
130
+ or cat in ("Mn", "Mc") or char == "_")
131
+ def _is_digit(char):
132
+ """Check if a character is a Unicode decimal digit."""
133
+ if not char:
134
+ return False
135
+ return unicodedata.category(char) == "Nd"
136
+ def _is_hex_digit(char):
137
+ """Check if a character is an ASCII hexadecimal digit."""
138
+ return char.isdigit() or char.lower() in "abcdef"
139
+ # pylint: disable=too-few-public-methods
140
+ class Lexer:
141
+ """
142
+ Tokenizes multilingual source code.
143
+ Recognizes keywords in any of the 10 pilot languages,
144
+ Unicode identifiers, multilingual numerals, multilingual
145
+ string literals, and operators (including Unicode alternatives).
146
+ """
147
+ _MAX_KEYWORD_WORDS = 3
148
+
149
+ def __init__(self, source, language=None):
150
+ """
151
+ Initialize the lexer.
152
+ Parameters:
153
+ source (str): Source code to tokenize
154
+ language (str): If given, only this language's keywords
155
+ are recognized. If None, auto-detect.
156
+ """
157
+ self.reader = SourceReader(source)
158
+ self.language = language
159
+ self.registry = KeywordRegistry()
160
+ self.tokens = []
161
+ self._indent_stack = [0]
162
+ self._at_line_start = True
163
+ self._detected_keywords = []
164
+
165
+ def _reader_state(self):
166
+ """Snapshot current reader state."""
167
+ return (self.reader.pos, self.reader.line, self.reader.column)
168
+
169
+ def _restore_reader_state(self, state):
170
+ """Restore a previously saved reader state."""
171
+ self.reader.pos, self.reader.line, self.reader.column = state
172
+
173
+ def _match_keyword(self, text):
174
+ """Return (concept, language) if text is a keyword, else None."""
175
+ if self.language is not None:
176
+ if self.registry.is_keyword(text, self.language):
177
+ return (self.registry.get_concept(text, self.language), self.language)
178
+ return None
179
+
180
+ for try_lang in self.registry.get_supported_languages():
181
+ if self.registry.is_keyword(text, try_lang):
182
+ return (self.registry.get_concept(text, try_lang), try_lang)
183
+ return None
184
+ # pylint: disable=too-many-branches,too-many-statements
185
+ def tokenize(self):
186
+ """
187
+ Tokenize the entire source string.
188
+ Returns:
189
+ list[Token]: List of tokens
190
+ """
191
+ while not self.reader.is_at_end():
192
+ self._skip_spaces()
193
+ if self.reader.is_at_end():
194
+ break
195
+ char = self.reader.peek()
196
+ # Newline
197
+ if char == "\n":
198
+ self._read_newline()
199
+ continue
200
+ # Comment
201
+ if char == "#":
202
+ self._read_comment()
203
+ continue
204
+ # Handle indentation at start of line
205
+ if self._at_line_start:
206
+ self._handle_indentation()
207
+ self._at_line_start = False
208
+ if self.reader.is_at_end():
209
+ break
210
+ char = self.reader.peek()
211
+ if char in ("\n", "#"):
212
+ continue
213
+ # F-string literals: f"..." or f'...'
214
+ if char in ('f', 'F') and self.reader.peek_ahead(1) in ('"', "'"):
215
+ self._read_fstring()
216
+ continue
217
+ # String literals (check triple-quoted first)
218
+ if char in ('"', "'") and self.reader.peek_ahead(1) == char \
219
+ and self.reader.peek_ahead(2) == char:
220
+ self._read_triple_string(char)
221
+ continue
222
+ # String literals
223
+ if char in STRING_PAIRS:
224
+ self._read_string(char)
225
+ continue
226
+ # Date literals
227
+ if char == DATE_OPEN:
228
+ self._read_date_literal()
229
+ continue
230
+ # Numerals (Unicode decimal digits or ASCII digits, or leading -)
231
+ if _is_digit(char):
232
+ self._read_numeral()
233
+ continue
234
+ # Identifiers and keywords
235
+ if _is_identifier_start(char):
236
+ self._read_identifier_or_keyword()
237
+ continue
238
+ # Operators (Unicode)
239
+ if char in UNICODE_OPERATORS:
240
+ line, col = self.reader.line, self.reader.column
241
+ self.reader.advance()
242
+ self.tokens.append(Token(
243
+ TokenType.OPERATOR, UNICODE_OPERATORS[char],
244
+ line, col
245
+ ))
246
+ continue
247
+ # Operators (ASCII)
248
+ if char in SINGLE_OPERATORS:
249
+ self._read_operator()
250
+ continue
251
+ # Walrus operator uses ':' prefix, which is also a delimiter.
252
+ if char == ":" and self.reader.peek_ahead(1) == "=":
253
+ line, col = self.reader.line, self.reader.column
254
+ self.reader.advance()
255
+ self.reader.advance()
256
+ self.tokens.append(Token(
257
+ TokenType.OPERATOR, ":=", line, col
258
+ ))
259
+ continue
260
+ # Delimiters (Unicode)
261
+ if char in UNICODE_DELIMITERS:
262
+ line, col = self.reader.line, self.reader.column
263
+ self.reader.advance()
264
+ self.tokens.append(Token(
265
+ TokenType.DELIMITER, UNICODE_DELIMITERS[char],
266
+ line, col
267
+ ))
268
+ continue
269
+ # Delimiters (ASCII)
270
+ if char in DELIMITERS:
271
+ line, col = self.reader.line, self.reader.column
272
+ self.reader.advance()
273
+ self.tokens.append(Token(
274
+ TokenType.DELIMITER, char, line, col
275
+ ))
276
+ continue
277
+ # Whitespace (spaces/tabs already handled)
278
+ if char in (" ", "\t", "\r"):
279
+ self.reader.advance()
280
+ continue
281
+ # Unknown character
282
+ raise UnexpectedTokenError(
283
+ repr(char), self.reader.line, self.reader.column
284
+ )
285
+ # Emit remaining DEDENTs
286
+ while len(self._indent_stack) > 1:
287
+ self._indent_stack.pop()
288
+ self.tokens.append(Token(
289
+ TokenType.DEDENT, "", self.reader.line, self.reader.column
290
+ ))
291
+ self.tokens.append(Token(
292
+ TokenType.EOF, "", self.reader.line, self.reader.column
293
+ ))
294
+ # Auto-detect language if not set
295
+ if self.language is None and self._detected_keywords:
296
+ self.language = self.registry.detect_language(
297
+ self._detected_keywords
298
+ )
299
+ return self.tokens
300
+ def _skip_spaces(self):
301
+ """Skip spaces and tabs (not newlines)."""
302
+ while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
303
+ if self._at_line_start:
304
+ break # Don't skip — indentation matters
305
+ self.reader.advance()
306
+ def _read_newline(self):
307
+ """Read a newline and emit NEWLINE token."""
308
+ line, col = self.reader.line, self.reader.column
309
+ self.reader.advance()
310
+ self.tokens.append(Token(TokenType.NEWLINE, "\\n", line, col))
311
+ self._at_line_start = True
312
+ def _read_comment(self):
313
+ """Read a comment (# to end of line)."""
314
+ line, col = self.reader.line, self.reader.column
315
+ text = ""
316
+ while not self.reader.is_at_end() and self.reader.peek() != "\n":
317
+ text += self.reader.advance()
318
+ self.tokens.append(Token(TokenType.COMMENT, text, line, col))
319
+ def _handle_indentation(self):
320
+ """Handle Python-style indentation."""
321
+ line, col = self.reader.line, self.reader.column
322
+ indent = 0
323
+ while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
324
+ char = self.reader.advance()
325
+ if char == "\t":
326
+ indent += 4 # Tab = 4 spaces
327
+ else:
328
+ indent += 1
329
+ # Skip blank lines and comment-only lines
330
+ if not self.reader.is_at_end() and self.reader.peek() in ("\n", "#"):
331
+ return
332
+ current = self._indent_stack[-1]
333
+ if indent > current:
334
+ self._indent_stack.append(indent)
335
+ self.tokens.append(Token(TokenType.INDENT, "", line, col))
336
+ elif indent < current:
337
+ while self._indent_stack and self._indent_stack[-1] > indent:
338
+ self._indent_stack.pop()
339
+ self.tokens.append(Token(TokenType.DEDENT, "", line, col))
340
+ def _read_numeral(self):
341
+ """Read numeral token (decimal, base-prefixed, scientific)."""
342
+ line, col = self.reader.line, self.reader.column
343
+ text = self.reader.advance() # first digit already confirmed
344
+ # Base-prefixed numerals: 0x..., 0o..., 0b...
345
+ if text == "0" and not self.reader.is_at_end():
346
+ prefix = self.reader.peek()
347
+ if prefix.lower() in ("x", "o", "b"):
348
+ text += self.reader.advance()
349
+ while not self.reader.is_at_end():
350
+ char = self.reader.peek()
351
+ valid = False
352
+ if prefix.lower() == "x":
353
+ valid = _is_hex_digit(char) or char == "_"
354
+ elif prefix.lower() == "o":
355
+ valid = char in "01234567_"
356
+ elif prefix.lower() == "b":
357
+ valid = char in "01_"
358
+ if not valid:
359
+ break
360
+ text += self.reader.advance()
361
+ self.tokens.append(Token(TokenType.NUMERAL, text, line, col))
362
+ return
363
+ # Decimal and float part
364
+ while not self.reader.is_at_end():
365
+ char = self.reader.peek()
366
+ if _is_digit(char) or char == "_":
367
+ text += self.reader.advance()
368
+ else:
369
+ break
370
+ # Fractional part
371
+ if not self.reader.is_at_end() and self.reader.peek() == ".":
372
+ text += self.reader.advance()
373
+ while not self.reader.is_at_end():
374
+ char = self.reader.peek()
375
+ if _is_digit(char) or char == "_":
376
+ text += self.reader.advance()
377
+ else:
378
+ break
379
+ # Scientific notation (ASCII e/E)
380
+ if not self.reader.is_at_end() and self.reader.peek() in ("e", "E"):
381
+ sign = self.reader.peek_ahead(1)
382
+ first_digit = self.reader.peek_ahead(2) if sign in ("+", "-") \
383
+ else sign
384
+ if first_digit and _is_digit(first_digit):
385
+ text += self.reader.advance() # e/E
386
+ if sign in ("+", "-"):
387
+ text += self.reader.advance()
388
+ while not self.reader.is_at_end():
389
+ char = self.reader.peek()
390
+ if _is_digit(char) or char == "_":
391
+ text += self.reader.advance()
392
+ else:
393
+ break
394
+ self.tokens.append(Token(TokenType.NUMERAL, text, line, col))
395
+ def _read_identifier_or_keyword(self):
396
+ """Read an identifier or keyword token."""
397
+ line, col = self.reader.line, self.reader.column
398
+ text = ""
399
+ while not self.reader.is_at_end() and _is_identifier_part(self.reader.peek()):
400
+ text += self.reader.advance()
401
+
402
+ first_word_end = self._reader_state()
403
+ words = [text]
404
+ best_match = None
405
+
406
+ initial_match = self._match_keyword(text)
407
+ if initial_match is not None:
408
+ best_match = (text, initial_match[0], initial_match[1], first_word_end)
409
+
410
+ for _ in range(self._MAX_KEYWORD_WORDS - 1):
411
+ before_gap = self._reader_state()
412
+ saw_gap = False
413
+ while not self.reader.is_at_end() and self.reader.peek() in (" ", "\t"):
414
+ saw_gap = True
415
+ self.reader.advance()
416
+ if (not saw_gap) or self.reader.is_at_end():
417
+ self._restore_reader_state(before_gap)
418
+ break
419
+ if not _is_identifier_start(self.reader.peek()):
420
+ self._restore_reader_state(before_gap)
421
+ break
422
+
423
+ next_word = ""
424
+ while (not self.reader.is_at_end()
425
+ and _is_identifier_part(self.reader.peek())):
426
+ next_word += self.reader.advance()
427
+
428
+ words.append(next_word)
429
+ phrase = " ".join(words)
430
+ phrase_match = self._match_keyword(phrase)
431
+ if phrase_match is not None:
432
+ best_match = (
433
+ phrase,
434
+ phrase_match[0],
435
+ phrase_match[1],
436
+ self._reader_state(),
437
+ )
438
+
439
+ if best_match is not None:
440
+ phrase, concept, language, end_state = best_match
441
+ self._restore_reader_state(end_state)
442
+ self._detected_keywords.append(phrase)
443
+ self.tokens.append(Token(
444
+ TokenType.KEYWORD, phrase, line, col,
445
+ concept=concept, language=language
446
+ ))
447
+ return
448
+
449
+ self._restore_reader_state(first_word_end)
450
+ self.tokens.append(Token(TokenType.IDENTIFIER, text, line, col))
451
+ def _read_fstring(self):
452
+ """Read an f-string literal: f"text {expr} text"."""
453
+ line, col = self.reader.line, self.reader.column
454
+ self.reader.advance() # consume 'f'
455
+ quote_char = self.reader.advance() # consume opening quote
456
+ text = ""
457
+ while not self.reader.is_at_end():
458
+ char = self.reader.peek()
459
+ if char == quote_char:
460
+ self.reader.advance()
461
+ self.tokens.append(Token(
462
+ TokenType.FSTRING, text, line, col
463
+ ))
464
+ return
465
+ if char == "\\" and quote_char in ('"', "'"):
466
+ self.reader.advance()
467
+ next_char = self.reader.advance()
468
+ text += "\\" + next_char
469
+ else:
470
+ text += self.reader.advance()
471
+ raise UnexpectedTokenError(
472
+ "Unterminated f-string literal",
473
+ line, col
474
+ )
475
+ def _read_triple_string(self, quote_char):
476
+ """Read a triple-quoted string literal (\"\"\"...\"\"\" or '''...''')."""
477
+ line, col = self.reader.line, self.reader.column
478
+ # Consume the three opening quotes
479
+ self.reader.advance()
480
+ self.reader.advance()
481
+ self.reader.advance()
482
+ text = ""
483
+ while not self.reader.is_at_end():
484
+ char = self.reader.peek()
485
+ if char == quote_char and self.reader.peek_ahead(1) == quote_char \
486
+ and self.reader.peek_ahead(2) == quote_char:
487
+ # Consume the three closing quotes
488
+ self.reader.advance()
489
+ self.reader.advance()
490
+ self.reader.advance()
491
+ self.tokens.append(Token(
492
+ TokenType.STRING, text, line, col
493
+ ))
494
+ return
495
+ if char == "\\" and quote_char in ('"', "'"):
496
+ self.reader.advance() # consume backslash
497
+ next_char = self.reader.advance()
498
+ text += "\\" + next_char
499
+ else:
500
+ text += self.reader.advance()
501
+ raise UnexpectedTokenError(
502
+ "Unterminated triple-quoted string literal",
503
+ line, col
504
+ )
505
+ def _read_string(self, open_char):
506
+ """Read a string literal."""
507
+ line, col = self.reader.line, self.reader.column
508
+ close_char = STRING_PAIRS[open_char]
509
+ self.reader.advance() # consume opening quote
510
+ text = ""
511
+ while not self.reader.is_at_end():
512
+ char = self.reader.peek()
513
+ if char == close_char:
514
+ self.reader.advance()
515
+ self.tokens.append(Token(
516
+ TokenType.STRING, text, line, col
517
+ ))
518
+ return
519
+ if char == "\\" and close_char in ('"', "'"):
520
+ self.reader.advance() # consume backslash
521
+ next_char = self.reader.advance()
522
+ text += "\\" + next_char
523
+ else:
524
+ text += self.reader.advance()
525
+ # Unterminated string
526
+ raise UnexpectedTokenError(
527
+ "Unterminated string literal",
528
+ line, col
529
+ )
530
+ def _read_date_literal(self):
531
+ """Read a date literal enclosed in 〔 and 〕."""
532
+ line, col = self.reader.line, self.reader.column
533
+ self.reader.advance() # consume 〔
534
+ text = ""
535
+ while not self.reader.is_at_end():
536
+ char = self.reader.peek()
537
+ if char == DATE_CLOSE:
538
+ self.reader.advance()
539
+ self.tokens.append(Token(
540
+ TokenType.DATE_LITERAL, text, line, col
541
+ ))
542
+ return
543
+ text += self.reader.advance()
544
+ raise UnexpectedTokenError(
545
+ "Unterminated date literal",
546
+ line, col
547
+ )
548
+ def _read_operator(self):
549
+ """Read an operator token, checking for multi-character operators."""
550
+ line, col = self.reader.line, self.reader.column
551
+ char = self.reader.advance()
552
+ # Check for three-character operators first (e.g., **=, //=, <<=, >>=)
553
+ if not self.reader.is_at_end():
554
+ two_char = char + self.reader.peek()
555
+ if two_char in MULTI_OPERATORS:
556
+ peek2 = self.reader.peek_ahead(1)
557
+ three_char = two_char + peek2 if peek2 else ""
558
+ if len(three_char) == 3 and three_char in MULTI_OPERATORS:
559
+ self.reader.advance() # consume second char
560
+ self.reader.advance() # consume third char
561
+ self.tokens.append(Token(
562
+ TokenType.OPERATOR, three_char, line, col
563
+ ))
564
+ return
565
+ self.reader.advance()
566
+ self.tokens.append(Token(
567
+ TokenType.OPERATOR, two_char, line, col
568
+ ))
569
+ return
570
+ self.tokens.append(Token(TokenType.OPERATOR, char, line, col))
@@ -0,0 +1,91 @@
1
+ #
2
+ # SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
3
+ #
4
+ # SPDX-License-Identifier: GPL-3.0-or-later
5
+ #
6
+
7
+ """Source reader with position tracking for the lexer."""
8
+
9
+
10
+ class SourceReader:
11
+ """
12
+ Wraps a source string with character-by-character reading
13
+ and position tracking.
14
+ """
15
+
16
+ def __init__(self, source):
17
+ """
18
+ Initialize the reader.
19
+
20
+ Parameters:
21
+ source (str): The source code string
22
+ """
23
+ self.source = source
24
+ self.pos = 0
25
+ self.line = 1
26
+ self.column = 1
27
+
28
+ def peek(self):
29
+ """
30
+ Look at the current character without consuming it.
31
+
32
+ Returns:
33
+ str: Current character, or empty string if at end
34
+ """
35
+ if self.is_at_end():
36
+ return ""
37
+ return self.source[self.pos]
38
+
39
+ def peek_ahead(self, offset=1):
40
+ """
41
+ Look ahead by offset characters without consuming.
42
+
43
+ Returns:
44
+ str: Character at offset, or empty string if beyond end
45
+ """
46
+ idx = self.pos + offset
47
+ if idx >= len(self.source):
48
+ return ""
49
+ return self.source[idx]
50
+
51
+ def advance(self):
52
+ """
53
+ Consume and return the current character.
54
+
55
+ Returns:
56
+ str: The consumed character, or empty string if at end
57
+ """
58
+ if self.is_at_end():
59
+ return ""
60
+ char = self.source[self.pos]
61
+ self.pos += 1
62
+ if char == "\n":
63
+ self.line += 1
64
+ self.column = 1
65
+ else:
66
+ self.column += 1
67
+ return char
68
+
69
+ def match(self, expected):
70
+ """
71
+ Consume the current character if it matches expected.
72
+
73
+ Parameters:
74
+ expected (str): The expected character
75
+
76
+ Returns:
77
+ bool: True if matched and consumed
78
+ """
79
+ if self.is_at_end() or self.source[self.pos] != expected:
80
+ return False
81
+ self.advance()
82
+ return True
83
+
84
+ def is_at_end(self):
85
+ """
86
+ Check if we've reached the end of the source.
87
+
88
+ Returns:
89
+ bool: True if at end
90
+ """
91
+ return self.pos >= len(self.source)