minecraft-datapack-language 15.4.28__py3-none-any.whl → 15.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minecraft_datapack_language/__init__.py +23 -2
- minecraft_datapack_language/_version.py +2 -2
- minecraft_datapack_language/ast_nodes.py +87 -59
- minecraft_datapack_language/cli.py +276 -139
- minecraft_datapack_language/mdl_compiler.py +470 -0
- minecraft_datapack_language/mdl_errors.py +14 -0
- minecraft_datapack_language/mdl_lexer.py +624 -0
- minecraft_datapack_language/mdl_parser.py +573 -0
- minecraft_datapack_language-15.4.30.dist-info/METADATA +266 -0
- minecraft_datapack_language-15.4.30.dist-info/RECORD +17 -0
- minecraft_datapack_language/cli_build.py +0 -1292
- minecraft_datapack_language/cli_check.py +0 -155
- minecraft_datapack_language/cli_colors.py +0 -264
- minecraft_datapack_language/cli_help.py +0 -508
- minecraft_datapack_language/cli_new.py +0 -300
- minecraft_datapack_language/cli_utils.py +0 -276
- minecraft_datapack_language/expression_processor.py +0 -352
- minecraft_datapack_language/linter.py +0 -409
- minecraft_datapack_language/mdl_lexer_js.py +0 -754
- minecraft_datapack_language/mdl_parser_js.py +0 -1049
- minecraft_datapack_language/pack.py +0 -758
- minecraft_datapack_language-15.4.28.dist-info/METADATA +0 -1274
- minecraft_datapack_language-15.4.28.dist-info/RECORD +0 -25
- {minecraft_datapack_language-15.4.28.dist-info → minecraft_datapack_language-15.4.30.dist-info}/WHEEL +0 -0
- {minecraft_datapack_language-15.4.28.dist-info → minecraft_datapack_language-15.4.30.dist-info}/entry_points.txt +0 -0
- {minecraft_datapack_language-15.4.28.dist-info → minecraft_datapack_language-15.4.30.dist-info}/licenses/LICENSE +0 -0
- {minecraft_datapack_language-15.4.28.dist-info → minecraft_datapack_language-15.4.30.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,624 @@
|
|
1
|
+
"""
|
2
|
+
MDL Lexer - Clean, extensible lexer for Minecraft Datapack Language
|
3
|
+
Fully supports the language specification defined in language-reference.md
|
4
|
+
"""
|
5
|
+
|
6
|
+
import re
|
7
|
+
from dataclasses import dataclass
|
8
|
+
from typing import List, Optional, Dict, Any
|
9
|
+
from .mdl_errors import MDLLexerError
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class Token:
|
14
|
+
"""Represents a single token in the MDL language."""
|
15
|
+
type: str
|
16
|
+
value: str
|
17
|
+
line: int
|
18
|
+
column: int
|
19
|
+
|
20
|
+
def __repr__(self) -> str:
|
21
|
+
return f"Token({self.type}, '{self.value}', line={self.line}, col={self.column})"
|
22
|
+
|
23
|
+
|
24
|
+
class TokenType:
|
25
|
+
"""All possible token types in the MDL language."""
|
26
|
+
|
27
|
+
# Keywords (Reserved Words)
|
28
|
+
PACK = "PACK"
|
29
|
+
NAMESPACE = "NAMESPACE"
|
30
|
+
FUNCTION = "FUNCTION"
|
31
|
+
VAR = "VAR"
|
32
|
+
NUM = "NUM"
|
33
|
+
IF = "IF"
|
34
|
+
ELSE = "ELSE"
|
35
|
+
WHILE = "WHILE"
|
36
|
+
ON_LOAD = "ON_LOAD"
|
37
|
+
ON_TICK = "ON_TICK"
|
38
|
+
EXEC = "EXEC"
|
39
|
+
TAG = "TAG"
|
40
|
+
|
41
|
+
# Tag Types (Resource Categories)
|
42
|
+
RECIPE = "RECIPE"
|
43
|
+
LOOT_TABLE = "LOOT_TABLE"
|
44
|
+
ADVANCEMENT = "ADVANCEMENT"
|
45
|
+
ITEM_MODIFIER = "ITEM_MODIFIER"
|
46
|
+
PREDICATE = "PREDICATE"
|
47
|
+
STRUCTURE = "STRUCTURE"
|
48
|
+
|
49
|
+
# Operators
|
50
|
+
PLUS = "PLUS" # +
|
51
|
+
MINUS = "MINUS" # -
|
52
|
+
MULTIPLY = "MULTIPLY" # *
|
53
|
+
DIVIDE = "DIVIDE" # /
|
54
|
+
ASSIGN = "ASSIGN" # =
|
55
|
+
EQUAL = "EQUAL" # ==
|
56
|
+
NOT_EQUAL = "NOT_EQUAL" # !=
|
57
|
+
GREATER = "GREATER" # >
|
58
|
+
LESS = "LESS" # <
|
59
|
+
GREATER_EQUAL = "GREATER_EQUAL" # >=
|
60
|
+
LESS_EQUAL = "LESS_EQUAL" # <=
|
61
|
+
|
62
|
+
# Delimiters
|
63
|
+
SEMICOLON = "SEMICOLON" # ;
|
64
|
+
COMMA = "COMMA" # ,
|
65
|
+
COLON = "COLON" # :
|
66
|
+
|
67
|
+
# Brackets and Braces
|
68
|
+
LPAREN = "LPAREN" # (
|
69
|
+
RPAREN = "RPAREN" # )
|
70
|
+
LBRACE = "LBRACE" # {
|
71
|
+
RBRACE = "RBRACE" # }
|
72
|
+
LBRACKET = "LBRACKET" # [
|
73
|
+
RBRACKET = "RBRACKET" # ]
|
74
|
+
LANGLE = "LANGLE" # < (for scope syntax)
|
75
|
+
RANGLE = "RANGLE" # > (for scope syntax)
|
76
|
+
|
77
|
+
# Special Tokens
|
78
|
+
DOLLAR = "DOLLAR" # $ (variable substitution)
|
79
|
+
QUOTE = "QUOTE" # " (string literal delimiter)
|
80
|
+
EXCLAMATION = "EXCLAMATION" # ! (for raw blocks)
|
81
|
+
RANGE = "RANGE" # .. (range operator)
|
82
|
+
|
83
|
+
# Literals
|
84
|
+
IDENTIFIER = "IDENTIFIER" # Variable names, function names, etc.
|
85
|
+
NUMBER = "NUMBER" # Numbers (integers and floats)
|
86
|
+
|
87
|
+
# Special
|
88
|
+
NEWLINE = "NEWLINE"
|
89
|
+
EOF = "EOF"
|
90
|
+
COMMENT = "COMMENT" # Comments (ignored during parsing)
|
91
|
+
RAW_CONTENT = "RAW_CONTENT" # Raw content inside raw blocks
|
92
|
+
|
93
|
+
|
94
|
+
class MDLLexer:
|
95
|
+
"""
|
96
|
+
Clean, extensible lexer for the MDL language.
|
97
|
+
|
98
|
+
Features:
|
99
|
+
- Full support for all language constructs defined in the spec
|
100
|
+
- Clean, readable code structure
|
101
|
+
- Easy to extend with new token types
|
102
|
+
- Comprehensive error handling
|
103
|
+
- Efficient tokenization with minimal memory usage
|
104
|
+
"""
|
105
|
+
|
106
|
+
def __init__(self, source_file: str = None):
|
107
|
+
self.source_file = source_file
|
108
|
+
self.reset()
|
109
|
+
|
110
|
+
def reset(self):
|
111
|
+
"""Reset the lexer state."""
|
112
|
+
self.tokens = []
|
113
|
+
self.current = 0
|
114
|
+
self.start = 0
|
115
|
+
self.line = 1
|
116
|
+
self.column = 1
|
117
|
+
self.in_raw_mode = False
|
118
|
+
self.source = ""
|
119
|
+
|
120
|
+
def lex(self, source: str) -> List[Token]:
|
121
|
+
"""
|
122
|
+
Lex the source code into tokens.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
source: The source code string to tokenize
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
List of Token objects representing the source code
|
129
|
+
|
130
|
+
Raises:
|
131
|
+
MDLLexerError: If there's a lexical error in the source code
|
132
|
+
"""
|
133
|
+
self.reset()
|
134
|
+
self.source = source
|
135
|
+
|
136
|
+
while self.current < len(source):
|
137
|
+
self.start = self.current
|
138
|
+
self._scan_token()
|
139
|
+
|
140
|
+
# Add EOF token
|
141
|
+
self.tokens.append(Token(TokenType.EOF, "", self.line, self.column))
|
142
|
+
return self.tokens
|
143
|
+
|
144
|
+
def _scan_token(self):
|
145
|
+
"""Scan a single token from the source."""
|
146
|
+
if self.current >= len(self.source):
|
147
|
+
return
|
148
|
+
|
149
|
+
char = self.source[self.current]
|
150
|
+
|
151
|
+
# Handle raw mode
|
152
|
+
if self.in_raw_mode:
|
153
|
+
self._scan_raw_text()
|
154
|
+
return
|
155
|
+
|
156
|
+
# Handle whitespace and newlines
|
157
|
+
if char.isspace():
|
158
|
+
self._scan_whitespace()
|
159
|
+
return
|
160
|
+
|
161
|
+
# Handle comments
|
162
|
+
if char == '/' and self._peek(1) == '/':
|
163
|
+
self._scan_single_line_comment()
|
164
|
+
return
|
165
|
+
|
166
|
+
if char == '/' and self._peek(1) == '*':
|
167
|
+
self._scan_multi_line_comment()
|
168
|
+
return
|
169
|
+
|
170
|
+
# Handle strings (quotes)
|
171
|
+
if char == '"':
|
172
|
+
self._scan_string()
|
173
|
+
return
|
174
|
+
|
175
|
+
# Handle raw block markers
|
176
|
+
if char == '$' and self._peek(1) == '!' and self._peek(2) == 'r':
|
177
|
+
if self._peek(3) == 'a' and self._peek(4) == 'w':
|
178
|
+
self._scan_raw_block_start()
|
179
|
+
return
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
# Handle variable substitution
|
184
|
+
if char == '$':
|
185
|
+
self._scan_variable_substitution()
|
186
|
+
return
|
187
|
+
|
188
|
+
# Handle numbers
|
189
|
+
if char.isdigit():
|
190
|
+
self._scan_number()
|
191
|
+
return
|
192
|
+
|
193
|
+
# Handle identifiers and keywords
|
194
|
+
if char.isalpha() or char == '_':
|
195
|
+
self._scan_identifier()
|
196
|
+
return
|
197
|
+
|
198
|
+
# Handle @ selectors (like @s, @a, @e[type=armor_stand])
|
199
|
+
if char == '@':
|
200
|
+
self._scan_selector()
|
201
|
+
return
|
202
|
+
|
203
|
+
# Handle scope selectors (<@s>, <@a[team=red]>, etc.)
|
204
|
+
if char == '<':
|
205
|
+
# Check if this is a scope selector (followed by @ or identifier)
|
206
|
+
if (self.current + 1 < len(self.source) and
|
207
|
+
(self.source[self.current + 1] == '@' or
|
208
|
+
self.source[self.current + 1].isalpha() or
|
209
|
+
self.source[self.current + 1] == '_')):
|
210
|
+
self._scan_scope_selector()
|
211
|
+
return
|
212
|
+
# Otherwise, treat as LESS operator (handled by _scan_operator_or_delimiter)
|
213
|
+
|
214
|
+
# Handle operators and delimiters
|
215
|
+
self._scan_operator_or_delimiter()
|
216
|
+
|
217
|
+
def _scan_whitespace(self):
|
218
|
+
"""Scan whitespace characters."""
|
219
|
+
while (self.current < len(self.source) and
|
220
|
+
self.source[self.current].isspace()):
|
221
|
+
char = self.source[self.current]
|
222
|
+
if char == '\n':
|
223
|
+
self.line += 1
|
224
|
+
self.column = 1
|
225
|
+
else:
|
226
|
+
self.column += 1
|
227
|
+
self.current += 1
|
228
|
+
|
229
|
+
def _scan_single_line_comment(self):
|
230
|
+
"""Scan a single-line comment (// ...)."""
|
231
|
+
# Skip //
|
232
|
+
self.current += 2
|
233
|
+
self.column += 2
|
234
|
+
|
235
|
+
# Scan until end of line or end of source
|
236
|
+
while (self.current < len(self.source) and
|
237
|
+
self.source[self.current] != '\n'):
|
238
|
+
self.current += 1
|
239
|
+
self.column += 1
|
240
|
+
|
241
|
+
# Comments are ignored - no token generated
|
242
|
+
|
243
|
+
def _scan_multi_line_comment(self):
|
244
|
+
"""Scan a multi-line comment (/* ... */)."""
|
245
|
+
# Skip /*
|
246
|
+
self.current += 2
|
247
|
+
self.column += 2
|
248
|
+
|
249
|
+
# Scan until we find */
|
250
|
+
while (self.current < len(self.source) - 1):
|
251
|
+
if (self.source[self.current] == '*' and
|
252
|
+
self.source[self.current + 1] == '/'):
|
253
|
+
self.current += 2
|
254
|
+
self.column += 2
|
255
|
+
return
|
256
|
+
|
257
|
+
if self.source[self.current] == '\n':
|
258
|
+
self.line += 1
|
259
|
+
self.column = 1
|
260
|
+
else:
|
261
|
+
self.column += 1
|
262
|
+
self.current += 1
|
263
|
+
|
264
|
+
# Unterminated comment
|
265
|
+
self._error("Unterminated multi-line comment", "Add */ to close the comment")
|
266
|
+
|
267
|
+
def _scan_string(self):
|
268
|
+
"""Scan a string literal (quoted text)."""
|
269
|
+
# Skip opening quote
|
270
|
+
self.current += 1
|
271
|
+
self.column += 1
|
272
|
+
|
273
|
+
start_line = self.line
|
274
|
+
start_column = self.column
|
275
|
+
|
276
|
+
# Scan until closing quote
|
277
|
+
while (self.current < len(self.source) and
|
278
|
+
self.source[self.current] != '"'):
|
279
|
+
if self.source[self.current] == '\n':
|
280
|
+
self._error("Unterminated string literal", "Add a closing quote")
|
281
|
+
|
282
|
+
if self.source[self.current] == '\\' and self.current + 1 < len(self.source):
|
283
|
+
# Handle escape sequences
|
284
|
+
self.current += 2
|
285
|
+
self.column += 2
|
286
|
+
else:
|
287
|
+
self.current += 1
|
288
|
+
self.column += 1
|
289
|
+
|
290
|
+
if self.current >= len(self.source):
|
291
|
+
self._error("Unterminated string literal at end of file", "Add a closing quote")
|
292
|
+
|
293
|
+
# Include closing quote
|
294
|
+
self.current += 1
|
295
|
+
self.column += 1
|
296
|
+
|
297
|
+
# Generate QUOTE token for the opening quote
|
298
|
+
self.tokens.append(Token(TokenType.QUOTE, '"', start_line, start_column))
|
299
|
+
|
300
|
+
# Generate IDENTIFIER token for the string content
|
301
|
+
string_content = self.source[self.start + 1:self.current - 1]
|
302
|
+
self.tokens.append(Token(TokenType.IDENTIFIER, string_content, start_line, start_column + 1))
|
303
|
+
|
304
|
+
# Generate QUOTE token for the closing quote
|
305
|
+
self.tokens.append(Token(TokenType.QUOTE, '"', self.line, self.column - 1))
|
306
|
+
|
307
|
+
def _scan_raw_block_start(self):
|
308
|
+
"""Scan the start of a raw block ($!raw)."""
|
309
|
+
# Consume $!raw
|
310
|
+
self.current += 5
|
311
|
+
self.column += 5
|
312
|
+
|
313
|
+
# Generate tokens: $ ! raw
|
314
|
+
self.tokens.append(Token(TokenType.DOLLAR, "$", self.line, self.column - 5))
|
315
|
+
self.tokens.append(Token(TokenType.EXCLAMATION, "!", self.line, self.column - 4))
|
316
|
+
self.tokens.append(Token(TokenType.IDENTIFIER, "raw", self.line, self.column - 3))
|
317
|
+
|
318
|
+
self.in_raw_mode = True
|
319
|
+
|
320
|
+
|
321
|
+
|
322
|
+
def _scan_raw_text(self):
|
323
|
+
"""Scan raw text inside a raw block."""
|
324
|
+
# Remember where the raw content starts
|
325
|
+
content_start = self.current
|
326
|
+
|
327
|
+
# Consume all characters until we find raw!$
|
328
|
+
while self.current < len(self.source) - 4:
|
329
|
+
if (self.source[self.current:self.current + 5] == 'raw!$'):
|
330
|
+
# Found the end marker - extract the content
|
331
|
+
raw_content = self.source[content_start:self.current]
|
332
|
+
|
333
|
+
# Generate a single RAW_CONTENT token with all the content
|
334
|
+
self.tokens.append(Token(TokenType.RAW_CONTENT, raw_content, self.line, self.column))
|
335
|
+
|
336
|
+
# Consume the end marker and exit raw mode
|
337
|
+
self.current += 5
|
338
|
+
self.column += 5
|
339
|
+
self.in_raw_mode = False
|
340
|
+
|
341
|
+
# Generate tokens for the end marker: raw ! $
|
342
|
+
self.tokens.append(Token(TokenType.IDENTIFIER, "raw", self.line, self.column - 5))
|
343
|
+
self.tokens.append(Token(TokenType.EXCLAMATION, "!", self.line, self.column - 2))
|
344
|
+
self.tokens.append(Token(TokenType.DOLLAR, "$", self.line, self.column - 1))
|
345
|
+
return
|
346
|
+
|
347
|
+
if self.source[self.current] == '\n':
|
348
|
+
self.line += 1
|
349
|
+
self.column = 1
|
350
|
+
else:
|
351
|
+
self.column += 1
|
352
|
+
self.current += 1
|
353
|
+
|
354
|
+
# If we didn't find the end marker, it's an error
|
355
|
+
if self.current >= len(self.source) - 4:
|
356
|
+
self._error("Unterminated raw block", "Add 'raw!$' to close the raw block")
|
357
|
+
|
358
|
+
def _scan_variable_substitution(self):
|
359
|
+
"""Scan variable substitution ($variable<scope>$)."""
|
360
|
+
# Skip opening $
|
361
|
+
self.current += 1
|
362
|
+
self.column += 1
|
363
|
+
|
364
|
+
# Generate DOLLAR token
|
365
|
+
self.tokens.append(Token(TokenType.DOLLAR, "$", self.line, self.column - 1))
|
366
|
+
|
367
|
+
# Scan variable name (start from current position, not from start)
|
368
|
+
self.start = self.current
|
369
|
+
self._scan_identifier()
|
370
|
+
|
371
|
+
# Check for scope selector
|
372
|
+
if (self.current < len(self.source) and
|
373
|
+
self.source[self.current] == '<'):
|
374
|
+
self._scan_scope_selector()
|
375
|
+
|
376
|
+
# Check for closing $
|
377
|
+
if (self.current < len(self.source) and
|
378
|
+
self.source[self.current] == '$'):
|
379
|
+
self.current += 1
|
380
|
+
self.column += 1
|
381
|
+
self.tokens.append(Token(TokenType.DOLLAR, "$", self.line, self.column - 1))
|
382
|
+
else:
|
383
|
+
self._error("Unterminated variable substitution", "Add $ to close the variable substitution")
|
384
|
+
|
385
|
+
def _scan_selector(self):
|
386
|
+
"""Scan a selector (@s, @a, @e[type=armor_stand], etc.)."""
|
387
|
+
# Consume @
|
388
|
+
self.current += 1
|
389
|
+
self.column += 1
|
390
|
+
|
391
|
+
# Scan selector identifier
|
392
|
+
self._scan_identifier()
|
393
|
+
|
394
|
+
# Check for bracket parameters
|
395
|
+
if (self.current < len(self.source) and
|
396
|
+
self.source[self.current] == '['):
|
397
|
+
self._scan_selector_parameters()
|
398
|
+
|
399
|
+
def _scan_selector_parameters(self):
|
400
|
+
"""Scan selector parameters in brackets."""
|
401
|
+
# Consume [
|
402
|
+
self.current += 1
|
403
|
+
self.column += 1
|
404
|
+
|
405
|
+
# Generate LBRACKET token
|
406
|
+
self.tokens.append(Token(TokenType.LBRACKET, "[", self.line, self.column - 1))
|
407
|
+
|
408
|
+
# Remember where the parameters start (after the opening [)
|
409
|
+
param_start = self.current
|
410
|
+
|
411
|
+
# Scan until we find the matching ]
|
412
|
+
bracket_count = 1
|
413
|
+
while (self.current < len(self.source) and bracket_count > 0):
|
414
|
+
if self.source[self.current] == '[':
|
415
|
+
bracket_count += 1
|
416
|
+
elif self.source[self.current] == ']':
|
417
|
+
bracket_count -= 1
|
418
|
+
|
419
|
+
if bracket_count > 0:
|
420
|
+
if self.source[self.current] == '\n':
|
421
|
+
self.line += 1
|
422
|
+
self.column = 1
|
423
|
+
else:
|
424
|
+
self.column += 1
|
425
|
+
self.current += 1
|
426
|
+
|
427
|
+
if bracket_count == 0:
|
428
|
+
# Successfully found closing ]
|
429
|
+
# Generate IDENTIFIER token for the entire parameter content
|
430
|
+
param_content = self.source[param_start:self.current]
|
431
|
+
self.tokens.append(Token(TokenType.IDENTIFIER, param_content, self.line, self.column - len(param_content)))
|
432
|
+
|
433
|
+
# Generate RBRACKET token
|
434
|
+
self.current += 1
|
435
|
+
self.column += 1
|
436
|
+
self.tokens.append(Token(TokenType.RBRACKET, "]", self.line, self.column - 1))
|
437
|
+
else:
|
438
|
+
# Unterminated selector parameters
|
439
|
+
self._error("Unterminated selector parameters", "Add ] to close the selector parameters")
|
440
|
+
|
441
|
+
def _scan_scope_selector(self):
|
442
|
+
"""Scan a scope selector (<@s>, <@a[team=red]>, etc.)."""
|
443
|
+
# Consume <
|
444
|
+
self.current += 1
|
445
|
+
self.column += 1
|
446
|
+
|
447
|
+
# Generate LANGLE token
|
448
|
+
self.tokens.append(Token(TokenType.LANGLE, "<", self.line, self.column - 1))
|
449
|
+
|
450
|
+
# Scan selector content - this could be @s, @a[team=red], etc.
|
451
|
+
if (self.current < len(self.source) and
|
452
|
+
self.source[self.current] == '@'):
|
453
|
+
# Handle @ selector - start from current position
|
454
|
+
self.start = self.current
|
455
|
+
self._scan_selector()
|
456
|
+
else:
|
457
|
+
# Handle other identifier - start from current position
|
458
|
+
self.start = self.current
|
459
|
+
self._scan_identifier()
|
460
|
+
|
461
|
+
# Consume >
|
462
|
+
if (self.current < len(self.source) and
|
463
|
+
self.source[self.current] == '>'):
|
464
|
+
self.current += 1
|
465
|
+
self.column += 1
|
466
|
+
self.tokens.append(Token(TokenType.RANGLE, ">", self.line, self.column - 1))
|
467
|
+
else:
|
468
|
+
self._error("Unterminated scope selector", "Add > to close the scope selector")
|
469
|
+
|
470
|
+
def _scan_number(self):
|
471
|
+
"""Scan a number literal."""
|
472
|
+
# Scan integer part
|
473
|
+
while (self.current < len(self.source) and
|
474
|
+
self.source[self.current].isdigit()):
|
475
|
+
self.current += 1
|
476
|
+
self.column += 1
|
477
|
+
|
478
|
+
# Check for decimal point
|
479
|
+
if (self.current < len(self.source) and
|
480
|
+
self.source[self.current] == '.' and
|
481
|
+
self.current + 1 < len(self.source) and
|
482
|
+
self.source[self.current + 1].isdigit()):
|
483
|
+
self.current += 1 # consume decimal point
|
484
|
+
self.column += 1
|
485
|
+
|
486
|
+
# Scan fractional part
|
487
|
+
while (self.current < len(self.source) and
|
488
|
+
self.source[self.current].isdigit()):
|
489
|
+
self.current += 1
|
490
|
+
self.column += 1
|
491
|
+
|
492
|
+
number_text = self.source[self.start:self.current]
|
493
|
+
self.tokens.append(Token(TokenType.NUMBER, number_text, self.line, self.column - len(number_text)))
|
494
|
+
|
495
|
+
def _scan_identifier(self):
|
496
|
+
"""Scan an identifier or keyword."""
|
497
|
+
# Scan identifier characters
|
498
|
+
while (self.current < len(self.source) and
|
499
|
+
(self.source[self.current].isalnum() or
|
500
|
+
self.source[self.current] == '_')):
|
501
|
+
self.current += 1
|
502
|
+
self.column += 1
|
503
|
+
|
504
|
+
identifier_text = self.source[self.start:self.current]
|
505
|
+
|
506
|
+
# Check if it's a keyword
|
507
|
+
token_type = self._get_keyword_type(identifier_text)
|
508
|
+
|
509
|
+
self.tokens.append(Token(token_type, identifier_text, self.line, self.column - len(identifier_text)))
|
510
|
+
|
511
|
+
def _scan_operator_or_delimiter(self):
|
512
|
+
"""Scan operators and delimiters."""
|
513
|
+
char = self.source[self.current]
|
514
|
+
|
515
|
+
# Handle two-character operators first
|
516
|
+
if self.current + 1 < len(self.source):
|
517
|
+
two_char = self.source[self.current:self.current + 2]
|
518
|
+
|
519
|
+
if two_char in ['==', '!=', '>=', '<=', '..']:
|
520
|
+
self.current += 2
|
521
|
+
self.column += 2
|
522
|
+
|
523
|
+
token_type = {
|
524
|
+
'==': TokenType.EQUAL,
|
525
|
+
'!=': TokenType.NOT_EQUAL,
|
526
|
+
'>=': TokenType.GREATER_EQUAL,
|
527
|
+
'<=': TokenType.LESS_EQUAL,
|
528
|
+
'..': TokenType.RANGE
|
529
|
+
}[two_char]
|
530
|
+
|
531
|
+
self.tokens.append(Token(token_type, two_char, self.line, self.column - 2))
|
532
|
+
return
|
533
|
+
|
534
|
+
# Handle single-character operators and delimiters
|
535
|
+
token_map = {
|
536
|
+
'+': TokenType.PLUS,
|
537
|
+
'-': TokenType.MINUS,
|
538
|
+
'*': TokenType.MULTIPLY,
|
539
|
+
'/': TokenType.DIVIDE,
|
540
|
+
'=': TokenType.ASSIGN,
|
541
|
+
'>': TokenType.GREATER,
|
542
|
+
'<': TokenType.LESS,
|
543
|
+
';': TokenType.SEMICOLON,
|
544
|
+
',': TokenType.COMMA,
|
545
|
+
':': TokenType.COLON,
|
546
|
+
'(': TokenType.LPAREN,
|
547
|
+
')': TokenType.RPAREN,
|
548
|
+
'{': TokenType.LBRACE,
|
549
|
+
'}': TokenType.RBRACE,
|
550
|
+
'[': TokenType.LBRACKET,
|
551
|
+
']': TokenType.RBRACKET
|
552
|
+
}
|
553
|
+
|
554
|
+
if char in token_map:
|
555
|
+
self.current += 1
|
556
|
+
self.column += 1
|
557
|
+
self.tokens.append(Token(token_map[char], char, self.line, self.column - 1))
|
558
|
+
else:
|
559
|
+
# Unknown character
|
560
|
+
self._error(f"Unknown character '{char}'", f"Remove or replace the character '{char}'")
|
561
|
+
|
562
|
+
def _get_keyword_type(self, text: str) -> str:
|
563
|
+
"""Get the token type for a keyword."""
|
564
|
+
keyword_map = {
|
565
|
+
# Keywords
|
566
|
+
'pack': TokenType.PACK,
|
567
|
+
'namespace': TokenType.NAMESPACE,
|
568
|
+
'function': TokenType.FUNCTION,
|
569
|
+
'var': TokenType.VAR,
|
570
|
+
'num': TokenType.NUM,
|
571
|
+
'if': TokenType.IF,
|
572
|
+
'else': TokenType.ELSE,
|
573
|
+
'while': TokenType.WHILE,
|
574
|
+
'on_load': TokenType.ON_LOAD,
|
575
|
+
'on_tick': TokenType.ON_TICK,
|
576
|
+
'exec': TokenType.EXEC,
|
577
|
+
'tag': TokenType.TAG,
|
578
|
+
|
579
|
+
# Tag types
|
580
|
+
'recipe': TokenType.RECIPE,
|
581
|
+
'loot_table': TokenType.LOOT_TABLE,
|
582
|
+
'advancement': TokenType.ADVANCEMENT,
|
583
|
+
'item_modifier': TokenType.ITEM_MODIFIER,
|
584
|
+
'predicate': TokenType.PREDICATE,
|
585
|
+
'structure': TokenType.STRUCTURE
|
586
|
+
}
|
587
|
+
|
588
|
+
return keyword_map.get(text.lower(), TokenType.IDENTIFIER)
|
589
|
+
|
590
|
+
def _peek(self, offset: int) -> Optional[str]:
|
591
|
+
"""Peek ahead in the source without consuming characters."""
|
592
|
+
if self.current + offset < len(self.source):
|
593
|
+
return self.source[self.current + offset]
|
594
|
+
return None
|
595
|
+
|
596
|
+
def _error(self, message: str, suggestion: str):
|
597
|
+
"""Raise a lexer error with context information."""
|
598
|
+
# Get the current line content for better error reporting
|
599
|
+
lines = self.source.split('\n')
|
600
|
+
line_content = ""
|
601
|
+
if self.line - 1 < len(lines):
|
602
|
+
line_content = lines[self.line - 1]
|
603
|
+
|
604
|
+
raise MDLLexerError(
|
605
|
+
message=message,
|
606
|
+
file_path=self.source_file,
|
607
|
+
line=self.line,
|
608
|
+
column=self.column,
|
609
|
+
line_content=line_content,
|
610
|
+
suggestion=suggestion
|
611
|
+
)
|
612
|
+
|
613
|
+
def get_token_summary(self) -> Dict[str, Any]:
|
614
|
+
"""Get a summary of the tokenization results."""
|
615
|
+
token_counts = {}
|
616
|
+
for token in self.tokens:
|
617
|
+
if token.type != TokenType.EOF:
|
618
|
+
token_counts[token.type] = token_counts.get(token.type, 0) + 1
|
619
|
+
|
620
|
+
return {
|
621
|
+
'total_tokens': len(self.tokens),
|
622
|
+
'token_counts': token_counts,
|
623
|
+
'lines_processed': self.line
|
624
|
+
}
|