machine-dialect 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- machine_dialect/__main__.py +667 -0
- machine_dialect/agent/__init__.py +5 -0
- machine_dialect/agent/agent.py +360 -0
- machine_dialect/ast/__init__.py +95 -0
- machine_dialect/ast/ast_node.py +35 -0
- machine_dialect/ast/call_expression.py +82 -0
- machine_dialect/ast/dict_extraction.py +60 -0
- machine_dialect/ast/expressions.py +439 -0
- machine_dialect/ast/literals.py +309 -0
- machine_dialect/ast/program.py +35 -0
- machine_dialect/ast/statements.py +1433 -0
- machine_dialect/ast/tests/test_ast_string_representation.py +62 -0
- machine_dialect/ast/tests/test_boolean_literal.py +29 -0
- machine_dialect/ast/tests/test_collection_hir.py +138 -0
- machine_dialect/ast/tests/test_define_statement.py +142 -0
- machine_dialect/ast/tests/test_desugar.py +541 -0
- machine_dialect/ast/tests/test_foreach_desugar.py +245 -0
- machine_dialect/cfg/__init__.py +6 -0
- machine_dialect/cfg/config.py +156 -0
- machine_dialect/cfg/examples.py +221 -0
- machine_dialect/cfg/generate_with_ai.py +187 -0
- machine_dialect/cfg/openai_generation.py +200 -0
- machine_dialect/cfg/parser.py +94 -0
- machine_dialect/cfg/tests/__init__.py +1 -0
- machine_dialect/cfg/tests/test_cfg_parser.py +252 -0
- machine_dialect/cfg/tests/test_config.py +188 -0
- machine_dialect/cfg/tests/test_examples.py +391 -0
- machine_dialect/cfg/tests/test_generate_with_ai.py +354 -0
- machine_dialect/cfg/tests/test_openai_generation.py +256 -0
- machine_dialect/codegen/__init__.py +5 -0
- machine_dialect/codegen/bytecode_module.py +89 -0
- machine_dialect/codegen/bytecode_serializer.py +300 -0
- machine_dialect/codegen/opcodes.py +101 -0
- machine_dialect/codegen/register_codegen.py +1996 -0
- machine_dialect/codegen/symtab.py +208 -0
- machine_dialect/codegen/tests/__init__.py +1 -0
- machine_dialect/codegen/tests/test_array_operations_codegen.py +295 -0
- machine_dialect/codegen/tests/test_bytecode_serializer.py +185 -0
- machine_dialect/codegen/tests/test_register_codegen_ssa.py +324 -0
- machine_dialect/codegen/tests/test_symtab.py +418 -0
- machine_dialect/codegen/vm_serializer.py +621 -0
- machine_dialect/compiler/__init__.py +18 -0
- machine_dialect/compiler/compiler.py +197 -0
- machine_dialect/compiler/config.py +149 -0
- machine_dialect/compiler/context.py +149 -0
- machine_dialect/compiler/phases/__init__.py +19 -0
- machine_dialect/compiler/phases/bytecode_optimization.py +90 -0
- machine_dialect/compiler/phases/codegen.py +40 -0
- machine_dialect/compiler/phases/hir_generation.py +39 -0
- machine_dialect/compiler/phases/mir_generation.py +86 -0
- machine_dialect/compiler/phases/optimization.py +110 -0
- machine_dialect/compiler/phases/parsing.py +39 -0
- machine_dialect/compiler/pipeline.py +143 -0
- machine_dialect/compiler/tests/__init__.py +1 -0
- machine_dialect/compiler/tests/test_compiler.py +568 -0
- machine_dialect/compiler/vm_runner.py +173 -0
- machine_dialect/errors/__init__.py +32 -0
- machine_dialect/errors/exceptions.py +369 -0
- machine_dialect/errors/messages.py +82 -0
- machine_dialect/errors/tests/__init__.py +0 -0
- machine_dialect/errors/tests/test_expected_token_errors.py +188 -0
- machine_dialect/errors/tests/test_name_errors.py +118 -0
- machine_dialect/helpers/__init__.py +0 -0
- machine_dialect/helpers/stopwords.py +225 -0
- machine_dialect/helpers/validators.py +30 -0
- machine_dialect/lexer/__init__.py +9 -0
- machine_dialect/lexer/constants.py +23 -0
- machine_dialect/lexer/lexer.py +907 -0
- machine_dialect/lexer/tests/__init__.py +0 -0
- machine_dialect/lexer/tests/helpers.py +86 -0
- machine_dialect/lexer/tests/test_apostrophe_identifiers.py +122 -0
- machine_dialect/lexer/tests/test_backtick_identifiers.py +140 -0
- machine_dialect/lexer/tests/test_boolean_literals.py +108 -0
- machine_dialect/lexer/tests/test_case_insensitive_keywords.py +188 -0
- machine_dialect/lexer/tests/test_comments.py +200 -0
- machine_dialect/lexer/tests/test_double_asterisk_keywords.py +127 -0
- machine_dialect/lexer/tests/test_lexer_position.py +113 -0
- machine_dialect/lexer/tests/test_list_tokens.py +282 -0
- machine_dialect/lexer/tests/test_stopwords.py +80 -0
- machine_dialect/lexer/tests/test_strict_equality.py +129 -0
- machine_dialect/lexer/tests/test_token.py +41 -0
- machine_dialect/lexer/tests/test_tokenization.py +294 -0
- machine_dialect/lexer/tests/test_underscore_literals.py +343 -0
- machine_dialect/lexer/tests/test_url_literals.py +169 -0
- machine_dialect/lexer/tokens.py +487 -0
- machine_dialect/linter/__init__.py +10 -0
- machine_dialect/linter/__main__.py +144 -0
- machine_dialect/linter/linter.py +154 -0
- machine_dialect/linter/rules/__init__.py +8 -0
- machine_dialect/linter/rules/base.py +112 -0
- machine_dialect/linter/rules/statement_termination.py +99 -0
- machine_dialect/linter/tests/__init__.py +1 -0
- machine_dialect/linter/tests/mdrules/__init__.py +0 -0
- machine_dialect/linter/tests/mdrules/test_md101_statement_termination.py +181 -0
- machine_dialect/linter/tests/test_linter.py +81 -0
- machine_dialect/linter/tests/test_rules.py +110 -0
- machine_dialect/linter/tests/test_violations.py +71 -0
- machine_dialect/linter/violations.py +51 -0
- machine_dialect/mir/__init__.py +69 -0
- machine_dialect/mir/analyses/__init__.py +20 -0
- machine_dialect/mir/analyses/alias_analysis.py +315 -0
- machine_dialect/mir/analyses/dominance_analysis.py +49 -0
- machine_dialect/mir/analyses/escape_analysis.py +286 -0
- machine_dialect/mir/analyses/loop_analysis.py +272 -0
- machine_dialect/mir/analyses/tests/test_type_analysis.py +736 -0
- machine_dialect/mir/analyses/type_analysis.py +448 -0
- machine_dialect/mir/analyses/use_def_chains.py +232 -0
- machine_dialect/mir/basic_block.py +385 -0
- machine_dialect/mir/dataflow.py +445 -0
- machine_dialect/mir/debug_info.py +208 -0
- machine_dialect/mir/hir_to_mir.py +1738 -0
- machine_dialect/mir/mir_dumper.py +366 -0
- machine_dialect/mir/mir_function.py +167 -0
- machine_dialect/mir/mir_instructions.py +1877 -0
- machine_dialect/mir/mir_interpreter.py +556 -0
- machine_dialect/mir/mir_module.py +225 -0
- machine_dialect/mir/mir_printer.py +480 -0
- machine_dialect/mir/mir_transformer.py +410 -0
- machine_dialect/mir/mir_types.py +367 -0
- machine_dialect/mir/mir_validation.py +455 -0
- machine_dialect/mir/mir_values.py +268 -0
- machine_dialect/mir/optimization_config.py +233 -0
- machine_dialect/mir/optimization_pass.py +251 -0
- machine_dialect/mir/optimization_pipeline.py +355 -0
- machine_dialect/mir/optimizations/__init__.py +84 -0
- machine_dialect/mir/optimizations/algebraic_simplification.py +733 -0
- machine_dialect/mir/optimizations/branch_prediction.py +372 -0
- machine_dialect/mir/optimizations/constant_propagation.py +634 -0
- machine_dialect/mir/optimizations/cse.py +398 -0
- machine_dialect/mir/optimizations/dce.py +288 -0
- machine_dialect/mir/optimizations/inlining.py +551 -0
- machine_dialect/mir/optimizations/jump_threading.py +487 -0
- machine_dialect/mir/optimizations/licm.py +405 -0
- machine_dialect/mir/optimizations/loop_unrolling.py +366 -0
- machine_dialect/mir/optimizations/strength_reduction.py +422 -0
- machine_dialect/mir/optimizations/tail_call.py +207 -0
- machine_dialect/mir/optimizations/tests/test_loop_unrolling.py +483 -0
- machine_dialect/mir/optimizations/type_narrowing.py +397 -0
- machine_dialect/mir/optimizations/type_specialization.py +447 -0
- machine_dialect/mir/optimizations/type_specific.py +906 -0
- machine_dialect/mir/optimize_mir.py +89 -0
- machine_dialect/mir/pass_manager.py +391 -0
- machine_dialect/mir/profiling/__init__.py +26 -0
- machine_dialect/mir/profiling/profile_collector.py +318 -0
- machine_dialect/mir/profiling/profile_data.py +372 -0
- machine_dialect/mir/profiling/profile_reader.py +272 -0
- machine_dialect/mir/profiling/profile_writer.py +226 -0
- machine_dialect/mir/register_allocation.py +302 -0
- machine_dialect/mir/reporting/__init__.py +17 -0
- machine_dialect/mir/reporting/optimization_reporter.py +314 -0
- machine_dialect/mir/reporting/report_formatter.py +289 -0
- machine_dialect/mir/ssa_construction.py +342 -0
- machine_dialect/mir/tests/__init__.py +1 -0
- machine_dialect/mir/tests/test_algebraic_associativity.py +204 -0
- machine_dialect/mir/tests/test_algebraic_complex_patterns.py +221 -0
- machine_dialect/mir/tests/test_algebraic_division.py +126 -0
- machine_dialect/mir/tests/test_algebraic_simplification.py +863 -0
- machine_dialect/mir/tests/test_basic_block.py +425 -0
- machine_dialect/mir/tests/test_branch_prediction.py +459 -0
- machine_dialect/mir/tests/test_call_lowering.py +168 -0
- machine_dialect/mir/tests/test_collection_lowering.py +604 -0
- machine_dialect/mir/tests/test_cross_block_constant_propagation.py +255 -0
- machine_dialect/mir/tests/test_custom_passes.py +166 -0
- machine_dialect/mir/tests/test_debug_info.py +285 -0
- machine_dialect/mir/tests/test_dict_extraction_lowering.py +192 -0
- machine_dialect/mir/tests/test_dictionary_lowering.py +299 -0
- machine_dialect/mir/tests/test_double_negation.py +231 -0
- machine_dialect/mir/tests/test_escape_analysis.py +233 -0
- machine_dialect/mir/tests/test_hir_to_mir.py +465 -0
- machine_dialect/mir/tests/test_hir_to_mir_complete.py +389 -0
- machine_dialect/mir/tests/test_hir_to_mir_simple.py +130 -0
- machine_dialect/mir/tests/test_inlining.py +435 -0
- machine_dialect/mir/tests/test_licm.py +472 -0
- machine_dialect/mir/tests/test_mir_dumper.py +313 -0
- machine_dialect/mir/tests/test_mir_instructions.py +445 -0
- machine_dialect/mir/tests/test_mir_module.py +860 -0
- machine_dialect/mir/tests/test_mir_printer.py +387 -0
- machine_dialect/mir/tests/test_mir_types.py +123 -0
- machine_dialect/mir/tests/test_mir_types_enhanced.py +132 -0
- machine_dialect/mir/tests/test_mir_validation.py +378 -0
- machine_dialect/mir/tests/test_mir_values.py +168 -0
- machine_dialect/mir/tests/test_one_based_indexing.py +202 -0
- machine_dialect/mir/tests/test_optimization_helpers.py +60 -0
- machine_dialect/mir/tests/test_optimization_pipeline.py +554 -0
- machine_dialect/mir/tests/test_optimization_reporter.py +318 -0
- machine_dialect/mir/tests/test_pass_manager.py +294 -0
- machine_dialect/mir/tests/test_pass_registration.py +64 -0
- machine_dialect/mir/tests/test_profiling.py +356 -0
- machine_dialect/mir/tests/test_register_allocation.py +307 -0
- machine_dialect/mir/tests/test_report_formatters.py +372 -0
- machine_dialect/mir/tests/test_ssa_construction.py +433 -0
- machine_dialect/mir/tests/test_tail_call.py +236 -0
- machine_dialect/mir/tests/test_type_annotated_instructions.py +192 -0
- machine_dialect/mir/tests/test_type_narrowing.py +277 -0
- machine_dialect/mir/tests/test_type_specialization.py +421 -0
- machine_dialect/mir/tests/test_type_specific_optimization.py +545 -0
- machine_dialect/mir/tests/test_type_specific_optimization_advanced.py +382 -0
- machine_dialect/mir/type_inference.py +368 -0
- machine_dialect/parser/__init__.py +12 -0
- machine_dialect/parser/enums.py +45 -0
- machine_dialect/parser/parser.py +3655 -0
- machine_dialect/parser/protocols.py +11 -0
- machine_dialect/parser/symbol_table.py +169 -0
- machine_dialect/parser/tests/__init__.py +0 -0
- machine_dialect/parser/tests/helper_functions.py +193 -0
- machine_dialect/parser/tests/test_action_statements.py +334 -0
- machine_dialect/parser/tests/test_boolean_literal_expressions.py +152 -0
- machine_dialect/parser/tests/test_call_statements.py +154 -0
- machine_dialect/parser/tests/test_call_statements_errors.py +187 -0
- machine_dialect/parser/tests/test_collection_mutations.py +264 -0
- machine_dialect/parser/tests/test_conditional_expressions.py +343 -0
- machine_dialect/parser/tests/test_define_integration.py +468 -0
- machine_dialect/parser/tests/test_define_statements.py +311 -0
- machine_dialect/parser/tests/test_dict_extraction.py +115 -0
- machine_dialect/parser/tests/test_empty_literal.py +155 -0
- machine_dialect/parser/tests/test_float_literal_expressions.py +163 -0
- machine_dialect/parser/tests/test_identifier_expressions.py +57 -0
- machine_dialect/parser/tests/test_if_empty_block.py +61 -0
- machine_dialect/parser/tests/test_if_statements.py +299 -0
- machine_dialect/parser/tests/test_illegal_tokens.py +86 -0
- machine_dialect/parser/tests/test_infix_expressions.py +680 -0
- machine_dialect/parser/tests/test_integer_literal_expressions.py +137 -0
- machine_dialect/parser/tests/test_interaction_statements.py +269 -0
- machine_dialect/parser/tests/test_list_literals.py +277 -0
- machine_dialect/parser/tests/test_no_none_in_ast.py +94 -0
- machine_dialect/parser/tests/test_panic_mode_recovery.py +171 -0
- machine_dialect/parser/tests/test_parse_errors.py +114 -0
- machine_dialect/parser/tests/test_possessive_syntax.py +182 -0
- machine_dialect/parser/tests/test_prefix_expressions.py +415 -0
- machine_dialect/parser/tests/test_program.py +13 -0
- machine_dialect/parser/tests/test_return_statements.py +89 -0
- machine_dialect/parser/tests/test_set_statements.py +152 -0
- machine_dialect/parser/tests/test_strict_equality.py +258 -0
- machine_dialect/parser/tests/test_symbol_table.py +217 -0
- machine_dialect/parser/tests/test_url_literal_expressions.py +209 -0
- machine_dialect/parser/tests/test_utility_statements.py +423 -0
- machine_dialect/parser/token_buffer.py +159 -0
- machine_dialect/repl/__init__.py +3 -0
- machine_dialect/repl/repl.py +426 -0
- machine_dialect/repl/tests/__init__.py +0 -0
- machine_dialect/repl/tests/test_repl.py +606 -0
- machine_dialect/semantic/__init__.py +12 -0
- machine_dialect/semantic/analyzer.py +906 -0
- machine_dialect/semantic/error_messages.py +189 -0
- machine_dialect/semantic/tests/__init__.py +1 -0
- machine_dialect/semantic/tests/test_analyzer.py +364 -0
- machine_dialect/semantic/tests/test_error_messages.py +104 -0
- machine_dialect/tests/edge_cases/__init__.py +10 -0
- machine_dialect/tests/edge_cases/test_boundary_access.py +256 -0
- machine_dialect/tests/edge_cases/test_empty_collections.py +166 -0
- machine_dialect/tests/edge_cases/test_invalid_operations.py +243 -0
- machine_dialect/tests/edge_cases/test_named_list_edge_cases.py +295 -0
- machine_dialect/tests/edge_cases/test_nested_structures.py +313 -0
- machine_dialect/tests/edge_cases/test_type_mixing.py +277 -0
- machine_dialect/tests/integration/test_array_operations_emulation.py +248 -0
- machine_dialect/tests/integration/test_list_compilation.py +395 -0
- machine_dialect/tests/integration/test_lists_and_dictionaries.py +322 -0
- machine_dialect/type_checking/__init__.py +21 -0
- machine_dialect/type_checking/tests/__init__.py +1 -0
- machine_dialect/type_checking/tests/test_type_system.py +230 -0
- machine_dialect/type_checking/type_system.py +270 -0
- machine_dialect-0.1.0a1.dist-info/METADATA +128 -0
- machine_dialect-0.1.0a1.dist-info/RECORD +268 -0
- machine_dialect-0.1.0a1.dist-info/WHEEL +5 -0
- machine_dialect-0.1.0a1.dist-info/entry_points.txt +3 -0
- machine_dialect-0.1.0a1.dist-info/licenses/LICENSE +201 -0
- machine_dialect-0.1.0a1.dist-info/top_level.txt +2 -0
- machine_dialect_vm/__init__.pyi +15 -0
@@ -0,0 +1,907 @@
|
|
1
|
+
"""Streaming lexer implementation for Machine Dialect™.
|
2
|
+
|
3
|
+
This module provides a Lexer class that generates tokens one at a time
|
4
|
+
instead of all at once, enabling memory-efficient parsing of large files.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from machine_dialect.helpers.validators import is_valid_url
|
8
|
+
from machine_dialect.lexer.constants import CHAR_TO_TOKEN_MAP
|
9
|
+
from machine_dialect.lexer.tokens import Token, TokenType, lookup_tag_token, lookup_token_type
|
10
|
+
|
11
|
+
|
12
|
+
class Lexer:
|
13
|
+
"""Streaming lexer for Machine Dialect™ language.
|
14
|
+
|
15
|
+
Generates tokens one at a time from the source code.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self, source: str) -> None:
|
19
|
+
"""Initialize the lexer with source code.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
source: The source code to tokenize.
|
23
|
+
"""
|
24
|
+
self.source = source
|
25
|
+
self.position = 0
|
26
|
+
self.line = 1
|
27
|
+
self.column = 1
|
28
|
+
self.current_char: str | None = self.source[0] if source else None
|
29
|
+
self.in_summary_comment = False
|
30
|
+
|
31
|
+
@property
|
32
|
+
def at_line_start(self) -> bool:
|
33
|
+
"""Check if we're at the start of a logical line.
|
34
|
+
|
35
|
+
A logical line start means we're at column 1 or only have whitespace
|
36
|
+
and block markers (>) before current position on this line.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
True if we're at the start of a logical line.
|
40
|
+
"""
|
41
|
+
if self.column == 1:
|
42
|
+
return True
|
43
|
+
|
44
|
+
# Check if we only have whitespace or block markers before current position on this line
|
45
|
+
# Find the start of the current line
|
46
|
+
line_start = self.position - (self.column - 1)
|
47
|
+
for i in range(line_start, self.position):
|
48
|
+
if i < len(self.source):
|
49
|
+
char = self.source[i]
|
50
|
+
if not char.isspace() and char != ">":
|
51
|
+
return False
|
52
|
+
return True
|
53
|
+
|
54
|
+
def advance(self) -> None:
|
55
|
+
"""Move to the next character in the source."""
|
56
|
+
if self.current_char == "\n":
|
57
|
+
self.line += 1
|
58
|
+
self.column = 1
|
59
|
+
else:
|
60
|
+
self.column += 1
|
61
|
+
|
62
|
+
self.position += 1
|
63
|
+
if self.position >= len(self.source):
|
64
|
+
self.current_char = None
|
65
|
+
else:
|
66
|
+
self.current_char = self.source[self.position]
|
67
|
+
|
68
|
+
def _restore_position(self, pos: int) -> None:
|
69
|
+
"""Restore position and recalculate column.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
pos: The position to restore to.
|
73
|
+
"""
|
74
|
+
self.position = pos
|
75
|
+
self.current_char = self.source[pos] if pos < len(self.source) else None
|
76
|
+
|
77
|
+
# Recalculate column by counting from start of current line
|
78
|
+
line_start = pos
|
79
|
+
while line_start > 0 and self.source[line_start - 1] != "\n":
|
80
|
+
line_start -= 1
|
81
|
+
self.column = pos - line_start + 1
|
82
|
+
|
83
|
+
def peek(self, offset: int = 1) -> str | None:
|
84
|
+
"""Look ahead at a character without consuming it.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
offset: How many characters ahead to look.
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
The character at the offset, or None if out of bounds.
|
91
|
+
"""
|
92
|
+
peek_pos = self.position + offset
|
93
|
+
if peek_pos >= len(self.source):
|
94
|
+
return None
|
95
|
+
return self.source[peek_pos]
|
96
|
+
|
97
|
+
def skip_whitespace(self) -> None:
|
98
|
+
"""Skip whitespace characters."""
|
99
|
+
while self.current_char and self.current_char.isspace():
|
100
|
+
self.advance()
|
101
|
+
|
102
|
+
def read_number(self) -> tuple[str, bool, int, int]:
|
103
|
+
"""Read a number literal.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
Tuple of (literal, is_float, line, column).
|
107
|
+
"""
|
108
|
+
start_pos = self.position
|
109
|
+
start_line = self.line
|
110
|
+
start_column = self.column
|
111
|
+
has_dot = False
|
112
|
+
|
113
|
+
while self.current_char and (self.current_char.isdigit() or self.current_char == "."):
|
114
|
+
if self.current_char == ".":
|
115
|
+
# Only allow one decimal point
|
116
|
+
if has_dot:
|
117
|
+
break
|
118
|
+
# Check if next character is a digit
|
119
|
+
next_char = self.peek()
|
120
|
+
if not next_char or not next_char.isdigit():
|
121
|
+
break
|
122
|
+
has_dot = True
|
123
|
+
self.advance()
|
124
|
+
|
125
|
+
return self.source[start_pos : self.position], has_dot, start_line, start_column
|
126
|
+
|
127
|
+
def read_identifier(self) -> tuple[str, int, int]:
|
128
|
+
"""Read an identifier.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Tuple of (identifier, line, column).
|
132
|
+
"""
|
133
|
+
start_pos = self.position
|
134
|
+
start_line = self.line
|
135
|
+
start_column = self.column
|
136
|
+
while self.current_char and (self.current_char.isalnum() or self.current_char == "_"):
|
137
|
+
self.advance()
|
138
|
+
|
139
|
+
# Check for contractions like 't or 's
|
140
|
+
peek_char = self.peek()
|
141
|
+
if self.current_char == "'" and peek_char and peek_char.isalpha():
|
142
|
+
self.advance() # Skip apostrophe
|
143
|
+
while self.current_char and self.current_char.isalpha():
|
144
|
+
self.advance()
|
145
|
+
|
146
|
+
return self.source[start_pos : self.position], start_line, start_column
|
147
|
+
|
148
|
+
def read_string(self) -> tuple[str, int, int]:
|
149
|
+
"""Read a string literal.
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
Tuple of (string_literal, line, column).
|
153
|
+
"""
|
154
|
+
start_pos = self.position
|
155
|
+
start_line = self.line
|
156
|
+
start_column = self.column
|
157
|
+
quote_char = self.current_char
|
158
|
+
self.advance() # Skip opening quote
|
159
|
+
|
160
|
+
while self.current_char and self.current_char != quote_char:
|
161
|
+
if self.current_char == "\\":
|
162
|
+
self.advance() # Skip escape character
|
163
|
+
if self.current_char:
|
164
|
+
self.advance() # Skip escaped character
|
165
|
+
else:
|
166
|
+
self.advance()
|
167
|
+
|
168
|
+
if self.current_char == quote_char:
|
169
|
+
self.advance() # Skip closing quote
|
170
|
+
|
171
|
+
return self.source[start_pos : self.position], start_line, start_column
|
172
|
+
|
173
|
+
def read_triple_backtick_string(self) -> tuple[str, int, int]:
|
174
|
+
"""Read a triple backtick string.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
Tuple of (string_content, line, column).
|
178
|
+
"""
|
179
|
+
start_line = self.line
|
180
|
+
start_column = self.column
|
181
|
+
|
182
|
+
# Skip the three backticks
|
183
|
+
self.advance() # First backtick
|
184
|
+
self.advance() # Second backtick
|
185
|
+
self.advance() # Third backtick
|
186
|
+
|
187
|
+
# Read until we find three closing backticks
|
188
|
+
content_start = self.position
|
189
|
+
while self.current_char:
|
190
|
+
if self.current_char == "`" and self.peek() == "`" and self.peek(2) == "`":
|
191
|
+
content = self.source[content_start : self.position]
|
192
|
+
# Skip the closing backticks
|
193
|
+
self.advance()
|
194
|
+
self.advance()
|
195
|
+
self.advance()
|
196
|
+
return content, start_line, start_column
|
197
|
+
self.advance()
|
198
|
+
|
199
|
+
# Unclosed triple backtick string
|
200
|
+
content = self.source[content_start : self.position]
|
201
|
+
return content, start_line, start_column
|
202
|
+
|
203
|
+
def check_multi_word_keyword(self, first_word: str, line: int, pos: int) -> tuple[str | None, int]:
|
204
|
+
"""Check if the identifier starts a multi-word keyword.
|
205
|
+
|
206
|
+
Args:
|
207
|
+
first_word: The first word that was read.
|
208
|
+
line: Line number of the first word.
|
209
|
+
pos: Column position of the first word.
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
Tuple of (multi_word_keyword, end_position) if found, otherwise (None, current_position).
|
213
|
+
"""
|
214
|
+
# Save current state
|
215
|
+
saved_position = self.position
|
216
|
+
saved_line = self.line
|
217
|
+
saved_column = self.column
|
218
|
+
saved_char = self.current_char
|
219
|
+
|
220
|
+
words = [first_word]
|
221
|
+
longest_match = None
|
222
|
+
longest_match_position = self.position
|
223
|
+
longest_match_line = self.line
|
224
|
+
longest_match_column = self.column
|
225
|
+
longest_match_char = self.current_char
|
226
|
+
|
227
|
+
# Try to build progressively longer multi-word sequences
|
228
|
+
while True:
|
229
|
+
# Skip whitespace
|
230
|
+
start_whitespace = self.position
|
231
|
+
self.skip_whitespace()
|
232
|
+
|
233
|
+
# If no whitespace was skipped, we can't have a multi-word keyword
|
234
|
+
if self.position == start_whitespace:
|
235
|
+
break
|
236
|
+
|
237
|
+
# Try to read the next word
|
238
|
+
if not self.current_char or not self.current_char.isalpha():
|
239
|
+
break
|
240
|
+
|
241
|
+
next_word, _, _ = self.read_identifier()
|
242
|
+
if not next_word:
|
243
|
+
break
|
244
|
+
|
245
|
+
words.append(next_word)
|
246
|
+
potential_keyword = " ".join(words)
|
247
|
+
|
248
|
+
# Check if this forms a valid multi-word keyword
|
249
|
+
token_type, _ = lookup_token_type(potential_keyword)
|
250
|
+
# Only accept actual keywords, not just any valid identifier
|
251
|
+
if token_type not in (TokenType.MISC_ILLEGAL, TokenType.MISC_IDENT, TokenType.MISC_STOPWORD):
|
252
|
+
# Found a valid multi-word keyword
|
253
|
+
longest_match = potential_keyword
|
254
|
+
longest_match_position = self.position
|
255
|
+
longest_match_line = self.line
|
256
|
+
longest_match_column = self.column
|
257
|
+
longest_match_char = self.current_char
|
258
|
+
|
259
|
+
if longest_match:
|
260
|
+
# Use the longest matching multi-word keyword
|
261
|
+
self.position = longest_match_position
|
262
|
+
self.line = longest_match_line
|
263
|
+
self.column = longest_match_column
|
264
|
+
self.current_char = longest_match_char
|
265
|
+
return longest_match, self.position
|
266
|
+
else:
|
267
|
+
# No multi-word keyword found, restore original position
|
268
|
+
self.position = saved_position
|
269
|
+
self.line = saved_line
|
270
|
+
self.column = saved_column
|
271
|
+
self.current_char = saved_char
|
272
|
+
return None, self.position
|
273
|
+
|
274
|
+
def read_double_asterisk_keyword(self) -> tuple[str, TokenType, int, int] | None:
|
275
|
+
"""Read a double-asterisk wrapped keyword.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
Tuple of (literal, token_type, line, column) or None if not a valid keyword.
|
279
|
+
"""
|
280
|
+
start_pos = self.position
|
281
|
+
start_line = self.line
|
282
|
+
start_column = self.column
|
283
|
+
|
284
|
+
# Skip first two asterisks
|
285
|
+
self.advance() # First *
|
286
|
+
self.advance() # Second *
|
287
|
+
|
288
|
+
# Check what comes after the asterisks
|
289
|
+
if not self.current_char or not self.current_char.isalpha():
|
290
|
+
# Restore position
|
291
|
+
self.position = start_pos
|
292
|
+
self.line = start_line
|
293
|
+
self.column = start_column
|
294
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
295
|
+
return None
|
296
|
+
|
297
|
+
# Read the keyword (can be multi-word)
|
298
|
+
words = []
|
299
|
+
|
300
|
+
while True:
|
301
|
+
# Read a word
|
302
|
+
if not self.current_char or not self.current_char.isalpha():
|
303
|
+
break
|
304
|
+
|
305
|
+
word_start = self.position
|
306
|
+
while self.current_char and self.current_char.isalpha():
|
307
|
+
self.advance()
|
308
|
+
words.append(self.source[word_start : self.position])
|
309
|
+
|
310
|
+
# Check if there's a space and another word
|
311
|
+
if self.current_char == " ":
|
312
|
+
# Peek ahead to see if there's another word or closing **
|
313
|
+
saved_pos = self.position
|
314
|
+
saved_line = self.line
|
315
|
+
saved_column = self.column
|
316
|
+
saved_char = self.current_char
|
317
|
+
|
318
|
+
self.advance() # Skip space
|
319
|
+
|
320
|
+
if self.current_char == "*" and self.peek() == "*":
|
321
|
+
# It's the closing **, restore and break
|
322
|
+
self.position = saved_pos
|
323
|
+
self.line = saved_line
|
324
|
+
self.column = saved_column
|
325
|
+
self.current_char = saved_char
|
326
|
+
break
|
327
|
+
elif self.current_char and self.current_char.isalpha():
|
328
|
+
# Another word follows, continue
|
329
|
+
continue
|
330
|
+
else:
|
331
|
+
# Not a word, restore and break
|
332
|
+
self.position = saved_pos
|
333
|
+
self.line = saved_line
|
334
|
+
self.column = saved_column
|
335
|
+
self.current_char = saved_char
|
336
|
+
break
|
337
|
+
else:
|
338
|
+
break
|
339
|
+
|
340
|
+
keyword = " ".join(words) if words else ""
|
341
|
+
|
342
|
+
# Check for closing double asterisk
|
343
|
+
if self.current_char == "*" and self.peek() == "*":
|
344
|
+
self.advance() # First closing *
|
345
|
+
self.advance() # Second closing *
|
346
|
+
|
347
|
+
# Check if it's a valid keyword
|
348
|
+
from machine_dialect.lexer.tokens import lookup_token_type
|
349
|
+
|
350
|
+
token_type, canonical = lookup_token_type(keyword)
|
351
|
+
|
352
|
+
# Only accept actual keywords, not identifiers, stopwords, or boolean literals
|
353
|
+
if token_type not in (
|
354
|
+
TokenType.MISC_ILLEGAL,
|
355
|
+
TokenType.MISC_IDENT,
|
356
|
+
TokenType.MISC_STOPWORD,
|
357
|
+
TokenType.LIT_YES,
|
358
|
+
TokenType.LIT_NO,
|
359
|
+
):
|
360
|
+
return canonical, token_type, start_line, start_column
|
361
|
+
|
362
|
+
# Not a valid double-asterisk keyword, restore position
|
363
|
+
self.position = start_pos
|
364
|
+
self.line = start_line
|
365
|
+
self.column = start_column
|
366
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
367
|
+
return None
|
368
|
+
|
369
|
+
def read_tag_token(self) -> tuple[str, TokenType, int, int] | None:
|
370
|
+
"""Read a tag token like <summary>, </summary>, <details>, </details>.
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
Tuple of (literal, token_type, line, column) or None if not a valid tag.
|
374
|
+
"""
|
375
|
+
start_pos = self.position
|
376
|
+
start_line = self.line
|
377
|
+
start_column = self.column
|
378
|
+
|
379
|
+
# Must start with '<'
|
380
|
+
if self.current_char != "<":
|
381
|
+
return None
|
382
|
+
|
383
|
+
self.advance() # Skip '<'
|
384
|
+
|
385
|
+
# Check for closing tag
|
386
|
+
is_closing = False
|
387
|
+
if self.current_char == "/":
|
388
|
+
is_closing = True
|
389
|
+
self.advance() # Skip '/'
|
390
|
+
|
391
|
+
# Read the tag name
|
392
|
+
tag_name_start = self.position
|
393
|
+
while self.current_char and self.current_char.isalpha():
|
394
|
+
self.advance()
|
395
|
+
|
396
|
+
tag_name = self.source[tag_name_start : self.position]
|
397
|
+
|
398
|
+
# Must end with '>'
|
399
|
+
if self.current_char != ">":
|
400
|
+
# Not a valid tag, restore position
|
401
|
+
self.position = start_pos
|
402
|
+
self.line = start_line
|
403
|
+
self.column = start_column
|
404
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
405
|
+
return None
|
406
|
+
|
407
|
+
self.advance() # Skip '>'
|
408
|
+
|
409
|
+
# Construct the full tag literal
|
410
|
+
if is_closing:
|
411
|
+
tag_literal = f"</{tag_name}>"
|
412
|
+
else:
|
413
|
+
tag_literal = f"<{tag_name}>"
|
414
|
+
|
415
|
+
# Check if it's a valid tag token
|
416
|
+
token_type, canonical_literal = lookup_tag_token(tag_literal)
|
417
|
+
if token_type:
|
418
|
+
return canonical_literal, token_type, start_line, start_column
|
419
|
+
|
420
|
+
# Not a recognized tag, restore position
|
421
|
+
self.position = start_pos
|
422
|
+
self.line = start_line
|
423
|
+
self.column = start_column
|
424
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
425
|
+
return None
|
426
|
+
|
427
|
+
def read_comment_content(self) -> tuple[str, int, int]:
|
428
|
+
"""Read comment content until </summary> tag is found.
|
429
|
+
|
430
|
+
Returns:
|
431
|
+
Tuple of (comment_content, line, column).
|
432
|
+
"""
|
433
|
+
start_line = self.line
|
434
|
+
start_column = self.column
|
435
|
+
content_start = self.position
|
436
|
+
|
437
|
+
while self.current_char:
|
438
|
+
# Look for potential closing tag
|
439
|
+
if self.current_char == "<":
|
440
|
+
# Save position before checking
|
441
|
+
saved_pos = self.position
|
442
|
+
saved_line = self.line
|
443
|
+
saved_column = self.column
|
444
|
+
saved_char = self.current_char
|
445
|
+
|
446
|
+
# Check if it's </summary>
|
447
|
+
self.advance() # Skip '<'
|
448
|
+
if self.current_char == "/":
|
449
|
+
self.advance() # Skip '/'
|
450
|
+
# Check for "summary"
|
451
|
+
tag_start = self.position
|
452
|
+
while self.current_char and self.current_char.isalpha():
|
453
|
+
self.advance()
|
454
|
+
tag_name = self.source[tag_start : self.position]
|
455
|
+
|
456
|
+
if tag_name.lower() == "summary" and self.current_char == ">":
|
457
|
+
# Found closing tag, restore to before the tag
|
458
|
+
self.position = saved_pos
|
459
|
+
self.line = saved_line
|
460
|
+
self.column = saved_column
|
461
|
+
self.current_char = saved_char
|
462
|
+
# Return the content before the closing tag
|
463
|
+
content = self.source[content_start:saved_pos]
|
464
|
+
return content, start_line, start_column
|
465
|
+
|
466
|
+
# Not a closing summary tag, restore and continue
|
467
|
+
self.position = saved_pos
|
468
|
+
self.line = saved_line
|
469
|
+
self.column = saved_column
|
470
|
+
self.current_char = saved_char
|
471
|
+
|
472
|
+
self.advance()
|
473
|
+
|
474
|
+
# No closing tag found, return content up to EOF
|
475
|
+
content = self.source[content_start : self.position]
|
476
|
+
return content, start_line, start_column
|
477
|
+
|
478
|
+
def read_underscore_literal(self) -> tuple[str, TokenType, int, int] | None:
|
479
|
+
"""Read an underscore-wrapped literal.
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
Tuple of (literal, token_type, line, column) or None if not a valid literal.
|
483
|
+
"""
|
484
|
+
start_pos = self.position
|
485
|
+
start_line = self.line
|
486
|
+
start_column = self.column
|
487
|
+
|
488
|
+
self.advance() # Skip first underscore
|
489
|
+
|
490
|
+
# For underscore literals, report the position after the underscore
|
491
|
+
literal_column = start_column
|
492
|
+
|
493
|
+
# Check what comes after the underscore
|
494
|
+
if not self.current_char:
|
495
|
+
# Restore position
|
496
|
+
self.position = start_pos
|
497
|
+
self.line = start_line
|
498
|
+
self.column = start_column
|
499
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
500
|
+
return None
|
501
|
+
|
502
|
+
# Check for negative sign before number
|
503
|
+
has_minus = False
|
504
|
+
if self.current_char == "-":
|
505
|
+
has_minus = True
|
506
|
+
self.advance() # Skip minus sign
|
507
|
+
|
508
|
+
# Check what comes after the minus
|
509
|
+
if not self.current_char:
|
510
|
+
# Restore position
|
511
|
+
self.position = start_pos
|
512
|
+
self.line = start_line
|
513
|
+
self.column = start_column
|
514
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
515
|
+
return None
|
516
|
+
|
517
|
+
# Try different literal types
|
518
|
+
next_char = self.peek()
|
519
|
+
if self.current_char.isdigit() or (self.current_char == "." and next_char and next_char.isdigit()):
|
520
|
+
# Number literal
|
521
|
+
literal, is_float, _, _ = self.read_number()
|
522
|
+
|
523
|
+
# Normalize decimal-only floats (e.g., ".5" -> "0.5")
|
524
|
+
if is_float and literal.startswith("."):
|
525
|
+
literal = "0" + literal
|
526
|
+
|
527
|
+
# Add minus sign if present
|
528
|
+
if has_minus:
|
529
|
+
literal = "-" + literal
|
530
|
+
|
531
|
+
# Check for closing underscore
|
532
|
+
if self.current_char == "_":
|
533
|
+
self.advance()
|
534
|
+
|
535
|
+
# Check for extra trailing underscores (invalid pattern)
|
536
|
+
if self.current_char == "_":
|
537
|
+
# Multiple trailing underscores - this is invalid
|
538
|
+
# Don't restore position, let the caller handle the illegal token
|
539
|
+
return None
|
540
|
+
|
541
|
+
# Return canonical form without underscores
|
542
|
+
token_type = TokenType.LIT_FLOAT if is_float else TokenType.LIT_WHOLE_NUMBER
|
543
|
+
return literal, token_type, start_line, literal_column
|
544
|
+
elif self.current_char in ('"', "'"):
|
545
|
+
# String literal - but minus sign is not valid before strings
|
546
|
+
if has_minus:
|
547
|
+
# Restore position
|
548
|
+
self.position = start_pos
|
549
|
+
self.line = start_line
|
550
|
+
self.column = start_column
|
551
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
552
|
+
return None
|
553
|
+
quote_char = self.current_char
|
554
|
+
self.advance() # Skip opening quote
|
555
|
+
|
556
|
+
string_content_start = self.position
|
557
|
+
while self.current_char and self.current_char != quote_char:
|
558
|
+
if self.current_char == "\\":
|
559
|
+
self.advance()
|
560
|
+
if self.current_char:
|
561
|
+
self.advance()
|
562
|
+
else:
|
563
|
+
self.advance()
|
564
|
+
|
565
|
+
if self.current_char == quote_char:
|
566
|
+
self.advance() # Skip closing quote
|
567
|
+
|
568
|
+
# Check for closing underscore
|
569
|
+
if self.current_char == "_":
|
570
|
+
self.advance()
|
571
|
+
# Get string content without quotes
|
572
|
+
string_content = self.source[string_content_start : self.position - 2]
|
573
|
+
# Return canonical form with quotes but without underscores
|
574
|
+
full_literal = f"{quote_char}{string_content}{quote_char}"
|
575
|
+
|
576
|
+
# Check if it's a URL
|
577
|
+
url_to_check = string_content
|
578
|
+
token_type = TokenType.LIT_URL if is_valid_url(url_to_check) else TokenType.LIT_TEXT
|
579
|
+
return full_literal, token_type, start_line, literal_column
|
580
|
+
else:
|
581
|
+
# String is unclosed - this is a malformed underscore literal
|
582
|
+
# Don't restore position, we've already consumed the content
|
583
|
+
# Return None to indicate it's invalid, but keep the consumed position
|
584
|
+
return None
|
585
|
+
elif self.current_char.isalpha():
|
586
|
+
# Read alphabetic characters only (no underscores) for potential boolean literal
|
587
|
+
# Minus sign is not valid before boolean literals
|
588
|
+
if has_minus:
|
589
|
+
# Restore position
|
590
|
+
self.position = start_pos
|
591
|
+
self.line = start_line
|
592
|
+
self.column = start_column
|
593
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
594
|
+
return None
|
595
|
+
ident_start = self.position
|
596
|
+
while self.current_char and self.current_char.isalpha():
|
597
|
+
self.advance()
|
598
|
+
|
599
|
+
literal = self.source[ident_start : self.position]
|
600
|
+
|
601
|
+
# Check for closing underscore
|
602
|
+
if self.current_char == "_":
|
603
|
+
# Check if it's a boolean or empty literal
|
604
|
+
if literal.lower() in ("true", "false", "yes", "no", "empty"):
|
605
|
+
self.advance() # Consume the closing underscore
|
606
|
+
# Use canonical form for the literal (without underscores)
|
607
|
+
if literal.lower() == "empty":
|
608
|
+
return "empty", TokenType.KW_EMPTY, start_line, literal_column
|
609
|
+
else:
|
610
|
+
# Map Yes/No to True/False
|
611
|
+
is_true = literal.lower() in ("true", "yes")
|
612
|
+
canonical_literal = "Yes" if is_true else "No"
|
613
|
+
token_type = TokenType.LIT_YES if is_true else TokenType.LIT_NO
|
614
|
+
return canonical_literal, token_type, start_line, literal_column
|
615
|
+
|
616
|
+
# Not a valid underscore-wrapped literal, restore position
|
617
|
+
# (This also handles the case where we have a minus sign but no valid literal follows)
|
618
|
+
self.position = start_pos
|
619
|
+
self.line = start_line
|
620
|
+
self.column = start_column
|
621
|
+
self.current_char = self.source[self.position] if self.position < len(self.source) else None
|
622
|
+
return None
|
623
|
+
|
624
|
+
def next_token(self, in_block: bool = False, in_list_context: bool = False) -> Token:
|
625
|
+
"""Get the next token from the source.
|
626
|
+
|
627
|
+
Args:
|
628
|
+
in_block: Whether we're currently parsing inside a block (currently unused).
|
629
|
+
in_list_context: Whether we're in a list definition context (after Set ... to:).
|
630
|
+
|
631
|
+
Returns:
|
632
|
+
The next token, or an EOF token if no more tokens are available.
|
633
|
+
"""
|
634
|
+
# If we're in a summary comment, read the comment content
|
635
|
+
if self.in_summary_comment:
|
636
|
+
self.in_summary_comment = False
|
637
|
+
# Don't skip whitespace - it's part of the comment
|
638
|
+
# If we're at EOF, don't create a comment
|
639
|
+
if self.current_char is None:
|
640
|
+
return Token(TokenType.MISC_EOF, "", self.line, self.column)
|
641
|
+
content, line, pos = self.read_comment_content()
|
642
|
+
return Token(TokenType.MISC_COMMENT, content, line, pos)
|
643
|
+
|
644
|
+
# Skip whitespace
|
645
|
+
self.skip_whitespace()
|
646
|
+
|
647
|
+
# Check if we've reached the end
|
648
|
+
if self.current_char is None:
|
649
|
+
return Token(TokenType.MISC_EOF, "", self.line, self.column)
|
650
|
+
|
651
|
+
# Save position for token
|
652
|
+
token_line = self.line
|
653
|
+
token_column = self.column
|
654
|
+
|
655
|
+
# Check for tag tokens (<summary>, </summary>, <details>, </details>)
|
656
|
+
if self.current_char == "<":
|
657
|
+
tag_result = self.read_tag_token()
|
658
|
+
if tag_result:
|
659
|
+
literal, token_type, line, pos = tag_result
|
660
|
+
# If we just read a summary start tag, set flag for next token
|
661
|
+
if token_type == TokenType.TAG_SUMMARY_START:
|
662
|
+
self.in_summary_comment = True
|
663
|
+
return Token(token_type, literal, line, pos)
|
664
|
+
|
665
|
+
# Check for underscore-wrapped literals
|
666
|
+
if self.current_char == "_":
|
667
|
+
start_pos = self.position
|
668
|
+
literal_result = self.read_underscore_literal()
|
669
|
+
if literal_result:
|
670
|
+
literal, token_type, line, pos = literal_result
|
671
|
+
return Token(token_type, literal, line, pos)
|
672
|
+
|
673
|
+
# If read_underscore_literal returned None and consumed characters
|
674
|
+
# we have an invalid pattern
|
675
|
+
if self.position > start_pos:
|
676
|
+
# We've consumed some characters - it's an invalid pattern
|
677
|
+
# Continue consuming any remaining underscores
|
678
|
+
while self.current_char == "_":
|
679
|
+
self.advance()
|
680
|
+
illegal_literal = self.source[start_pos : self.position]
|
681
|
+
return Token(TokenType.MISC_ILLEGAL, illegal_literal, token_line, token_column)
|
682
|
+
|
683
|
+
# Check if this is an incomplete underscore pattern
|
684
|
+
next_char = self.peek()
|
685
|
+
next_next_char = self.peek(2)
|
686
|
+
if next_char and (
|
687
|
+
next_char.isdigit() or (next_char == "." and next_next_char is not None and next_next_char.isdigit())
|
688
|
+
):
|
689
|
+
# Invalid underscore pattern
|
690
|
+
self.advance() # Skip underscore
|
691
|
+
|
692
|
+
# Read the number part
|
693
|
+
if self.current_char == "." or (self.current_char and self.current_char.isdigit()):
|
694
|
+
self.read_number()
|
695
|
+
|
696
|
+
# Consume trailing underscores
|
697
|
+
while self.current_char == "_":
|
698
|
+
self.advance()
|
699
|
+
|
700
|
+
illegal_literal = self.source[start_pos : self.position]
|
701
|
+
return Token(TokenType.MISC_ILLEGAL, illegal_literal, token_line, token_column)
|
702
|
+
|
703
|
+
# Check for double-asterisk wrapped keywords or operator
|
704
|
+
if self.current_char == "*" and self.peek() == "*":
|
705
|
+
asterisk_result = self.read_double_asterisk_keyword()
|
706
|
+
if asterisk_result:
|
707
|
+
literal, token_type, line, pos = asterisk_result
|
708
|
+
return Token(token_type, literal, line, pos)
|
709
|
+
else:
|
710
|
+
# Not a wrapped keyword, treat as ** operator
|
711
|
+
self.advance() # First *
|
712
|
+
self.advance() # Second *
|
713
|
+
return Token(TokenType.OP_TWO_STARS, "**", token_line, token_column)
|
714
|
+
|
715
|
+
# Numbers
|
716
|
+
next_char = self.peek()
|
717
|
+
if self.current_char.isdigit() or (self.current_char == "." and next_char and next_char.isdigit()):
|
718
|
+
literal, is_float, _, _ = self.read_number()
|
719
|
+
|
720
|
+
# Check for invalid trailing underscore
|
721
|
+
if self.current_char == "_":
|
722
|
+
start_pos = self.position - len(literal)
|
723
|
+
self.advance()
|
724
|
+
illegal_literal = self.source[start_pos : self.position]
|
725
|
+
return Token(TokenType.MISC_ILLEGAL, illegal_literal, token_line, token_column)
|
726
|
+
|
727
|
+
# Prepend "0" to literals starting with "."
|
728
|
+
if literal.startswith("."):
|
729
|
+
literal = "0" + literal
|
730
|
+
|
731
|
+
token_type = TokenType.LIT_FLOAT if is_float else TokenType.LIT_WHOLE_NUMBER
|
732
|
+
return Token(token_type, literal, token_line, token_column)
|
733
|
+
|
734
|
+
# Identifiers and keywords
|
735
|
+
if self.current_char.isalpha() or self.current_char == "_":
|
736
|
+
# Handle multiple underscores followed by number
|
737
|
+
if self.current_char == "_":
|
738
|
+
underscore_count = 0
|
739
|
+
temp_pos = self.position
|
740
|
+
while temp_pos < len(self.source) and self.source[temp_pos] == "_":
|
741
|
+
underscore_count += 1
|
742
|
+
temp_pos += 1
|
743
|
+
|
744
|
+
if temp_pos < len(self.source) and (
|
745
|
+
self.source[temp_pos].isdigit()
|
746
|
+
or (
|
747
|
+
self.source[temp_pos] == "."
|
748
|
+
and temp_pos + 1 < len(self.source)
|
749
|
+
and self.source[temp_pos + 1].isdigit()
|
750
|
+
)
|
751
|
+
):
|
752
|
+
if underscore_count > 1:
|
753
|
+
# Invalid pattern
|
754
|
+
start_pos = self.position
|
755
|
+
for _ in range(underscore_count):
|
756
|
+
self.advance()
|
757
|
+
|
758
|
+
self.read_number()
|
759
|
+
|
760
|
+
while self.current_char == "_":
|
761
|
+
self.advance()
|
762
|
+
|
763
|
+
illegal_literal = self.source[start_pos : self.position]
|
764
|
+
return Token(TokenType.MISC_ILLEGAL, illegal_literal, token_line, token_column)
|
765
|
+
|
766
|
+
# Read identifier
|
767
|
+
literal, _, _ = self.read_identifier()
|
768
|
+
|
769
|
+
# Special check for "Yes/No" type keyword
|
770
|
+
if (
|
771
|
+
literal is not None
|
772
|
+
and literal.lower() == "yes"
|
773
|
+
and self.current_char == "/"
|
774
|
+
and self.peek() is not None
|
775
|
+
and self.peek().lower() == "n" # type: ignore[union-attr]
|
776
|
+
and self.peek(2) is not None
|
777
|
+
and self.peek(2).lower() == "o" # type: ignore[union-attr]
|
778
|
+
):
|
779
|
+
# Consume "/No"
|
780
|
+
self.advance() # Skip '/'
|
781
|
+
self.advance() # Skip 'N' or 'n'
|
782
|
+
self.advance() # Skip 'o' or 'O'
|
783
|
+
# Return the Yes/No keyword token
|
784
|
+
return Token(TokenType.KW_YES_NO, "Yes/No", token_line, token_column)
|
785
|
+
|
786
|
+
# Check for multi-word keywords
|
787
|
+
multi_word, _ = self.check_multi_word_keyword(literal, token_line, token_column)
|
788
|
+
if multi_word:
|
789
|
+
token_type, canonical_literal = lookup_token_type(multi_word)
|
790
|
+
return Token(token_type, canonical_literal, token_line, token_column)
|
791
|
+
|
792
|
+
# Single word keyword or identifier
|
793
|
+
token_type, canonical_literal = lookup_token_type(literal)
|
794
|
+
return Token(token_type, canonical_literal, token_line, token_column)
|
795
|
+
|
796
|
+
# Strings
|
797
|
+
if self.current_char in ('"', "'"):
|
798
|
+
literal, _, _ = self.read_string()
|
799
|
+
|
800
|
+
# Check if it's a URL
|
801
|
+
url_to_check = literal[1:-1] if len(literal) > 2 else literal
|
802
|
+
token_type = TokenType.LIT_URL if is_valid_url(url_to_check) else TokenType.LIT_TEXT
|
803
|
+
return Token(token_type, literal, token_line, token_column)
|
804
|
+
|
805
|
+
# Backticks
|
806
|
+
if self.current_char == "`":
|
807
|
+
# Check for triple backticks
|
808
|
+
if self.peek() == "`" and self.peek(2) == "`":
|
809
|
+
literal, _, _ = self.read_triple_backtick_string()
|
810
|
+
return Token(TokenType.LIT_TRIPLE_BACKTICK, literal, token_line, token_column)
|
811
|
+
|
812
|
+
# Single backtick identifier
|
813
|
+
start_pos = self.position
|
814
|
+
self.advance() # Skip opening backtick
|
815
|
+
|
816
|
+
# For backtick identifiers:
|
817
|
+
# - If backtick is at position 1, report position 1
|
818
|
+
# - Otherwise, report position after the backtick
|
819
|
+
identifier_column = token_column if token_column == 1 else self.column
|
820
|
+
identifier_start = self.position
|
821
|
+
while self.current_char and self.current_char != "`":
|
822
|
+
self.advance()
|
823
|
+
|
824
|
+
identifier = self.source[identifier_start : self.position]
|
825
|
+
|
826
|
+
if self.current_char == "`" and identifier:
|
827
|
+
from machine_dialect.lexer.tokens import is_valid_identifier
|
828
|
+
|
829
|
+
if is_valid_identifier(identifier):
|
830
|
+
self.advance() # Skip closing backtick
|
831
|
+
token_type, canonical_literal = lookup_token_type(identifier)
|
832
|
+
|
833
|
+
# Keywords, stopwords, and boolean literals in backticks become identifiers
|
834
|
+
# Backticks force the content to be treated as an identifier
|
835
|
+
from machine_dialect.lexer.tokens import TokenMetaType
|
836
|
+
|
837
|
+
if (
|
838
|
+
token_type == TokenType.MISC_STOPWORD
|
839
|
+
or token_type.meta_type == TokenMetaType.KW
|
840
|
+
or token_type in (TokenType.LIT_YES, TokenType.LIT_NO)
|
841
|
+
):
|
842
|
+
token_type = TokenType.MISC_IDENT
|
843
|
+
canonical_literal = identifier
|
844
|
+
|
845
|
+
if token_type != TokenType.MISC_ILLEGAL:
|
846
|
+
# Check if this identifier is followed by 's for possessive
|
847
|
+
# This allows us to handle `person`'s name patterns
|
848
|
+
if self.current_char == "'" and self.peek() == "s":
|
849
|
+
# Skip the apostrophe and 's'
|
850
|
+
self.advance() # Skip '
|
851
|
+
self.advance() # Skip 's'
|
852
|
+
# Return a special token that indicates possessive access
|
853
|
+
# The literal includes the identifier for context
|
854
|
+
return Token(TokenType.PUNCT_APOSTROPHE_S, canonical_literal, token_line, identifier_column)
|
855
|
+
return Token(token_type, canonical_literal, token_line, identifier_column)
|
856
|
+
|
857
|
+
# Invalid backtick usage
|
858
|
+
self._restore_position(start_pos)
|
859
|
+
|
860
|
+
# Single backtick is illegal
|
861
|
+
self.advance()
|
862
|
+
return Token(TokenType.MISC_ILLEGAL, "`", token_line, token_column)
|
863
|
+
|
864
|
+
# Check for dash at line start in list context
|
865
|
+
if self.current_char == "-" and self.at_line_start and in_list_context:
|
866
|
+
# In list context, dash at line start is a list marker
|
867
|
+
self.advance()
|
868
|
+
return Token(TokenType.PUNCT_DASH, "-", token_line, token_column)
|
869
|
+
|
870
|
+
# Single character tokens (operators, delimiters, punctuation)
|
871
|
+
if self.current_char in CHAR_TO_TOKEN_MAP:
|
872
|
+
char = self.current_char
|
873
|
+
self.advance()
|
874
|
+
|
875
|
+
# Check for multi-character operators
|
876
|
+
if char == "<" and self.current_char == "=":
|
877
|
+
self.advance()
|
878
|
+
return Token(TokenType.OP_LTE, "<=", token_line, token_column)
|
879
|
+
elif char == ">" and self.current_char == "=":
|
880
|
+
self.advance()
|
881
|
+
return Token(TokenType.OP_GTE, ">=", token_line, token_column)
|
882
|
+
elif char == "#":
|
883
|
+
# Check for ##, ###, or ####
|
884
|
+
if self.current_char == "#":
|
885
|
+
self.advance()
|
886
|
+
if self.current_char == "#":
|
887
|
+
self.advance()
|
888
|
+
if self.current_char == "#":
|
889
|
+
self.advance()
|
890
|
+
return Token(TokenType.PUNCT_HASH_QUAD, "####", token_line, token_column)
|
891
|
+
return Token(TokenType.PUNCT_HASH_TRIPLE, "###", token_line, token_column)
|
892
|
+
return Token(TokenType.PUNCT_HASH_DOUBLE, "##", token_line, token_column)
|
893
|
+
elif char == "-":
|
894
|
+
# Check for --- (frontmatter delimiter)
|
895
|
+
if self.current_char == "-" and self.peek() == "-":
|
896
|
+
self.advance() # Second dash
|
897
|
+
self.advance() # Third dash
|
898
|
+
return Token(TokenType.PUNCT_FRONTMATTER, "---", token_line, token_column)
|
899
|
+
|
900
|
+
# Single character token
|
901
|
+
token_type = CHAR_TO_TOKEN_MAP[char]
|
902
|
+
return Token(token_type, char, token_line, token_column)
|
903
|
+
|
904
|
+
# Unknown character - illegal token
|
905
|
+
char = self.current_char
|
906
|
+
self.advance()
|
907
|
+
return Token(TokenType.MISC_ILLEGAL, char, token_line, token_column)
|