machine-dialect 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. machine_dialect/__main__.py +667 -0
  2. machine_dialect/agent/__init__.py +5 -0
  3. machine_dialect/agent/agent.py +360 -0
  4. machine_dialect/ast/__init__.py +95 -0
  5. machine_dialect/ast/ast_node.py +35 -0
  6. machine_dialect/ast/call_expression.py +82 -0
  7. machine_dialect/ast/dict_extraction.py +60 -0
  8. machine_dialect/ast/expressions.py +439 -0
  9. machine_dialect/ast/literals.py +309 -0
  10. machine_dialect/ast/program.py +35 -0
  11. machine_dialect/ast/statements.py +1433 -0
  12. machine_dialect/ast/tests/test_ast_string_representation.py +62 -0
  13. machine_dialect/ast/tests/test_boolean_literal.py +29 -0
  14. machine_dialect/ast/tests/test_collection_hir.py +138 -0
  15. machine_dialect/ast/tests/test_define_statement.py +142 -0
  16. machine_dialect/ast/tests/test_desugar.py +541 -0
  17. machine_dialect/ast/tests/test_foreach_desugar.py +245 -0
  18. machine_dialect/cfg/__init__.py +6 -0
  19. machine_dialect/cfg/config.py +156 -0
  20. machine_dialect/cfg/examples.py +221 -0
  21. machine_dialect/cfg/generate_with_ai.py +187 -0
  22. machine_dialect/cfg/openai_generation.py +200 -0
  23. machine_dialect/cfg/parser.py +94 -0
  24. machine_dialect/cfg/tests/__init__.py +1 -0
  25. machine_dialect/cfg/tests/test_cfg_parser.py +252 -0
  26. machine_dialect/cfg/tests/test_config.py +188 -0
  27. machine_dialect/cfg/tests/test_examples.py +391 -0
  28. machine_dialect/cfg/tests/test_generate_with_ai.py +354 -0
  29. machine_dialect/cfg/tests/test_openai_generation.py +256 -0
  30. machine_dialect/codegen/__init__.py +5 -0
  31. machine_dialect/codegen/bytecode_module.py +89 -0
  32. machine_dialect/codegen/bytecode_serializer.py +300 -0
  33. machine_dialect/codegen/opcodes.py +101 -0
  34. machine_dialect/codegen/register_codegen.py +1996 -0
  35. machine_dialect/codegen/symtab.py +208 -0
  36. machine_dialect/codegen/tests/__init__.py +1 -0
  37. machine_dialect/codegen/tests/test_array_operations_codegen.py +295 -0
  38. machine_dialect/codegen/tests/test_bytecode_serializer.py +185 -0
  39. machine_dialect/codegen/tests/test_register_codegen_ssa.py +324 -0
  40. machine_dialect/codegen/tests/test_symtab.py +418 -0
  41. machine_dialect/codegen/vm_serializer.py +621 -0
  42. machine_dialect/compiler/__init__.py +18 -0
  43. machine_dialect/compiler/compiler.py +197 -0
  44. machine_dialect/compiler/config.py +149 -0
  45. machine_dialect/compiler/context.py +149 -0
  46. machine_dialect/compiler/phases/__init__.py +19 -0
  47. machine_dialect/compiler/phases/bytecode_optimization.py +90 -0
  48. machine_dialect/compiler/phases/codegen.py +40 -0
  49. machine_dialect/compiler/phases/hir_generation.py +39 -0
  50. machine_dialect/compiler/phases/mir_generation.py +86 -0
  51. machine_dialect/compiler/phases/optimization.py +110 -0
  52. machine_dialect/compiler/phases/parsing.py +39 -0
  53. machine_dialect/compiler/pipeline.py +143 -0
  54. machine_dialect/compiler/tests/__init__.py +1 -0
  55. machine_dialect/compiler/tests/test_compiler.py +568 -0
  56. machine_dialect/compiler/vm_runner.py +173 -0
  57. machine_dialect/errors/__init__.py +32 -0
  58. machine_dialect/errors/exceptions.py +369 -0
  59. machine_dialect/errors/messages.py +82 -0
  60. machine_dialect/errors/tests/__init__.py +0 -0
  61. machine_dialect/errors/tests/test_expected_token_errors.py +188 -0
  62. machine_dialect/errors/tests/test_name_errors.py +118 -0
  63. machine_dialect/helpers/__init__.py +0 -0
  64. machine_dialect/helpers/stopwords.py +225 -0
  65. machine_dialect/helpers/validators.py +30 -0
  66. machine_dialect/lexer/__init__.py +9 -0
  67. machine_dialect/lexer/constants.py +23 -0
  68. machine_dialect/lexer/lexer.py +907 -0
  69. machine_dialect/lexer/tests/__init__.py +0 -0
  70. machine_dialect/lexer/tests/helpers.py +86 -0
  71. machine_dialect/lexer/tests/test_apostrophe_identifiers.py +122 -0
  72. machine_dialect/lexer/tests/test_backtick_identifiers.py +140 -0
  73. machine_dialect/lexer/tests/test_boolean_literals.py +108 -0
  74. machine_dialect/lexer/tests/test_case_insensitive_keywords.py +188 -0
  75. machine_dialect/lexer/tests/test_comments.py +200 -0
  76. machine_dialect/lexer/tests/test_double_asterisk_keywords.py +127 -0
  77. machine_dialect/lexer/tests/test_lexer_position.py +113 -0
  78. machine_dialect/lexer/tests/test_list_tokens.py +282 -0
  79. machine_dialect/lexer/tests/test_stopwords.py +80 -0
  80. machine_dialect/lexer/tests/test_strict_equality.py +129 -0
  81. machine_dialect/lexer/tests/test_token.py +41 -0
  82. machine_dialect/lexer/tests/test_tokenization.py +294 -0
  83. machine_dialect/lexer/tests/test_underscore_literals.py +343 -0
  84. machine_dialect/lexer/tests/test_url_literals.py +169 -0
  85. machine_dialect/lexer/tokens.py +487 -0
  86. machine_dialect/linter/__init__.py +10 -0
  87. machine_dialect/linter/__main__.py +144 -0
  88. machine_dialect/linter/linter.py +154 -0
  89. machine_dialect/linter/rules/__init__.py +8 -0
  90. machine_dialect/linter/rules/base.py +112 -0
  91. machine_dialect/linter/rules/statement_termination.py +99 -0
  92. machine_dialect/linter/tests/__init__.py +1 -0
  93. machine_dialect/linter/tests/mdrules/__init__.py +0 -0
  94. machine_dialect/linter/tests/mdrules/test_md101_statement_termination.py +181 -0
  95. machine_dialect/linter/tests/test_linter.py +81 -0
  96. machine_dialect/linter/tests/test_rules.py +110 -0
  97. machine_dialect/linter/tests/test_violations.py +71 -0
  98. machine_dialect/linter/violations.py +51 -0
  99. machine_dialect/mir/__init__.py +69 -0
  100. machine_dialect/mir/analyses/__init__.py +20 -0
  101. machine_dialect/mir/analyses/alias_analysis.py +315 -0
  102. machine_dialect/mir/analyses/dominance_analysis.py +49 -0
  103. machine_dialect/mir/analyses/escape_analysis.py +286 -0
  104. machine_dialect/mir/analyses/loop_analysis.py +272 -0
  105. machine_dialect/mir/analyses/tests/test_type_analysis.py +736 -0
  106. machine_dialect/mir/analyses/type_analysis.py +448 -0
  107. machine_dialect/mir/analyses/use_def_chains.py +232 -0
  108. machine_dialect/mir/basic_block.py +385 -0
  109. machine_dialect/mir/dataflow.py +445 -0
  110. machine_dialect/mir/debug_info.py +208 -0
  111. machine_dialect/mir/hir_to_mir.py +1738 -0
  112. machine_dialect/mir/mir_dumper.py +366 -0
  113. machine_dialect/mir/mir_function.py +167 -0
  114. machine_dialect/mir/mir_instructions.py +1877 -0
  115. machine_dialect/mir/mir_interpreter.py +556 -0
  116. machine_dialect/mir/mir_module.py +225 -0
  117. machine_dialect/mir/mir_printer.py +480 -0
  118. machine_dialect/mir/mir_transformer.py +410 -0
  119. machine_dialect/mir/mir_types.py +367 -0
  120. machine_dialect/mir/mir_validation.py +455 -0
  121. machine_dialect/mir/mir_values.py +268 -0
  122. machine_dialect/mir/optimization_config.py +233 -0
  123. machine_dialect/mir/optimization_pass.py +251 -0
  124. machine_dialect/mir/optimization_pipeline.py +355 -0
  125. machine_dialect/mir/optimizations/__init__.py +84 -0
  126. machine_dialect/mir/optimizations/algebraic_simplification.py +733 -0
  127. machine_dialect/mir/optimizations/branch_prediction.py +372 -0
  128. machine_dialect/mir/optimizations/constant_propagation.py +634 -0
  129. machine_dialect/mir/optimizations/cse.py +398 -0
  130. machine_dialect/mir/optimizations/dce.py +288 -0
  131. machine_dialect/mir/optimizations/inlining.py +551 -0
  132. machine_dialect/mir/optimizations/jump_threading.py +487 -0
  133. machine_dialect/mir/optimizations/licm.py +405 -0
  134. machine_dialect/mir/optimizations/loop_unrolling.py +366 -0
  135. machine_dialect/mir/optimizations/strength_reduction.py +422 -0
  136. machine_dialect/mir/optimizations/tail_call.py +207 -0
  137. machine_dialect/mir/optimizations/tests/test_loop_unrolling.py +483 -0
  138. machine_dialect/mir/optimizations/type_narrowing.py +397 -0
  139. machine_dialect/mir/optimizations/type_specialization.py +447 -0
  140. machine_dialect/mir/optimizations/type_specific.py +906 -0
  141. machine_dialect/mir/optimize_mir.py +89 -0
  142. machine_dialect/mir/pass_manager.py +391 -0
  143. machine_dialect/mir/profiling/__init__.py +26 -0
  144. machine_dialect/mir/profiling/profile_collector.py +318 -0
  145. machine_dialect/mir/profiling/profile_data.py +372 -0
  146. machine_dialect/mir/profiling/profile_reader.py +272 -0
  147. machine_dialect/mir/profiling/profile_writer.py +226 -0
  148. machine_dialect/mir/register_allocation.py +302 -0
  149. machine_dialect/mir/reporting/__init__.py +17 -0
  150. machine_dialect/mir/reporting/optimization_reporter.py +314 -0
  151. machine_dialect/mir/reporting/report_formatter.py +289 -0
  152. machine_dialect/mir/ssa_construction.py +342 -0
  153. machine_dialect/mir/tests/__init__.py +1 -0
  154. machine_dialect/mir/tests/test_algebraic_associativity.py +204 -0
  155. machine_dialect/mir/tests/test_algebraic_complex_patterns.py +221 -0
  156. machine_dialect/mir/tests/test_algebraic_division.py +126 -0
  157. machine_dialect/mir/tests/test_algebraic_simplification.py +863 -0
  158. machine_dialect/mir/tests/test_basic_block.py +425 -0
  159. machine_dialect/mir/tests/test_branch_prediction.py +459 -0
  160. machine_dialect/mir/tests/test_call_lowering.py +168 -0
  161. machine_dialect/mir/tests/test_collection_lowering.py +604 -0
  162. machine_dialect/mir/tests/test_cross_block_constant_propagation.py +255 -0
  163. machine_dialect/mir/tests/test_custom_passes.py +166 -0
  164. machine_dialect/mir/tests/test_debug_info.py +285 -0
  165. machine_dialect/mir/tests/test_dict_extraction_lowering.py +192 -0
  166. machine_dialect/mir/tests/test_dictionary_lowering.py +299 -0
  167. machine_dialect/mir/tests/test_double_negation.py +231 -0
  168. machine_dialect/mir/tests/test_escape_analysis.py +233 -0
  169. machine_dialect/mir/tests/test_hir_to_mir.py +465 -0
  170. machine_dialect/mir/tests/test_hir_to_mir_complete.py +389 -0
  171. machine_dialect/mir/tests/test_hir_to_mir_simple.py +130 -0
  172. machine_dialect/mir/tests/test_inlining.py +435 -0
  173. machine_dialect/mir/tests/test_licm.py +472 -0
  174. machine_dialect/mir/tests/test_mir_dumper.py +313 -0
  175. machine_dialect/mir/tests/test_mir_instructions.py +445 -0
  176. machine_dialect/mir/tests/test_mir_module.py +860 -0
  177. machine_dialect/mir/tests/test_mir_printer.py +387 -0
  178. machine_dialect/mir/tests/test_mir_types.py +123 -0
  179. machine_dialect/mir/tests/test_mir_types_enhanced.py +132 -0
  180. machine_dialect/mir/tests/test_mir_validation.py +378 -0
  181. machine_dialect/mir/tests/test_mir_values.py +168 -0
  182. machine_dialect/mir/tests/test_one_based_indexing.py +202 -0
  183. machine_dialect/mir/tests/test_optimization_helpers.py +60 -0
  184. machine_dialect/mir/tests/test_optimization_pipeline.py +554 -0
  185. machine_dialect/mir/tests/test_optimization_reporter.py +318 -0
  186. machine_dialect/mir/tests/test_pass_manager.py +294 -0
  187. machine_dialect/mir/tests/test_pass_registration.py +64 -0
  188. machine_dialect/mir/tests/test_profiling.py +356 -0
  189. machine_dialect/mir/tests/test_register_allocation.py +307 -0
  190. machine_dialect/mir/tests/test_report_formatters.py +372 -0
  191. machine_dialect/mir/tests/test_ssa_construction.py +433 -0
  192. machine_dialect/mir/tests/test_tail_call.py +236 -0
  193. machine_dialect/mir/tests/test_type_annotated_instructions.py +192 -0
  194. machine_dialect/mir/tests/test_type_narrowing.py +277 -0
  195. machine_dialect/mir/tests/test_type_specialization.py +421 -0
  196. machine_dialect/mir/tests/test_type_specific_optimization.py +545 -0
  197. machine_dialect/mir/tests/test_type_specific_optimization_advanced.py +382 -0
  198. machine_dialect/mir/type_inference.py +368 -0
  199. machine_dialect/parser/__init__.py +12 -0
  200. machine_dialect/parser/enums.py +45 -0
  201. machine_dialect/parser/parser.py +3655 -0
  202. machine_dialect/parser/protocols.py +11 -0
  203. machine_dialect/parser/symbol_table.py +169 -0
  204. machine_dialect/parser/tests/__init__.py +0 -0
  205. machine_dialect/parser/tests/helper_functions.py +193 -0
  206. machine_dialect/parser/tests/test_action_statements.py +334 -0
  207. machine_dialect/parser/tests/test_boolean_literal_expressions.py +152 -0
  208. machine_dialect/parser/tests/test_call_statements.py +154 -0
  209. machine_dialect/parser/tests/test_call_statements_errors.py +187 -0
  210. machine_dialect/parser/tests/test_collection_mutations.py +264 -0
  211. machine_dialect/parser/tests/test_conditional_expressions.py +343 -0
  212. machine_dialect/parser/tests/test_define_integration.py +468 -0
  213. machine_dialect/parser/tests/test_define_statements.py +311 -0
  214. machine_dialect/parser/tests/test_dict_extraction.py +115 -0
  215. machine_dialect/parser/tests/test_empty_literal.py +155 -0
  216. machine_dialect/parser/tests/test_float_literal_expressions.py +163 -0
  217. machine_dialect/parser/tests/test_identifier_expressions.py +57 -0
  218. machine_dialect/parser/tests/test_if_empty_block.py +61 -0
  219. machine_dialect/parser/tests/test_if_statements.py +299 -0
  220. machine_dialect/parser/tests/test_illegal_tokens.py +86 -0
  221. machine_dialect/parser/tests/test_infix_expressions.py +680 -0
  222. machine_dialect/parser/tests/test_integer_literal_expressions.py +137 -0
  223. machine_dialect/parser/tests/test_interaction_statements.py +269 -0
  224. machine_dialect/parser/tests/test_list_literals.py +277 -0
  225. machine_dialect/parser/tests/test_no_none_in_ast.py +94 -0
  226. machine_dialect/parser/tests/test_panic_mode_recovery.py +171 -0
  227. machine_dialect/parser/tests/test_parse_errors.py +114 -0
  228. machine_dialect/parser/tests/test_possessive_syntax.py +182 -0
  229. machine_dialect/parser/tests/test_prefix_expressions.py +415 -0
  230. machine_dialect/parser/tests/test_program.py +13 -0
  231. machine_dialect/parser/tests/test_return_statements.py +89 -0
  232. machine_dialect/parser/tests/test_set_statements.py +152 -0
  233. machine_dialect/parser/tests/test_strict_equality.py +258 -0
  234. machine_dialect/parser/tests/test_symbol_table.py +217 -0
  235. machine_dialect/parser/tests/test_url_literal_expressions.py +209 -0
  236. machine_dialect/parser/tests/test_utility_statements.py +423 -0
  237. machine_dialect/parser/token_buffer.py +159 -0
  238. machine_dialect/repl/__init__.py +3 -0
  239. machine_dialect/repl/repl.py +426 -0
  240. machine_dialect/repl/tests/__init__.py +0 -0
  241. machine_dialect/repl/tests/test_repl.py +606 -0
  242. machine_dialect/semantic/__init__.py +12 -0
  243. machine_dialect/semantic/analyzer.py +906 -0
  244. machine_dialect/semantic/error_messages.py +189 -0
  245. machine_dialect/semantic/tests/__init__.py +1 -0
  246. machine_dialect/semantic/tests/test_analyzer.py +364 -0
  247. machine_dialect/semantic/tests/test_error_messages.py +104 -0
  248. machine_dialect/tests/edge_cases/__init__.py +10 -0
  249. machine_dialect/tests/edge_cases/test_boundary_access.py +256 -0
  250. machine_dialect/tests/edge_cases/test_empty_collections.py +166 -0
  251. machine_dialect/tests/edge_cases/test_invalid_operations.py +243 -0
  252. machine_dialect/tests/edge_cases/test_named_list_edge_cases.py +295 -0
  253. machine_dialect/tests/edge_cases/test_nested_structures.py +313 -0
  254. machine_dialect/tests/edge_cases/test_type_mixing.py +277 -0
  255. machine_dialect/tests/integration/test_array_operations_emulation.py +248 -0
  256. machine_dialect/tests/integration/test_list_compilation.py +395 -0
  257. machine_dialect/tests/integration/test_lists_and_dictionaries.py +322 -0
  258. machine_dialect/type_checking/__init__.py +21 -0
  259. machine_dialect/type_checking/tests/__init__.py +1 -0
  260. machine_dialect/type_checking/tests/test_type_system.py +230 -0
  261. machine_dialect/type_checking/type_system.py +270 -0
  262. machine_dialect-0.1.0a1.dist-info/METADATA +128 -0
  263. machine_dialect-0.1.0a1.dist-info/RECORD +268 -0
  264. machine_dialect-0.1.0a1.dist-info/WHEEL +5 -0
  265. machine_dialect-0.1.0a1.dist-info/entry_points.txt +3 -0
  266. machine_dialect-0.1.0a1.dist-info/licenses/LICENSE +201 -0
  267. machine_dialect-0.1.0a1.dist-info/top_level.txt +2 -0
  268. machine_dialect_vm/__init__.pyi +15 -0
@@ -0,0 +1,3655 @@
1
+ # mypy: disable-error-code="comparison-overlap"
2
+
3
+ import re
4
+ from collections.abc import Callable
5
+ from copy import copy
6
+
7
+ from machine_dialect.ast import (
8
+ ActionStatement,
9
+ Arguments,
10
+ BlockStatement,
11
+ CallExpression,
12
+ CallStatement,
13
+ CollectionAccessExpression,
14
+ ConditionalExpression,
15
+ DefineStatement,
16
+ EmptyLiteral,
17
+ ErrorExpression,
18
+ ErrorStatement,
19
+ Expression,
20
+ ExpressionStatement,
21
+ FloatLiteral,
22
+ ForEachStatement,
23
+ Identifier,
24
+ IfStatement,
25
+ InfixExpression,
26
+ InteractionStatement,
27
+ NamedListLiteral,
28
+ OrderedListLiteral,
29
+ Output,
30
+ Parameter,
31
+ PrefixExpression,
32
+ Program,
33
+ ReturnStatement,
34
+ SayStatement,
35
+ SetStatement,
36
+ Statement,
37
+ StringLiteral,
38
+ UnorderedListLiteral,
39
+ URLLiteral,
40
+ UtilityStatement,
41
+ WhileStatement,
42
+ WholeNumberLiteral,
43
+ YesNoLiteral,
44
+ )
45
+ from machine_dialect.errors.exceptions import MDBaseException, MDNameError, MDSyntaxError, MDTypeError
46
+ from machine_dialect.errors.messages import (
47
+ EMPTY_ELSE_BLOCK,
48
+ EMPTY_FOR_EACH_BODY,
49
+ EMPTY_IF_CONSEQUENCE,
50
+ EMPTY_WHILE_BODY,
51
+ EXPECTED_DETAILS_CLOSE,
52
+ EXPECTED_EXPRESSION,
53
+ EXPECTED_FUNCTION_NAME,
54
+ EXPECTED_IDENTIFIER_AFTER,
55
+ EXPECTED_TOKEN,
56
+ EXPECTED_TOKEN_AFTER,
57
+ ILLEGAL_TOKEN,
58
+ INVALID_ARGUMENT_VALUE,
59
+ INVALID_FLOAT_LITERAL,
60
+ INVALID_INTEGER_LITERAL,
61
+ INVALID_TYPE_NAME,
62
+ MISSING_COMMA_BETWEEN_ARGS,
63
+ MISSING_DEPTH_TRANSITION,
64
+ NAME_UNDEFINED,
65
+ NO_PARSE_FUNCTION,
66
+ UNEXPECTED_BLOCK_DEPTH,
67
+ UNEXPECTED_STATEMENT,
68
+ UNEXPECTED_TOKEN_AT_START,
69
+ UNHANDLED_OPERATION,
70
+ VARIABLE_ALREADY_DEFINED,
71
+ VARIABLE_NOT_DEFINED,
72
+ ErrorTemplate,
73
+ )
74
+ from machine_dialect.lexer import Lexer
75
+ from machine_dialect.lexer.tokens import Token, TokenType
76
+ from machine_dialect.parser import Precedence
77
+ from machine_dialect.parser.protocols import (
78
+ InfixParseFuncs,
79
+ PostfixParseFuncs,
80
+ PrefixParseFuncs,
81
+ )
82
+ from machine_dialect.parser.symbol_table import SymbolTable
83
+ from machine_dialect.parser.token_buffer import TokenBuffer
84
+ from machine_dialect.type_checking import TypeSpec, check_type_compatibility, get_type_from_value
85
+
86
+ PRECEDENCES: dict[TokenType, Precedence] = {
87
+ # Ternary conditional
88
+ TokenType.KW_IF: Precedence.TERNARY,
89
+ # Logical operators
90
+ TokenType.KW_OR: Precedence.LOGICAL_OR,
91
+ TokenType.KW_AND: Precedence.LOGICAL_AND,
92
+ # Comparison operators
93
+ TokenType.OP_EQ: Precedence.REL_SYM_COMP,
94
+ TokenType.OP_NOT_EQ: Precedence.REL_SYM_COMP,
95
+ TokenType.OP_STRICT_EQ: Precedence.REL_SYM_COMP,
96
+ TokenType.OP_STRICT_NOT_EQ: Precedence.REL_SYM_COMP,
97
+ TokenType.OP_LT: Precedence.REL_ASYM_COMP,
98
+ TokenType.OP_GT: Precedence.REL_ASYM_COMP,
99
+ TokenType.OP_LTE: Precedence.REL_ASYM_COMP,
100
+ TokenType.OP_GTE: Precedence.REL_ASYM_COMP,
101
+ # Arithmetic operators
102
+ TokenType.OP_PLUS: Precedence.MATH_ADD_SUB,
103
+ TokenType.OP_MINUS: Precedence.MATH_ADD_SUB,
104
+ TokenType.OP_STAR: Precedence.MATH_PROD_DIV_MOD,
105
+ TokenType.OP_DIVISION: Precedence.MATH_PROD_DIV_MOD,
106
+ TokenType.OP_CARET: Precedence.MATH_EXPONENT,
107
+ # Dictionary extraction operators (postfix-like)
108
+ TokenType.OP_THE_NAMES_OF: Precedence.UNARY_POST_OPERATOR,
109
+ TokenType.OP_THE_CONTENTS_OF: Precedence.UNARY_POST_OPERATOR,
110
+ }
111
+
112
+ TYPING_MAP: dict[TokenType, str] = {
113
+ TokenType.KW_TEXT: "Text",
114
+ TokenType.KW_WHOLE_NUMBER: "Whole Number",
115
+ TokenType.KW_FLOAT: "Float",
116
+ TokenType.KW_NUMBER: "Number",
117
+ TokenType.KW_YES_NO: "Yes/No",
118
+ TokenType.KW_URL: "URL",
119
+ TokenType.KW_DATE: "Date",
120
+ TokenType.KW_DATETIME: "DateTime",
121
+ TokenType.KW_TIME: "Time",
122
+ TokenType.KW_LIST: "List",
123
+ TokenType.KW_ORDERED_LIST: "Ordered List",
124
+ TokenType.KW_UNORDERED_LIST: "Unordered List",
125
+ TokenType.KW_NAMED_LIST: "Named List",
126
+ TokenType.KW_EMPTY: "Empty",
127
+ }
128
+
129
+ __all__ = ["Parser"]
130
+
131
+
132
+ class Parser:
133
+ """Parser for Machine Dialect™ language.
134
+
135
+ Transforms source code into an Abstract Syntax Tree (AST) by first
136
+ tokenizing it with the lexer and then parsing the tokens.
137
+ Also collects any lexical errors from the tokenizer.
138
+
139
+ Attributes:
140
+ errors (list[MDBaseException]): List of errors encountered during parsing,
141
+ including lexical errors from the tokenizer.
142
+ """
143
+
144
+ def __init__(self) -> None:
145
+ """Initialize the parser."""
146
+ self._current_token: Token | None = None
147
+ self._peek_token: Token | None = None
148
+ self._token_buffer: TokenBuffer | None = None
149
+ self._errors: list[MDBaseException] = []
150
+ self._panic_count = 0 # Track panic-mode recoveries
151
+ self._block_depth = 0 # Track if we're inside block statements
152
+ self._symbol_table: SymbolTable = SymbolTable() # Track variable definitions
153
+
154
+ self._prefix_parse_funcs: PrefixParseFuncs = self._register_prefix_funcs()
155
+ self._infix_parse_funcs: InfixParseFuncs = self._register_infix_funcs()
156
+ self._postfix_parse_funcs: PostfixParseFuncs = self._register_postfix_funcs()
157
+
158
+ def parse(self, source: str, as_hir: bool = False, check_semantics: bool = True) -> Program:
159
+ """Parse the source code into an AST.
160
+
161
+ Args:
162
+ source: The source code to parse.
163
+ as_hir: If True, return a HIR (High level Intermediate Representation).
164
+ check_semantics: If True, perform semantic analysis.
165
+
166
+ Returns:
167
+ The root Program node of the AST.
168
+
169
+ Note:
170
+ Any errors encountered during parsing are added to the
171
+ errors attribute. The parser attempts to continue parsing
172
+ even after encountering errors using panic-mode recovery.
173
+ """
174
+ # Reset parser state for new parse
175
+ self._reset_state()
176
+
177
+ # Create lexer and token buffer for streaming
178
+ lexer = Lexer(source)
179
+ self._token_buffer = TokenBuffer(lexer)
180
+
181
+ # Initialize token pointers
182
+ self._advance_tokens()
183
+ self._advance_tokens()
184
+
185
+ # Skip frontmatter if present
186
+ self._skip_frontmatter()
187
+
188
+ # Parse the program
189
+ program: Program = Program(statements=[])
190
+
191
+ assert self._current_token is not None
192
+ while self._current_token.type != TokenType.MISC_EOF and self._panic_count < 20:
193
+ # Skip standalone periods
194
+ if self._current_token.type == TokenType.PUNCT_PERIOD:
195
+ self._advance_tokens()
196
+ continue
197
+
198
+ # Save the token position before parsing
199
+ token_before = self._current_token
200
+
201
+ statement = self._parse_statement()
202
+ program.statements.append(statement)
203
+
204
+ # If we haven't advanced past the token we started with, we need to advance
205
+ # This happens when expression parsing leaves us at the last token
206
+ if self._current_token == token_before:
207
+ self._advance_tokens()
208
+ # After parsing a statement, skip any trailing period
209
+ elif self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
210
+ self._advance_tokens()
211
+
212
+ # Perform semantic analysis if requested
213
+ if check_semantics and not self._errors:
214
+ from machine_dialect.semantic.analyzer import SemanticAnalyzer
215
+
216
+ analyzer = SemanticAnalyzer()
217
+ program, semantic_errors = analyzer.analyze(program)
218
+ self._errors.extend(semantic_errors)
219
+
220
+ return program.desugar() if as_hir else program
221
+
222
+ def _reset_state(self) -> None:
223
+ """Reset the parser state for a new parse."""
224
+ self._current_token = None
225
+ self._peek_token = None
226
+ self._token_buffer = None
227
+ self._errors = []
228
+ self._panic_count = 0
229
+ self._block_depth = 0
230
+ self._symbol_table = SymbolTable() # Reset symbol table for new parse
231
+
232
+ def has_errors(self) -> bool:
233
+ """Check if any errors were encountered during parsing.
234
+
235
+ Returns:
236
+ True if there are any errors, False otherwise.
237
+ """
238
+ return len(self._errors) > 0
239
+
240
+ @property
241
+ def errors(self) -> list[MDBaseException]:
242
+ """Get the list of errors encountered during parsing.
243
+
244
+ This includes both lexical errors from the tokenizer and syntax errors
245
+ from the parser. Errors are collected in the order they were encountered.
246
+
247
+ Returns:
248
+ List of MDBaseException instances representing all errors found
249
+ during lexical analysis and parsing.
250
+ """
251
+ return copy(self._errors)
252
+
253
+ def _skip_frontmatter(self) -> None:
254
+ """Skip YAML frontmatter section if present at the beginning of the document.
255
+
256
+ Frontmatter starts with --- and ends with another --- on its own line.
257
+ Everything between these delimiters is skipped.
258
+ """
259
+ # Check if we're at the beginning with a frontmatter delimiter
260
+ if self._current_token and self._current_token.type == TokenType.PUNCT_FRONTMATTER:
261
+ # Skip tokens until we find the closing frontmatter delimiter
262
+ self._advance_tokens()
263
+
264
+ while self._current_token and self._current_token.type != TokenType.MISC_EOF:
265
+ if self._current_token.type == TokenType.PUNCT_FRONTMATTER:
266
+ # Found closing delimiter, skip it and exit
267
+ self._advance_tokens()
268
+ break
269
+ # Skip any token that's not a closing frontmatter delimiter
270
+ self._advance_tokens()
271
+
272
+ def _advance_tokens(self) -> None:
273
+ """Advance to the next token in the stream.
274
+
275
+ Moves the peek token to current token and reads the next token
276
+ into peek token from the buffer. Automatically skips MISC_STOPWORD tokens.
277
+ """
278
+ self._current_token = self._peek_token
279
+
280
+ # Skip any stopword tokens
281
+ if self._token_buffer:
282
+ while True:
283
+ self._peek_token = self._token_buffer.current()
284
+ if self._peek_token is None:
285
+ self._peek_token = Token(TokenType.MISC_EOF, "", line=1, position=1)
286
+ break
287
+
288
+ self._token_buffer.advance()
289
+
290
+ # Skip stopwords and backslashes
291
+ if self._peek_token.type not in (TokenType.MISC_STOPWORD, TokenType.PUNCT_BACKSLASH):
292
+ break
293
+ else:
294
+ # No buffer available
295
+ self._peek_token = Token(TokenType.MISC_EOF, "", line=1, position=1)
296
+
297
+ def _current_precedence(self) -> Precedence:
298
+ """Get the precedence of the current token.
299
+
300
+ Returns:
301
+ The precedence level of the current token, or LOWEST if not found.
302
+ """
303
+ assert self._current_token is not None
304
+ return PRECEDENCES.get(self._current_token.type, Precedence.LOWEST)
305
+
306
+ def _peek_precedence(self) -> Precedence:
307
+ """Get the precedence of the peek token.
308
+
309
+ Returns:
310
+ The precedence level of the peek token, or LOWEST if not found.
311
+ """
312
+ assert self._peek_token is not None
313
+ return PRECEDENCES.get(self._peek_token.type, Precedence.LOWEST)
314
+
315
+ def _panic_recovery(self, stop_at: list[TokenType] | None = None, stop_at_types: bool = False) -> list[Token]:
316
+ """Unified error recovery: skip tokens until finding synchronization point.
317
+
318
+ Args:
319
+ stop_at: Token types to stop at (default: [PERIOD, EOF])
320
+ stop_at_types: If True, also stop at type keywords
321
+
322
+ Returns:
323
+ List of tokens that were skipped during recovery.
324
+ """
325
+ self._panic_count += 1 # Always increment to prevent infinite loops
326
+
327
+ if stop_at is None:
328
+ stop_at = [TokenType.PUNCT_PERIOD, TokenType.MISC_EOF]
329
+
330
+ skipped_tokens = []
331
+
332
+ # Skip tokens until finding a synchronization point
333
+ while self._peek_token is not None and self._peek_token.type not in stop_at:
334
+ self._advance_tokens()
335
+ if self._current_token is not None:
336
+ # Check if we should stop at type keywords
337
+ if stop_at_types and self._is_type_token(self._current_token.type):
338
+ break
339
+ skipped_tokens.append(self._current_token)
340
+
341
+ # Advance one more token to move past the last error token
342
+ # This prevents the main loop from trying to parse the last token again
343
+ if self._current_token is not None and self._current_token.type != TokenType.MISC_EOF:
344
+ self._advance_tokens()
345
+
346
+ return skipped_tokens
347
+
348
+ def _report_error_and_recover(
349
+ self,
350
+ template: ErrorTemplate,
351
+ error_type: str = "syntax",
352
+ expected_token: TokenType | None = None,
353
+ recovery_tokens: list[TokenType] | None = None,
354
+ recovery_to_types: bool = False,
355
+ skip_recovery: bool = False,
356
+ is_expression: bool = False,
357
+ **kwargs: str,
358
+ ) -> ErrorStatement | ErrorExpression:
359
+ """Unified error handling: always adds error to list and returns appropriate error node.
360
+
361
+ Args:
362
+ template: ErrorTemplate with the error message
363
+ error_type: Type of error - "syntax", "name", or "type"
364
+ expected_token: Expected token type (for syntax errors)
365
+ recovery_tokens: Specific tokens to recover to
366
+ recovery_to_types: If True, recover to type keywords
367
+ skip_recovery: If True, skip error recovery
368
+ is_expression: If True, return ErrorExpression instead of ErrorStatement
369
+ **kwargs: Template substitution parameters
370
+
371
+ Returns:
372
+ ErrorStatement or ErrorExpression with consistent error reporting.
373
+ """
374
+ # Determine which token to use for error reporting
375
+ # If we have an expected_token, we're likely checking peek_token
376
+ # Otherwise, use current_token
377
+ if expected_token and self._peek_token:
378
+ token = self._peek_token or Token(TokenType.MISC_EOF, "", 0, 0)
379
+ else:
380
+ token = self._current_token or Token(TokenType.MISC_EOF, "", 0, 0)
381
+
382
+ # Create appropriate error type
383
+ error: MDBaseException
384
+ if error_type == "name":
385
+ error = MDNameError(message=template, line=token.line, column=token.position, **kwargs)
386
+ elif error_type == "type":
387
+ error = MDTypeError(message=template, line=token.line, column=token.position, **kwargs)
388
+ else: # syntax
389
+ # Special case: if we expected an identifier and got something else
390
+ # Check if it's an illegal token (syntax error) or something else (name error)
391
+ if expected_token == TokenType.MISC_IDENT and token.type != TokenType.MISC_IDENT:
392
+ # If it's an illegal token, it's a syntax error
393
+ if token.type == TokenType.MISC_ILLEGAL:
394
+ error = MDSyntaxError(
395
+ message=ILLEGAL_TOKEN,
396
+ line=token.line,
397
+ column=token.position,
398
+ token=token.literal,
399
+ )
400
+ else:
401
+ # Otherwise, it's still a name error (e.g., keyword used as identifier)
402
+ from machine_dialect.errors.messages import ILLEGAL_CHARACTER
403
+
404
+ # Get a human-readable name for the expected token
405
+ expected_name = "identifier"
406
+ error = MDNameError(
407
+ message=ILLEGAL_CHARACTER,
408
+ line=token.line,
409
+ column=token.position,
410
+ expected=expected_name,
411
+ character=token.literal,
412
+ )
413
+ else:
414
+ error = MDSyntaxError(message=template, line=token.line, column=token.position, **kwargs)
415
+
416
+ # ALWAYS add the error to ensure consistency
417
+ self._errors.append(error)
418
+
419
+ # Perform recovery unless explicitly skipped
420
+ skipped = []
421
+ if not skip_recovery:
422
+ skipped = self._panic_recovery(stop_at=recovery_tokens, stop_at_types=recovery_to_types)
423
+
424
+ # Get formatted message
425
+ formatted_message = template.format(**kwargs) if kwargs else template.substitute()
426
+
427
+ # Return appropriate error node type
428
+ if is_expression:
429
+ return ErrorExpression(token=token, message=formatted_message)
430
+ else:
431
+ return ErrorStatement(token=token, skipped_tokens=skipped, message=formatted_message)
432
+
433
+ def _expect_token(
434
+ self,
435
+ token_type: TokenType,
436
+ context: str | None = None,
437
+ error_message: str | None = None,
438
+ error_node: ErrorExpression | None = None,
439
+ ) -> ErrorStatement | ErrorExpression | None:
440
+ """Unified token expectation with automatic error handling.
441
+
442
+ Args:
443
+ token_type: Expected token type
444
+ context: Context for error message (e.g., "after 'Set'")
445
+ error_message: Full custom error message (overrides auto-generated)
446
+ error_node: Custom ErrorExpression to return (for expression contexts)
447
+
448
+ Returns:
449
+ None if token matches (advances and continues)
450
+ ErrorExpression if error_node provided and token doesn't match
451
+ ErrorStatement if token doesn't match (with appropriate recovery)
452
+ """
453
+ if self._peek_token and self._peek_token.type == token_type:
454
+ self._advance_tokens()
455
+ return None # Success, continue parsing
456
+
457
+ # Token doesn't match - use appropriate error template
458
+ from machine_dialect.errors.messages import EXPECTED_TOKEN, EXPECTED_TOKEN_AFTER
459
+
460
+ if context:
461
+ # Use "Expected X after Y" template
462
+ token_name = token_type.name.lower().replace("_", " ")
463
+ template = EXPECTED_TOKEN_AFTER
464
+ kwargs = {"expected": token_name, "after": context}
465
+ else:
466
+ # Use simple "Expected X" template
467
+ token_name = token_type.name.lower().replace("_", " ")
468
+ template = EXPECTED_TOKEN
469
+ # Get the actual token type that was found
470
+ actual_token = self._peek_token if self._peek_token else None
471
+ got_type = actual_token.type.name if actual_token else "EOF"
472
+ kwargs = {"token": token_name, "got_token_type": got_type}
473
+
474
+ # Special handling for period (statement terminator)
475
+ skip_recovery = token_type == TokenType.PUNCT_PERIOD
476
+
477
+ # Always use _report_error_and_recover for error reporting
478
+ error_statement = self._report_error_and_recover(
479
+ template=template,
480
+ expected_token=token_type,
481
+ skip_recovery=skip_recovery or error_node is not None, # Skip recovery for expressions too
482
+ error_type="syntax",
483
+ recovery_tokens=None,
484
+ recovery_to_types=False,
485
+ is_expression=False,
486
+ **kwargs,
487
+ )
488
+
489
+ # If custom error node provided, return it instead of the ErrorStatement
490
+ if error_node is not None:
491
+ return error_node
492
+
493
+ return error_statement
494
+
495
+ def _parse_expression(self, precedence: Precedence = Precedence.LOWEST) -> Expression:
496
+ """Parse an expression with a given precedence level.
497
+
498
+ Args:
499
+ precedence: The minimum precedence level to parse. Defaults to LOWEST.
500
+
501
+ Returns:
502
+ An Expression AST node if successful, ErrorExpression if parsing fails, None if no expression.
503
+ """
504
+ assert self._current_token is not None
505
+
506
+ # Handle illegal tokens
507
+ if self._current_token.type == TokenType.MISC_ILLEGAL:
508
+ # Report as syntax error, not name error
509
+ result = self._report_error_and_recover(
510
+ template=ILLEGAL_TOKEN,
511
+ error_type="syntax",
512
+ token=self._current_token.literal,
513
+ skip_recovery=True, # Don't recover - let caller handle advancement
514
+ is_expression=True,
515
+ )
516
+ assert isinstance(result, ErrorExpression)
517
+ return result
518
+
519
+ if self._current_token.type not in self._prefix_parse_funcs:
520
+ # Check if it's an infix operator at the start
521
+ # Determine which error template to use and its parameters
522
+ if self._current_token.type in self._infix_parse_funcs:
523
+ error_expr = self._report_error_and_recover(
524
+ template=UNEXPECTED_TOKEN_AT_START,
525
+ token=self._current_token.literal,
526
+ skip_recovery=True,
527
+ is_expression=True,
528
+ )
529
+ elif self._current_token.type == TokenType.MISC_EOF:
530
+ error_expr = self._report_error_and_recover(
531
+ template=EXPECTED_EXPRESSION,
532
+ got="<end-of-file>",
533
+ skip_recovery=True,
534
+ is_expression=True,
535
+ )
536
+ else:
537
+ error_expr = self._report_error_and_recover(
538
+ template=NO_PARSE_FUNCTION,
539
+ literal=self._current_token.literal,
540
+ skip_recovery=True,
541
+ is_expression=True,
542
+ )
543
+
544
+ # Advance past the problematic token so we can continue parsing
545
+ if self._current_token.type != TokenType.MISC_EOF:
546
+ self._advance_tokens()
547
+ assert isinstance(error_expr, ErrorExpression)
548
+ return error_expr
549
+
550
+ prefix_parse_fn = self._prefix_parse_funcs[self._current_token.type]
551
+
552
+ left_expression = prefix_parse_fn()
553
+
554
+ # Handle infix operators
555
+ assert self._peek_token is not None
556
+ while self._peek_token.type != TokenType.PUNCT_PERIOD and precedence < self._peek_precedence():
557
+ if self._peek_token.type not in self._infix_parse_funcs:
558
+ return left_expression
559
+
560
+ self._advance_tokens()
561
+
562
+ assert self._current_token is not None
563
+ infix_parse_fn = self._infix_parse_funcs[self._current_token.type]
564
+ left_expression = infix_parse_fn(left_expression)
565
+
566
+ assert self._peek_token is not None
567
+
568
+ return left_expression
569
+
570
+ def _parse_expression_statement(self) -> ExpressionStatement | ErrorStatement:
571
+ assert self._current_token is not None
572
+
573
+ expression = self._parse_expression()
574
+
575
+ expression_statement = ExpressionStatement(
576
+ token=self._current_token,
577
+ expression=expression,
578
+ )
579
+
580
+ # Require trailing period if not at EOF or if we're in a block
581
+ assert self._peek_token is not None
582
+ if self._peek_token.type != TokenType.MISC_EOF or self._block_depth > 0:
583
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
584
+ assert isinstance(error, ErrorStatement)
585
+ return error
586
+
587
+ # Advance past the last token of the expression
588
+ # Expression parsing leaves us at the last token, not after it
589
+ self._advance_tokens()
590
+
591
+ return expression_statement
592
+
593
+ def _parse_float_literal(self) -> FloatLiteral | ErrorExpression:
594
+ assert self._current_token is not None
595
+
596
+ # The lexer has already validated and cleaned the literal
597
+ # so we can directly parse it as a float
598
+ try:
599
+ value = float(self._current_token.literal)
600
+ except ValueError:
601
+ # This shouldn't happen if the lexer is working correctly
602
+ result = self._report_error_and_recover(
603
+ template=INVALID_FLOAT_LITERAL,
604
+ literal=self._current_token.literal,
605
+ skip_recovery=True,
606
+ is_expression=True,
607
+ )
608
+ assert isinstance(result, ErrorExpression)
609
+ return result
610
+
611
+ return FloatLiteral(
612
+ token=self._current_token,
613
+ value=value,
614
+ )
615
+
616
+ def _parse_identifier(self) -> Identifier:
617
+ assert self._current_token is not None
618
+
619
+ return Identifier(
620
+ token=self._current_token,
621
+ value=self._current_token.literal,
622
+ )
623
+
624
+ def _parse_identifier_or_keyword_as_identifier(self) -> Identifier | None:
625
+ """Parse an identifier, accepting keywords as identifiers when appropriate.
626
+
627
+ This is useful when a keyword appears where we expect an identifier,
628
+ like variable names that happen to match keywords.
629
+
630
+ Returns:
631
+ An Identifier AST node, or None if current token can't be used as identifier.
632
+ """
633
+ if not self._current_token:
634
+ return None
635
+
636
+ # Accept actual identifiers
637
+ if self._current_token.type == TokenType.MISC_IDENT:
638
+ return self._parse_identifier()
639
+
640
+ # Also accept any keyword that has a literal value as an identifier
641
+ # This allows using words like "items", "first", etc. as variable names
642
+ if self._current_token.literal:
643
+ # Create an identifier from the keyword's literal
644
+ return Identifier(
645
+ token=self._current_token,
646
+ value=self._current_token.literal,
647
+ )
648
+
649
+ return None
650
+
651
+ def _parse_integer_literal(self) -> WholeNumberLiteral | ErrorExpression:
652
+ assert self._current_token is not None
653
+
654
+ # The lexer has already validated and cleaned the literal
655
+ # so we can directly parse it as an integer
656
+ try:
657
+ value = int(self._current_token.literal)
658
+ except ValueError:
659
+ # This shouldn't happen if the lexer is working correctly
660
+ result = self._report_error_and_recover(
661
+ template=INVALID_INTEGER_LITERAL,
662
+ literal=self._current_token.literal,
663
+ skip_recovery=True,
664
+ is_expression=True,
665
+ )
666
+ assert isinstance(result, ErrorExpression)
667
+ return result
668
+
669
+ return WholeNumberLiteral(
670
+ token=self._current_token,
671
+ value=value,
672
+ )
673
+
674
+ def _parse_boolean_literal(self) -> YesNoLiteral:
675
+ """Parse a boolean literal.
676
+
677
+ Returns:
678
+ A YesNoLiteral AST node.
679
+
680
+ Note:
681
+ The lexer has already validated and provided the canonical
682
+ representation of the boolean literal ("True" or "False").
683
+ """
684
+ assert self._current_token is not None
685
+
686
+ # Determine the boolean value based on the token type
687
+ value = self._current_token.type == TokenType.LIT_YES
688
+
689
+ return YesNoLiteral(
690
+ token=self._current_token,
691
+ value=value,
692
+ )
693
+
694
+ def _parse_empty_literal(self) -> EmptyLiteral:
695
+ """Parse an empty literal.
696
+
697
+ Returns:
698
+ An EmptyLiteral AST node.
699
+ """
700
+ assert self._current_token is not None
701
+
702
+ return EmptyLiteral(
703
+ token=self._current_token,
704
+ )
705
+
706
+ def _parse_string_literal(self) -> StringLiteral:
707
+ """Parse a string literal.
708
+
709
+ Returns:
710
+ A StringLiteral AST node.
711
+ """
712
+ assert self._current_token is not None
713
+
714
+ # Extract the actual string value without quotes
715
+ literal = self._current_token.literal
716
+ if literal.startswith('"') and literal.endswith('"'):
717
+ value = literal[1:-1]
718
+ elif literal.startswith("'") and literal.endswith("'"):
719
+ value = literal[1:-1]
720
+ else:
721
+ # Fallback if no quotes found
722
+ value = literal
723
+
724
+ return StringLiteral(
725
+ token=self._current_token,
726
+ value=value,
727
+ )
728
+
729
+ def _parse_list_literal(self) -> Expression:
730
+ """Parse a list literal (unordered, ordered, or named).
731
+
732
+ Called when current token is the first list item marker after "Set x to:"
733
+ Determines the list type based on the first item marker and
734
+ delegates to the appropriate specialized parser.
735
+
736
+ Returns:
737
+ UnorderedListLiteral, OrderedListLiteral, or NamedListLiteral
738
+ """
739
+ # SetStatement has already advanced past the colon to the first list item
740
+ # Current token should be the first list item marker (dash, number, or EOF for empty list)
741
+ if not self._current_token:
742
+ return ErrorExpression(
743
+ token=Token(TokenType.MISC_EOF, "", 0, 0),
744
+ message="Unexpected EOF while parsing list",
745
+ )
746
+
747
+ # Save the starting token for error reporting
748
+ list_token = self._current_token
749
+
750
+ # List context should already be set by SetStatement
751
+
752
+ # Check for empty list (no items)
753
+ current_type = self._current_token.type if self._current_token else None
754
+ if current_type in (None, TokenType.MISC_EOF) or (
755
+ # Also check if we hit a statement terminator or new statement
756
+ current_type in (TokenType.PUNCT_PERIOD, TokenType.KW_SET, TokenType.KW_DEFINE)
757
+ ):
758
+ # Empty list - default to unordered
759
+ return UnorderedListLiteral(token=list_token, elements=[])
760
+
761
+ # Skip any stopwords that might appear
762
+ while self._current_token and self._current_token.type in (TokenType.MISC_STOPWORD,):
763
+ self._advance_tokens()
764
+ # Update current type after skipping stopwords
765
+ current_type = self._current_token.type if self._current_token else None
766
+
767
+ # Now current_token should be the first list item marker (dash or number)
768
+ # Update current_type after advancing
769
+ current_type = self._current_token.type if self._current_token else None
770
+
771
+ # Look at what type of list this is
772
+ if current_type == TokenType.PUNCT_DASH:
773
+ # Check if it's a named list by looking for pattern: dash, key, colon
774
+ # Named lists have patterns like: - _"key"_: value
775
+
776
+ # Use the token buffer to peek ahead without advancing
777
+ is_named_list = False
778
+
779
+ # We're at the dash, peek_token is the key
780
+ if self._peek_token:
781
+ # Check token after the key using buffer
782
+ if self._token_buffer:
783
+ # The buffer's current token is the token after our peek_token
784
+ colon_after_key = self._token_buffer.current()
785
+
786
+ # Check if we have the pattern: dash, key, colon
787
+ if colon_after_key and colon_after_key.type == TokenType.PUNCT_COLON:
788
+ # Check if peek_token (the key) is valid for a named list
789
+ peek_type = self._peek_token.type
790
+ if peek_type in (
791
+ TokenType.LIT_TEXT,
792
+ TokenType.MISC_IDENT,
793
+ TokenType.KW_NAME,
794
+ TokenType.KW_CONTENT,
795
+ ):
796
+ is_named_list = True
797
+ elif self._peek_token.literal and self._peek_token.literal.lower() in (
798
+ "age",
799
+ "active",
800
+ "profession",
801
+ ):
802
+ is_named_list = True
803
+
804
+ result: Expression
805
+ if is_named_list:
806
+ result = self._parse_named_list_literal(list_token)
807
+ else:
808
+ # Not a named list, it's an unordered list
809
+ result = self._parse_unordered_list_literal(list_token)
810
+
811
+ # Check if it's an ordered list (number followed by period)
812
+ elif (
813
+ current_type == TokenType.LIT_WHOLE_NUMBER
814
+ and self._peek_token
815
+ and self._peek_token.type == TokenType.PUNCT_PERIOD
816
+ ):
817
+ result = self._parse_ordered_list_literal(list_token)
818
+ else:
819
+ # Invalid list format - return error expression
820
+ result = ErrorExpression(
821
+ token=self._current_token or list_token,
822
+ message=(
823
+ f"Expected list item marker (dash or number), got "
824
+ f"{self._current_token.type if self._current_token else 'EOF'}"
825
+ ),
826
+ )
827
+
828
+ return result
829
+
830
+ def _parse_unordered_list_literal(self, list_token: Token) -> UnorderedListLiteral:
831
+ """Parse an unordered list (dash-prefixed items).
832
+
833
+ Args:
834
+ list_token: The token marking the start of the list
835
+
836
+ Returns:
837
+ UnorderedListLiteral with parsed items
838
+ """
839
+ items: list[Expression] = []
840
+
841
+ # We might not be at the dash yet if we came from lookahead
842
+ # Go back to find the dash
843
+ if self._current_token and self._current_token.type != TokenType.PUNCT_DASH:
844
+ # We're past the dash (probably at 'name' from lookahead), go back
845
+ # Actually, this is complex. Let's just handle where we are.
846
+ pass
847
+
848
+ while True:
849
+ # Check if we're at a dash (list item marker)
850
+ if not self._current_token:
851
+ break
852
+ token_type = self._current_token.type
853
+ if token_type != TokenType.PUNCT_DASH:
854
+ break
855
+
856
+ # Move past dash
857
+ self._advance_tokens()
858
+
859
+ # Parse the item expression
860
+ item = self._parse_expression(Precedence.LOWEST)
861
+ items.append(item)
862
+
863
+ # After parsing expression, advance to check for period
864
+ self._advance_tokens()
865
+
866
+ # Each list item must end with a period
867
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
868
+ # Good, we have the required period
869
+ # Now check if there's another list item
870
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_DASH:
871
+ # There's another list item, advance to it
872
+ self._advance_tokens()
873
+ else:
874
+ # No more list items - we're done
875
+ break
876
+ else:
877
+ # Missing period after list item - use unified error handling
878
+ from machine_dialect.errors.messages import EXPECTED_TOKEN_AFTER
879
+
880
+ self._report_error_and_recover(
881
+ template=EXPECTED_TOKEN_AFTER,
882
+ expected_token=TokenType.PUNCT_PERIOD,
883
+ skip_recovery=True, # We'll handle recovery manually below
884
+ expected="period",
885
+ after="list item",
886
+ )
887
+
888
+ # Check if we're at another dash (next item) or done
889
+ if self._current_token and self._current_token.type == TokenType.PUNCT_DASH:
890
+ # Continue with next item despite missing period
891
+ continue
892
+ else:
893
+ # No more items
894
+ break
895
+
896
+ return UnorderedListLiteral(token=list_token, elements=items)
897
+
898
+ def _parse_ordered_list_literal(self, list_token: Token) -> OrderedListLiteral:
899
+ """Parse an ordered list (numbered items like 1., 2., etc).
900
+
901
+ Args:
902
+ list_token: The token marking the start of the list
903
+
904
+ Returns:
905
+ OrderedListLiteral with parsed items
906
+ """
907
+ items: list[Expression] = []
908
+
909
+ while True:
910
+ # Check if we're at a number (ordered list item marker)
911
+ if not self._current_token:
912
+ break
913
+ token_type = self._current_token.type
914
+ if token_type != TokenType.LIT_WHOLE_NUMBER:
915
+ break
916
+
917
+ # Skip the number
918
+ self._advance_tokens()
919
+
920
+ # Check for period after number (this is the list marker period, e.g., "1.")
921
+ if not self._current_token or self._current_token.type != TokenType.PUNCT_PERIOD:
922
+ break
923
+
924
+ # Move past the list marker period
925
+ self._advance_tokens()
926
+
927
+ # Parse the item expression
928
+ item = self._parse_expression(Precedence.LOWEST)
929
+ items.append(item)
930
+
931
+ # After parsing expression, advance to check for item-terminating period
932
+ self._advance_tokens()
933
+
934
+ # Each list item must end with a period
935
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
936
+ # Good, we have the required period
937
+ # Check if there's another list item
938
+ if self._peek_token and self._peek_token.type == TokenType.LIT_WHOLE_NUMBER:
939
+ # There's another list item, advance to it
940
+ self._advance_tokens()
941
+ else:
942
+ # No more list items - we're done
943
+ break
944
+ else:
945
+ # Missing period after list item - use unified error handling
946
+ from machine_dialect.errors.messages import EXPECTED_TOKEN_AFTER
947
+
948
+ self._report_error_and_recover(
949
+ template=EXPECTED_TOKEN_AFTER,
950
+ expected_token=TokenType.PUNCT_PERIOD,
951
+ skip_recovery=True, # We'll handle recovery manually below
952
+ expected="period",
953
+ after="list item",
954
+ )
955
+
956
+ # Check if we're at another number (next item) or done
957
+ if self._current_token and self._current_token.type == TokenType.LIT_WHOLE_NUMBER:
958
+ # Continue with next item despite missing period
959
+ continue
960
+ else:
961
+ # No more items
962
+ break
963
+
964
+ return OrderedListLiteral(token=list_token, elements=items)
965
+
966
+ def _parse_named_list_literal(self, list_token: Token) -> NamedListLiteral:
967
+ """Parse a named list (dictionary with key:value pairs).
968
+
969
+ Format:
970
+ - key1: value1
971
+ - key2: value2
972
+
973
+ Args:
974
+ list_token: The token marking the start of the list
975
+
976
+ Returns:
977
+ NamedListLiteral with parsed key-value pairs
978
+ """
979
+ entries: list[tuple[str, Expression]] = []
980
+
981
+ # Parse entries while we have dash-prefixed lines
982
+ while True:
983
+ # Check if we're at a dash (named list item marker)
984
+ if not self._current_token:
985
+ break
986
+ token_type = self._current_token.type
987
+ if token_type != TokenType.PUNCT_DASH:
988
+ break
989
+
990
+ # Move past the dash
991
+ self._advance_tokens()
992
+
993
+ # Parse the key (MUST be a string literal)
994
+ key = ""
995
+ current_type_after_dash: TokenType | None = self._current_token.type if self._current_token else None
996
+ if current_type_after_dash == TokenType.LIT_TEXT:
997
+ key = self._current_token.literal.strip('"')
998
+ self._advance_tokens()
999
+ else:
1000
+ # Invalid key - named lists require string literal keys only
1001
+ if self._current_token:
1002
+ from machine_dialect.errors.messages import INVALID_NAMED_LIST_KEY
1003
+
1004
+ self._report_error_and_recover(
1005
+ template=INVALID_NAMED_LIST_KEY,
1006
+ error_type="type",
1007
+ literal=self._current_token.literal,
1008
+ recovery_tokens=[TokenType.PUNCT_DASH, TokenType.MISC_EOF],
1009
+ )
1010
+ else:
1011
+ self._panic_recovery(stop_at=[TokenType.PUNCT_DASH, TokenType.MISC_EOF])
1012
+ continue
1013
+
1014
+ # Expect colon
1015
+ current_type_for_colon: TokenType | None = self._current_token.type if self._current_token else None
1016
+ if current_type_for_colon != TokenType.PUNCT_COLON:
1017
+ # Missing colon, this might be an unordered list item
1018
+ # Add error and try to continue
1019
+ entries.append(
1020
+ (key, ErrorExpression(token=self._current_token or list_token, message="Expected colon after key"))
1021
+ )
1022
+ self._panic_recovery(stop_at=[TokenType.PUNCT_DASH, TokenType.MISC_EOF])
1023
+ continue
1024
+
1025
+ self._advance_tokens() # Move past colon
1026
+
1027
+ # Parse the value expression
1028
+ value = self._parse_expression(Precedence.LOWEST)
1029
+ if not value:
1030
+ entries.append(
1031
+ (
1032
+ key,
1033
+ ErrorExpression(token=self._current_token or list_token, message="Expected value after colon"),
1034
+ )
1035
+ )
1036
+ self._panic_recovery(stop_at=[TokenType.PUNCT_DASH, TokenType.MISC_EOF])
1037
+ continue
1038
+
1039
+ # After parsing expression, advance to check for period
1040
+ self._advance_tokens()
1041
+
1042
+ # Each named list entry must end with a period
1043
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
1044
+ # Good, we have the required period
1045
+ entries.append((key, value))
1046
+ # Check if there's another entry
1047
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_DASH:
1048
+ # There's another entry, advance to it
1049
+ self._advance_tokens()
1050
+ else:
1051
+ # No more entries - we're done
1052
+ break
1053
+ else:
1054
+ # Missing period after entry - add error but include the entry
1055
+ from machine_dialect.errors.messages import EXPECTED_TOKEN_AFTER
1056
+
1057
+ self._report_error_and_recover(
1058
+ template=EXPECTED_TOKEN_AFTER,
1059
+ expected_token=TokenType.PUNCT_PERIOD,
1060
+ skip_recovery=True, # We'll handle recovery manually below
1061
+ expected="period",
1062
+ after="named list entry",
1063
+ )
1064
+ entries.append((key, value))
1065
+
1066
+ # Check if we're at another dash (next entry) or done
1067
+ if self._current_token and self._current_token.type == TokenType.PUNCT_DASH:
1068
+ # Continue with next entry despite missing period
1069
+ continue
1070
+ else:
1071
+ # No more entries
1072
+ break
1073
+
1074
+ return NamedListLiteral(token=list_token, entries=entries)
1075
+
1076
+ def _parse_url_literal(self) -> URLLiteral:
1077
+ """Parse a URL literal.
1078
+
1079
+ Returns:
1080
+ A URLLiteral AST node.
1081
+ """
1082
+ assert self._current_token is not None
1083
+
1084
+ # Extract the actual URL value without quotes (like string literals)
1085
+ literal = self._current_token.literal
1086
+ if literal.startswith('"') and literal.endswith('"'):
1087
+ value = literal[1:-1]
1088
+ elif literal.startswith("'") and literal.endswith("'"):
1089
+ value = literal[1:-1]
1090
+ else:
1091
+ # Fallback if no quotes found
1092
+ value = literal
1093
+
1094
+ return URLLiteral(
1095
+ token=self._current_token,
1096
+ value=value,
1097
+ )
1098
+
1099
+ def _parse_prefix_expression(self) -> PrefixExpression:
1100
+ """Parse a prefix expression.
1101
+
1102
+ Prefix expressions consist of a prefix operator followed by an expression.
1103
+ Examples: -42, not True, --5, not not False
1104
+
1105
+ Returns:
1106
+ A PrefixExpression AST node if successful, None if parsing fails.
1107
+ """
1108
+ assert self._current_token is not None
1109
+
1110
+ # Create the prefix expression with the operator
1111
+ expression = PrefixExpression(
1112
+ token=self._current_token,
1113
+ operator=self._current_token.literal,
1114
+ )
1115
+
1116
+ # Advance past the operator
1117
+ self._advance_tokens()
1118
+
1119
+ # Parse the right-hand expression with appropriate precedence
1120
+ # All unary operators (including 'not') have high precedence
1121
+ expression.right = self._parse_expression(Precedence.UNARY_SIMPLIFIED)
1122
+
1123
+ return expression
1124
+
1125
+ def _parse_ordinal_list_access(self) -> Expression:
1126
+ """Parse ordinal list access: '[the] first item of list'.
1127
+
1128
+ Handles both forms:
1129
+ - 'the first item of list' (with optional 'the')
1130
+ - 'first item of list' (without 'the')
1131
+
1132
+ Returns:
1133
+ A CollectionAccessExpression for ordinal access.
1134
+ """
1135
+ assert self._current_token is not None
1136
+
1137
+ # Check if we're starting with an ordinal directly or with 'the' (stopword)
1138
+ if self._current_token.type == TokenType.MISC_STOPWORD and self._current_token.literal.lower() == "the":
1139
+ # Skip optional 'the'
1140
+ self._advance_tokens()
1141
+
1142
+ # Now we should have an ordinal (first, second, third, last)
1143
+ if self._current_token is None or self._current_token.type not in [
1144
+ TokenType.KW_FIRST,
1145
+ TokenType.KW_SECOND,
1146
+ TokenType.KW_THIRD,
1147
+ TokenType.KW_LAST,
1148
+ ]:
1149
+ # Not a valid ordinal access pattern
1150
+ return ErrorExpression(
1151
+ token=self._current_token or Token(TokenType.MISC_ILLEGAL, "", 0, 0),
1152
+ message="Not a valid ordinal access pattern",
1153
+ )
1154
+
1155
+ ordinal_token = self._current_token
1156
+ ordinal = self._current_token.literal
1157
+
1158
+ # Skip ordinal
1159
+ self._advance_tokens()
1160
+
1161
+ # Expect 'item'
1162
+ if self._current_token is None or self._current_token.type != TokenType.KW_ITEM:
1163
+ msg = f"Expected 'item' after ordinal, got {self._current_token.type if self._current_token else 'EOF'}"
1164
+ return ErrorExpression(token=self._current_token or ordinal_token, message=msg)
1165
+
1166
+ # Skip 'item'
1167
+ self._advance_tokens()
1168
+
1169
+ # Expect 'of' - check the new current token after advancing
1170
+ current = self._current_token
1171
+ if current is None or current.type != TokenType.KW_OF:
1172
+ msg = f"Expected 'of' after 'item', got {self._current_token.type if self._current_token else 'EOF'}"
1173
+ return ErrorExpression(token=self._current_token or ordinal_token, message=msg)
1174
+
1175
+ # Skip 'of'
1176
+ self._advance_tokens()
1177
+
1178
+ # Parse the collection expression
1179
+ collection = self._parse_expression(Precedence.LOWEST)
1180
+
1181
+ return CollectionAccessExpression(
1182
+ token=ordinal_token, collection=collection, accessor=ordinal, access_type="ordinal"
1183
+ )
1184
+
1185
+ def _parse_stopword_expression(self) -> Expression:
1186
+ """Parse expressions that start with stopwords.
1187
+
1188
+ Handles:
1189
+ - 'the' for list access patterns (the first/second/third/last item of)
1190
+
1191
+ Returns:
1192
+ An appropriate expression or None if not a valid pattern.
1193
+ """
1194
+ assert self._current_token is not None
1195
+
1196
+ # Check if it's 'the' which might start various patterns
1197
+ if self._current_token.literal.lower() == "the":
1198
+ # Look ahead to see if it's followed by an ordinal
1199
+ if self._peek_token and self._peek_token.type in [
1200
+ TokenType.KW_FIRST,
1201
+ TokenType.KW_SECOND,
1202
+ TokenType.KW_THIRD,
1203
+ TokenType.KW_LAST,
1204
+ ]:
1205
+ return self._parse_ordinal_list_access()
1206
+
1207
+ # Otherwise, stopwords aren't valid expression starters
1208
+ return ErrorExpression(
1209
+ token=self._current_token,
1210
+ message=f"Unexpected stopword '{self._current_token.literal}' at start of expression",
1211
+ )
1212
+
1213
+ def _parse_dict_extraction_prefix(self) -> Expression:
1214
+ """Parse dictionary extraction as a prefix operator.
1215
+
1216
+ Examples:
1217
+ the names of `person` -> DictExtraction(dictionary=person, extract_type="names")
1218
+ the contents of `config` -> DictExtraction(dictionary=config, extract_type="contents")
1219
+
1220
+ Returns:
1221
+ A DictExtraction expression.
1222
+ """
1223
+ assert self._current_token is not None
1224
+ operator_token = self._current_token
1225
+
1226
+ # Determine extraction type based on operator
1227
+ if operator_token.type == TokenType.OP_THE_NAMES_OF:
1228
+ extract_type = "names"
1229
+ elif operator_token.type == TokenType.OP_THE_CONTENTS_OF:
1230
+ extract_type = "contents"
1231
+ else:
1232
+ msg = f"Unknown dictionary extraction operator: {operator_token.type}"
1233
+ return ErrorExpression(token=operator_token, message=msg)
1234
+
1235
+ # Skip the operator
1236
+ self._advance_tokens()
1237
+
1238
+ # Parse the dictionary expression
1239
+ dictionary = self._parse_expression(Precedence.UNARY_POST_OPERATOR)
1240
+
1241
+ if dictionary is None:
1242
+ msg = "Expected dictionary expression after extraction operator"
1243
+ return ErrorExpression(token=self._current_token or operator_token, message=msg)
1244
+
1245
+ # Import here to avoid circular dependency
1246
+ from machine_dialect.ast.dict_extraction import DictExtraction
1247
+
1248
+ return DictExtraction(token=operator_token, dictionary=dictionary, extract_type=extract_type)
1249
+
1250
+ def _parse_dict_extraction_infix(self, dictionary: Expression) -> Expression:
1251
+ """Parse dictionary extraction as an infix operator.
1252
+
1253
+ Examples:
1254
+ `person` the names of -> DictExtraction(dictionary=person, extract_type="names")
1255
+ `config` the contents of -> DictExtraction(dictionary=config, extract_type="contents")
1256
+
1257
+ Args:
1258
+ dictionary: The dictionary expression to extract from
1259
+
1260
+ Returns:
1261
+ A DictExtraction expression.
1262
+ """
1263
+ assert self._current_token is not None
1264
+ operator_token = self._current_token
1265
+
1266
+ # Determine extraction type based on operator
1267
+ if operator_token.type == TokenType.OP_THE_NAMES_OF:
1268
+ extract_type = "names"
1269
+ elif operator_token.type == TokenType.OP_THE_CONTENTS_OF:
1270
+ extract_type = "contents"
1271
+ else:
1272
+ msg = f"Unknown dictionary extraction operator: {operator_token.type}"
1273
+ return ErrorExpression(token=operator_token, message=msg)
1274
+
1275
+ # Import here to avoid circular dependency
1276
+ from machine_dialect.ast.dict_extraction import DictExtraction
1277
+
1278
+ return DictExtraction(token=operator_token, dictionary=dictionary, extract_type=extract_type)
1279
+
1280
+ # TODO: Refactor this function to an infix expression
1281
+ def _parse_possessive_access(self) -> Expression:
1282
+ """Parse possessive property access: `person`'s _"name"_.
1283
+
1284
+ When the lexer sees `person`'s, it emits a PUNCT_APOSTROPHE_S token
1285
+ with the identifier as the literal. We then need to parse the property name
1286
+ as a string literal.
1287
+
1288
+ Returns:
1289
+ A CollectionAccessExpression for property access.
1290
+ """
1291
+ assert self._current_token is not None
1292
+ assert self._current_token.type == TokenType.PUNCT_APOSTROPHE_S
1293
+
1294
+ # The literal contains the identifier name (e.g., "person")
1295
+ dict_name = self._current_token.literal
1296
+ token = self._current_token
1297
+
1298
+ # Create an identifier for the dictionary
1299
+ dict_identifier = Identifier(Token(TokenType.MISC_IDENT, dict_name, token.line, token.position), dict_name)
1300
+
1301
+ # Skip the possessive token
1302
+ self._advance_tokens()
1303
+
1304
+ # Now we expect a string literal for the property name
1305
+ # Note: after _advance_tokens(), current_token has changed from PUNCT_APOSTROPHE_S
1306
+ if self._current_token is None or self._current_token.type != TokenType.LIT_TEXT:
1307
+ msg = (
1308
+ "Expected string literal for property name after possessive, got "
1309
+ f"{self._current_token.type if self._current_token else 'EOF'}"
1310
+ )
1311
+ return ErrorExpression(token=self._current_token or token, message=msg)
1312
+
1313
+ # Extract the property name from the string literal (remove quotes)
1314
+ property_literal = self._current_token.literal
1315
+ # Remove quotes from the literal
1316
+ if property_literal.startswith('"') and property_literal.endswith('"'):
1317
+ property_name = property_literal[1:-1]
1318
+ elif property_literal.startswith("'") and property_literal.endswith("'"):
1319
+ property_name = property_literal[1:-1]
1320
+ else:
1321
+ property_name = property_literal
1322
+
1323
+ # Note: We do NOT advance past the property name here.
1324
+ # Expression parsers should leave current_token AT the last token of the expression,
1325
+ # not after it. The caller will advance when needed.
1326
+
1327
+ # Create a collection access expression with property access type
1328
+ return CollectionAccessExpression(
1329
+ token=token, collection=dict_identifier, accessor=property_name, access_type="property"
1330
+ )
1331
+
1332
+ def _parse_numeric_list_access(self) -> Expression:
1333
+ """Parse numeric list access: 'item _5_ of list'.
1334
+
1335
+ Returns:
1336
+ A CollectionAccessExpression for numeric access.
1337
+ """
1338
+ assert self._current_token is not None
1339
+ assert self._current_token.type == TokenType.KW_ITEM
1340
+
1341
+ item_token = self._current_token
1342
+
1343
+ # Skip 'item'
1344
+ self._advance_tokens()
1345
+
1346
+ # Expect a number literal - check the new current token after advancing
1347
+ current = self._current_token
1348
+ if current is None or current.type != TokenType.LIT_WHOLE_NUMBER:
1349
+ msg = f"Expected number after 'item', got {self._current_token.type if self._current_token else 'EOF'}"
1350
+ return ErrorExpression(token=self._current_token or item_token, message=msg)
1351
+
1352
+ # Get the index (one-based in Machine Dialect™)
1353
+ index = int(self._current_token.literal)
1354
+
1355
+ # Skip number
1356
+ self._advance_tokens()
1357
+
1358
+ # Expect 'of' - check the new current token after advancing
1359
+ current = self._current_token
1360
+ if current is None or current.type != TokenType.KW_OF:
1361
+ msg = f"Expected 'of' after number, got {self._current_token.type if self._current_token else 'EOF'}"
1362
+ return ErrorExpression(token=self._current_token or item_token, message=msg)
1363
+
1364
+ # Skip 'of'
1365
+ self._advance_tokens()
1366
+
1367
+ # Parse the collection expression
1368
+ collection = self._parse_expression(Precedence.LOWEST)
1369
+
1370
+ return CollectionAccessExpression(
1371
+ token=item_token, collection=collection, accessor=index, access_type="numeric"
1372
+ )
1373
+
1374
+ def _parse_infix_expression(self, left: Expression) -> InfixExpression:
1375
+ """Parse an infix expression.
1376
+
1377
+ Infix expressions consist of a left expression, an infix operator, and a
1378
+ right expression. Examples: 5 + 3, x == y, a and b.
1379
+
1380
+ Args:
1381
+ left: The left-hand expression that was already parsed.
1382
+
1383
+ Returns:
1384
+ An InfixExpression AST node.
1385
+ """
1386
+ assert self._current_token is not None
1387
+
1388
+ # Map token type to operator string
1389
+ operator_map = {
1390
+ TokenType.OP_PLUS: "+",
1391
+ TokenType.OP_MINUS: "-",
1392
+ TokenType.OP_STAR: "*",
1393
+ TokenType.OP_DIVISION: "/",
1394
+ TokenType.OP_EQ: "equals",
1395
+ TokenType.OP_NOT_EQ: "is not",
1396
+ TokenType.OP_STRICT_EQ: "is strictly equal to",
1397
+ TokenType.OP_STRICT_NOT_EQ: "is not strictly equal to",
1398
+ TokenType.OP_LT: "<",
1399
+ TokenType.OP_GT: ">",
1400
+ TokenType.OP_LTE: "<=",
1401
+ TokenType.OP_GTE: ">=",
1402
+ TokenType.KW_AND: "and",
1403
+ TokenType.KW_OR: "or",
1404
+ }
1405
+
1406
+ # Get the operator string
1407
+ operator = operator_map.get(self._current_token.type, self._current_token.literal)
1408
+
1409
+ # Create the infix expression with the operator and left operand
1410
+ expression = InfixExpression(
1411
+ token=self._current_token,
1412
+ operator=operator,
1413
+ left=left,
1414
+ )
1415
+
1416
+ # Get the precedence of this operator
1417
+ precedence = self._current_precedence()
1418
+
1419
+ # Advance past the operator
1420
+ self._advance_tokens()
1421
+
1422
+ # Parse the right-hand expression
1423
+ expression.right = self._parse_expression(precedence)
1424
+
1425
+ return expression
1426
+
1427
+ def _parse_grouped_expression(self) -> Expression:
1428
+ """Parse a grouped expression (expression in parentheses).
1429
+
1430
+ Grouped expressions are expressions wrapped in parentheses, which
1431
+ can be used to override operator precedence.
1432
+
1433
+ Returns:
1434
+ The expression inside the parentheses, or None if parsing fails.
1435
+ """
1436
+ # Advance past the opening parenthesis
1437
+ self._advance_tokens()
1438
+
1439
+ # Parse the inner expression
1440
+ expression = self._parse_expression(Precedence.LOWEST)
1441
+
1442
+ # Expect closing parenthesis
1443
+ assert self._current_token is not None
1444
+ if error := self._expect_token(
1445
+ TokenType.DELIM_RPAREN,
1446
+ error_node=ErrorExpression(token=self._current_token, message="Expected closing parenthesis"),
1447
+ ):
1448
+ assert isinstance(error, ErrorExpression)
1449
+ return error
1450
+
1451
+ return expression
1452
+
1453
+ def _parse_conditional_expression(self, consequence: Expression) -> ConditionalExpression:
1454
+ """Parse a conditional (ternary) expression.
1455
+
1456
+ Formats supported:
1457
+ - consequence if condition, else alternative
1458
+ - consequence if condition, otherwise alternative
1459
+ - consequence when condition, else alternative
1460
+ - consequence when condition, otherwise alternative
1461
+ - consequence if condition; else alternative
1462
+ - consequence if condition; otherwise alternative
1463
+ - consequence when condition; else alternative
1464
+ - consequence when condition; otherwise alternative
1465
+
1466
+ Args:
1467
+ consequence: The expression to return if condition is true.
1468
+
1469
+ Returns:
1470
+ A ConditionalExpression node.
1471
+ """
1472
+ assert self._current_token is not None
1473
+ # Create the conditional expression with the consequence
1474
+ expression = ConditionalExpression(token=self._current_token, consequence=consequence)
1475
+
1476
+ # Move past 'if' or 'when'
1477
+ self._advance_tokens()
1478
+
1479
+ # Parse the condition with TERNARY precedence to stop at comma
1480
+ expression.condition = self._parse_expression(Precedence.TERNARY)
1481
+
1482
+ # After parsing the condition, we need to advance to the next token
1483
+ # _parse_expression leaves us at the last token of the parsed expression
1484
+ self._advance_tokens()
1485
+
1486
+ # DEBUG: Print current state
1487
+ # print(f"After parsing condition and advancing: current={self._current_token}, peek={self._peek_token}")
1488
+
1489
+ # Check for comma or semicolon before 'else'/'otherwise'
1490
+ if self._current_token and self._current_token.type in (TokenType.PUNCT_COMMA, TokenType.PUNCT_SEMICOLON):
1491
+ self._advance_tokens() # Move past comma/semicolon
1492
+ # print(f"After advancing past comma/semicolon: current={self._current_token}, peek={self._peek_token}")
1493
+
1494
+ # Expect 'else' or 'otherwise' (both map to KW_ELSE)
1495
+ if not self._current_token or self._current_token.type != TokenType.KW_ELSE:
1496
+ return expression # Return incomplete expression if no else clause
1497
+
1498
+ # Move past 'else' or 'otherwise'
1499
+ self._advance_tokens()
1500
+
1501
+ # Parse the alternative expression
1502
+ expression.alternative = self._parse_expression(Precedence.LOWEST)
1503
+
1504
+ return expression
1505
+
1506
+ def _parse_define_statement(self) -> DefineStatement | ErrorStatement:
1507
+ """Parse a Define statement.
1508
+
1509
+ Grammar:
1510
+ define_statement ::= "Define" identifier "as" type_spec
1511
+ ["(" "default" ":" expression ")"] "."
1512
+
1513
+ Examples:
1514
+ Define `x` as Whole Number.
1515
+ Define `name` as Text (default: _"Unknown"_).
1516
+ Define `value` as Whole Number or Text.
1517
+
1518
+ Returns:
1519
+ DefineStatement on success, ErrorStatement on parse error.
1520
+ """
1521
+ statement_token = self._current_token
1522
+ assert statement_token is not None
1523
+
1524
+ # Move past "Define" to get to the identifier
1525
+ self._advance_tokens()
1526
+
1527
+ # Check if we have an identifier
1528
+ if not self._current_token or self._current_token.type != TokenType.MISC_IDENT:
1529
+ # Report error and get recovery result
1530
+ error_stmt = self._report_error_and_recover(
1531
+ template=EXPECTED_IDENTIFIER_AFTER,
1532
+ expected_token=TokenType.MISC_IDENT,
1533
+ what="variable",
1534
+ after="'Define'",
1535
+ recovery_tokens=[TokenType.KW_AS, TokenType.PUNCT_PERIOD, TokenType.MISC_EOF],
1536
+ )
1537
+
1538
+ # Try to continue parsing if we recovered at 'as' keyword
1539
+ if self._current_token and self._current_token.type == TokenType.KW_AS:
1540
+ # Found 'as', try to continue parsing from here
1541
+ name = Identifier(statement_token, "<error>") # Placeholder name
1542
+ self._advance_tokens() # Skip 'as'
1543
+ type_spec = self._parse_type_spec()
1544
+ if type_spec:
1545
+ return DefineStatement(statement_token, name, type_spec, None)
1546
+
1547
+ assert isinstance(error_stmt, ErrorStatement)
1548
+ return error_stmt
1549
+
1550
+ # Parse the identifier
1551
+ name = self._parse_identifier()
1552
+
1553
+ # Move past the identifier
1554
+ self._advance_tokens()
1555
+
1556
+ # Skip any stopwords between identifier and "as"
1557
+ while self._current_token and self._current_token.type == TokenType.MISC_STOPWORD:
1558
+ self._advance_tokens()
1559
+
1560
+ # Expect "as" keyword - we should be at "as" now
1561
+ # Re-check current_token to help MyPy's type narrowing
1562
+ if self._current_token is None or self._current_token.type != TokenType.KW_AS:
1563
+ # Report error with recovery to type keywords
1564
+ error_stmt = self._report_error_and_recover(
1565
+ template=EXPECTED_TOKEN_AFTER,
1566
+ expected_token=TokenType.KW_AS,
1567
+ expected="'as'",
1568
+ after=f"variable name '{name.value}'",
1569
+ recovery_to_types=True,
1570
+ )
1571
+
1572
+ # If we recovered at a type keyword, try to continue parsing
1573
+ if self._current_token and self._is_type_token(self._current_token.type):
1574
+ # Found a type, try to continue parsing
1575
+ type_spec = self._parse_type_spec()
1576
+ if type_spec:
1577
+ # Still register the variable even with syntax error
1578
+ self._register_variable_definition(
1579
+ name.value, type_spec, statement_token.line, statement_token.position
1580
+ )
1581
+ return DefineStatement(statement_token, name, type_spec, None)
1582
+
1583
+ # Need additional recovery if we didn't find a type
1584
+ if not isinstance(error_stmt, ErrorStatement):
1585
+ # This shouldn't happen, but handle it just in case
1586
+ skipped = self._panic_recovery()
1587
+ return ErrorStatement(
1588
+ token=statement_token,
1589
+ skipped_tokens=skipped,
1590
+ message=f"Expected 'as' after variable name '{name.value}'",
1591
+ )
1592
+ return error_stmt
1593
+
1594
+ # Move past "as"
1595
+ self._advance_tokens()
1596
+
1597
+ # Parse type specification
1598
+ type_spec = self._parse_type_spec()
1599
+ if not type_spec:
1600
+ # Get the invalid type name token
1601
+ invalid_name = self._current_token.literal if self._current_token else "unknown"
1602
+ # Generate valid types list from TYPING_MAP
1603
+ valid_types = list(TYPING_MAP.values())
1604
+ return self._report_error_and_recover(
1605
+ template=INVALID_TYPE_NAME,
1606
+ name=invalid_name,
1607
+ valid_types=", ".join(valid_types),
1608
+ expected_token=TokenType.KW_TEXT, # Use TEXT as representative type
1609
+ expected="type name",
1610
+ after="'as'",
1611
+ )
1612
+
1613
+ # Optional: (default: value) clause
1614
+ initial_value = None
1615
+ if self._current_token and self._current_token.type == TokenType.DELIM_LPAREN:
1616
+ self._advance_tokens() # Move past "("
1617
+
1618
+ # Expect "default" - we should be at "default" now
1619
+ if not self._current_token or self._current_token.type != TokenType.KW_DEFAULT:
1620
+ # Report error and handle recovery to closing paren
1621
+ error_stmt = self._report_error_and_recover(
1622
+ template=EXPECTED_TOKEN_AFTER,
1623
+ expected_token=TokenType.KW_DEFAULT,
1624
+ expected="'default'",
1625
+ after="'('",
1626
+ recovery_tokens=[TokenType.DELIM_RPAREN, TokenType.PUNCT_PERIOD, TokenType.MISC_EOF],
1627
+ )
1628
+ # If we found the closing paren, advance past it
1629
+ if self._current_token and self._current_token.type == TokenType.DELIM_RPAREN:
1630
+ self._advance_tokens()
1631
+ return error_stmt
1632
+
1633
+ # Move past "default"
1634
+ self._advance_tokens()
1635
+
1636
+ # Expect ":" - we should be at ":"
1637
+ if not self._current_token or self._current_token.type != TokenType.PUNCT_COLON:
1638
+ # Report error and handle recovery to closing paren
1639
+ error_stmt = self._report_error_and_recover(
1640
+ template=EXPECTED_TOKEN_AFTER,
1641
+ expected_token=TokenType.PUNCT_COLON,
1642
+ expected="':'",
1643
+ after="'default'",
1644
+ recovery_tokens=[TokenType.DELIM_RPAREN, TokenType.PUNCT_PERIOD, TokenType.MISC_EOF],
1645
+ )
1646
+ # If we found the closing paren, advance past it
1647
+ if self._current_token and self._current_token.type == TokenType.DELIM_RPAREN:
1648
+ self._advance_tokens()
1649
+ return error_stmt
1650
+
1651
+ # Move past ":"
1652
+ self._advance_tokens()
1653
+
1654
+ # Parse the default value expression
1655
+ initial_value = self._parse_expression(Precedence.LOWEST)
1656
+
1657
+ # Expect ")" - check if we're at the closing paren
1658
+ if self._peek_token and self._peek_token.type != TokenType.DELIM_RPAREN:
1659
+ # Report error but don't return - continue to create the statement
1660
+ self._report_error_and_recover(
1661
+ template=EXPECTED_TOKEN_AFTER,
1662
+ expected_token=TokenType.DELIM_RPAREN,
1663
+ expected="')'",
1664
+ after="default value",
1665
+ skip_recovery=True, # Don't recover, continue processing
1666
+ )
1667
+ elif self._peek_token:
1668
+ self._advance_tokens() # Move to ")"
1669
+ self._advance_tokens() # Skip ")"
1670
+
1671
+ # Check for period at statement end (optional for now)
1672
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_PERIOD:
1673
+ self._advance_tokens() # Move to period
1674
+
1675
+ # Register the variable definition in the symbol table
1676
+ self._register_variable_definition(name.value, type_spec, statement_token.line, statement_token.position)
1677
+
1678
+ return DefineStatement(statement_token, name, type_spec, initial_value)
1679
+
1680
+ def _parse_type_spec(self) -> list[str]:
1681
+ """Parse type specification, handling union types.
1682
+
1683
+ Grammar:
1684
+ type_spec ::= type_name ["or" type_name]*
1685
+ type_name ::= "Text" | "Whole Number" | "Float" | "Number" | "Yes/No"
1686
+ | "URL" | "Date" | "DateTime" | "Time" | "List" | "Empty"
1687
+
1688
+ Examples:
1689
+ Whole Number -> ["Whole Number"]
1690
+ Whole Number or Text -> ["Whole Number", "Text"]
1691
+ Number or Yes/No or Empty -> ["Number", "Yes/No", "Empty"]
1692
+
1693
+ Returns:
1694
+ List of type names, empty list if no valid type found.
1695
+ """
1696
+ types = []
1697
+
1698
+ # Parse first type
1699
+ type_name = self._parse_type_name()
1700
+ if type_name:
1701
+ types.append(type_name)
1702
+ else:
1703
+ return types # Return empty list if no type found
1704
+
1705
+ # Parse additional types with "or" (for union types)
1706
+ while self._current_token and self._current_token.type == TokenType.KW_OR:
1707
+ self._advance_tokens() # Skip "or"
1708
+
1709
+ type_name = self._parse_type_name()
1710
+ if type_name:
1711
+ types.append(type_name)
1712
+ else:
1713
+ # If we don't find a type after "or", that's an error
1714
+ self._report_error_and_recover(
1715
+ template=EXPECTED_TOKEN_AFTER,
1716
+ expected_token=TokenType.KW_TEXT, # Use TEXT as representative
1717
+ expected="type name",
1718
+ after="'or'",
1719
+ skip_recovery=True, # Continue with what we have
1720
+ )
1721
+ break
1722
+
1723
+ return types
1724
+
1725
+ def _parse_type_name(self) -> str | None:
1726
+ """Parse a single type name.
1727
+
1728
+ Only handles keyword-based types as specified in the grammar.
1729
+
1730
+ Returns:
1731
+ The type name as a string, or None if current token is not a type.
1732
+ """
1733
+ if not self._current_token:
1734
+ return None
1735
+
1736
+ if self._current_token.type in TYPING_MAP:
1737
+ type_name = TYPING_MAP[self._current_token.type]
1738
+ self._advance_tokens()
1739
+ return type_name
1740
+
1741
+ return None
1742
+
1743
+ def _parse_set_statement(self) -> SetStatement | ErrorStatement | Statement:
1744
+ """Parse a Set statement.
1745
+
1746
+ Expects:
1747
+ - Set `identifier` to expression
1748
+ - Set the second item of `list` to expression
1749
+ - Set item _5_ of `list` to expression
1750
+
1751
+ Returns:
1752
+ A SetStatement AST node if successful, ErrorStatement if parsing fails.
1753
+ """
1754
+ from machine_dialect.ast.statements import CollectionMutationStatement
1755
+
1756
+ assert self._current_token is not None
1757
+ statement_token = self._current_token # Save the 'Set' token
1758
+
1759
+ # Check for collection item assignment patterns
1760
+ # We need to handle both "Set the first item of" and "Set item _1_ of"
1761
+ if self._peek_token:
1762
+ # Since stopwords are auto-skipped, after "Set" we might be directly at "first"/"second" etc
1763
+ # if the user wrote "Set the first..." because "the" gets skipped
1764
+ if self._peek_token.type in (
1765
+ TokenType.KW_FIRST,
1766
+ TokenType.KW_SECOND,
1767
+ TokenType.KW_THIRD,
1768
+ TokenType.KW_LAST,
1769
+ ):
1770
+ # Pattern: Set [the] [ordinal] item of `list` to value
1771
+ # "the" was already skipped if present
1772
+ self._advance_tokens() # Move to ordinal
1773
+
1774
+ # We're now at the ordinal
1775
+ if not self._current_token or self._current_token.type not in (
1776
+ TokenType.KW_FIRST,
1777
+ TokenType.KW_SECOND,
1778
+ TokenType.KW_THIRD,
1779
+ TokenType.KW_LAST,
1780
+ ):
1781
+ return ErrorStatement(
1782
+ token=statement_token, message="Expected ordinal (first, second, third, last)"
1783
+ )
1784
+
1785
+ ordinal = self._current_token.literal.lower()
1786
+ self._advance_tokens() # Move past ordinal
1787
+
1788
+ # Expect "item"
1789
+ if self._current_token and self._current_token.type == TokenType.KW_ITEM:
1790
+ self._advance_tokens() # Move past 'item'
1791
+
1792
+ # Expect "of"
1793
+ if self._current_token and self._current_token.type == TokenType.KW_OF:
1794
+ self._advance_tokens() # Move past 'of'
1795
+
1796
+ # Parse collection identifier
1797
+ collection = self._parse_identifier_or_keyword_as_identifier()
1798
+ if not collection:
1799
+ error_stmt = self._report_error_and_recover(
1800
+ template=EXPECTED_TOKEN,
1801
+ expected_token=TokenType.MISC_IDENT,
1802
+ token="collection identifier",
1803
+ got_token_type=self._current_token.type.name if self._current_token else "EOF",
1804
+ )
1805
+ assert isinstance(error_stmt, ErrorStatement)
1806
+ return error_stmt
1807
+ self._advance_tokens()
1808
+
1809
+ # Expect "to"
1810
+ if self._current_token and self._current_token.type == TokenType.KW_TO:
1811
+ self._advance_tokens()
1812
+
1813
+ # Parse the value
1814
+ value = self._parse_expression(Precedence.LOWEST)
1815
+ self._advance_tokens()
1816
+
1817
+ # Expect period
1818
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
1819
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
1820
+ assert isinstance(error, ErrorStatement)
1821
+ return error
1822
+
1823
+ return CollectionMutationStatement(
1824
+ token=statement_token,
1825
+ operation="set",
1826
+ collection=collection,
1827
+ value=value,
1828
+ position=ordinal,
1829
+ position_type="ordinal",
1830
+ )
1831
+
1832
+ elif self._peek_token.type == TokenType.KW_ITEM:
1833
+ # Pattern: Set item _5_ of `list` to value
1834
+ self._advance_tokens() # Move to 'item'
1835
+ self._advance_tokens() # Move past 'item'
1836
+
1837
+ # Parse numeric index
1838
+ index = self._parse_expression(Precedence.LOWEST)
1839
+ self._advance_tokens()
1840
+
1841
+ # Expect "of"
1842
+ if self._current_token and self._current_token.type == TokenType.KW_OF:
1843
+ self._advance_tokens()
1844
+
1845
+ # Parse collection identifier
1846
+ collection = self._parse_identifier_or_keyword_as_identifier()
1847
+ if not collection:
1848
+ error_stmt = self._report_error_and_recover(
1849
+ template=EXPECTED_TOKEN,
1850
+ expected_token=TokenType.MISC_IDENT,
1851
+ token="collection identifier",
1852
+ got_token_type=self._current_token.type.name if self._current_token else "EOF",
1853
+ )
1854
+ assert isinstance(error_stmt, ErrorStatement)
1855
+ return error_stmt
1856
+ self._advance_tokens()
1857
+
1858
+ # Expect "to"
1859
+ if self._current_token and self._current_token.type == TokenType.KW_TO:
1860
+ self._advance_tokens()
1861
+
1862
+ # Parse the value
1863
+ value = self._parse_expression(Precedence.LOWEST)
1864
+ self._advance_tokens()
1865
+
1866
+ # Expect period
1867
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
1868
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
1869
+ assert isinstance(error, ErrorStatement)
1870
+ return error
1871
+
1872
+ return CollectionMutationStatement(
1873
+ token=statement_token,
1874
+ operation="set",
1875
+ collection=collection,
1876
+ value=value,
1877
+ position=index,
1878
+ position_type="numeric",
1879
+ )
1880
+
1881
+ # Normal Set statement: Set `identifier` to expression
1882
+ let_statement = SetStatement(token=statement_token)
1883
+
1884
+ # Expect identifier (which may have come from backticks)
1885
+ if error := self._expect_token(TokenType.MISC_IDENT, "'Set'"):
1886
+ assert isinstance(error, ErrorStatement)
1887
+ return error
1888
+
1889
+ # Use the identifier value directly (backticks already stripped by lexer)
1890
+ let_statement.name = self._parse_identifier()
1891
+
1892
+ # Variables MUST be defined before use - no exceptions
1893
+ variable_defined = self._check_variable_defined(
1894
+ let_statement.name.value, let_statement.name.token.line, let_statement.name.token.position
1895
+ )
1896
+
1897
+ # Check for "to" or "using" keyword
1898
+ assert self._peek_token is not None
1899
+ used_using = False # Track if we used the 'using' branch
1900
+ if self._peek_token.type == TokenType.KW_TO:
1901
+ # Standard assignment: Set x to value
1902
+ self._advance_tokens() # Move to 'to'
1903
+
1904
+ # Check if this is a list definition (colon after 'to')
1905
+ # After advancing, peek_token is now the next token
1906
+ next_token_type: TokenType | None = self._peek_token.type if self._peek_token else None
1907
+ if next_token_type == TokenType.PUNCT_COLON:
1908
+ # This will be a list - set context NOW before advancing
1909
+ # This ensures the dash tokens after the colon are properly tokenized
1910
+ if self._token_buffer:
1911
+ self._token_buffer.set_list_context(True)
1912
+
1913
+ self._advance_tokens() # Move past 'to' to the colon
1914
+
1915
+ # Advance past the colon to get to the first list item
1916
+ self._advance_tokens()
1917
+
1918
+ # Parse the list - current token should now be the first list item marker
1919
+ let_statement.value = self._parse_list_literal()
1920
+
1921
+ # Disable list context after parsing
1922
+ if self._token_buffer:
1923
+ self._token_buffer.set_list_context(False)
1924
+
1925
+ # After parsing a list, we're already properly positioned
1926
+ # (either at EOF, a period, or the next statement)
1927
+ # Set a flag to skip the advance and period check
1928
+ used_using = True # Reuse this flag to skip advance/period check
1929
+ else:
1930
+ # Not a list, advance past 'to' and parse expression normally
1931
+ self._advance_tokens() # Move past 'to'
1932
+
1933
+ # Check for "blank" keyword for empty collections
1934
+ if self._current_token and self._current_token.type == TokenType.KW_BLANK:
1935
+ from machine_dialect.ast import BlankLiteral
1936
+
1937
+ let_statement.value = BlankLiteral(self._current_token)
1938
+ # Don't advance here - let the normal flow handle it
1939
+ else:
1940
+ # Parse the value expression normally
1941
+ let_statement.value = self._parse_expression()
1942
+
1943
+ elif self._peek_token.type == TokenType.KW_USING:
1944
+ # Function call assignment: Set x using function_name
1945
+ self._advance_tokens() # Move to 'using'
1946
+ self._advance_tokens() # Move past 'using'
1947
+ # Parse a function call (similar to Use statement but returns the value)
1948
+ func_call = self._parse_function_call_expression()
1949
+ # CallExpression is an Expression, so this is valid
1950
+ let_statement.value = func_call
1951
+ # Note: _parse_function_call_expression already leaves us at the period,
1952
+ # so we'll skip the advance_tokens() call below for this branch
1953
+ used_using = True
1954
+ else:
1955
+ # Report the error using unified error handling
1956
+ assert self._peek_token is not None
1957
+ error_stmt = self._report_error_and_recover(
1958
+ template=EXPECTED_TOKEN,
1959
+ expected_token=TokenType.KW_TO,
1960
+ token="'to' or 'using' keyword",
1961
+ got_token_type=self._peek_token.type.name if self._peek_token else "EOF",
1962
+ )
1963
+ assert isinstance(error_stmt, ErrorStatement)
1964
+ return error_stmt
1965
+
1966
+ # Advance past the last token of the expression
1967
+ # Expression parsing leaves us at the last token, not after it
1968
+ # BUT: the 'using' branch already leaves us at the period, so skip this
1969
+ if not used_using:
1970
+ self._advance_tokens()
1971
+
1972
+ # Type-check the assignment if the variable is defined
1973
+ if variable_defined and let_statement.value and not isinstance(let_statement.value, ErrorExpression):
1974
+ self._validate_assignment_type(
1975
+ let_statement.name.value,
1976
+ let_statement.value,
1977
+ let_statement.name.token.line,
1978
+ let_statement.name.token.position,
1979
+ )
1980
+
1981
+ # If the expression failed, skip to synchronization point
1982
+ if isinstance(let_statement.value, ErrorExpression):
1983
+ # Skip remaining tokens until we're at a period or EOF
1984
+ while self._current_token is not None and self._current_token.type not in (
1985
+ TokenType.PUNCT_PERIOD,
1986
+ TokenType.MISC_EOF,
1987
+ ):
1988
+ self._advance_tokens()
1989
+
1990
+ # Require trailing period if not at EOF or if we're in a block
1991
+ # But if we're already at a period (after error recovery), don't expect another
1992
+ assert self._peek_token is not None
1993
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
1994
+ # Already at period, no need to expect one
1995
+ pass
1996
+ elif self._peek_token.type != TokenType.MISC_EOF or self._block_depth > 0:
1997
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
1998
+ assert isinstance(error, ErrorStatement)
1999
+ return error
2000
+
2001
+ return let_statement
2002
+
2003
+ def _parse_return_statement(self) -> ReturnStatement | ErrorStatement:
2004
+ """Parse a return statement.
2005
+
2006
+ Expects: give back expression or gives back expression
2007
+
2008
+ Returns:
2009
+ A ReturnStatement AST node.
2010
+ """
2011
+ assert self._current_token is not None
2012
+ return_statement = ReturnStatement(token=self._current_token)
2013
+
2014
+ # Advance past "give back" or "gives back"
2015
+ self._advance_tokens()
2016
+
2017
+ # Parse the return value expression
2018
+ return_statement.return_value = self._parse_expression()
2019
+
2020
+ # Advance past the last token of the expression
2021
+ # Expression parsing leaves us at the last token, not after it
2022
+ self._advance_tokens()
2023
+
2024
+ # If the expression failed, don't require a period since we're already in error recovery
2025
+ if not isinstance(return_statement.return_value, ErrorExpression):
2026
+ # Require trailing period if not at EOF or if we're in a block
2027
+ # But if we're already at a period (after error recovery), don't expect another
2028
+ assert self._peek_token is not None
2029
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
2030
+ # Already at period, no need to expect one
2031
+ pass
2032
+ elif self._peek_token.type != TokenType.MISC_EOF or self._block_depth > 0:
2033
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2034
+ assert isinstance(error, ErrorStatement)
2035
+ return error
2036
+
2037
+ return return_statement
2038
+
2039
+ def _parse_say_statement(self) -> SayStatement | ErrorStatement:
2040
+ """Parse a Say or Tell statement.
2041
+
2042
+ Syntax: Say <expression>. or Tell <expression>.
2043
+
2044
+ Returns:
2045
+ A SayStatement AST node.
2046
+ """
2047
+ assert self._current_token is not None
2048
+ assert self._current_token.type in (TokenType.KW_SAY, TokenType.KW_TELL)
2049
+
2050
+ statement_token = self._current_token
2051
+
2052
+ # Move past 'Say'
2053
+ self._advance_tokens()
2054
+
2055
+ # Parse the expression to output
2056
+ expression = self._parse_expression(Precedence.LOWEST)
2057
+
2058
+ # Create the Say statement
2059
+ say_statement = SayStatement(statement_token, expression)
2060
+
2061
+ # Expect a period at the end
2062
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_PERIOD:
2063
+ self._advance_tokens()
2064
+ # But if we're already at a period (after error recovery), don't expect another
2065
+ elif self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2066
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2067
+ assert isinstance(error, ErrorStatement)
2068
+ return error
2069
+
2070
+ return say_statement
2071
+
2072
+ def _parse_collection_mutation_statement(self) -> Statement:
2073
+ """Parse a collection mutation statement.
2074
+
2075
+ Handles:
2076
+ - Add _value_ to `list`.
2077
+ - Remove _value_ from `list`.
2078
+ - Set the second item of `list` to _value_.
2079
+ - Set item _5_ of `list` to _value_.
2080
+ - Insert _value_ at position _3_ in `list`.
2081
+ - Empty `list`.
2082
+
2083
+ Returns:
2084
+ A CollectionMutationStatement AST node.
2085
+ """
2086
+ from machine_dialect.ast.statements import CollectionMutationStatement
2087
+
2088
+ assert self._current_token is not None
2089
+ start_token = self._current_token
2090
+ operation = start_token.literal.lower()
2091
+
2092
+ # Move past the operation keyword
2093
+ self._advance_tokens()
2094
+
2095
+ if operation == "add":
2096
+ # Two syntaxes:
2097
+ # 1. Add _value_ to `list`. (for arrays)
2098
+ # 2. Add "key" to `dict` with value _value_. (for named lists)
2099
+
2100
+ # Parse the first value/key
2101
+ first_value = self._parse_expression(Precedence.LOWEST)
2102
+ self._advance_tokens()
2103
+
2104
+ # Skip "to"
2105
+ if self._current_token and self._current_token.type == TokenType.KW_TO:
2106
+ self._advance_tokens()
2107
+
2108
+ # Parse the collection
2109
+ collection = self._parse_identifier_or_keyword_as_identifier()
2110
+ if not collection:
2111
+ error_stmt = self._report_error_and_recover(
2112
+ template=EXPECTED_IDENTIFIER_AFTER,
2113
+ expected_token=TokenType.MISC_IDENT,
2114
+ what="collection",
2115
+ after="'Add ... to'",
2116
+ )
2117
+ assert isinstance(error_stmt, ErrorStatement)
2118
+ return error_stmt
2119
+ self._advance_tokens()
2120
+
2121
+ # Check if this is dictionary syntax (with value)
2122
+ current_token_type = self._current_token.type if self._current_token else None
2123
+ if current_token_type == TokenType.KW_WITH:
2124
+ self._advance_tokens()
2125
+
2126
+ # Skip "value" if present
2127
+ if self._current_token and self._current_token.type == TokenType.KW_VALUE:
2128
+ self._advance_tokens()
2129
+
2130
+ # Parse the actual value
2131
+ dict_value = self._parse_expression(Precedence.LOWEST)
2132
+ self._advance_tokens()
2133
+
2134
+ # Expect period
2135
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2136
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2137
+ assert isinstance(error, ErrorStatement)
2138
+ return error
2139
+
2140
+ return CollectionMutationStatement(
2141
+ token=start_token,
2142
+ operation="add",
2143
+ collection=collection,
2144
+ value=dict_value,
2145
+ position=first_value, # The key
2146
+ position_type="key",
2147
+ )
2148
+ else:
2149
+ # Regular array syntax
2150
+ # Expect period
2151
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2152
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2153
+ assert isinstance(error, ErrorStatement)
2154
+ return error
2155
+
2156
+ return CollectionMutationStatement(
2157
+ token=start_token,
2158
+ operation="add",
2159
+ collection=collection,
2160
+ value=first_value,
2161
+ )
2162
+
2163
+ elif operation == "remove":
2164
+ # Two syntaxes:
2165
+ # 1. Remove _value_ from `list`. (for arrays - removes by value)
2166
+ # 2. Remove "key" from `dict`. (for named lists - removes by key)
2167
+ # Note: The semantic analyzer will determine which one based on collection type
2168
+
2169
+ value = self._parse_expression(Precedence.LOWEST)
2170
+ self._advance_tokens()
2171
+
2172
+ # Skip "from"
2173
+ if self._current_token and self._current_token.type == TokenType.KW_FROM:
2174
+ self._advance_tokens()
2175
+
2176
+ # Parse the collection
2177
+ collection = self._parse_identifier_or_keyword_as_identifier()
2178
+ if not collection:
2179
+ return ErrorStatement(
2180
+ token=start_token, message="Expected collection identifier after 'Remove ... from'"
2181
+ )
2182
+ self._advance_tokens()
2183
+
2184
+ # Expect period
2185
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2186
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2187
+ assert isinstance(error, ErrorStatement)
2188
+ return error
2189
+
2190
+ return CollectionMutationStatement(
2191
+ token=start_token,
2192
+ operation="remove",
2193
+ collection=collection,
2194
+ value=value,
2195
+ )
2196
+
2197
+ elif operation == "insert":
2198
+ # Insert _value_ at position _3_ in `list`.
2199
+ value = self._parse_expression(Precedence.LOWEST)
2200
+ self._advance_tokens()
2201
+
2202
+ # Skip "at"
2203
+ if self._current_token and self._current_token.literal and self._current_token.literal.lower() == "at":
2204
+ self._advance_tokens()
2205
+
2206
+ # Skip "position" if present
2207
+ if self._current_token and self._current_token.literal.lower() == "position":
2208
+ self._advance_tokens()
2209
+
2210
+ # Parse the position (should be a number)
2211
+ position = self._parse_expression(Precedence.LOWEST)
2212
+ self._advance_tokens()
2213
+
2214
+ # Skip "in"
2215
+ if self._current_token and self._current_token.type == TokenType.KW_IN:
2216
+ self._advance_tokens()
2217
+
2218
+ # Parse the collection
2219
+ collection = self._parse_identifier_or_keyword_as_identifier()
2220
+ if not collection:
2221
+ return ErrorStatement(
2222
+ token=start_token, message="Expected collection identifier after 'Insert ... at position ... in'"
2223
+ )
2224
+ self._advance_tokens()
2225
+
2226
+ # Expect period
2227
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2228
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2229
+ assert isinstance(error, ErrorStatement)
2230
+ return error
2231
+
2232
+ return CollectionMutationStatement(
2233
+ token=start_token,
2234
+ operation="insert",
2235
+ collection=collection,
2236
+ value=value,
2237
+ position=position,
2238
+ position_type="numeric",
2239
+ )
2240
+
2241
+ elif operation == "clear":
2242
+ # Clear `collection`.
2243
+ # Parse the collection identifier
2244
+ collection = self._parse_identifier_or_keyword_as_identifier()
2245
+ if not collection:
2246
+ error_stmt = self._report_error_and_recover(
2247
+ template=EXPECTED_IDENTIFIER_AFTER,
2248
+ expected_token=TokenType.MISC_IDENT,
2249
+ what="collection",
2250
+ after="'Clear'",
2251
+ )
2252
+ assert isinstance(error_stmt, ErrorStatement)
2253
+ return error_stmt
2254
+
2255
+ # Advance past the identifier to check for period
2256
+ self._advance_tokens()
2257
+
2258
+ # Expect period
2259
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2260
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2261
+ assert isinstance(error, ErrorStatement)
2262
+ return error
2263
+
2264
+ return CollectionMutationStatement(
2265
+ token=start_token,
2266
+ operation="clear",
2267
+ collection=collection,
2268
+ )
2269
+
2270
+ elif operation == "update":
2271
+ # Update "key" in `dict` to _value_.
2272
+ # Parse the key (should be a string literal)
2273
+ key = self._parse_expression(Precedence.LOWEST)
2274
+ self._advance_tokens()
2275
+
2276
+ # Skip "in"
2277
+ if self._current_token and self._current_token.type == TokenType.KW_IN:
2278
+ self._advance_tokens()
2279
+
2280
+ # Parse the collection
2281
+ collection = self._parse_identifier_or_keyword_as_identifier()
2282
+ if not collection:
2283
+ error_stmt = self._report_error_and_recover(
2284
+ template=EXPECTED_IDENTIFIER_AFTER,
2285
+ expected_token=TokenType.MISC_IDENT,
2286
+ what="collection",
2287
+ after="'Update ... in'",
2288
+ )
2289
+ assert isinstance(error_stmt, ErrorStatement)
2290
+ return error_stmt
2291
+ self._advance_tokens()
2292
+
2293
+ # Skip "to"
2294
+ if self._current_token and self._current_token.type == TokenType.KW_TO:
2295
+ self._advance_tokens()
2296
+
2297
+ # Parse the value
2298
+ value = self._parse_expression(Precedence.LOWEST)
2299
+ self._advance_tokens()
2300
+
2301
+ # Expect period
2302
+ if self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2303
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2304
+ assert isinstance(error, ErrorStatement)
2305
+ return error
2306
+
2307
+ return CollectionMutationStatement(
2308
+ token=start_token,
2309
+ operation="update",
2310
+ collection=collection,
2311
+ value=value,
2312
+ position=key, # Using position field to store the key
2313
+ position_type="key",
2314
+ )
2315
+
2316
+ # Should not reach here
2317
+ error_stmt = self._report_error_and_recover(
2318
+ template=UNHANDLED_OPERATION, what="collection mutation", operation=operation
2319
+ )
2320
+ assert isinstance(error_stmt, ErrorStatement)
2321
+ return error_stmt
2322
+
2323
+ def _parse_function_call_expression(self) -> Expression:
2324
+ """Parse a function call as an expression (for use with 'using' in Set statements).
2325
+
2326
+ Syntax: function_name [with <arguments>] or function_name [where <named arguments>].
2327
+
2328
+ Returns:
2329
+ A CallExpression AST node that will be evaluated as an expression.
2330
+ """
2331
+ assert self._current_token is not None
2332
+
2333
+ # Parse the function name (must be an identifier in backticks)
2334
+ if self._current_token and self._current_token.type == TokenType.MISC_IDENT:
2335
+ function_name = Identifier(self._current_token, self._current_token.literal)
2336
+ call_token = self._current_token
2337
+ self._advance_tokens()
2338
+ else:
2339
+ # Error: expected identifier for function name
2340
+ result = self._report_error_and_recover(
2341
+ template=EXPECTED_FUNCTION_NAME,
2342
+ token_type=str(self._current_token.type) if self._current_token else "EOF",
2343
+ skip_recovery=True,
2344
+ is_expression=True,
2345
+ )
2346
+ assert isinstance(result, ErrorExpression)
2347
+ return result
2348
+
2349
+ # Create the CallExpression
2350
+ call_expression = CallExpression(token=call_token, function_name=function_name)
2351
+
2352
+ # Check for arguments
2353
+ if self._current_token and self._current_token.type == TokenType.KW_WITH:
2354
+ # Positional arguments
2355
+ with_token = self._current_token
2356
+ self._advance_tokens()
2357
+ call_expression.arguments = self._parse_positional_arguments(with_token)
2358
+ elif self._current_token and self._current_token.type == TokenType.KW_WHERE:
2359
+ # Named arguments
2360
+ where_token = self._current_token
2361
+ self._advance_tokens()
2362
+ call_expression.arguments = self._parse_named_arguments(where_token)
2363
+
2364
+ return call_expression
2365
+
2366
+ def _parse_call_statement(self) -> CallStatement | ErrorStatement:
2367
+ """Parse a Use statement.
2368
+
2369
+ Syntax: use <function> [with <arguments>] or use <function> [where <named arguments>].
2370
+
2371
+ Returns:
2372
+ A CallStatement AST node.
2373
+ """
2374
+ assert self._current_token is not None
2375
+ assert self._current_token.type == TokenType.KW_USE
2376
+
2377
+ statement_token = self._current_token
2378
+
2379
+ # Move past 'use'
2380
+ self._advance_tokens()
2381
+
2382
+ # Parse the function name (must be an identifier in backticks)
2383
+ if self._current_token and self._current_token.type == TokenType.MISC_IDENT:
2384
+ function_name = Identifier(self._current_token, self._current_token.literal)
2385
+ self._advance_tokens()
2386
+ else:
2387
+ # Record error for missing or invalid function name
2388
+ error_token = self._current_token or Token(TokenType.MISC_EOF, "", 0, 0)
2389
+ self._report_error_and_recover(
2390
+ template=EXPECTED_FUNCTION_NAME,
2391
+ token_type=str(error_token.type),
2392
+ skip_recovery=True, # Continue parsing to find more errors
2393
+ )
2394
+ function_name = None
2395
+
2396
+ # Check for 'with' or 'where' keyword for arguments
2397
+ arguments: Arguments | None = None
2398
+ if self._current_token and self._current_token.type == TokenType.KW_WITH:
2399
+ # 'with' is for positional arguments
2400
+ with_token = self._current_token
2401
+ self._advance_tokens() # Move past 'with'
2402
+
2403
+ # Parse positional arguments
2404
+ arguments = self._parse_positional_arguments(with_token)
2405
+
2406
+ elif self._current_token and self._current_token.type == TokenType.KW_WHERE:
2407
+ # 'where' is for named arguments
2408
+ where_token = self._current_token
2409
+ self._advance_tokens() # Move past 'where'
2410
+
2411
+ # Parse named arguments
2412
+ arguments = self._parse_named_arguments(where_token)
2413
+
2414
+ # Create the Call statement
2415
+ call_statement = CallStatement(statement_token, function_name, arguments)
2416
+
2417
+ # Expect a period at the end
2418
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_PERIOD:
2419
+ self._advance_tokens()
2420
+ # But if we're already at a period (after error recovery), don't expect another
2421
+ elif self._current_token and self._current_token.type != TokenType.PUNCT_PERIOD:
2422
+ if error := self._expect_token(TokenType.PUNCT_PERIOD):
2423
+ assert isinstance(error, ErrorStatement)
2424
+ return error
2425
+
2426
+ return call_statement
2427
+
2428
+ def _parse_argument_value(self) -> Expression | None:
2429
+ """Parse a single argument value (literal or identifier).
2430
+
2431
+ Returns:
2432
+ The parsed expression or None if invalid.
2433
+ """
2434
+ if not self._current_token:
2435
+ return None
2436
+
2437
+ token = self._current_token
2438
+
2439
+ # Parse based on token type
2440
+ value: Expression | None = None
2441
+ if token.type == TokenType.MISC_IDENT:
2442
+ # Identifier
2443
+ value = Identifier(token, token.literal)
2444
+ self._advance_tokens()
2445
+ return value
2446
+ elif token.type == TokenType.LIT_WHOLE_NUMBER:
2447
+ # Integer literal
2448
+ int_value = self._parse_integer_literal()
2449
+ self._advance_tokens()
2450
+ return int_value
2451
+ elif token.type == TokenType.LIT_FLOAT:
2452
+ # Float literal
2453
+ float_value = self._parse_float_literal()
2454
+ self._advance_tokens()
2455
+ return float_value
2456
+ elif token.type == TokenType.LIT_TEXT:
2457
+ # String literal
2458
+ str_value = self._parse_string_literal()
2459
+ self._advance_tokens()
2460
+ return str_value
2461
+ elif token.type == TokenType.LIT_URL:
2462
+ # URL literal
2463
+ url_value = self._parse_url_literal()
2464
+ self._advance_tokens()
2465
+ return url_value
2466
+ elif token.type in (TokenType.LIT_YES, TokenType.LIT_NO):
2467
+ # Boolean literal
2468
+ bool_value = self._parse_boolean_literal()
2469
+ self._advance_tokens()
2470
+ return bool_value
2471
+ elif token.type == TokenType.KW_EMPTY:
2472
+ # Empty literal
2473
+ empty_value = self._parse_empty_literal()
2474
+ self._advance_tokens()
2475
+ return empty_value
2476
+ else:
2477
+ # Unknown token type for argument
2478
+ self._report_error_and_recover(
2479
+ template=INVALID_ARGUMENT_VALUE,
2480
+ literal=token.literal,
2481
+ skip_recovery=True, # We'll handle advancement manually
2482
+ )
2483
+ self._advance_tokens() # Skip the invalid token
2484
+ return None
2485
+
2486
+ def _parse_positional_arguments(self, with_token: Token) -> Arguments:
2487
+ """Parse positional arguments after 'with' keyword.
2488
+
2489
+ Syntax: with _value1_, _value2_
2490
+
2491
+ Returns:
2492
+ An Arguments AST node with positional arguments.
2493
+ """
2494
+ arguments = Arguments(with_token)
2495
+
2496
+ while self._current_token and self._current_token.type not in (
2497
+ TokenType.PUNCT_PERIOD,
2498
+ TokenType.MISC_EOF,
2499
+ ):
2500
+ # Parse argument value
2501
+ value = self._parse_argument_value()
2502
+ if value:
2503
+ arguments.positional.append(value)
2504
+
2505
+ # Check for comma (optional)
2506
+ if self._current_token and self._current_token.type == TokenType.PUNCT_COMMA:
2507
+ self._advance_tokens()
2508
+ # Check for 'and' (optional)
2509
+ elif self._current_token and self._current_token.type == TokenType.KW_AND:
2510
+ self._advance_tokens()
2511
+ # If no comma or 'and', and we're not at the end, check if another argument follows
2512
+ elif self._current_token and self._current_token.type not in (
2513
+ TokenType.PUNCT_PERIOD,
2514
+ TokenType.MISC_EOF,
2515
+ ):
2516
+ # Check if this looks like another argument (identifier or literal)
2517
+ if self._current_token.type in (
2518
+ TokenType.MISC_IDENT,
2519
+ TokenType.LIT_WHOLE_NUMBER,
2520
+ TokenType.LIT_FLOAT,
2521
+ TokenType.LIT_TEXT,
2522
+ TokenType.LIT_YES,
2523
+ TokenType.LIT_NO,
2524
+ TokenType.KW_EMPTY,
2525
+ ):
2526
+ # Report error but continue parsing (error recovery)
2527
+ self._report_error_and_recover(
2528
+ template=MISSING_COMMA_BETWEEN_ARGS,
2529
+ skip_recovery=True, # Continue parsing the next argument
2530
+ )
2531
+ continue
2532
+ else:
2533
+ # Not an argument, stop parsing
2534
+ break
2535
+
2536
+ return arguments
2537
+
2538
+ def _parse_named_arguments(self, where_token: Token) -> Arguments:
2539
+ """Parse named arguments after 'where' keyword.
2540
+
2541
+ Syntax: where `param1` is _value1_, `param2` is _value2_
2542
+
2543
+ Returns:
2544
+ An Arguments AST node with named arguments.
2545
+ """
2546
+ arguments = Arguments(where_token)
2547
+
2548
+ while self._current_token and self._current_token.type not in (
2549
+ TokenType.PUNCT_PERIOD,
2550
+ TokenType.MISC_EOF,
2551
+ ):
2552
+ # Parse parameter name (should be an identifier in backticks)
2553
+ name_expr: Identifier | None = None
2554
+ if self._current_token and self._current_token.type == TokenType.MISC_IDENT:
2555
+ name_expr = Identifier(self._current_token, self._current_token.literal)
2556
+ self._advance_tokens()
2557
+ else:
2558
+ # Error: expected identifier
2559
+ self._report_error_and_recover(
2560
+ template=EXPECTED_TOKEN,
2561
+ expected_token=TokenType.MISC_IDENT,
2562
+ skip_recovery=True,
2563
+ token="parameter name",
2564
+ got_token_type=self._current_token.type.name if self._current_token else "EOF",
2565
+ )
2566
+ break
2567
+
2568
+ # Expect 'is' keyword - mypy doesn't realize _advance_tokens() changes _current_token
2569
+ assert self._current_token is not None # Help mypy understand
2570
+ if self._current_token.type == TokenType.KW_IS:
2571
+ self._advance_tokens()
2572
+ else:
2573
+ self._report_error_and_recover(
2574
+ template=EXPECTED_TOKEN_AFTER,
2575
+ expected_token=TokenType.KW_IS,
2576
+ skip_recovery=True,
2577
+ expected="'is' keyword",
2578
+ after="parameter name",
2579
+ )
2580
+ break
2581
+
2582
+ # Parse the value
2583
+ value = self._parse_argument_value()
2584
+
2585
+ # Add to named arguments if both name and value are valid
2586
+ if name_expr and value:
2587
+ arguments.named.append((name_expr, value))
2588
+
2589
+ # Check for comma (optional)
2590
+ if self._current_token and self._current_token.type == TokenType.PUNCT_COMMA:
2591
+ self._advance_tokens()
2592
+ # Check for 'and' (optional)
2593
+ elif self._current_token and self._current_token.type == TokenType.KW_AND:
2594
+ self._advance_tokens()
2595
+ # If no comma or 'and', and we're not at the end, break
2596
+ elif self._current_token and self._current_token.type not in (
2597
+ TokenType.PUNCT_PERIOD,
2598
+ TokenType.MISC_EOF,
2599
+ ):
2600
+ break
2601
+
2602
+ return arguments
2603
+
2604
+ def _parse_if_statement(self) -> IfStatement | ErrorStatement:
2605
+ """Parse an if statement with block statements.
2606
+
2607
+ Expects: if/when/whenever <condition> [then]: <block> [else/otherwise: <block>]
2608
+
2609
+ Returns:
2610
+ An IfStatement AST node.
2611
+ """
2612
+ assert self._current_token is not None
2613
+ if_statement = IfStatement(token=self._current_token)
2614
+
2615
+ # Advance past 'if', 'when', or 'whenever'
2616
+ self._advance_tokens()
2617
+
2618
+ # Parse the condition expression
2619
+ if_statement.condition = self._parse_expression(Precedence.LOWEST)
2620
+
2621
+ # Check for optional comma before 'then'
2622
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_COMMA:
2623
+ self._advance_tokens() # Skip the comma
2624
+
2625
+ # Check for optional 'then' keyword
2626
+ if self._peek_token and self._peek_token.type == TokenType.KW_THEN:
2627
+ self._advance_tokens() # Move to 'then'
2628
+
2629
+ # Expect colon
2630
+ if error := self._expect_token(TokenType.PUNCT_COLON, "if condition"):
2631
+ assert isinstance(error, ErrorStatement)
2632
+ return error
2633
+
2634
+ # Parse the consequence block
2635
+ # If we're inside a block, nested if statements should have deeper blocks
2636
+ expected_depth = self._block_depth + 1
2637
+ if_statement.consequence = self._parse_block_statement(expected_depth)
2638
+
2639
+ # Check if the consequence block is empty - this is an error
2640
+ if not if_statement.consequence or len(if_statement.consequence.statements) == 0:
2641
+ self._report_error_and_recover(
2642
+ template=EMPTY_IF_CONSEQUENCE,
2643
+ skip_recovery=True, # No recovery needed, continue parsing
2644
+ )
2645
+
2646
+ # Check for else/otherwise clause
2647
+ if self._current_token and self._current_token.type == TokenType.KW_ELSE:
2648
+ # Check if next token is colon
2649
+ if self._peek_token and self._peek_token.type == TokenType.PUNCT_COLON:
2650
+ self._advance_tokens() # Move past else to colon
2651
+ self._advance_tokens() # Move past colon
2652
+ else:
2653
+ # No colon after else, return without alternative
2654
+ return if_statement
2655
+
2656
+ # Parse the alternative block
2657
+ if_statement.alternative = self._parse_block_statement(expected_depth)
2658
+
2659
+ # Check if the alternative block is empty - this is also an error
2660
+ if not if_statement.alternative or len(if_statement.alternative.statements) == 0:
2661
+ self._report_error_and_recover(
2662
+ template=EMPTY_ELSE_BLOCK,
2663
+ skip_recovery=True, # No recovery needed, continue parsing
2664
+ )
2665
+ elif self._block_depth == 0:
2666
+ # No else clause and we're at top level (not inside a block)
2667
+ # Check if we're at a '>' token that was part of the block we just parsed
2668
+ # If so, don't rewind as it would re-parse block content
2669
+ if (
2670
+ self._current_token
2671
+ and self._current_token.type == TokenType.OP_GT
2672
+ and if_statement.consequence
2673
+ and if_statement.consequence.depth > 0
2674
+ ):
2675
+ # We're at a '>' that was part of the block, don't rewind
2676
+ pass
2677
+ elif self._current_token and self._current_token.type != TokenType.MISC_EOF:
2678
+ # With streaming, we can't back up tokens
2679
+ # The block parsing should have left us in the right position
2680
+ pass
2681
+
2682
+ return if_statement
2683
+
2684
+ def _parse_while_statement(self) -> WhileStatement | ErrorStatement:
2685
+ """Parse a while loop statement.
2686
+
2687
+ Expects: while <condition>: <block>
2688
+
2689
+ Returns:
2690
+ A WhileStatement AST node.
2691
+ """
2692
+ assert self._current_token is not None
2693
+ while_statement = WhileStatement(token=self._current_token)
2694
+
2695
+ # Advance past 'while'
2696
+ self._advance_tokens()
2697
+
2698
+ # Parse the condition expression
2699
+ while_statement.condition = self._parse_expression(Precedence.LOWEST)
2700
+
2701
+ # Expect colon
2702
+ if error := self._expect_token(TokenType.PUNCT_COLON, "while condition"):
2703
+ assert isinstance(error, ErrorStatement)
2704
+ return error
2705
+
2706
+ # Parse the body block
2707
+ expected_depth = self._block_depth + 1
2708
+ while_statement.body = self._parse_block_statement(expected_depth)
2709
+
2710
+ # Check if the body block is empty - this is an error
2711
+ if not while_statement.body or len(while_statement.body.statements) == 0:
2712
+ self._report_error_and_recover(
2713
+ template=EMPTY_WHILE_BODY,
2714
+ skip_recovery=True, # No recovery needed, continue parsing
2715
+ )
2716
+
2717
+ return while_statement
2718
+
2719
+ def _parse_for_each_statement(self) -> ForEachStatement | ErrorStatement:
2720
+ """Parse a for-each loop statement.
2721
+
2722
+ Expects: for each <item> in <collection>: <block>
2723
+
2724
+ Returns:
2725
+ A ForEachStatement AST node.
2726
+ """
2727
+ assert self._current_token is not None
2728
+ for_statement = ForEachStatement(token=self._current_token)
2729
+
2730
+ # Advance past 'for'
2731
+ self._advance_tokens()
2732
+
2733
+ # Expect 'each'
2734
+ if self._current_token and self._current_token.type != TokenType.KW_EACH:
2735
+ if error := self._expect_token(TokenType.KW_EACH, "'for' keyword"):
2736
+ assert isinstance(error, ErrorStatement)
2737
+ return error
2738
+ self._advance_tokens() # Move past 'each'
2739
+
2740
+ # Parse the loop variable (item)
2741
+ for_statement.item = self._parse_identifier_or_keyword_as_identifier()
2742
+ if not for_statement.item:
2743
+ error_stmt = self._report_error_and_recover(
2744
+ template=EXPECTED_IDENTIFIER_AFTER,
2745
+ context="'each'",
2746
+ skip_recovery=True,
2747
+ )
2748
+ assert isinstance(error_stmt, ErrorStatement)
2749
+ return error_stmt
2750
+ self._advance_tokens()
2751
+
2752
+ # Expect 'in'
2753
+ if self._current_token and self._current_token.type != TokenType.KW_IN:
2754
+ if error := self._expect_token(TokenType.KW_IN, "loop variable"):
2755
+ assert isinstance(error, ErrorStatement)
2756
+ return error
2757
+ self._advance_tokens() # Move past 'in'
2758
+
2759
+ # Parse the collection expression
2760
+ for_statement.collection = self._parse_expression(Precedence.LOWEST)
2761
+
2762
+ # Expect colon
2763
+ if error := self._expect_token(TokenType.PUNCT_COLON, "for-each header"):
2764
+ assert isinstance(error, ErrorStatement)
2765
+ return error
2766
+
2767
+ # Parse the body block
2768
+ expected_depth = self._block_depth + 1
2769
+ for_statement.body = self._parse_block_statement(expected_depth)
2770
+
2771
+ # Check if the body block is empty - this is an error
2772
+ if not for_statement.body or len(for_statement.body.statements) == 0:
2773
+ self._report_error_and_recover(
2774
+ template=EMPTY_FOR_EACH_BODY,
2775
+ skip_recovery=True, # No recovery needed, continue parsing
2776
+ )
2777
+
2778
+ return for_statement
2779
+
2780
+ def _parse_action_interaction_or_utility(
2781
+ self,
2782
+ ) -> ActionStatement | InteractionStatement | UtilityStatement | ErrorStatement:
2783
+ """Parse an Action, Interaction, or Utility statement.
2784
+
2785
+ Expected format:
2786
+ ### **Action**: `name`
2787
+ or
2788
+ ### **Interaction**: `name`
2789
+ or
2790
+ ### **Utility**: `name`
2791
+
2792
+ <details>
2793
+ <summary>Description</summary>
2794
+ > statements
2795
+ </details>
2796
+
2797
+ Returns:
2798
+ ActionStatement, InteractionStatement, or UtilityStatement node, or ErrorStatement if parsing fails.
2799
+ """
2800
+ assert self._current_token is not None
2801
+ assert self._current_token.type == TokenType.PUNCT_HASH_TRIPLE
2802
+
2803
+ # Save the ### token for the statement
2804
+ # hash_token = self._current_token # Currently unused, but may be needed for error reporting
2805
+
2806
+ # Move past ###
2807
+ self._advance_tokens()
2808
+
2809
+ # Expect **Action**, **Interaction**, or **Utility** (wrapped keyword)
2810
+ if not self._current_token or self._current_token.type not in (
2811
+ TokenType.KW_ACTION,
2812
+ TokenType.KW_INTERACTION,
2813
+ TokenType.KW_UTILITY,
2814
+ ):
2815
+ error_stmt = self._report_error_and_recover(
2816
+ template=EXPECTED_TOKEN_AFTER, expected="**Action**, **Interaction**, or **Utility**", after="###"
2817
+ )
2818
+ assert isinstance(error_stmt, ErrorStatement)
2819
+ return error_stmt
2820
+
2821
+ statement_type = self._current_token.type
2822
+ keyword_token = self._current_token
2823
+
2824
+ # Move past Action/Interaction/Utility keyword
2825
+ self._advance_tokens()
2826
+
2827
+ # Expect colon - should be at current position
2828
+ if not self._current_token or self._current_token.type != TokenType.PUNCT_COLON:
2829
+ error_stmt = self._report_error_and_recover(
2830
+ template=EXPECTED_TOKEN_AFTER,
2831
+ expected_token=TokenType.PUNCT_COLON,
2832
+ expected="':'",
2833
+ after="Action/Interaction/Utility",
2834
+ )
2835
+ assert isinstance(error_stmt, ErrorStatement)
2836
+ return error_stmt
2837
+
2838
+ # Move past colon
2839
+ self._advance_tokens()
2840
+
2841
+ # Expect backtick-wrapped name
2842
+ if not self._current_token or self._current_token.type != TokenType.MISC_IDENT:
2843
+ return self._report_error_and_recover(
2844
+ template=EXPECTED_TOKEN, expected_token=TokenType.MISC_IDENT, token="identifier in backticks for name"
2845
+ )
2846
+
2847
+ name = Identifier(self._current_token, self._current_token.literal)
2848
+ self._advance_tokens()
2849
+
2850
+ # Now expect <details> tag - should be at current position
2851
+ if not self._current_token or self._current_token.type != TokenType.TAG_DETAILS_START:
2852
+ return self._report_error_and_recover(
2853
+ template=EXPECTED_TOKEN,
2854
+ token="<details> tag",
2855
+ got_token_type=self._current_token.type.name if self._current_token else "EOF",
2856
+ )
2857
+
2858
+ # Move past <details>
2859
+ self._advance_tokens()
2860
+
2861
+ # Check for <summary> tag and extract description
2862
+ description = ""
2863
+ if self._current_token and self._current_token.type == TokenType.TAG_SUMMARY_START:
2864
+ self._advance_tokens()
2865
+ # The next token should be a comment with the description
2866
+ if self._current_token and self._current_token.type == TokenType.MISC_COMMENT:
2867
+ description = self._current_token.literal
2868
+ self._advance_tokens()
2869
+ # Expect </summary>
2870
+ if self._current_token and self._current_token.type == TokenType.TAG_SUMMARY_END:
2871
+ self._advance_tokens()
2872
+
2873
+ # Parse the body (block of statements with > prefix)
2874
+ body = self._parse_block_statement()
2875
+
2876
+ # Expect </details> tag - should be at current position after block parsing
2877
+ if self._current_token and self._current_token.type == TokenType.TAG_DETAILS_END:
2878
+ self._advance_tokens()
2879
+ else:
2880
+ # If we're not at </details>, something went wrong with block parsing
2881
+ # Create an error but don't panic recover
2882
+ if self._current_token:
2883
+ # Check if this is likely a missing depth transition issue
2884
+ if self._current_token.type == TokenType.KW_RETURN and self._block_depth > 0:
2885
+ # This looks like a "Give back" statement after nested blocks
2886
+ # The user likely forgot to add a transition line
2887
+ nested_depth = ">" * (self._block_depth + 1) # The depth they were at (e.g., >>)
2888
+ parent_depth = ">" * self._block_depth # The depth they need to transition to (e.g., >)
2889
+ self._report_error_and_recover(
2890
+ template=MISSING_DEPTH_TRANSITION,
2891
+ nested_depth=nested_depth,
2892
+ parent_depth=parent_depth,
2893
+ token_type=self._current_token.type.name,
2894
+ skip_recovery=True, # Continue parsing
2895
+ )
2896
+ else:
2897
+ self._report_error_and_recover(
2898
+ template=EXPECTED_DETAILS_CLOSE,
2899
+ token_type=self._current_token.type.name,
2900
+ skip_recovery=True, # Continue parsing
2901
+ )
2902
+
2903
+ # Check for parameter sections (#### Inputs: and #### Outputs:)
2904
+ inputs: list[Parameter] = []
2905
+ outputs: list[Parameter] = []
2906
+
2907
+ # Check if we have #### for parameter sections
2908
+ if self._current_token and self._current_token.type == TokenType.PUNCT_HASH_QUAD:
2909
+ # Parse parameter sections
2910
+ inputs, outputs = self._parse_parameter_sections()
2911
+
2912
+ # Create and return the appropriate statement
2913
+ if statement_type == TokenType.KW_ACTION:
2914
+ return ActionStatement(
2915
+ keyword_token, name, inputs=inputs, outputs=outputs, body=body, description=description
2916
+ )
2917
+ elif statement_type == TokenType.KW_INTERACTION:
2918
+ return InteractionStatement(
2919
+ keyword_token, name, inputs=inputs, outputs=outputs, body=body, description=description
2920
+ )
2921
+ elif statement_type == TokenType.KW_UTILITY:
2922
+ return UtilityStatement(
2923
+ keyword_token, name, inputs=inputs, outputs=outputs, body=body, description=description
2924
+ )
2925
+ else:
2926
+ # This should never happen since we check for valid types above
2927
+ return self._report_error_and_recover(template=UNEXPECTED_STATEMENT, type=statement_type)
2928
+
2929
+ def _parse_block_statement(self, expected_depth: int = 1) -> BlockStatement:
2930
+ """Parse a block of statements marked by '>' symbols.
2931
+
2932
+ A block contains statements that start with one or more '>' symbols.
2933
+ The number of '>' symbols determines the depth of the block.
2934
+ The block ends when we encounter a statement with fewer '>' symbols
2935
+ or a statement without '>' symbols.
2936
+
2937
+ Args:
2938
+ expected_depth: The expected depth for this block (number of '>' symbols).
2939
+ Defaults to 1 for top-level blocks.
2940
+
2941
+ Returns:
2942
+ A BlockStatement AST node.
2943
+ """
2944
+ assert self._current_token is not None
2945
+ block_token = self._current_token
2946
+ block = BlockStatement(token=block_token, depth=expected_depth)
2947
+
2948
+ # Track that we're entering a block
2949
+ self._block_depth += 1
2950
+
2951
+ # Tell the token buffer we're in a block
2952
+ if self._token_buffer:
2953
+ self._token_buffer.set_block_context(True)
2954
+
2955
+ # If we're at a colon, it's the start of a block - advance past it
2956
+ if self._current_token.type == TokenType.PUNCT_COLON:
2957
+ self._advance_tokens()
2958
+
2959
+ # Parse statements in the block
2960
+ while self._current_token and self._current_token.type != TokenType.MISC_EOF:
2961
+ # Note: With streaming tokens, we can't save/restore positions
2962
+
2963
+ # Count the depth at the start of the current line
2964
+ current_depth = 0
2965
+ original_line = self._current_token.line if self._current_token else 0
2966
+
2967
+ # Check if we're at '>' tokens
2968
+ if self._current_token.type == TokenType.OP_GT:
2969
+ # Count '>' tokens only on the current line
2970
+ current_line = self._current_token.line
2971
+ while (
2972
+ self._current_token
2973
+ and self._current_token.type == TokenType.OP_GT
2974
+ and self._current_token.line == current_line
2975
+ ):
2976
+ current_depth += 1
2977
+ self._advance_tokens()
2978
+
2979
+ # Check depth against expected depth
2980
+ if current_depth == 0:
2981
+ # No '>' means we've exited the block
2982
+ break
2983
+ elif current_depth < expected_depth:
2984
+ # We've exited the block due to lower depth
2985
+
2986
+ # TODO: Fix bug where statements after nested if blocks (e.g., after `> >`)
2987
+ # are not properly parsed as part of the parent block. Currently requires
2988
+ # an empty `>` line after the nested block to continue parsing correctly.
2989
+ # Example issue:
2990
+ # > If condition then:
2991
+ # > > Give back value.
2992
+ # > Set `var` to _1_. # This line may not be parsed correctly
2993
+ # Example working code:
2994
+ # > If condition then:
2995
+ # > > Give back value.
2996
+ # >
2997
+ # > Set `var` to _1_. # This line may now be parsed correctly
2998
+ # We expect both example codes to be working
2999
+
3000
+ # We've already consumed the '>' tokens while counting depth
3001
+ # The parent block needs to handle this line's content
3002
+ # But first check if this is an empty line (only '>')
3003
+ if self._current_token and self._current_token.line != original_line:
3004
+ # Empty line - we consumed all tokens on the line
3005
+ # Just break and let parent continue from next line
3006
+ break
3007
+ else:
3008
+ # Not empty - there's content after the '>'
3009
+ # With streaming, we can't back up - the tokens are already consumed
3010
+ # This means nested blocks need special handling
3011
+ break
3012
+ elif current_depth > expected_depth:
3013
+ # Nested block or error - for now treat as error
3014
+ self._report_error_and_recover(
3015
+ template=UNEXPECTED_BLOCK_DEPTH,
3016
+ expected=str(expected_depth),
3017
+ actual=str(current_depth),
3018
+ skip_recovery=True, # We'll handle recovery manually
3019
+ )
3020
+ # Skip to next line
3021
+ while self._current_token and self._current_token.type not in (
3022
+ TokenType.PUNCT_PERIOD,
3023
+ TokenType.MISC_EOF,
3024
+ TokenType.OP_GT,
3025
+ ):
3026
+ self._advance_tokens()
3027
+ continue
3028
+
3029
+ # After depth check, check if this was an empty line (just '>' with no content)
3030
+ # Empty line is when we counted '>' but are no longer on the same line
3031
+ if current_depth > 0 and self._current_token and self._current_token.line != original_line:
3032
+ # The line only had '>' markers, skip to next line
3033
+ continue
3034
+
3035
+ # Check for tokens that would indicate we've left the block
3036
+ if self._current_token and self._current_token.type in (
3037
+ TokenType.MISC_EOF,
3038
+ TokenType.KW_ELSE, # 'else' would be outside the block
3039
+ ):
3040
+ break # We've exited the block
3041
+
3042
+ # Parse the statement
3043
+ statement = self._parse_statement()
3044
+ block.statements.append(statement)
3045
+
3046
+ # Skip the period if present
3047
+ if self._current_token and self._current_token.type == TokenType.PUNCT_PERIOD:
3048
+ self._advance_tokens()
3049
+
3050
+ # Track that we're exiting a block
3051
+ self._block_depth -= 1
3052
+
3053
+ # Tell the token buffer we're no longer in a block
3054
+ if self._token_buffer:
3055
+ self._token_buffer.set_block_context(self._block_depth > 0)
3056
+
3057
+ return block
3058
+
3059
+ def _parse_statement(self) -> Statement:
3060
+ """Parse a single statement.
3061
+
3062
+ Determines the statement type based on the current token and
3063
+ delegates to the appropriate parsing method.
3064
+
3065
+ Returns:
3066
+ A Statement AST node (may be an ErrorStatement if parsing fails).
3067
+ """
3068
+ assert self._current_token is not None
3069
+
3070
+ stmt_funcs = self._register_statement_functions()
3071
+ if self._current_token.type in stmt_funcs:
3072
+ return stmt_funcs[self._current_token.type]()
3073
+ else:
3074
+ return self._parse_expression_statement()
3075
+
3076
+ def _register_infix_funcs(self) -> InfixParseFuncs:
3077
+ """Register infix parsing functions for each token type.
3078
+
3079
+ Infix parsing functions handle expressions where an operator appears
3080
+ between operands (e.g., "1 + 2", "a * b"). Each function takes the
3081
+ left-hand expression as an argument and returns the complete expression.
3082
+
3083
+ The parser uses these functions when it encounters a token in the middle
3084
+ of an expression. For example, when parsing "1 + 2", after parsing "1",
3085
+ the parser sees "+" and calls the registered infix function for PLUS,
3086
+ passing "1" as the left operand.
3087
+
3088
+ Returns:
3089
+ Dictionary mapping TokenType to InfixParseFunc callbacks.
3090
+ Each callback signature: (left: Expression) -> Optional[Expression]
3091
+
3092
+ Example:
3093
+ When implemented, might look like:
3094
+ return {
3095
+ TokenType.OP_PLUS: self._parse_infix_expression,
3096
+ TokenType.OP_MINUS: self._parse_infix_expression,
3097
+ TokenType.OP_MULTIPLY: self._parse_infix_expression,
3098
+ TokenType.DELIM_LPAREN: self._parse_call_expression,
3099
+ }
3100
+ """
3101
+ return {
3102
+ # Arithmetic operators
3103
+ TokenType.OP_PLUS: self._parse_infix_expression,
3104
+ TokenType.OP_MINUS: self._parse_infix_expression,
3105
+ TokenType.OP_STAR: self._parse_infix_expression,
3106
+ TokenType.OP_DIVISION: self._parse_infix_expression,
3107
+ TokenType.OP_CARET: self._parse_infix_expression,
3108
+ # Comparison operators
3109
+ TokenType.OP_EQ: self._parse_infix_expression,
3110
+ TokenType.OP_NOT_EQ: self._parse_infix_expression,
3111
+ TokenType.OP_STRICT_EQ: self._parse_infix_expression,
3112
+ TokenType.OP_STRICT_NOT_EQ: self._parse_infix_expression,
3113
+ TokenType.OP_LT: self._parse_infix_expression,
3114
+ TokenType.OP_GT: self._parse_infix_expression,
3115
+ TokenType.OP_LTE: self._parse_infix_expression,
3116
+ TokenType.OP_GTE: self._parse_infix_expression,
3117
+ # Logical operators
3118
+ TokenType.KW_AND: self._parse_infix_expression,
3119
+ TokenType.KW_OR: self._parse_infix_expression,
3120
+ # Conditional/ternary expressions
3121
+ TokenType.KW_IF: self._parse_conditional_expression,
3122
+ # Dictionary extraction operators
3123
+ TokenType.OP_THE_NAMES_OF: self._parse_dict_extraction_infix,
3124
+ TokenType.OP_THE_CONTENTS_OF: self._parse_dict_extraction_infix,
3125
+ }
3126
+
3127
+ def _register_prefix_funcs(self) -> PrefixParseFuncs:
3128
+ """Register prefix parsing functions for each token type.
3129
+
3130
+ Prefix parsing functions handle expressions that start with a specific
3131
+ token type. This includes literals (numbers, strings), identifiers,
3132
+ prefix operators (e.g., "-5", "not true"), and grouped expressions
3133
+ (parentheses).
3134
+
3135
+ The parser calls these functions when it encounters a token at the
3136
+ beginning of an expression. For example, when parsing "-5", the parser
3137
+ sees "-" and calls the registered prefix function for MINUS.
3138
+
3139
+ Returns:
3140
+ Dictionary mapping TokenType to PrefixParseFunc callbacks.
3141
+ Each callback signature: () -> Optional[Expression]
3142
+
3143
+ Example:
3144
+ When implemented, might look like:
3145
+ return {
3146
+ TokenType.LIT_IDENTIFIER: self._parse_identifier,
3147
+ TokenType.LIT_NUMBER: self._parse_number_literal,
3148
+ TokenType.LIT_TEXT: self._parse_string_literal,
3149
+ TokenType.OP_MINUS: self._parse_prefix_expression,
3150
+ TokenType.KW_NOT: self._parse_prefix_expression,
3151
+ TokenType.DELIM_LPAREN: self._parse_grouped_expression,
3152
+ }
3153
+ """
3154
+ return {
3155
+ TokenType.MISC_IDENT: self._parse_identifier,
3156
+ TokenType.LIT_WHOLE_NUMBER: self._parse_integer_literal,
3157
+ TokenType.LIT_FLOAT: self._parse_float_literal,
3158
+ TokenType.LIT_TEXT: self._parse_string_literal,
3159
+ TokenType.LIT_URL: self._parse_url_literal,
3160
+ TokenType.LIT_YES: self._parse_boolean_literal,
3161
+ TokenType.LIT_NO: self._parse_boolean_literal,
3162
+ TokenType.KW_EMPTY: self._parse_empty_literal,
3163
+ TokenType.OP_MINUS: self._parse_prefix_expression,
3164
+ TokenType.KW_NEGATION: self._parse_prefix_expression,
3165
+ TokenType.DELIM_LPAREN: self._parse_grouped_expression,
3166
+ # List access patterns
3167
+ TokenType.KW_FIRST: self._parse_ordinal_list_access,
3168
+ TokenType.KW_SECOND: self._parse_ordinal_list_access,
3169
+ TokenType.KW_THIRD: self._parse_ordinal_list_access,
3170
+ TokenType.KW_LAST: self._parse_ordinal_list_access,
3171
+ TokenType.KW_ITEM: self._parse_numeric_list_access,
3172
+ # Handle 'the' stopword for list access
3173
+ TokenType.MISC_STOPWORD: self._parse_stopword_expression,
3174
+ # Handle possessive syntax
3175
+ TokenType.PUNCT_APOSTROPHE_S: self._parse_possessive_access,
3176
+ # Dictionary extraction operators can also be prefix
3177
+ TokenType.OP_THE_NAMES_OF: self._parse_dict_extraction_prefix,
3178
+ TokenType.OP_THE_CONTENTS_OF: self._parse_dict_extraction_prefix,
3179
+ }
3180
+
3181
+ @staticmethod
3182
+ def _register_postfix_funcs() -> PostfixParseFuncs:
3183
+ """Register postfix parsing functions for each token type.
3184
+
3185
+ Postfix parsing functions handle expressions where an operator appears
3186
+ after the operand (e.g., "i++", "factorial!", array indexing "arr[0]").
3187
+ Each function takes the left-hand expression as an argument and returns
3188
+ the complete expression.
3189
+
3190
+ The parser uses these functions when it encounters a token after a
3191
+ complete expression. For example, when parsing "i++", after parsing "i",
3192
+ the parser sees "++" and calls the registered postfix function for
3193
+ INCREMENT, passing "i" as the operand.
3194
+
3195
+ Returns:
3196
+ Dictionary mapping TokenType to PostfixParseFunc callbacks.
3197
+ Each callback signature: (left: Expression) -> Optional[Expression]
3198
+
3199
+ Example:
3200
+ When implemented, might look like:
3201
+ return {
3202
+ TokenType.OP_INCREMENT: self._parse_postfix_expression,
3203
+ TokenType.OP_DECREMENT: self._parse_postfix_expression,
3204
+ TokenType.OP_FACTORIAL: self._parse_postfix_expression,
3205
+ TokenType.DELIM_LBRACKET: self._parse_index_expression,
3206
+ TokenType.PUNCT_QUESTION: self._parse_ternary_expression,
3207
+ }
3208
+ """
3209
+ return {}
3210
+
3211
+ def _register_statement_functions(self) -> dict[TokenType, Callable[[], Statement]]:
3212
+ """Register statement parsing functions for each token type."""
3213
+ return {
3214
+ TokenType.KW_DEFINE: self._parse_define_statement,
3215
+ TokenType.KW_SET: self._parse_set_statement,
3216
+ TokenType.KW_RETURN: self._parse_return_statement,
3217
+ TokenType.KW_IF: self._parse_if_statement,
3218
+ TokenType.KW_WHILE: self._parse_while_statement,
3219
+ TokenType.KW_FOR: self._parse_for_each_statement,
3220
+ TokenType.KW_SAY: self._parse_say_statement,
3221
+ TokenType.KW_TELL: self._parse_say_statement, # Tell is an alias for Say
3222
+ TokenType.KW_USE: self._parse_call_statement,
3223
+ TokenType.PUNCT_HASH_TRIPLE: self._parse_action_interaction_or_utility,
3224
+ TokenType.KW_ADD: self._parse_collection_mutation_statement,
3225
+ TokenType.KW_REMOVE: self._parse_collection_mutation_statement,
3226
+ TokenType.KW_INSERT: self._parse_collection_mutation_statement,
3227
+ TokenType.KW_CLEAR: self._parse_collection_mutation_statement,
3228
+ TokenType.KW_UPDATE: self._parse_collection_mutation_statement,
3229
+ # Note: KW_EMPTY is not registered here because "empty" as a literal is more common
3230
+ # than "Empty `collection`" as a statement. Standalone "empty" will be parsed as an expression.
3231
+ }
3232
+
3233
+ def _parse_parameter_sections(self) -> tuple[list[Parameter], list[Output]]:
3234
+ """Parse parameter sections (#### Inputs: and #### Outputs:).
3235
+
3236
+ Returns:
3237
+ A tuple of (inputs, outputs) - inputs are Parameters, outputs are Outputs.
3238
+ """
3239
+ inputs: list[Parameter] = []
3240
+ outputs: list[Output] = []
3241
+
3242
+ while self._current_token and self._current_token.type == TokenType.PUNCT_HASH_QUAD:
3243
+ # Move past ####
3244
+ self._advance_tokens()
3245
+
3246
+ # Check if it's Inputs or Outputs
3247
+ current = self._current_token
3248
+ if current:
3249
+ if current.type == TokenType.KW_INPUTS:
3250
+ # Move past "Inputs"
3251
+ self._advance_tokens()
3252
+
3253
+ # Expect colon
3254
+ colon_token = self._current_token
3255
+ if colon_token and colon_token.type == TokenType.PUNCT_COLON:
3256
+ self._advance_tokens()
3257
+
3258
+ # Parse input parameters (lines starting with -)
3259
+ inputs = self._parse_parameter_list()
3260
+
3261
+ elif current.type == TokenType.KW_OUTPUTS:
3262
+ # Move past "Outputs"
3263
+ self._advance_tokens()
3264
+
3265
+ # Expect colon
3266
+ colon_token2 = self._current_token
3267
+ if colon_token2 and colon_token2.type == TokenType.PUNCT_COLON:
3268
+ self._advance_tokens()
3269
+
3270
+ # Parse output list (different format from inputs)
3271
+ outputs = self._parse_output_list()
3272
+
3273
+ else:
3274
+ # Not a parameter section, break
3275
+ break
3276
+
3277
+ return inputs, outputs
3278
+
3279
+ def _parse_parameter_list(self) -> list[Parameter]:
3280
+ """Parse a list of parameters (lines starting with -).
3281
+
3282
+ Expected format:
3283
+ - `name` **as** Type (required)
3284
+ - `name` **as** Type (optional, default: value)
3285
+
3286
+ Returns:
3287
+ List of Parameter objects.
3288
+ """
3289
+ parameters: list[Parameter] = []
3290
+
3291
+ while self._current_token and self._current_token.type == TokenType.OP_MINUS:
3292
+ # Move past -
3293
+ self._advance_tokens()
3294
+
3295
+ # Parse single parameter
3296
+ param = self._parse_parameter()
3297
+ if param:
3298
+ parameters.append(param)
3299
+
3300
+ return parameters
3301
+
3302
+ def _parse_parameter(self) -> Parameter | None:
3303
+ """Parse a single parameter.
3304
+
3305
+ Expected format:
3306
+ `name` **as** Type (required)
3307
+ `name` **as** Type (optional, default: value)
3308
+
3309
+ Returns:
3310
+ A Parameter object or None if parsing fails.
3311
+ """
3312
+ if not self._current_token:
3313
+ return None
3314
+
3315
+ # Save starting token for error reporting
3316
+ start_token = self._current_token
3317
+
3318
+ # Expect identifier in backticks
3319
+ if self._current_token.type != TokenType.MISC_IDENT:
3320
+ return None
3321
+
3322
+ name = Identifier(self._current_token, self._current_token.literal)
3323
+ self._advance_tokens()
3324
+
3325
+ # Expect "as" keyword
3326
+ current = self._current_token
3327
+ if not current or current.type != TokenType.KW_AS:
3328
+ return None
3329
+ self._advance_tokens()
3330
+
3331
+ # Parse type name (could be multi-word like "Whole Number")
3332
+ type_name = self._parse_type_name()
3333
+ if not type_name:
3334
+ return None
3335
+
3336
+ # Default values
3337
+ is_required = True
3338
+ default_value: Expression | None = None
3339
+
3340
+ # Check for (required) or (optional, default: value) or (default: value)
3341
+ # TODO: In the future, remove support for explicit required/optional keywords
3342
+ # and make it fully implicit based on presence of default value
3343
+ paren_token = self._current_token
3344
+ if paren_token and paren_token.type == TokenType.DELIM_LPAREN:
3345
+ self._advance_tokens()
3346
+
3347
+ status_token = self._current_token
3348
+ if status_token:
3349
+ if status_token.type == TokenType.KW_OPTIONAL:
3350
+ is_required = False
3351
+ self._advance_tokens()
3352
+
3353
+ # Check for default value
3354
+ comma_token = self._current_token
3355
+ if comma_token and comma_token.type == TokenType.PUNCT_COMMA:
3356
+ self._advance_tokens()
3357
+
3358
+ # Expect "default"
3359
+ default_token = self._current_token
3360
+ if default_token and default_token.type == TokenType.KW_DEFAULT:
3361
+ self._advance_tokens()
3362
+
3363
+ # Expect colon
3364
+ colon_check = self._current_token
3365
+ if colon_check and colon_check.type == TokenType.PUNCT_COLON:
3366
+ self._advance_tokens()
3367
+
3368
+ # Parse the default value expression
3369
+ default_value = self._parse_expression(Precedence.LOWEST)
3370
+
3371
+ # After parsing expression, advance past it
3372
+ self._advance_tokens()
3373
+
3374
+ elif status_token.type == TokenType.KW_DEFAULT:
3375
+ # Handle (default: value) without explicit optional/required
3376
+ # Infer that presence of default means optional
3377
+ is_required = False
3378
+ self._advance_tokens()
3379
+
3380
+ # Expect colon
3381
+ colon_check = self._current_token
3382
+ if colon_check and colon_check.type == TokenType.PUNCT_COLON:
3383
+ self._advance_tokens()
3384
+
3385
+ # Parse the default value expression
3386
+ default_value = self._parse_expression(Precedence.LOWEST)
3387
+
3388
+ # After parsing expression, advance past it
3389
+ self._advance_tokens()
3390
+
3391
+ elif status_token.type == TokenType.KW_REQUIRED:
3392
+ is_required = True
3393
+ self._advance_tokens()
3394
+ else:
3395
+ # Unknown token, keep as required
3396
+ is_required = True
3397
+ self._advance_tokens()
3398
+
3399
+ # Expect closing paren
3400
+ rparen_token = self._current_token
3401
+ if rparen_token and rparen_token.type == TokenType.DELIM_RPAREN:
3402
+ self._advance_tokens()
3403
+
3404
+ return Parameter(
3405
+ token=start_token,
3406
+ name=name,
3407
+ type_name=type_name,
3408
+ is_required=is_required,
3409
+ default_value=default_value,
3410
+ )
3411
+
3412
+ def _parse_output_list(self) -> list[Output]:
3413
+ """Parse a list of outputs (lines starting with -).
3414
+
3415
+ Expected format for outputs:
3416
+ - `name` **as** Type
3417
+ - `name` **as** Type (default: value)
3418
+
3419
+ Returns:
3420
+ List of Output objects.
3421
+ """
3422
+ outputs: list[Output] = []
3423
+
3424
+ while self._current_token and self._current_token.type == TokenType.OP_MINUS:
3425
+ # Move past -
3426
+ self._advance_tokens()
3427
+
3428
+ # Parse single output
3429
+ output = self._parse_output()
3430
+ if output:
3431
+ outputs.append(output)
3432
+
3433
+ return outputs
3434
+
3435
+ def _parse_output(self) -> Output | None:
3436
+ """Parse a single output.
3437
+
3438
+ Expected formats:
3439
+ - Returns Type (simple format)
3440
+ - `name` **as** Type
3441
+ - `name` **as** Type (default: value)
3442
+
3443
+ Note: Outputs don't have required/optional, only optional defaults.
3444
+
3445
+ Returns:
3446
+ An Output object or None if parsing fails.
3447
+ """
3448
+ if not self._current_token:
3449
+ return None
3450
+
3451
+ # Save starting token for error reporting
3452
+ start_token = self._current_token
3453
+
3454
+ # Check for simple "Returns Type" format
3455
+ if self._current_token.type == TokenType.MISC_IDENT and self._current_token.literal.lower() == "returns":
3456
+ self._advance_tokens()
3457
+
3458
+ # Parse type name
3459
+ type_name = self._parse_type_name()
3460
+ if type_name:
3461
+ # Create a simple output with no specific name
3462
+ return Output(
3463
+ token=start_token,
3464
+ name=Identifier(start_token, "return_value"), # Default name
3465
+ type_name=type_name,
3466
+ default_value=EmptyLiteral(start_token),
3467
+ )
3468
+ else:
3469
+ # Failed to parse type, restore position
3470
+ return None
3471
+
3472
+ # Otherwise expect identifier in backticks for named output
3473
+ if self._current_token.type != TokenType.MISC_IDENT:
3474
+ return None
3475
+
3476
+ name = Identifier(self._current_token, self._current_token.literal)
3477
+ self._advance_tokens()
3478
+
3479
+ # Expect "as" keyword
3480
+ current = self._current_token
3481
+ if not current or current.type != TokenType.KW_AS:
3482
+ return None
3483
+ self._advance_tokens()
3484
+
3485
+ # Parse type name (could be multi-word like "Whole Number")
3486
+ type_name = self._parse_type_name()
3487
+ if not type_name:
3488
+ return None
3489
+
3490
+ # Default value (optional)
3491
+ default_value: Expression | None = None
3492
+
3493
+ # Check for (default: value)
3494
+ paren_token = self._current_token
3495
+ if paren_token and paren_token.type == TokenType.DELIM_LPAREN:
3496
+ self._advance_tokens()
3497
+
3498
+ # Expect "default"
3499
+ default_token = self._current_token
3500
+ if default_token and default_token.type == TokenType.KW_DEFAULT:
3501
+ self._advance_tokens()
3502
+
3503
+ # Expect colon
3504
+ colon_check = self._current_token
3505
+ if colon_check and colon_check.type == TokenType.PUNCT_COLON:
3506
+ self._advance_tokens()
3507
+
3508
+ # Parse the default value expression
3509
+ default_value = self._parse_expression(Precedence.LOWEST)
3510
+
3511
+ # After parsing expression, advance past it
3512
+ self._advance_tokens()
3513
+
3514
+ # Expect closing paren
3515
+ rparen_token = self._current_token
3516
+ if rparen_token and rparen_token.type == TokenType.DELIM_RPAREN:
3517
+ self._advance_tokens()
3518
+
3519
+ # If no default value was specified, use Empty as the default
3520
+ if default_value is None:
3521
+ default_value = EmptyLiteral(start_token)
3522
+
3523
+ return Output(
3524
+ token=start_token,
3525
+ name=name,
3526
+ type_name=type_name,
3527
+ default_value=default_value,
3528
+ )
3529
+
3530
+ def _register_variable_definition(self, name: str, type_spec: list[str], line: int, position: int) -> None:
3531
+ """Register a variable definition in the symbol table.
3532
+
3533
+ Args:
3534
+ name: Variable name
3535
+ type_spec: List of allowed types
3536
+ line: Line number
3537
+ position: Column position
3538
+ """
3539
+ try:
3540
+ self._symbol_table.define(name, type_spec, line, position)
3541
+ except NameError as e:
3542
+ # The error message contains info about redefinition
3543
+ if "already defined" in str(e):
3544
+ # Extract the original definition line from error message
3545
+ match = re.search(r"line (\d+)", str(e))
3546
+ original_line = int(match.group(1)) if match else line
3547
+ self._report_error_and_recover(
3548
+ template=VARIABLE_ALREADY_DEFINED,
3549
+ error_type="name",
3550
+ name=name,
3551
+ original_line=str(original_line),
3552
+ skip_recovery=True, # No recovery needed for semantic errors
3553
+ )
3554
+ else:
3555
+ # Fallback for other NameError cases
3556
+ self._report_error_and_recover(
3557
+ template=NAME_UNDEFINED,
3558
+ error_type="name",
3559
+ name=name,
3560
+ skip_recovery=True, # No recovery needed for semantic errors
3561
+ )
3562
+
3563
+ def _check_variable_defined(self, name: str, line: int, position: int) -> bool:
3564
+ """Check if a variable is defined.
3565
+
3566
+ Args:
3567
+ name: Variable name
3568
+ line: Line number for error reporting
3569
+ position: Column position for error reporting
3570
+
3571
+ Returns:
3572
+ True if defined, False otherwise
3573
+ """
3574
+ info = self._symbol_table.lookup(name)
3575
+ if not info:
3576
+ self._report_error_and_recover(
3577
+ template=VARIABLE_NOT_DEFINED,
3578
+ error_type="name",
3579
+ name=name,
3580
+ skip_recovery=True, # No recovery needed for semantic errors
3581
+ )
3582
+ return False
3583
+ return True
3584
+
3585
+ def _validate_assignment_type(self, variable_name: str, value: Expression, line: int, position: int) -> bool:
3586
+ """Validate that an assignment value matches the variable's type.
3587
+
3588
+ Args:
3589
+ variable_name: Name of the variable being assigned to
3590
+ value: The expression being assigned
3591
+ line: Line number for error reporting
3592
+ position: Column position for error reporting
3593
+
3594
+ Returns:
3595
+ True if type is valid, False otherwise
3596
+ """
3597
+ # Look up the variable's type specification
3598
+ var_info = self._symbol_table.lookup(variable_name)
3599
+ if not var_info:
3600
+ # Variable not defined - already reported elsewhere
3601
+ return False
3602
+
3603
+ # Determine the type of the value being assigned
3604
+ value_type = get_type_from_value(value)
3605
+ if value_type is None:
3606
+ # Can't determine type - allow assignment for now
3607
+ # This might be a function call or complex expression
3608
+ return True
3609
+
3610
+ # Check if the value's type is compatible with the variable's type spec
3611
+ type_spec = TypeSpec(var_info.type_spec)
3612
+ is_compatible, _error_msg = check_type_compatibility(value_type, type_spec)
3613
+
3614
+ if not is_compatible:
3615
+ # Create a detailed error message
3616
+ from machine_dialect.errors.messages import ASSIGNMENT_TYPE_MISMATCH
3617
+ from machine_dialect.type_checking import TYPE_DISPLAY_NAMES
3618
+
3619
+ actual_type_name = TYPE_DISPLAY_NAMES.get(value_type, "unknown")
3620
+ self._report_error_and_recover(
3621
+ template=ASSIGNMENT_TYPE_MISMATCH,
3622
+ error_type="type", # This is a type error
3623
+ variable=variable_name,
3624
+ expected_type=str(type_spec),
3625
+ actual_type=actual_type_name,
3626
+ skip_recovery=True, # No recovery needed for semantic errors
3627
+ )
3628
+ return False
3629
+
3630
+ # Mark the variable as initialized on successful type check
3631
+ self._symbol_table.mark_initialized(variable_name)
3632
+ return True
3633
+
3634
+ def _is_type_token(self, token_type: TokenType) -> bool:
3635
+ """Check if a token type represents a type keyword.
3636
+
3637
+ Args:
3638
+ token_type: The token type to check
3639
+
3640
+ Returns:
3641
+ True if it's a type keyword, False otherwise
3642
+ """
3643
+ return token_type in {
3644
+ TokenType.KW_TEXT,
3645
+ TokenType.KW_WHOLE_NUMBER,
3646
+ TokenType.KW_FLOAT,
3647
+ TokenType.KW_NUMBER,
3648
+ TokenType.KW_YES_NO,
3649
+ TokenType.KW_URL,
3650
+ TokenType.KW_DATE,
3651
+ TokenType.KW_DATETIME,
3652
+ TokenType.KW_TIME,
3653
+ TokenType.KW_LIST,
3654
+ TokenType.KW_EMPTY,
3655
+ }