machine-dialect 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. machine_dialect/__main__.py +667 -0
  2. machine_dialect/agent/__init__.py +5 -0
  3. machine_dialect/agent/agent.py +360 -0
  4. machine_dialect/ast/__init__.py +95 -0
  5. machine_dialect/ast/ast_node.py +35 -0
  6. machine_dialect/ast/call_expression.py +82 -0
  7. machine_dialect/ast/dict_extraction.py +60 -0
  8. machine_dialect/ast/expressions.py +439 -0
  9. machine_dialect/ast/literals.py +309 -0
  10. machine_dialect/ast/program.py +35 -0
  11. machine_dialect/ast/statements.py +1433 -0
  12. machine_dialect/ast/tests/test_ast_string_representation.py +62 -0
  13. machine_dialect/ast/tests/test_boolean_literal.py +29 -0
  14. machine_dialect/ast/tests/test_collection_hir.py +138 -0
  15. machine_dialect/ast/tests/test_define_statement.py +142 -0
  16. machine_dialect/ast/tests/test_desugar.py +541 -0
  17. machine_dialect/ast/tests/test_foreach_desugar.py +245 -0
  18. machine_dialect/cfg/__init__.py +6 -0
  19. machine_dialect/cfg/config.py +156 -0
  20. machine_dialect/cfg/examples.py +221 -0
  21. machine_dialect/cfg/generate_with_ai.py +187 -0
  22. machine_dialect/cfg/openai_generation.py +200 -0
  23. machine_dialect/cfg/parser.py +94 -0
  24. machine_dialect/cfg/tests/__init__.py +1 -0
  25. machine_dialect/cfg/tests/test_cfg_parser.py +252 -0
  26. machine_dialect/cfg/tests/test_config.py +188 -0
  27. machine_dialect/cfg/tests/test_examples.py +391 -0
  28. machine_dialect/cfg/tests/test_generate_with_ai.py +354 -0
  29. machine_dialect/cfg/tests/test_openai_generation.py +256 -0
  30. machine_dialect/codegen/__init__.py +5 -0
  31. machine_dialect/codegen/bytecode_module.py +89 -0
  32. machine_dialect/codegen/bytecode_serializer.py +300 -0
  33. machine_dialect/codegen/opcodes.py +101 -0
  34. machine_dialect/codegen/register_codegen.py +1996 -0
  35. machine_dialect/codegen/symtab.py +208 -0
  36. machine_dialect/codegen/tests/__init__.py +1 -0
  37. machine_dialect/codegen/tests/test_array_operations_codegen.py +295 -0
  38. machine_dialect/codegen/tests/test_bytecode_serializer.py +185 -0
  39. machine_dialect/codegen/tests/test_register_codegen_ssa.py +324 -0
  40. machine_dialect/codegen/tests/test_symtab.py +418 -0
  41. machine_dialect/codegen/vm_serializer.py +621 -0
  42. machine_dialect/compiler/__init__.py +18 -0
  43. machine_dialect/compiler/compiler.py +197 -0
  44. machine_dialect/compiler/config.py +149 -0
  45. machine_dialect/compiler/context.py +149 -0
  46. machine_dialect/compiler/phases/__init__.py +19 -0
  47. machine_dialect/compiler/phases/bytecode_optimization.py +90 -0
  48. machine_dialect/compiler/phases/codegen.py +40 -0
  49. machine_dialect/compiler/phases/hir_generation.py +39 -0
  50. machine_dialect/compiler/phases/mir_generation.py +86 -0
  51. machine_dialect/compiler/phases/optimization.py +110 -0
  52. machine_dialect/compiler/phases/parsing.py +39 -0
  53. machine_dialect/compiler/pipeline.py +143 -0
  54. machine_dialect/compiler/tests/__init__.py +1 -0
  55. machine_dialect/compiler/tests/test_compiler.py +568 -0
  56. machine_dialect/compiler/vm_runner.py +173 -0
  57. machine_dialect/errors/__init__.py +32 -0
  58. machine_dialect/errors/exceptions.py +369 -0
  59. machine_dialect/errors/messages.py +82 -0
  60. machine_dialect/errors/tests/__init__.py +0 -0
  61. machine_dialect/errors/tests/test_expected_token_errors.py +188 -0
  62. machine_dialect/errors/tests/test_name_errors.py +118 -0
  63. machine_dialect/helpers/__init__.py +0 -0
  64. machine_dialect/helpers/stopwords.py +225 -0
  65. machine_dialect/helpers/validators.py +30 -0
  66. machine_dialect/lexer/__init__.py +9 -0
  67. machine_dialect/lexer/constants.py +23 -0
  68. machine_dialect/lexer/lexer.py +907 -0
  69. machine_dialect/lexer/tests/__init__.py +0 -0
  70. machine_dialect/lexer/tests/helpers.py +86 -0
  71. machine_dialect/lexer/tests/test_apostrophe_identifiers.py +122 -0
  72. machine_dialect/lexer/tests/test_backtick_identifiers.py +140 -0
  73. machine_dialect/lexer/tests/test_boolean_literals.py +108 -0
  74. machine_dialect/lexer/tests/test_case_insensitive_keywords.py +188 -0
  75. machine_dialect/lexer/tests/test_comments.py +200 -0
  76. machine_dialect/lexer/tests/test_double_asterisk_keywords.py +127 -0
  77. machine_dialect/lexer/tests/test_lexer_position.py +113 -0
  78. machine_dialect/lexer/tests/test_list_tokens.py +282 -0
  79. machine_dialect/lexer/tests/test_stopwords.py +80 -0
  80. machine_dialect/lexer/tests/test_strict_equality.py +129 -0
  81. machine_dialect/lexer/tests/test_token.py +41 -0
  82. machine_dialect/lexer/tests/test_tokenization.py +294 -0
  83. machine_dialect/lexer/tests/test_underscore_literals.py +343 -0
  84. machine_dialect/lexer/tests/test_url_literals.py +169 -0
  85. machine_dialect/lexer/tokens.py +487 -0
  86. machine_dialect/linter/__init__.py +10 -0
  87. machine_dialect/linter/__main__.py +144 -0
  88. machine_dialect/linter/linter.py +154 -0
  89. machine_dialect/linter/rules/__init__.py +8 -0
  90. machine_dialect/linter/rules/base.py +112 -0
  91. machine_dialect/linter/rules/statement_termination.py +99 -0
  92. machine_dialect/linter/tests/__init__.py +1 -0
  93. machine_dialect/linter/tests/mdrules/__init__.py +0 -0
  94. machine_dialect/linter/tests/mdrules/test_md101_statement_termination.py +181 -0
  95. machine_dialect/linter/tests/test_linter.py +81 -0
  96. machine_dialect/linter/tests/test_rules.py +110 -0
  97. machine_dialect/linter/tests/test_violations.py +71 -0
  98. machine_dialect/linter/violations.py +51 -0
  99. machine_dialect/mir/__init__.py +69 -0
  100. machine_dialect/mir/analyses/__init__.py +20 -0
  101. machine_dialect/mir/analyses/alias_analysis.py +315 -0
  102. machine_dialect/mir/analyses/dominance_analysis.py +49 -0
  103. machine_dialect/mir/analyses/escape_analysis.py +286 -0
  104. machine_dialect/mir/analyses/loop_analysis.py +272 -0
  105. machine_dialect/mir/analyses/tests/test_type_analysis.py +736 -0
  106. machine_dialect/mir/analyses/type_analysis.py +448 -0
  107. machine_dialect/mir/analyses/use_def_chains.py +232 -0
  108. machine_dialect/mir/basic_block.py +385 -0
  109. machine_dialect/mir/dataflow.py +445 -0
  110. machine_dialect/mir/debug_info.py +208 -0
  111. machine_dialect/mir/hir_to_mir.py +1738 -0
  112. machine_dialect/mir/mir_dumper.py +366 -0
  113. machine_dialect/mir/mir_function.py +167 -0
  114. machine_dialect/mir/mir_instructions.py +1877 -0
  115. machine_dialect/mir/mir_interpreter.py +556 -0
  116. machine_dialect/mir/mir_module.py +225 -0
  117. machine_dialect/mir/mir_printer.py +480 -0
  118. machine_dialect/mir/mir_transformer.py +410 -0
  119. machine_dialect/mir/mir_types.py +367 -0
  120. machine_dialect/mir/mir_validation.py +455 -0
  121. machine_dialect/mir/mir_values.py +268 -0
  122. machine_dialect/mir/optimization_config.py +233 -0
  123. machine_dialect/mir/optimization_pass.py +251 -0
  124. machine_dialect/mir/optimization_pipeline.py +355 -0
  125. machine_dialect/mir/optimizations/__init__.py +84 -0
  126. machine_dialect/mir/optimizations/algebraic_simplification.py +733 -0
  127. machine_dialect/mir/optimizations/branch_prediction.py +372 -0
  128. machine_dialect/mir/optimizations/constant_propagation.py +634 -0
  129. machine_dialect/mir/optimizations/cse.py +398 -0
  130. machine_dialect/mir/optimizations/dce.py +288 -0
  131. machine_dialect/mir/optimizations/inlining.py +551 -0
  132. machine_dialect/mir/optimizations/jump_threading.py +487 -0
  133. machine_dialect/mir/optimizations/licm.py +405 -0
  134. machine_dialect/mir/optimizations/loop_unrolling.py +366 -0
  135. machine_dialect/mir/optimizations/strength_reduction.py +422 -0
  136. machine_dialect/mir/optimizations/tail_call.py +207 -0
  137. machine_dialect/mir/optimizations/tests/test_loop_unrolling.py +483 -0
  138. machine_dialect/mir/optimizations/type_narrowing.py +397 -0
  139. machine_dialect/mir/optimizations/type_specialization.py +447 -0
  140. machine_dialect/mir/optimizations/type_specific.py +906 -0
  141. machine_dialect/mir/optimize_mir.py +89 -0
  142. machine_dialect/mir/pass_manager.py +391 -0
  143. machine_dialect/mir/profiling/__init__.py +26 -0
  144. machine_dialect/mir/profiling/profile_collector.py +318 -0
  145. machine_dialect/mir/profiling/profile_data.py +372 -0
  146. machine_dialect/mir/profiling/profile_reader.py +272 -0
  147. machine_dialect/mir/profiling/profile_writer.py +226 -0
  148. machine_dialect/mir/register_allocation.py +302 -0
  149. machine_dialect/mir/reporting/__init__.py +17 -0
  150. machine_dialect/mir/reporting/optimization_reporter.py +314 -0
  151. machine_dialect/mir/reporting/report_formatter.py +289 -0
  152. machine_dialect/mir/ssa_construction.py +342 -0
  153. machine_dialect/mir/tests/__init__.py +1 -0
  154. machine_dialect/mir/tests/test_algebraic_associativity.py +204 -0
  155. machine_dialect/mir/tests/test_algebraic_complex_patterns.py +221 -0
  156. machine_dialect/mir/tests/test_algebraic_division.py +126 -0
  157. machine_dialect/mir/tests/test_algebraic_simplification.py +863 -0
  158. machine_dialect/mir/tests/test_basic_block.py +425 -0
  159. machine_dialect/mir/tests/test_branch_prediction.py +459 -0
  160. machine_dialect/mir/tests/test_call_lowering.py +168 -0
  161. machine_dialect/mir/tests/test_collection_lowering.py +604 -0
  162. machine_dialect/mir/tests/test_cross_block_constant_propagation.py +255 -0
  163. machine_dialect/mir/tests/test_custom_passes.py +166 -0
  164. machine_dialect/mir/tests/test_debug_info.py +285 -0
  165. machine_dialect/mir/tests/test_dict_extraction_lowering.py +192 -0
  166. machine_dialect/mir/tests/test_dictionary_lowering.py +299 -0
  167. machine_dialect/mir/tests/test_double_negation.py +231 -0
  168. machine_dialect/mir/tests/test_escape_analysis.py +233 -0
  169. machine_dialect/mir/tests/test_hir_to_mir.py +465 -0
  170. machine_dialect/mir/tests/test_hir_to_mir_complete.py +389 -0
  171. machine_dialect/mir/tests/test_hir_to_mir_simple.py +130 -0
  172. machine_dialect/mir/tests/test_inlining.py +435 -0
  173. machine_dialect/mir/tests/test_licm.py +472 -0
  174. machine_dialect/mir/tests/test_mir_dumper.py +313 -0
  175. machine_dialect/mir/tests/test_mir_instructions.py +445 -0
  176. machine_dialect/mir/tests/test_mir_module.py +860 -0
  177. machine_dialect/mir/tests/test_mir_printer.py +387 -0
  178. machine_dialect/mir/tests/test_mir_types.py +123 -0
  179. machine_dialect/mir/tests/test_mir_types_enhanced.py +132 -0
  180. machine_dialect/mir/tests/test_mir_validation.py +378 -0
  181. machine_dialect/mir/tests/test_mir_values.py +168 -0
  182. machine_dialect/mir/tests/test_one_based_indexing.py +202 -0
  183. machine_dialect/mir/tests/test_optimization_helpers.py +60 -0
  184. machine_dialect/mir/tests/test_optimization_pipeline.py +554 -0
  185. machine_dialect/mir/tests/test_optimization_reporter.py +318 -0
  186. machine_dialect/mir/tests/test_pass_manager.py +294 -0
  187. machine_dialect/mir/tests/test_pass_registration.py +64 -0
  188. machine_dialect/mir/tests/test_profiling.py +356 -0
  189. machine_dialect/mir/tests/test_register_allocation.py +307 -0
  190. machine_dialect/mir/tests/test_report_formatters.py +372 -0
  191. machine_dialect/mir/tests/test_ssa_construction.py +433 -0
  192. machine_dialect/mir/tests/test_tail_call.py +236 -0
  193. machine_dialect/mir/tests/test_type_annotated_instructions.py +192 -0
  194. machine_dialect/mir/tests/test_type_narrowing.py +277 -0
  195. machine_dialect/mir/tests/test_type_specialization.py +421 -0
  196. machine_dialect/mir/tests/test_type_specific_optimization.py +545 -0
  197. machine_dialect/mir/tests/test_type_specific_optimization_advanced.py +382 -0
  198. machine_dialect/mir/type_inference.py +368 -0
  199. machine_dialect/parser/__init__.py +12 -0
  200. machine_dialect/parser/enums.py +45 -0
  201. machine_dialect/parser/parser.py +3655 -0
  202. machine_dialect/parser/protocols.py +11 -0
  203. machine_dialect/parser/symbol_table.py +169 -0
  204. machine_dialect/parser/tests/__init__.py +0 -0
  205. machine_dialect/parser/tests/helper_functions.py +193 -0
  206. machine_dialect/parser/tests/test_action_statements.py +334 -0
  207. machine_dialect/parser/tests/test_boolean_literal_expressions.py +152 -0
  208. machine_dialect/parser/tests/test_call_statements.py +154 -0
  209. machine_dialect/parser/tests/test_call_statements_errors.py +187 -0
  210. machine_dialect/parser/tests/test_collection_mutations.py +264 -0
  211. machine_dialect/parser/tests/test_conditional_expressions.py +343 -0
  212. machine_dialect/parser/tests/test_define_integration.py +468 -0
  213. machine_dialect/parser/tests/test_define_statements.py +311 -0
  214. machine_dialect/parser/tests/test_dict_extraction.py +115 -0
  215. machine_dialect/parser/tests/test_empty_literal.py +155 -0
  216. machine_dialect/parser/tests/test_float_literal_expressions.py +163 -0
  217. machine_dialect/parser/tests/test_identifier_expressions.py +57 -0
  218. machine_dialect/parser/tests/test_if_empty_block.py +61 -0
  219. machine_dialect/parser/tests/test_if_statements.py +299 -0
  220. machine_dialect/parser/tests/test_illegal_tokens.py +86 -0
  221. machine_dialect/parser/tests/test_infix_expressions.py +680 -0
  222. machine_dialect/parser/tests/test_integer_literal_expressions.py +137 -0
  223. machine_dialect/parser/tests/test_interaction_statements.py +269 -0
  224. machine_dialect/parser/tests/test_list_literals.py +277 -0
  225. machine_dialect/parser/tests/test_no_none_in_ast.py +94 -0
  226. machine_dialect/parser/tests/test_panic_mode_recovery.py +171 -0
  227. machine_dialect/parser/tests/test_parse_errors.py +114 -0
  228. machine_dialect/parser/tests/test_possessive_syntax.py +182 -0
  229. machine_dialect/parser/tests/test_prefix_expressions.py +415 -0
  230. machine_dialect/parser/tests/test_program.py +13 -0
  231. machine_dialect/parser/tests/test_return_statements.py +89 -0
  232. machine_dialect/parser/tests/test_set_statements.py +152 -0
  233. machine_dialect/parser/tests/test_strict_equality.py +258 -0
  234. machine_dialect/parser/tests/test_symbol_table.py +217 -0
  235. machine_dialect/parser/tests/test_url_literal_expressions.py +209 -0
  236. machine_dialect/parser/tests/test_utility_statements.py +423 -0
  237. machine_dialect/parser/token_buffer.py +159 -0
  238. machine_dialect/repl/__init__.py +3 -0
  239. machine_dialect/repl/repl.py +426 -0
  240. machine_dialect/repl/tests/__init__.py +0 -0
  241. machine_dialect/repl/tests/test_repl.py +606 -0
  242. machine_dialect/semantic/__init__.py +12 -0
  243. machine_dialect/semantic/analyzer.py +906 -0
  244. machine_dialect/semantic/error_messages.py +189 -0
  245. machine_dialect/semantic/tests/__init__.py +1 -0
  246. machine_dialect/semantic/tests/test_analyzer.py +364 -0
  247. machine_dialect/semantic/tests/test_error_messages.py +104 -0
  248. machine_dialect/tests/edge_cases/__init__.py +10 -0
  249. machine_dialect/tests/edge_cases/test_boundary_access.py +256 -0
  250. machine_dialect/tests/edge_cases/test_empty_collections.py +166 -0
  251. machine_dialect/tests/edge_cases/test_invalid_operations.py +243 -0
  252. machine_dialect/tests/edge_cases/test_named_list_edge_cases.py +295 -0
  253. machine_dialect/tests/edge_cases/test_nested_structures.py +313 -0
  254. machine_dialect/tests/edge_cases/test_type_mixing.py +277 -0
  255. machine_dialect/tests/integration/test_array_operations_emulation.py +248 -0
  256. machine_dialect/tests/integration/test_list_compilation.py +395 -0
  257. machine_dialect/tests/integration/test_lists_and_dictionaries.py +322 -0
  258. machine_dialect/type_checking/__init__.py +21 -0
  259. machine_dialect/type_checking/tests/__init__.py +1 -0
  260. machine_dialect/type_checking/tests/test_type_system.py +230 -0
  261. machine_dialect/type_checking/type_system.py +270 -0
  262. machine_dialect-0.1.0a1.dist-info/METADATA +128 -0
  263. machine_dialect-0.1.0a1.dist-info/RECORD +268 -0
  264. machine_dialect-0.1.0a1.dist-info/WHEEL +5 -0
  265. machine_dialect-0.1.0a1.dist-info/entry_points.txt +3 -0
  266. machine_dialect-0.1.0a1.dist-info/licenses/LICENSE +201 -0
  267. machine_dialect-0.1.0a1.dist-info/top_level.txt +2 -0
  268. machine_dialect_vm/__init__.pyi +15 -0
@@ -0,0 +1,621 @@
1
+ """Proper bytecode serializer for the Rust VM with constant pool remapping.
2
+
3
+ This serializer correctly handles individual instruction parsing and remaps
4
+ constant indices when merging multiple chunks into a single module.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import struct
10
+ from dataclasses import dataclass, field
11
+ from io import BytesIO
12
+ from typing import Any, BinaryIO
13
+
14
+ from machine_dialect.codegen.bytecode_module import BytecodeModule, ConstantTag
15
+ from machine_dialect.codegen.opcodes import Opcode
16
+
17
+ # =============================================================================
18
+ # Deduplication and Mapping Support
19
+ # =============================================================================
20
+
21
+
22
+ @dataclass
23
+ class DeduplicationStats:
24
+ """Track deduplication effectiveness."""
25
+
26
+ original_count: int
27
+ deduped_count: int
28
+ bytes_saved: int
29
+ duplicate_chains: dict[tuple[Any, ...], list[int]] = field(default_factory=dict)
30
+
31
+
32
+ @dataclass
33
+ class ConstantMapping:
34
+ """Maps local chunk indices to global pool indices."""
35
+
36
+ # For each chunk, maps local index -> global index
37
+ chunk_mappings: list[dict[int, int]]
38
+
39
+ # Global constant pool with deduplication
40
+ global_constants: list[tuple[ConstantTag, Any]]
41
+
42
+ # Statistics for debugging
43
+ stats: DeduplicationStats
44
+
45
+
46
+ # =============================================================================
47
+ # Error Handling
48
+ # =============================================================================
49
+
50
+
51
+ class RemappingError(Exception):
52
+ """Base class for remapping errors."""
53
+
54
+ pass
55
+
56
+
57
+ class InvalidBytecodeError(RemappingError):
58
+ """Raised when bytecode is malformed."""
59
+
60
+ def __init__(self, message: str, offset: int | None = None, chunk_idx: int | None = None):
61
+ super().__init__(
62
+ f"{message}"
63
+ f"{f' at offset {offset:#x}' if offset is not None else ''}"
64
+ f"{f' in chunk {chunk_idx}' if chunk_idx is not None else ''}"
65
+ )
66
+ self.offset = offset
67
+ self.chunk_idx = chunk_idx
68
+
69
+
70
+ class ConstantIndexError(RemappingError):
71
+ """Raised when a constant index is out of range."""
72
+
73
+ def __init__(self, idx: int, max_idx: int, chunk_idx: int | None = None, offset: int | None = None):
74
+ super().__init__(
75
+ f"Invalid constant index {idx} (max: {max_idx})"
76
+ f"{f' in chunk {chunk_idx}' if chunk_idx is not None else ''}"
77
+ f"{f' at instruction offset {offset:#x}' if offset is not None else ''}"
78
+ )
79
+ self.index = idx
80
+ self.max_index = max_idx
81
+ self.chunk_idx = chunk_idx
82
+ self.offset = offset
83
+
84
+
85
+ # =============================================================================
86
+ # Instruction Format Definitions
87
+ # =============================================================================
88
+
89
+
90
+ @dataclass
91
+ class InstructionFormat:
92
+ """Describes the format of a bytecode instruction."""
93
+
94
+ opcode: int
95
+ name: str
96
+ size: int # Total size in bytes (-1 for variable)
97
+ has_const_operand: bool
98
+ operand_format: str # Format string for struct
99
+
100
+
101
+ # Instruction format definitions
102
+ INSTRUCTION_FORMATS = {
103
+ 0x00: InstructionFormat(0x00, "LOAD_CONST_R", 4, True, "BH"),
104
+ 0x01: InstructionFormat(0x01, "MOVE_R", 3, False, "BB"),
105
+ 0x02: InstructionFormat(0x02, "LOAD_GLOBAL_R", 4, True, "BH"),
106
+ 0x03: InstructionFormat(0x03, "STORE_GLOBAL_R", 4, True, "BH"),
107
+ # Arithmetic
108
+ 0x07: InstructionFormat(0x07, "ADD_R", 4, False, "BBB"),
109
+ 0x08: InstructionFormat(0x08, "SUB_R", 4, False, "BBB"),
110
+ 0x09: InstructionFormat(0x09, "MUL_R", 4, False, "BBB"),
111
+ 0x0A: InstructionFormat(0x0A, "DIV_R", 4, False, "BBB"),
112
+ 0x0B: InstructionFormat(0x0B, "MOD_R", 4, False, "BBB"),
113
+ 0x0C: InstructionFormat(0x0C, "NEG_R", 3, False, "BB"),
114
+ # Logical
115
+ 0x0D: InstructionFormat(0x0D, "NOT_R", 3, False, "BB"),
116
+ 0x0E: InstructionFormat(0x0E, "AND_R", 4, False, "BBB"),
117
+ 0x0F: InstructionFormat(0x0F, "OR_R", 4, False, "BBB"),
118
+ # Comparisons
119
+ 0x10: InstructionFormat(0x10, "EQ_R", 4, False, "BBB"),
120
+ 0x11: InstructionFormat(0x11, "NEQ_R", 4, False, "BBB"),
121
+ 0x12: InstructionFormat(0x12, "LT_R", 4, False, "BBB"),
122
+ 0x13: InstructionFormat(0x13, "GT_R", 4, False, "BBB"),
123
+ 0x14: InstructionFormat(0x14, "LTE_R", 4, False, "BBB"),
124
+ 0x15: InstructionFormat(0x15, "GTE_R", 4, False, "BBB"),
125
+ # Control flow
126
+ 0x16: InstructionFormat(0x16, "JUMP_R", 5, False, "i"),
127
+ 0x17: InstructionFormat(0x17, "JUMP_IF_R", 6, False, "Bi"),
128
+ 0x18: InstructionFormat(0x18, "JUMP_IF_NOT_R", 6, False, "Bi"),
129
+ 0x19: InstructionFormat(0x19, "CALL_R", -1, False, ""), # Variable size
130
+ 0x1A: InstructionFormat(0x1A, "RETURN_R", -1, False, ""), # Variable size
131
+ # MIR Support
132
+ 0x1B: InstructionFormat(0x1B, "PHI_R", -1, False, ""), # Variable size
133
+ 0x1C: InstructionFormat(0x1C, "ASSERT_R", 5, True, "BBH"), # reg + assert_type + msg_idx
134
+ 0x1D: InstructionFormat(0x1D, "SCOPE_ENTER_R", 3, False, "H"),
135
+ 0x1E: InstructionFormat(0x1E, "SCOPE_EXIT_R", 3, False, "H"),
136
+ # String operations
137
+ 0x1F: InstructionFormat(0x1F, "CONCAT_STR_R", 4, False, "BBB"),
138
+ 0x20: InstructionFormat(0x20, "STR_LEN_R", 3, False, "BB"),
139
+ # Arrays
140
+ 0x21: InstructionFormat(0x21, "NEW_ARRAY_R", 3, False, "BB"),
141
+ 0x22: InstructionFormat(0x22, "ARRAY_GET_R", 4, False, "BBB"),
142
+ 0x23: InstructionFormat(0x23, "ARRAY_SET_R", 4, False, "BBB"),
143
+ 0x24: InstructionFormat(0x24, "ARRAY_LEN_R", 3, False, "BB"),
144
+ # Debug
145
+ 0x25: InstructionFormat(0x25, "DEBUG_PRINT", 2, False, "B"),
146
+ 0x26: InstructionFormat(0x26, "BREAKPOINT", 1, False, ""),
147
+ }
148
+
149
+
150
+ # =============================================================================
151
+ # Helper Functions
152
+ # =============================================================================
153
+
154
+
155
+ def make_hashable(value: Any) -> Any:
156
+ """Convert value to hashable form for deduplication."""
157
+ if isinstance(value, int | float | str | bool | type(None)):
158
+ return value
159
+ elif isinstance(value, bytes):
160
+ return value
161
+ elif isinstance(value, list):
162
+ return tuple(make_hashable(v) for v in value)
163
+ elif isinstance(value, dict):
164
+ return tuple(sorted((k, make_hashable(v)) for k, v in value.items()))
165
+ else:
166
+ # Fallback: use string representation
167
+ return str(value)
168
+
169
+
170
+ def build_constant_mapping(module: BytecodeModule) -> ConstantMapping:
171
+ """Build mapping from local to global constant indices with deduplication."""
172
+
173
+ global_constants: list[tuple[ConstantTag, Any]] = []
174
+ chunk_mappings: list[dict[int, int]] = []
175
+ original_count = 0
176
+ bytes_saved = 0
177
+ duplicate_chains: dict[tuple[Any, ...], list[int]] = {}
178
+
179
+ # Global deduplication map: (tag, value) -> global_index
180
+ global_dedupe: dict[tuple[Any, ...], int] = {}
181
+
182
+ for chunk_idx, chunk in enumerate(module.chunks):
183
+ local_to_global = {}
184
+
185
+ for local_idx, (tag, value) in enumerate(chunk.constants):
186
+ original_count += 1
187
+
188
+ # Create hashable key for deduplication
189
+ key = (tag, make_hashable(value))
190
+
191
+ if key in global_dedupe:
192
+ # Reuse existing global constant
193
+ global_idx = global_dedupe[key]
194
+ bytes_saved += estimate_constant_size(tag, value)
195
+
196
+ # Track duplicate chains
197
+ if key not in duplicate_chains:
198
+ duplicate_chains[key] = []
199
+ duplicate_chains[key].append(chunk_idx)
200
+ else:
201
+ # Add new global constant
202
+ global_idx = len(global_constants)
203
+ global_constants.append((tag, value))
204
+ global_dedupe[key] = global_idx
205
+
206
+ local_to_global[local_idx] = global_idx
207
+
208
+ chunk_mappings.append(local_to_global)
209
+
210
+ stats = DeduplicationStats(
211
+ original_count=original_count,
212
+ deduped_count=len(global_constants),
213
+ bytes_saved=bytes_saved,
214
+ duplicate_chains=duplicate_chains,
215
+ )
216
+
217
+ return ConstantMapping(chunk_mappings=chunk_mappings, global_constants=global_constants, stats=stats)
218
+
219
+
220
+ def estimate_constant_size(tag: ConstantTag, value: Any) -> int:
221
+ """Estimate the size of a constant in bytes."""
222
+ if tag == ConstantTag.INT:
223
+ return 9 # 1 (tag) + 8 (i64)
224
+ elif tag == ConstantTag.FLOAT:
225
+ return 9 # 1 (tag) + 8 (f64)
226
+ elif tag == ConstantTag.STRING:
227
+ return 5 + len(value.encode("utf-8")) # 1 (tag) + 4 (len) + data
228
+ elif tag == ConstantTag.BOOL:
229
+ return 2 # 1 (tag) + 1 (bool)
230
+ elif tag == ConstantTag.EMPTY:
231
+ return 1 # 1 (tag)
232
+ return 1
233
+
234
+
235
+ def get_instruction_size(opcode: int, bytecode: bytes, offset: int) -> int:
236
+ """Get the actual size of an instruction at the given offset."""
237
+
238
+ fmt = INSTRUCTION_FORMATS.get(opcode)
239
+ if not fmt:
240
+ return 1 # Unknown opcode, skip single byte
241
+
242
+ if fmt.size > 0:
243
+ return fmt.size
244
+
245
+ # Handle variable-size instructions
246
+ if opcode == Opcode.CALL_R:
247
+ # Format: opcode + func + dst + num_args + args...
248
+ if offset + 3 < len(bytecode):
249
+ num_args = bytecode[offset + 3]
250
+ return 4 + num_args
251
+ return 1
252
+
253
+ elif opcode == Opcode.PHI_R:
254
+ # Format: opcode + dst + num_sources + (src + block_id) * num_sources
255
+ if offset + 2 < len(bytecode):
256
+ num_sources = bytecode[offset + 2]
257
+ return 3 + num_sources * 3 # Each source is reg(1) + block_id(2)
258
+ return 1
259
+
260
+ elif opcode == Opcode.RETURN_R:
261
+ # Format: opcode + has_value + [src]
262
+ if offset + 1 < len(bytecode):
263
+ has_value = bytecode[offset + 1]
264
+ return 3 if has_value else 2
265
+ return 1
266
+
267
+ return 1
268
+
269
+
270
+ # =============================================================================
271
+ # Bytecode Remapper
272
+ # =============================================================================
273
+
274
+
275
+ class BytecodeRemapper:
276
+ """Remaps constant indices in bytecode instructions."""
277
+
278
+ def __init__(self, mapping: ConstantMapping):
279
+ self.mapping = mapping
280
+
281
+ def remap_chunk(self, chunk_index: int, bytecode: bytes) -> bytes:
282
+ """Remap all constant indices in a chunk's bytecode."""
283
+
284
+ if chunk_index >= len(self.mapping.chunk_mappings):
285
+ # No remapping needed (e.g., chunk has no constants)
286
+ return bytecode
287
+
288
+ chunk_map = self.mapping.chunk_mappings[chunk_index]
289
+ if not chunk_map:
290
+ # Empty mapping, no constants to remap
291
+ return bytecode
292
+
293
+ result = bytearray()
294
+ offset = 0
295
+
296
+ while offset < len(bytecode):
297
+ opcode = bytecode[offset]
298
+
299
+ # Get instruction size
300
+ inst_size = get_instruction_size(opcode, bytecode, offset)
301
+
302
+ # Check if we have enough bytes
303
+ if offset + inst_size > len(bytecode):
304
+ raise InvalidBytecodeError(
305
+ f"Truncated instruction (opcode {opcode:#x}, expected {inst_size} bytes)",
306
+ offset=offset,
307
+ chunk_idx=chunk_index,
308
+ )
309
+
310
+ # Extract instruction bytes
311
+ inst_bytes = bytecode[offset : offset + inst_size]
312
+
313
+ # Check if this instruction needs remapping
314
+ fmt = INSTRUCTION_FORMATS.get(opcode)
315
+ if fmt and fmt.has_const_operand:
316
+ # Remap the instruction
317
+ result.append(opcode)
318
+ remapped_operands = self.remap_instruction(
319
+ opcode, inst_bytes[1:], chunk_map, chunk_idx=chunk_index, offset=offset
320
+ )
321
+ result.extend(remapped_operands)
322
+ else:
323
+ # Copy instruction as-is
324
+ result.extend(inst_bytes)
325
+
326
+ offset += inst_size
327
+
328
+ return bytes(result)
329
+
330
+ def remap_instruction(
331
+ self,
332
+ opcode: int,
333
+ operands: bytes,
334
+ chunk_map: dict[int, int],
335
+ chunk_idx: int | None = None,
336
+ offset: int | None = None,
337
+ ) -> bytes:
338
+ """Remap constant indices in a single instruction."""
339
+
340
+ if opcode == Opcode.LOAD_CONST_R:
341
+ # Format: dst_reg(u8) + const_idx(u16)
342
+ if len(operands) < 3:
343
+ raise InvalidBytecodeError("LOAD_CONST_R operands too short", offset=offset, chunk_idx=chunk_idx)
344
+ dst_reg = operands[0]
345
+ old_idx = struct.unpack("<H", operands[1:3])[0]
346
+ new_idx = chunk_map.get(old_idx, old_idx)
347
+
348
+ return bytes([dst_reg]) + struct.pack("<H", new_idx)
349
+
350
+ elif opcode in [Opcode.LOAD_GLOBAL_R, Opcode.STORE_GLOBAL_R]:
351
+ # Format: reg(u8) + name_idx(u16)
352
+ # Name index might reference string constants
353
+ if len(operands) < 3:
354
+ raise InvalidBytecodeError(
355
+ f"{INSTRUCTION_FORMATS[opcode].name} operands too short", offset=offset, chunk_idx=chunk_idx
356
+ )
357
+ reg = operands[0]
358
+ old_idx = struct.unpack("<H", operands[1:3])[0]
359
+ new_idx = chunk_map.get(old_idx, old_idx)
360
+
361
+ return bytes([reg]) + struct.pack("<H", new_idx)
362
+
363
+ elif opcode == Opcode.ASSERT_R:
364
+ # Format: cond_reg(u8) + assert_type(u8) + msg_idx(u16)
365
+ # Message index references string constant for assertion message
366
+ if len(operands) < 4:
367
+ raise InvalidBytecodeError("ASSERT_R operands too short", offset=offset, chunk_idx=chunk_idx)
368
+ cond_reg = operands[0]
369
+ assert_type = operands[1]
370
+ old_idx = struct.unpack("<H", operands[2:4])[0]
371
+ new_idx = chunk_map.get(old_idx, old_idx)
372
+
373
+ # Validate the message index is valid
374
+ if new_idx >= len(self.mapping.global_constants):
375
+ raise ConstantIndexError(new_idx, len(self.mapping.global_constants) - 1, chunk_idx, offset)
376
+
377
+ return bytes([cond_reg, assert_type]) + struct.pack("<H", new_idx)
378
+
379
+ # Other opcodes don't reference constants
380
+ return operands
381
+
382
+
383
+ def generate_remapping_report(module: BytecodeModule, mapping: ConstantMapping) -> str:
384
+ """Generate human-readable remapping report for debugging."""
385
+ report = []
386
+ report.append("=== Constant Pool Remapping Report ===\n")
387
+
388
+ # Deduplication statistics
389
+ stats = mapping.stats
390
+ report.append(f"Original constants: {stats.original_count}")
391
+ report.append(f"After deduplication: {stats.deduped_count}")
392
+ report.append(f"Bytes saved: {stats.bytes_saved}")
393
+ if stats.original_count > 0:
394
+ reduction = 100 * (stats.original_count - stats.deduped_count) / stats.original_count
395
+ report.append(f"Reduction: {reduction:.1f}%\n")
396
+ else:
397
+ report.append("Reduction: N/A (no constants)\n")
398
+
399
+ # Duplicate chains
400
+ if stats.duplicate_chains:
401
+ report.append("Duplicate constants found:")
402
+ for key, chunks in stats.duplicate_chains.items():
403
+ tag, val = key
404
+ report.append(f" {tag}: {val} appears in chunks: {chunks}")
405
+ report.append("")
406
+
407
+ # Per-chunk mappings
408
+ report.append("Per-chunk remapping:")
409
+ for chunk_idx, chunk_map in enumerate(mapping.chunk_mappings):
410
+ if chunk_idx < len(module.chunks):
411
+ chunk = module.chunks[chunk_idx]
412
+ report.append(f"\nChunk {chunk_idx} ({chunk.name}):")
413
+ for local, global_idx in sorted(chunk_map.items()):
414
+ if global_idx < len(mapping.global_constants):
415
+ tag, val = mapping.global_constants[global_idx]
416
+ report.append(f" [{local}] -> [{global_idx}]: {tag.name}: {val}")
417
+
418
+ return "\n".join(report)
419
+
420
+
421
+ # =============================================================================
422
+ # Main Serializer
423
+ # =============================================================================
424
+
425
+
426
+ class VMBytecodeSerializer:
427
+ """Serializes bytecode modules for the Rust VM with constant remapping."""
428
+
429
+ @staticmethod
430
+ def serialize(module: BytecodeModule, debug: bool = False) -> bytes:
431
+ """Serialize a bytecode module to bytes.
432
+
433
+ Args:
434
+ module: BytecodeModule to serialize.
435
+ debug: If True, print remapping report.
436
+
437
+ Returns:
438
+ Serialized bytecode.
439
+ """
440
+ buffer = BytesIO()
441
+ VMBytecodeSerializer.write_to_stream(module, buffer, debug=debug)
442
+ return buffer.getvalue()
443
+
444
+ @staticmethod
445
+ def write_to_stream(module: BytecodeModule, stream: BinaryIO, debug: bool = False) -> None:
446
+ """Write bytecode module to a stream with constant index remapping.
447
+
448
+ Args:
449
+ module: BytecodeModule to serialize.
450
+ stream: Binary stream to write to.
451
+ debug: If True, print remapping report.
452
+ """
453
+ # Step 1: Build constant mapping with deduplication
454
+ mapping = build_constant_mapping(module)
455
+
456
+ # Print debug report if requested
457
+ if debug:
458
+ print(generate_remapping_report(module, mapping))
459
+
460
+ # Step 2: Initialize remapper
461
+ remapper = BytecodeRemapper(mapping)
462
+
463
+ # Step 3: Process chunks with remapping
464
+ all_bytecode = bytearray()
465
+ chunk_offsets = {}
466
+
467
+ for i, chunk in enumerate(module.chunks):
468
+ chunk_offsets[i] = len(all_bytecode)
469
+
470
+ # Remap this chunk's bytecode
471
+ try:
472
+ remapped = remapper.remap_chunk(i, bytes(chunk.bytecode))
473
+ all_bytecode.extend(remapped)
474
+ except RemappingError as e:
475
+ # Add module context to error
476
+ raise RemappingError(f"Failed to remap chunk '{chunk.name}': {e}") from e
477
+
478
+ # Use remapped constants
479
+ all_constants = mapping.global_constants
480
+
481
+ # Calculate section sizes
482
+ header_size = 28 # 4 (magic) + 4 (version) + 4 (flags) + 16 (4 offsets)
483
+
484
+ name_bytes = module.name.encode("utf-8")
485
+ name_section_size = 4 + len(name_bytes)
486
+
487
+ const_section_size = 4 # count
488
+ for tag, value in all_constants:
489
+ const_section_size += 1 # tag
490
+ if tag == ConstantTag.INT:
491
+ const_section_size += 8 # i64
492
+ elif tag == ConstantTag.FLOAT:
493
+ const_section_size += 8 # f64
494
+ elif tag == ConstantTag.STRING:
495
+ str_bytes = value.encode("utf-8")
496
+ const_section_size += 4 + len(str_bytes) # length + data
497
+ elif tag == ConstantTag.BOOL:
498
+ const_section_size += 1 # u8
499
+ # EMPTY has no data
500
+
501
+ # Calculate function table section size
502
+ func_section_size = 4 # count
503
+ for func_name in module.function_table:
504
+ func_name_bytes = func_name.encode("utf-8")
505
+ func_section_size += 4 + len(func_name_bytes) + 4 # name length + name + offset
506
+
507
+ # Calculate offsets
508
+ name_offset = header_size
509
+ const_offset = name_offset + name_section_size
510
+ func_offset = const_offset + const_section_size
511
+ inst_offset = func_offset + func_section_size
512
+
513
+ # Write header
514
+ stream.write(b"MDBC") # Magic
515
+ stream.write(struct.pack("<I", 1)) # Version
516
+ stream.write(struct.pack("<I", 0x0001)) # Flags (little-endian)
517
+ stream.write(struct.pack("<I", name_offset))
518
+ stream.write(struct.pack("<I", const_offset))
519
+ stream.write(struct.pack("<I", func_offset))
520
+ stream.write(struct.pack("<I", inst_offset))
521
+
522
+ # Write module name
523
+ stream.write(struct.pack("<I", len(name_bytes)))
524
+ stream.write(name_bytes)
525
+
526
+ # Write constants (now deduplicated and remapped)
527
+ stream.write(struct.pack("<I", len(all_constants)))
528
+ for tag, value in all_constants:
529
+ stream.write(struct.pack("<B", tag))
530
+ if tag == ConstantTag.INT:
531
+ stream.write(struct.pack("<q", value))
532
+ elif tag == ConstantTag.FLOAT:
533
+ stream.write(struct.pack("<d", value))
534
+ elif tag == ConstantTag.STRING:
535
+ str_bytes = value.encode("utf-8")
536
+ stream.write(struct.pack("<I", len(str_bytes)))
537
+ stream.write(str_bytes)
538
+ elif tag == ConstantTag.BOOL:
539
+ stream.write(struct.pack("<B", 1 if value else 0))
540
+ # EMPTY has no data
541
+
542
+ # Write function table (convert chunk indices to bytecode offsets)
543
+ stream.write(struct.pack("<I", len(module.function_table)))
544
+ for func_name, chunk_idx in module.function_table.items():
545
+ func_name_bytes = func_name.encode("utf-8")
546
+ stream.write(struct.pack("<I", len(func_name_bytes)))
547
+ stream.write(func_name_bytes)
548
+ # Convert chunk index to bytecode offset (instruction index)
549
+ bytecode_offset = chunk_offsets.get(chunk_idx, 0)
550
+ # Convert byte offset to instruction offset
551
+ inst_offset = VMBytecodeSerializer.count_instructions(bytes(all_bytecode[:bytecode_offset]))
552
+ stream.write(struct.pack("<I", inst_offset))
553
+
554
+ # Write instructions
555
+ # The Rust loader expects the number of instructions, not bytes
556
+ instruction_count = VMBytecodeSerializer.count_instructions(bytes(all_bytecode))
557
+ stream.write(struct.pack("<I", instruction_count))
558
+ stream.write(all_bytecode)
559
+
560
+ @staticmethod
561
+ def count_instructions(bytecode: bytes) -> int:
562
+ """Count the number of instructions in bytecode.
563
+
564
+ Args:
565
+ bytecode: Raw bytecode bytes.
566
+
567
+ Returns:
568
+ Number of instructions.
569
+ """
570
+ count = 0
571
+ i = 0
572
+
573
+ while i < len(bytecode):
574
+ opcode = bytecode[i]
575
+ count += 1
576
+
577
+ # Use the get_instruction_size helper
578
+ inst_size = get_instruction_size(opcode, bytecode, i)
579
+ i += inst_size
580
+
581
+ return count
582
+
583
+ @staticmethod
584
+ def parse_instructions(bytecode: bytes, const_base: int = 0) -> list[bytes]:
585
+ """Parse bytecode into individual instructions.
586
+
587
+ DEPRECATED: This method is kept for backward compatibility but
588
+ the new remapping approach is used in write_to_stream.
589
+
590
+ Args:
591
+ bytecode: Raw bytecode bytes.
592
+ const_base: Base offset for constant indices.
593
+
594
+ Returns:
595
+ List of individual instruction bytes.
596
+ """
597
+ instructions = []
598
+ i = 0
599
+
600
+ while i < len(bytecode):
601
+ start = i
602
+ opcode = bytecode[i]
603
+
604
+ # Get instruction size
605
+ inst_size = get_instruction_size(opcode, bytecode, i)
606
+
607
+ # Extract instruction
608
+ inst = bytecode[start : start + inst_size]
609
+
610
+ # Legacy remapping for LOAD_CONST_R only
611
+ if const_base > 0 and opcode == Opcode.LOAD_CONST_R:
612
+ new_inst = bytearray(inst)
613
+ old_idx = struct.unpack("<H", inst[2:4])[0]
614
+ new_idx = old_idx + const_base
615
+ struct.pack_into("<H", new_inst, 2, new_idx)
616
+ inst = bytes(new_inst)
617
+
618
+ instructions.append(inst)
619
+ i += inst_size
620
+
621
+ return instructions
@@ -0,0 +1,18 @@
1
+ """Compiler module for Machine Dialect™.
2
+
3
+ This module provides the main compilation infrastructure for Machine Dialect™,
4
+ organizing the compilation process into clear phases and providing a unified
5
+ interface for compilation.
6
+ """
7
+
8
+ from machine_dialect.compiler.compiler import Compiler
9
+ from machine_dialect.compiler.config import CompilerConfig
10
+ from machine_dialect.compiler.context import CompilationContext
11
+ from machine_dialect.compiler.pipeline import CompilationPipeline
12
+
13
+ __all__ = [
14
+ "CompilationContext",
15
+ "CompilationPipeline",
16
+ "Compiler",
17
+ "CompilerConfig",
18
+ ]