parsanol 1.2.2 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'parsanol/native/transformer'
4
+
5
+ module Parsanol
6
+ module Native
7
+ # Decodes flat u64 arrays from Rust batch parser into Ruby AST
8
+ #
9
+ # The batch format uses tagged u64 values:
10
+ # - 0x00 = nil
11
+ # - 0x01 + value = bool (0 or 1)
12
+ # - 0x02 + value = int
13
+ # - 0x03 + bits = float (IEEE 754 bits)
14
+ # - 0x04 + offset + length = input string reference
15
+ # - 0x05 ... 0x06 = array (start ... end)
16
+ # - 0x07 ... 0x08 = hash (start ... end)
17
+ # - 0x09 + len + data... = hash key
18
+ # - 0x0A + len + data... = inline string
19
+ module BatchDecoder
20
+ TAG_NIL = 0x00
21
+ TAG_BOOL = 0x01
22
+ TAG_INT = 0x02
23
+ TAG_FLOAT = 0x03
24
+ TAG_STRING = 0x04
25
+ TAG_ARRAY_START = 0x05
26
+ TAG_ARRAY_END = 0x06
27
+ TAG_HASH_START = 0x07
28
+ TAG_HASH_END = 0x08
29
+ TAG_HASH_KEY = 0x09
30
+ TAG_INLINE_STRING = 0x0A
31
+ TAG_SYMBOL = 0x0B
32
+ TAG_REPETITION = 0x0C
33
+ TAG_SEQUENCE = 0x0D
34
+
35
+ class << self
36
+ # Decode a flat u64 array into Ruby AST with Slice objects
37
+ #
38
+ # @param data [Array<Integer>] Flat u64 array from batch parser
39
+ # @param input [String] Original input string (for Slice references)
40
+ # @param slice_class [Class] The Slice class to use
41
+ # @return [Object] Ruby AST (Hash, Array, Slice, etc.)
42
+ def decode(data, input, slice_class)
43
+ @input = input
44
+ @input_bytes = input.b
45
+ @slice_class = slice_class
46
+ @pos = 0
47
+ @data = data
48
+ decode_value
49
+ end
50
+
51
+ # Decode batch format to Ruby AST and apply transformation.
52
+ #
53
+ # The Rust parser produces raw AST that needs transformation to match
54
+ # Ruby parser behavior (merging duplicate keys, etc.)
55
+ #
56
+ # @param data [Array<Integer>|Object] Either flat u64 array from batch parser OR
57
+ # pre-decoded Ruby value from _parse_raw
58
+ # @param input [String] Original input string (for Slice references)
59
+ # @param slice_class [Class] The Slice class to use
60
+ # @param grammar_atom [Parsanol::Atoms::Base] The grammar atom (unused, kept for API compat)
61
+ # @return [Object] Transformed Ruby AST
62
+ def decode_and_flatten(data, input, slice_class, grammar_atom)
63
+ # Check if data is batch data (flat u64 array) or already a Ruby value
64
+ if data.is_a?(Integer) || (data.is_a?(Array) && data.first.is_a?(Integer))
65
+ # Batch data (flat u64 array) - decode first, then transform
66
+ raw_ast = decode(data, input, slice_class)
67
+ AstTransformer.transform(raw_ast)
68
+ else
69
+ # Already decoded Ruby value from _parse_raw - apply transformer directly
70
+ AstTransformer.transform(data)
71
+ end
72
+ end
73
+
74
+ # Join consecutive Slice objects in arrays into single Slices
75
+ # This matches what transform_ast does in Rust (join_slices_from_array)
76
+ #
77
+ # @param value [Object] AST value
78
+ # @param slice_class [Class] The Slice class to check for
79
+ # @param input [String] Original input string
80
+ # @return [Object] AST with joined slices
81
+ def join_consecutive_slices(value, slice_class, input)
82
+ input_bytes = input.b
83
+
84
+ case value
85
+ when Array
86
+ # Recursively process array elements
87
+ processed = value.map { |v| join_consecutive_slices(v, slice_class, input) }
88
+
89
+ # Check if all non-nil elements are Slices
90
+ non_nil = processed.compact
91
+ if non_nil.all? { |v| v.is_a?(slice_class) }
92
+ # Check if slices are consecutive
93
+ if slices_consecutive?(non_nil)
94
+ # Join into single slice
95
+ join_slices(non_nil, slice_class, input_bytes, input)
96
+ else
97
+ processed
98
+ end
99
+ else
100
+ processed
101
+ end
102
+ when Hash
103
+ # Process hash values recursively
104
+ result = {}
105
+ value.each do |k, v|
106
+ result[k] = join_consecutive_slices(v, slice_class, input)
107
+ end
108
+ result
109
+ else
110
+ value
111
+ end
112
+ end
113
+
114
+ private
115
+
116
+ def slices_consecutive?(slices)
117
+ return true if slices.empty?
118
+
119
+ slices.each_cons(2).all? do |a, b|
120
+ a.offset + a.content.bytesize == b.offset
121
+ end
122
+ end
123
+
124
+ def join_slices(slices, slice_class, input_bytes, input)
125
+ return nil if slices.empty?
126
+ return slices.first if slices.length == 1
127
+
128
+ first = slices.first
129
+ last = slices.last
130
+ total_length = last.offset + last.content.bytesize - first.offset
131
+ content = input_bytes[first.offset, total_length]
132
+ content = content.force_encoding('UTF-8') if content
133
+ slice_class.new(first.offset, content, input)
134
+ end
135
+
136
+ def decode_value
137
+ tag = @data[@pos]
138
+ @pos += 1
139
+
140
+ case tag
141
+ when TAG_NIL
142
+ nil
143
+ when TAG_BOOL
144
+ val = @data[@pos]
145
+ @pos += 1
146
+ val != 0
147
+ when TAG_INT
148
+ val = @data[@pos]
149
+ @pos += 1
150
+ # Handle negative numbers (signed i64 stored as u64)
151
+ if val >= 0x8000_0000_0000_0000
152
+ val = val - 0x1_0000_0000_0000_0000
153
+ end
154
+ val
155
+ when TAG_FLOAT
156
+ bits = @data[@pos]
157
+ @pos += 1
158
+ # Convert IEEE 754 bits to float
159
+ [bits].pack('Q').unpack1('D')
160
+ when TAG_STRING
161
+ offset = @data[@pos]
162
+ length = @data[@pos + 1]
163
+ @pos += 2
164
+ create_slice(offset, length)
165
+ when TAG_SYMBOL
166
+ # Symbol is encoded like inline string: len, then u64 chunks
167
+ len = @data[@pos]
168
+ @pos += 1
169
+ str = decode_inline_string_bytes(len)
170
+ str.to_sym
171
+ when TAG_REPETITION
172
+ inner = decode_value
173
+ [:repetition, inner].compact
174
+ when TAG_SEQUENCE
175
+ inner = decode_value
176
+ [:sequence, inner].compact
177
+ when TAG_ARRAY_START
178
+ decode_array
179
+ when TAG_HASH_START
180
+ decode_hash
181
+ else
182
+ raise "Unknown tag: #{tag} at position #{@pos - 1}"
183
+ end
184
+ end
185
+
186
+ def decode_array
187
+ result = []
188
+ loop do
189
+ tag = @data[@pos]
190
+ break if tag == TAG_ARRAY_END
191
+
192
+ result << decode_value
193
+ end
194
+ @pos += 1 # consume TAG_ARRAY_END
195
+ result
196
+ end
197
+
198
+ def decode_hash
199
+ result = {}
200
+ loop do
201
+ tag = @data[@pos]
202
+ break if tag == TAG_HASH_END
203
+
204
+ # Read key
205
+ raise "Expected TAG_HASH_KEY, got #{tag}" unless tag == TAG_HASH_KEY
206
+ @pos += 1
207
+ key = decode_inline_string
208
+
209
+ # Read value
210
+ value = decode_value
211
+
212
+ # Keep original key format (camelCase) for Ruby parser compatibility
213
+ result[key.to_sym] = value
214
+ end
215
+ @pos += 1 # consume TAG_HASH_END
216
+ result
217
+ end
218
+
219
+ def decode_inline_string
220
+ len = @data[@pos]
221
+ @pos += 1
222
+ decode_inline_string_bytes(len)
223
+ end
224
+
225
+ # Decode inline string bytes given the length
226
+ # @param len [Integer] Length of the string in bytes
227
+ # @return [String] Decoded string
228
+ def decode_inline_string_bytes(len)
229
+ # Read u64 chunks
230
+ chunks = (len + 7) / 8
231
+ bytes = String.new(encoding: 'ASCII-8BIT', capacity: len)
232
+ chunks.times do
233
+ chunk = @data[@pos]
234
+ @pos += 1
235
+ 8.times do |byte_idx|
236
+ break if bytes.bytesize >= len
237
+ bytes << ((chunk >> (byte_idx * 8)) & 0xFF)
238
+ end
239
+ end
240
+
241
+ bytes.force_encoding('UTF-8')
242
+ end
243
+
244
+ def create_slice(offset, length)
245
+ content = @input_bytes[offset, length]
246
+ content = content.force_encoding('UTF-8') if content
247
+ @slice_class.new(offset, content, @input)
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end