parsanol 1.2.2 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +29 -41
- data/Cargo.toml +8 -2
- data/HISTORY.txt +33 -3
- data/README.adoc +103 -9
- data/ext/parsanol_native/Cargo.toml +9 -6
- data/lib/parsanol/native/batch_decoder.rb +252 -0
- data/lib/parsanol/native/parser.rb +28 -574
- data/lib/parsanol/native/transformer.rb +125 -58
- data/lib/parsanol/native.rb +107 -183
- data/lib/parsanol/parser.rb +2 -6
- data/lib/parsanol/slice.rb +51 -105
- data/lib/parsanol/version.rb +1 -1
- metadata +2 -1
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'parsanol/native/transformer'
|
|
4
|
+
|
|
5
|
+
module Parsanol
|
|
6
|
+
module Native
|
|
7
|
+
# Decodes flat u64 arrays from Rust batch parser into Ruby AST
|
|
8
|
+
#
|
|
9
|
+
# The batch format uses tagged u64 values:
|
|
10
|
+
# - 0x00 = nil
|
|
11
|
+
# - 0x01 + value = bool (0 or 1)
|
|
12
|
+
# - 0x02 + value = int
|
|
13
|
+
# - 0x03 + bits = float (IEEE 754 bits)
|
|
14
|
+
# - 0x04 + offset + length = input string reference
|
|
15
|
+
# - 0x05 ... 0x06 = array (start ... end)
|
|
16
|
+
# - 0x07 ... 0x08 = hash (start ... end)
|
|
17
|
+
# - 0x09 + len + data... = hash key
|
|
18
|
+
# - 0x0A + len + data... = inline string
|
|
19
|
+
module BatchDecoder
|
|
20
|
+
TAG_NIL = 0x00
|
|
21
|
+
TAG_BOOL = 0x01
|
|
22
|
+
TAG_INT = 0x02
|
|
23
|
+
TAG_FLOAT = 0x03
|
|
24
|
+
TAG_STRING = 0x04
|
|
25
|
+
TAG_ARRAY_START = 0x05
|
|
26
|
+
TAG_ARRAY_END = 0x06
|
|
27
|
+
TAG_HASH_START = 0x07
|
|
28
|
+
TAG_HASH_END = 0x08
|
|
29
|
+
TAG_HASH_KEY = 0x09
|
|
30
|
+
TAG_INLINE_STRING = 0x0A
|
|
31
|
+
TAG_SYMBOL = 0x0B
|
|
32
|
+
TAG_REPETITION = 0x0C
|
|
33
|
+
TAG_SEQUENCE = 0x0D
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# Decode a flat u64 array into Ruby AST with Slice objects
|
|
37
|
+
#
|
|
38
|
+
# @param data [Array<Integer>] Flat u64 array from batch parser
|
|
39
|
+
# @param input [String] Original input string (for Slice references)
|
|
40
|
+
# @param slice_class [Class] The Slice class to use
|
|
41
|
+
# @return [Object] Ruby AST (Hash, Array, Slice, etc.)
|
|
42
|
+
def decode(data, input, slice_class)
|
|
43
|
+
@input = input
|
|
44
|
+
@input_bytes = input.b
|
|
45
|
+
@slice_class = slice_class
|
|
46
|
+
@pos = 0
|
|
47
|
+
@data = data
|
|
48
|
+
decode_value
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Decode batch format to Ruby AST and apply transformation.
|
|
52
|
+
#
|
|
53
|
+
# The Rust parser produces raw AST that needs transformation to match
|
|
54
|
+
# Ruby parser behavior (merging duplicate keys, etc.)
|
|
55
|
+
#
|
|
56
|
+
# @param data [Array<Integer>|Object] Either flat u64 array from batch parser OR
|
|
57
|
+
# pre-decoded Ruby value from _parse_raw
|
|
58
|
+
# @param input [String] Original input string (for Slice references)
|
|
59
|
+
# @param slice_class [Class] The Slice class to use
|
|
60
|
+
# @param grammar_atom [Parsanol::Atoms::Base] The grammar atom (unused, kept for API compat)
|
|
61
|
+
# @return [Object] Transformed Ruby AST
|
|
62
|
+
def decode_and_flatten(data, input, slice_class, grammar_atom)
|
|
63
|
+
# Check if data is batch data (flat u64 array) or already a Ruby value
|
|
64
|
+
if data.is_a?(Integer) || (data.is_a?(Array) && data.first.is_a?(Integer))
|
|
65
|
+
# Batch data (flat u64 array) - decode first, then transform
|
|
66
|
+
raw_ast = decode(data, input, slice_class)
|
|
67
|
+
AstTransformer.transform(raw_ast)
|
|
68
|
+
else
|
|
69
|
+
# Already decoded Ruby value from _parse_raw - apply transformer directly
|
|
70
|
+
AstTransformer.transform(data)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Join consecutive Slice objects in arrays into single Slices
|
|
75
|
+
# This matches what transform_ast does in Rust (join_slices_from_array)
|
|
76
|
+
#
|
|
77
|
+
# @param value [Object] AST value
|
|
78
|
+
# @param slice_class [Class] The Slice class to check for
|
|
79
|
+
# @param input [String] Original input string
|
|
80
|
+
# @return [Object] AST with joined slices
|
|
81
|
+
def join_consecutive_slices(value, slice_class, input)
|
|
82
|
+
input_bytes = input.b
|
|
83
|
+
|
|
84
|
+
case value
|
|
85
|
+
when Array
|
|
86
|
+
# Recursively process array elements
|
|
87
|
+
processed = value.map { |v| join_consecutive_slices(v, slice_class, input) }
|
|
88
|
+
|
|
89
|
+
# Check if all non-nil elements are Slices
|
|
90
|
+
non_nil = processed.compact
|
|
91
|
+
if non_nil.all? { |v| v.is_a?(slice_class) }
|
|
92
|
+
# Check if slices are consecutive
|
|
93
|
+
if slices_consecutive?(non_nil)
|
|
94
|
+
# Join into single slice
|
|
95
|
+
join_slices(non_nil, slice_class, input_bytes, input)
|
|
96
|
+
else
|
|
97
|
+
processed
|
|
98
|
+
end
|
|
99
|
+
else
|
|
100
|
+
processed
|
|
101
|
+
end
|
|
102
|
+
when Hash
|
|
103
|
+
# Process hash values recursively
|
|
104
|
+
result = {}
|
|
105
|
+
value.each do |k, v|
|
|
106
|
+
result[k] = join_consecutive_slices(v, slice_class, input)
|
|
107
|
+
end
|
|
108
|
+
result
|
|
109
|
+
else
|
|
110
|
+
value
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
def slices_consecutive?(slices)
|
|
117
|
+
return true if slices.empty?
|
|
118
|
+
|
|
119
|
+
slices.each_cons(2).all? do |a, b|
|
|
120
|
+
a.offset + a.content.bytesize == b.offset
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def join_slices(slices, slice_class, input_bytes, input)
|
|
125
|
+
return nil if slices.empty?
|
|
126
|
+
return slices.first if slices.length == 1
|
|
127
|
+
|
|
128
|
+
first = slices.first
|
|
129
|
+
last = slices.last
|
|
130
|
+
total_length = last.offset + last.content.bytesize - first.offset
|
|
131
|
+
content = input_bytes[first.offset, total_length]
|
|
132
|
+
content = content.force_encoding('UTF-8') if content
|
|
133
|
+
slice_class.new(first.offset, content, input)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def decode_value
|
|
137
|
+
tag = @data[@pos]
|
|
138
|
+
@pos += 1
|
|
139
|
+
|
|
140
|
+
case tag
|
|
141
|
+
when TAG_NIL
|
|
142
|
+
nil
|
|
143
|
+
when TAG_BOOL
|
|
144
|
+
val = @data[@pos]
|
|
145
|
+
@pos += 1
|
|
146
|
+
val != 0
|
|
147
|
+
when TAG_INT
|
|
148
|
+
val = @data[@pos]
|
|
149
|
+
@pos += 1
|
|
150
|
+
# Handle negative numbers (signed i64 stored as u64)
|
|
151
|
+
if val >= 0x8000_0000_0000_0000
|
|
152
|
+
val = val - 0x1_0000_0000_0000_0000
|
|
153
|
+
end
|
|
154
|
+
val
|
|
155
|
+
when TAG_FLOAT
|
|
156
|
+
bits = @data[@pos]
|
|
157
|
+
@pos += 1
|
|
158
|
+
# Convert IEEE 754 bits to float
|
|
159
|
+
[bits].pack('Q').unpack1('D')
|
|
160
|
+
when TAG_STRING
|
|
161
|
+
offset = @data[@pos]
|
|
162
|
+
length = @data[@pos + 1]
|
|
163
|
+
@pos += 2
|
|
164
|
+
create_slice(offset, length)
|
|
165
|
+
when TAG_SYMBOL
|
|
166
|
+
# Symbol is encoded like inline string: len, then u64 chunks
|
|
167
|
+
len = @data[@pos]
|
|
168
|
+
@pos += 1
|
|
169
|
+
str = decode_inline_string_bytes(len)
|
|
170
|
+
str.to_sym
|
|
171
|
+
when TAG_REPETITION
|
|
172
|
+
inner = decode_value
|
|
173
|
+
[:repetition, inner].compact
|
|
174
|
+
when TAG_SEQUENCE
|
|
175
|
+
inner = decode_value
|
|
176
|
+
[:sequence, inner].compact
|
|
177
|
+
when TAG_ARRAY_START
|
|
178
|
+
decode_array
|
|
179
|
+
when TAG_HASH_START
|
|
180
|
+
decode_hash
|
|
181
|
+
else
|
|
182
|
+
raise "Unknown tag: #{tag} at position #{@pos - 1}"
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def decode_array
|
|
187
|
+
result = []
|
|
188
|
+
loop do
|
|
189
|
+
tag = @data[@pos]
|
|
190
|
+
break if tag == TAG_ARRAY_END
|
|
191
|
+
|
|
192
|
+
result << decode_value
|
|
193
|
+
end
|
|
194
|
+
@pos += 1 # consume TAG_ARRAY_END
|
|
195
|
+
result
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def decode_hash
|
|
199
|
+
result = {}
|
|
200
|
+
loop do
|
|
201
|
+
tag = @data[@pos]
|
|
202
|
+
break if tag == TAG_HASH_END
|
|
203
|
+
|
|
204
|
+
# Read key
|
|
205
|
+
raise "Expected TAG_HASH_KEY, got #{tag}" unless tag == TAG_HASH_KEY
|
|
206
|
+
@pos += 1
|
|
207
|
+
key = decode_inline_string
|
|
208
|
+
|
|
209
|
+
# Read value
|
|
210
|
+
value = decode_value
|
|
211
|
+
|
|
212
|
+
# Keep original key format (camelCase) for Ruby parser compatibility
|
|
213
|
+
result[key.to_sym] = value
|
|
214
|
+
end
|
|
215
|
+
@pos += 1 # consume TAG_HASH_END
|
|
216
|
+
result
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def decode_inline_string
|
|
220
|
+
len = @data[@pos]
|
|
221
|
+
@pos += 1
|
|
222
|
+
decode_inline_string_bytes(len)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Decode inline string bytes given the length
|
|
226
|
+
# @param len [Integer] Length of the string in bytes
|
|
227
|
+
# @return [String] Decoded string
|
|
228
|
+
def decode_inline_string_bytes(len)
|
|
229
|
+
# Read u64 chunks
|
|
230
|
+
chunks = (len + 7) / 8
|
|
231
|
+
bytes = String.new(encoding: 'ASCII-8BIT', capacity: len)
|
|
232
|
+
chunks.times do
|
|
233
|
+
chunk = @data[@pos]
|
|
234
|
+
@pos += 1
|
|
235
|
+
8.times do |byte_idx|
|
|
236
|
+
break if bytes.bytesize >= len
|
|
237
|
+
bytes << ((chunk >> (byte_idx * 8)) & 0xFF)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
bytes.force_encoding('UTF-8')
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def create_slice(offset, length)
|
|
245
|
+
content = @input_bytes[offset, length]
|
|
246
|
+
content = content.force_encoding('UTF-8') if content
|
|
247
|
+
@slice_class.new(offset, content, @input)
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|