parsanol 1.2.2-aarch64-linux → 1.3.2-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.txt +33 -3
- data/README.adoc +103 -9
- data/lib/parsanol/3.2/parsanol_native.so +0 -0
- data/lib/parsanol/3.3/parsanol_native.so +0 -0
- data/lib/parsanol/3.4/parsanol_native.so +0 -0
- data/lib/parsanol/4.0/parsanol_native.so +0 -0
- data/lib/parsanol/native/batch_decoder.rb +252 -0
- data/lib/parsanol/native/parser.rb +28 -574
- data/lib/parsanol/native/transformer.rb +125 -58
- data/lib/parsanol/native.rb +107 -183
- data/lib/parsanol/parser.rb +2 -6
- data/lib/parsanol/slice.rb +51 -105
- data/lib/parsanol/version.rb +1 -1
- metadata +3 -2
|
@@ -5,23 +5,13 @@ require 'digest'
|
|
|
5
5
|
module Parsanol
|
|
6
6
|
module Native
|
|
7
7
|
# Core parsing functionality using Rust native extension
|
|
8
|
-
#
|
|
9
|
-
# Provides three parsing modes:
|
|
10
|
-
# - :ruby - Parse and transform to Parslet-compatible format
|
|
11
|
-
# - :json - Parse and return JSON-serialized AST
|
|
12
|
-
# - :slice - Parse and return raw native format (fastest)
|
|
13
|
-
#
|
|
14
8
|
module Parser
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
GRAMMAR_HASH_CACHE = Hash.new # object_id => hash_key
|
|
18
|
-
GRAMMAR_CACHE = Hash.new # hash_key => grammar_json
|
|
9
|
+
GRAMMAR_HASH_CACHE = Hash.new
|
|
10
|
+
GRAMMAR_CACHE = Hash.new
|
|
19
11
|
|
|
20
12
|
class << self
|
|
21
|
-
# Cached availability check
|
|
22
13
|
@cached_available = nil
|
|
23
14
|
|
|
24
|
-
# Check if native extension is available
|
|
25
15
|
def available?
|
|
26
16
|
return @cached_available unless @cached_available.nil?
|
|
27
17
|
|
|
@@ -33,603 +23,67 @@ module Parsanol
|
|
|
33
23
|
end
|
|
34
24
|
end
|
|
35
25
|
|
|
36
|
-
# Parse
|
|
37
|
-
#
|
|
38
|
-
# @param
|
|
39
|
-
# @param line_cache [Parsanol::Source::LineCache, nil] Optional line cache for position info
|
|
40
|
-
# @return Ruby AST from parsing with Slice objects for strings
|
|
41
|
-
def parse(grammar_json, input, line_cache = nil)
|
|
42
|
-
raise LoadError, 'Native parser not available. Run `rake compile` to build.' unless available?
|
|
43
|
-
|
|
44
|
-
# Build line cache if not provided
|
|
45
|
-
line_cache ||= build_line_cache(input)
|
|
46
|
-
|
|
47
|
-
# Call native parse_batch (returns flat u64 array)
|
|
48
|
-
flat = Parsanol::Native.parse_batch(grammar_json, input)
|
|
49
|
-
# Decode flat array to Ruby AST with Slice objects
|
|
50
|
-
decode_flat(flat, input, line_cache)
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Parse a grammar with automatic serialization and caching
|
|
54
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
55
|
-
# @param input [String] Input string to parse
|
|
56
|
-
# @param line_cache [Parsanol::Source::LineCache, nil] Optional line cache
|
|
57
|
-
# @return Ruby AST from parsing with Slice objects
|
|
58
|
-
def parse_with_grammar(root_atom, input, line_cache = nil)
|
|
59
|
-
# Extract root atom if a Parser is passed
|
|
60
|
-
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
61
|
-
grammar_json = serialize_grammar(root_atom)
|
|
62
|
-
parse(grammar_json, input, line_cache)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Parse and transform to Parslet-compatible format
|
|
66
|
-
# NOTE: This method now returns Slice objects with position info by default.
|
|
67
|
-
# The name is kept for backward compatibility but it's now the primary parse method.
|
|
68
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
69
|
-
# @param input [String] Input string to parse
|
|
70
|
-
# @param line_cache [Parsanol::Source::LineCache, nil] Optional line cache
|
|
71
|
-
# @return Ruby AST in Parslet-compatible format with Slice objects
|
|
72
|
-
def parse_parslet_compatible(root_atom, input, line_cache = nil)
|
|
73
|
-
# Extract root atom if a Parser is passed
|
|
74
|
-
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
75
|
-
raw_ast = parse_with_grammar(root_atom, input, line_cache)
|
|
76
|
-
AstTransformer.transform(raw_ast)
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# Parse multiple inputs with the same grammar (more efficient)
|
|
80
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
81
|
-
# @param inputs [Array<String>] Array of input strings to parse
|
|
82
|
-
# @return [Array] Array of Ruby ASTs with Slice objects
|
|
83
|
-
def parse_batch_inputs(root_atom, inputs)
|
|
84
|
-
# Extract root atom if a Parser is passed
|
|
85
|
-
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
86
|
-
grammar_json = serialize_grammar(root_atom)
|
|
87
|
-
inputs.map { |input| parse(grammar_json, input) }
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# Parse multiple inputs with transformation
|
|
91
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
92
|
-
# @param inputs [Array<String>] Array of input strings to parse
|
|
93
|
-
# @return [Array] Array of transformed Ruby ASTs with Slice objects
|
|
94
|
-
def parse_batch_with_transform(root_atom, inputs)
|
|
95
|
-
# Extract root atom if a Parser is passed
|
|
96
|
-
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
97
|
-
grammar_json = serialize_grammar(root_atom)
|
|
98
|
-
# First parse all inputs, then batch transform
|
|
99
|
-
# This provides better cache locality
|
|
100
|
-
raw_asts = inputs.map { |input| parse(grammar_json, input) }
|
|
101
|
-
AstTransformer.transform_batch(raw_asts)
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Parse without transformation (faster for raw AST access)
|
|
105
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
26
|
+
# Parse input with a Ruby grammar, returning clean AST.
|
|
27
|
+
#
|
|
28
|
+
# @param grammar [Parsanol::Atoms::Base] Ruby grammar or JSON string
|
|
106
29
|
# @param input [String] Input string to parse
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
111
|
-
parse_with_grammar(root_atom, input)
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Build a line cache for an input string
|
|
115
|
-
# @param input [String] The input string
|
|
116
|
-
# @return [Parsanol::Source::LineCache] The line cache
|
|
117
|
-
def build_line_cache(input)
|
|
118
|
-
cache = ::Parsanol::Source::LineCache.new
|
|
119
|
-
cache.scan_for_line_endings(0, input)
|
|
120
|
-
cache
|
|
30
|
+
def parse(grammar, input)
|
|
31
|
+
# Delegate to Parsanol::Native.parse for consistency
|
|
32
|
+
Parsanol::Native.parse(grammar, input)
|
|
121
33
|
end
|
|
122
34
|
|
|
123
|
-
# Serialize a grammar to JSON
|
|
124
|
-
# Level 1: object_id => hash_key (avoids grammar traversal)
|
|
125
|
-
# Level 2: hash_key => grammar_json (avoids serialization)
|
|
126
|
-
# @param root_atom [Parsanol::Atoms::Base] Root atom of the grammar
|
|
127
|
-
# @return [String] JSON string
|
|
35
|
+
# Serialize a Ruby grammar to JSON (cached).
|
|
128
36
|
def serialize_grammar(root_atom)
|
|
129
|
-
|
|
37
|
+
root_atom = root_atom.root if root_atom.is_a?(::Parsanol::Parser)
|
|
130
38
|
obj_id = root_atom.object_id
|
|
131
|
-
cache_key = GRAMMAR_HASH_CACHE[obj_id]
|
|
132
|
-
|
|
133
|
-
if cache_key
|
|
134
|
-
# Fast path: already computed hash, check grammar cache
|
|
135
|
-
else
|
|
136
|
-
# Slow path: compute structural hash
|
|
137
|
-
cache_key = grammar_structure_hash(root_atom)
|
|
138
|
-
GRAMMAR_HASH_CACHE[obj_id] = cache_key
|
|
139
|
-
end
|
|
39
|
+
cache_key = GRAMMAR_HASH_CACHE[obj_id] ||= grammar_structure_hash(root_atom)
|
|
140
40
|
GRAMMAR_CACHE[cache_key] ||= GrammarSerializer.serialize(root_atom)
|
|
141
41
|
end
|
|
142
42
|
|
|
143
|
-
# Clear grammar caches (call if grammar changes)
|
|
144
43
|
def clear_cache
|
|
145
44
|
GRAMMAR_HASH_CACHE.clear
|
|
146
45
|
GRAMMAR_CACHE.clear
|
|
147
46
|
end
|
|
148
47
|
|
|
149
|
-
# Get cache statistics
|
|
150
48
|
def cache_stats
|
|
151
49
|
{
|
|
152
50
|
hash_cache_size: GRAMMAR_HASH_CACHE.size,
|
|
153
|
-
grammar_cache_size: GRAMMAR_CACHE.size
|
|
154
|
-
grammar_keys: GRAMMAR_CACHE.keys
|
|
51
|
+
grammar_cache_size: GRAMMAR_CACHE.size
|
|
155
52
|
}
|
|
156
53
|
end
|
|
157
54
|
|
|
158
|
-
# ===== Serialized Mode (JSON Output) =====
|
|
159
|
-
|
|
160
|
-
# Parse input and return JSON string
|
|
161
|
-
# Uses native parsing and serializes the result to JSON
|
|
162
|
-
#
|
|
163
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
164
|
-
# @param input [String] Input string to parse
|
|
165
|
-
# @return [String] JSON string representing the result
|
|
166
|
-
def parse_to_json(grammar_json, input)
|
|
167
|
-
unless available?
|
|
168
|
-
raise LoadError,
|
|
169
|
-
"Serialized mode requires native extension. " \
|
|
170
|
-
"Run `rake compile` to build the extension."
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
# Parse using native engine and convert result to JSON
|
|
174
|
-
result = parse(grammar_json, input)
|
|
175
|
-
result.to_json
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Parse and return direct Ruby objects via FFI
|
|
179
|
-
# Uses ZeroCopy mode - Rust constructs Ruby objects directly via magnus FFI
|
|
180
|
-
# This bypasses the u64 serialization step for maximum performance.
|
|
181
|
-
#
|
|
182
|
-
# Slice information is preserved: InputRef nodes from Rust are returned
|
|
183
|
-
# directly as Parsanol::Slice objects (no intermediate hash conversion needed).
|
|
184
|
-
#
|
|
185
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
186
|
-
# @param input [String] Input string to parse
|
|
187
|
-
# @param type_map [Hash] Mapping of rule names to Ruby classes (not used in this mode)
|
|
188
|
-
# @return [Object] Direct Ruby object (type depends on grammar)
|
|
189
|
-
def parse_to_objects(grammar_json, input, _type_map = nil)
|
|
190
|
-
unless available?
|
|
191
|
-
raise LoadError,
|
|
192
|
-
"ZeroCopy mode requires native extension. " \
|
|
193
|
-
"Run `rake compile` to build the extension."
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
# Call Rust function that returns Slice objects directly
|
|
197
|
-
# No need to convert - they are already Parsanol::Slice objects
|
|
198
|
-
Parsanol::Native.parse_to_ruby_objects(grammar_json, input)
|
|
199
|
-
end
|
|
200
|
-
|
|
201
|
-
# Recursively convert slice hashes to Parsanol::Slice objects
|
|
202
|
-
# Rust returns { "_slice" => true, "str" => "...", "offset" => N, "length" => N }
|
|
203
|
-
# for InputRef nodes, which we convert to Slice objects preserving position info.
|
|
204
|
-
#
|
|
205
|
-
# @param obj [Object] The object to convert (may be Hash, Array, or leaf value)
|
|
206
|
-
# @param input [String] The original input string (for Slice source reference)
|
|
207
|
-
# @return [Object] The converted object with Slice objects in place of slice hashes
|
|
208
|
-
def convert_slices(obj, input)
|
|
209
|
-
case obj
|
|
210
|
-
when Hash
|
|
211
|
-
# Check if this is a slice marker from Rust
|
|
212
|
-
if obj['_slice'] == true
|
|
213
|
-
Parsanol::Slice.new(obj['offset'], obj['str'])
|
|
214
|
-
else
|
|
215
|
-
# Recursively convert hash values
|
|
216
|
-
obj.transform_values { |v| convert_slices(v, input) }
|
|
217
|
-
end
|
|
218
|
-
when Array
|
|
219
|
-
# Recursively convert array elements
|
|
220
|
-
obj.map { |item| convert_slices(item, input) }
|
|
221
|
-
else
|
|
222
|
-
# Leaf values (strings, integers, etc.) are returned as-is
|
|
223
|
-
obj
|
|
224
|
-
end
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
# ===== Source Location Tracking =====
|
|
228
|
-
|
|
229
|
-
# Parse with source location tracking
|
|
230
|
-
# Returns both the AST and a hash of spans
|
|
231
|
-
#
|
|
232
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
233
|
-
# @param input [String] Input string to parse
|
|
234
|
-
# @return [Array<(Object, Hash)>] Tuple of [parsed_result, spans_hash]
|
|
235
|
-
def parse_with_spans(grammar_json, input)
|
|
236
|
-
unless available?
|
|
237
|
-
raise LoadError,
|
|
238
|
-
"Source location tracking requires native extension. " \
|
|
239
|
-
"Run `rake compile` to build the extension."
|
|
240
|
-
end
|
|
241
|
-
|
|
242
|
-
_parse_with_spans(grammar_json, input)
|
|
243
|
-
end
|
|
244
|
-
|
|
245
|
-
# Get span for a specific node
|
|
246
|
-
#
|
|
247
|
-
# @param result [Object] Parse result from parse_with_spans
|
|
248
|
-
# @param node_id [Integer] Node identifier
|
|
249
|
-
# @return [Hash] Span information {start: {offset, line, column}, end: {...}}
|
|
250
|
-
def get_span(result, node_id)
|
|
251
|
-
raise LoadError, 'Source location tracking requires native extension.' unless available?
|
|
252
|
-
|
|
253
|
-
_get_span(result, node_id)
|
|
254
|
-
end
|
|
255
|
-
|
|
256
|
-
# ===== Grammar Composition =====
|
|
257
|
-
|
|
258
|
-
# Import another grammar with optional prefix
|
|
259
|
-
#
|
|
260
|
-
# @param builder_json [String] GrammarBuilder JSON
|
|
261
|
-
# @param grammar_json [String] Grammar to import
|
|
262
|
-
# @param prefix [String, nil] Optional prefix for imported rules
|
|
263
|
-
# @return [String] Updated GrammarBuilder JSON
|
|
264
|
-
def grammar_import(builder_json, grammar_json, prefix = nil)
|
|
265
|
-
raise LoadError, 'Grammar composition requires native extension.' unless available?
|
|
266
|
-
|
|
267
|
-
_grammar_import(builder_json, grammar_json, prefix)
|
|
268
|
-
end
|
|
269
|
-
|
|
270
|
-
# Get mutable reference to a rule
|
|
271
|
-
#
|
|
272
|
-
# @param builder_json [String] GrammarBuilder JSON
|
|
273
|
-
# @param rule_name [String] Name of the rule to modify
|
|
274
|
-
# @return [String] Updated GrammarBuilder JSON
|
|
275
|
-
def grammar_rule_mut(builder_json, rule_name)
|
|
276
|
-
raise LoadError, 'Grammar composition requires native extension.' unless available?
|
|
277
|
-
|
|
278
|
-
_grammar_rule_mut(builder_json, rule_name)
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
# ===== Streaming Parser =====
|
|
282
|
-
|
|
283
|
-
# Create a new streaming parser
|
|
284
|
-
#
|
|
285
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
286
|
-
# @return [Object] Streaming parser instance
|
|
287
|
-
def streaming_parser_new(grammar_json)
|
|
288
|
-
raise LoadError, 'Streaming parser requires native extension.' unless available?
|
|
289
|
-
|
|
290
|
-
_streaming_parser_new(grammar_json)
|
|
291
|
-
end
|
|
292
|
-
|
|
293
|
-
# Add a chunk to the streaming parser
|
|
294
|
-
#
|
|
295
|
-
# @param parser [Object] Streaming parser instance
|
|
296
|
-
# @param chunk [String] Input chunk to add
|
|
297
|
-
# @return [Boolean] True if more chunks needed, false if ready
|
|
298
|
-
def streaming_parser_add_chunk(parser, chunk)
|
|
299
|
-
raise LoadError, 'Streaming parser requires native extension.' unless available?
|
|
300
|
-
|
|
301
|
-
_streaming_parser_add_chunk(parser, chunk)
|
|
302
|
-
end
|
|
303
|
-
|
|
304
|
-
# Parse what we have so far
|
|
305
|
-
#
|
|
306
|
-
# @param parser [Object] Streaming parser instance
|
|
307
|
-
# @return [Object, nil] Parsed result or nil if need more data
|
|
308
|
-
def streaming_parser_parse_chunk(parser)
|
|
309
|
-
raise LoadError, 'Streaming parser requires native extension.' unless available?
|
|
310
|
-
|
|
311
|
-
_streaming_parser_parse_chunk(parser)
|
|
312
|
-
end
|
|
313
|
-
|
|
314
|
-
# ===== Incremental Parser =====
|
|
315
|
-
|
|
316
|
-
# Create a new incremental parser
|
|
317
|
-
#
|
|
318
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
319
|
-
# @param initial_input [String] Initial input string
|
|
320
|
-
# @return [Object] Incremental parser instance
|
|
321
|
-
def incremental_parser_new(grammar_json, initial_input)
|
|
322
|
-
raise LoadError, 'Incremental parser requires native extension.' unless available?
|
|
323
|
-
|
|
324
|
-
_incremental_parser_new(grammar_json, initial_input)
|
|
325
|
-
end
|
|
326
|
-
|
|
327
|
-
# Apply an edit to the incremental parser
|
|
328
|
-
#
|
|
329
|
-
# @param parser [Object] Incremental parser instance
|
|
330
|
-
# @param start [Integer] Start position of edit
|
|
331
|
-
# @param deleted [Integer] Number of characters deleted
|
|
332
|
-
# @param inserted [String] Text to insert
|
|
333
|
-
# @return [Object] Updated parser state
|
|
334
|
-
def incremental_parser_apply_edit(parser, start, deleted, inserted = '')
|
|
335
|
-
raise LoadError, 'Incremental parser requires native extension.' unless available?
|
|
336
|
-
|
|
337
|
-
_incremental_parser_apply_edit(parser, start, deleted, inserted)
|
|
338
|
-
end
|
|
339
|
-
|
|
340
|
-
# Reparse with changes
|
|
341
|
-
#
|
|
342
|
-
# @param parser [Object] Incremental parser instance
|
|
343
|
-
# @param new_input [String, nil] Optional new input (if not using apply_edit)
|
|
344
|
-
# @return [Object] Parse result
|
|
345
|
-
def incremental_parser_reparse(parser, new_input = nil)
|
|
346
|
-
raise LoadError, 'Incremental parser requires native extension.' unless available?
|
|
347
|
-
|
|
348
|
-
_incremental_parser_reparse(parser, new_input)
|
|
349
|
-
end
|
|
350
|
-
|
|
351
|
-
# ===== Streaming Builder =====
|
|
352
|
-
|
|
353
|
-
# Parse with a streaming builder for maximum performance.
|
|
354
|
-
# The builder receives callbacks as parsing progresses, eliminating
|
|
355
|
-
# intermediate AST construction.
|
|
356
|
-
#
|
|
357
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
358
|
-
# @param input [String] Input string to parse
|
|
359
|
-
# @param builder [Object] Object including BuilderCallbacks module
|
|
360
|
-
# @return [Object] Result of builder.finish
|
|
361
|
-
def parse_with_builder(grammar_json, input, builder)
|
|
362
|
-
unless available?
|
|
363
|
-
raise LoadError,
|
|
364
|
-
"Streaming builder requires native extension. " \
|
|
365
|
-
"Run `rake compile` to build the extension."
|
|
366
|
-
end
|
|
367
|
-
|
|
368
|
-
_parse_with_builder(grammar_json, input, builder)
|
|
369
|
-
end
|
|
370
|
-
|
|
371
|
-
# ===== Parallel Parsing =====
|
|
372
|
-
|
|
373
|
-
# Parse multiple inputs in parallel using rayon.
|
|
374
|
-
# Provides linear speedup on multi-core systems.
|
|
375
|
-
#
|
|
376
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
377
|
-
# @param inputs [Array<String>] Array of input strings to parse
|
|
378
|
-
# @param num_threads [Integer, nil] Number of threads (nil = auto-detect)
|
|
379
|
-
# @return [Array<Object>] Array of parse results in same order as inputs
|
|
380
|
-
def parse_batch_parallel(grammar_json, inputs, num_threads: nil)
|
|
381
|
-
unless available?
|
|
382
|
-
raise LoadError,
|
|
383
|
-
"Parallel parsing requires native extension. " \
|
|
384
|
-
"Run `rake compile` to build the extension."
|
|
385
|
-
end
|
|
386
|
-
|
|
387
|
-
_parse_batch_parallel(grammar_json, inputs, num_threads)
|
|
388
|
-
end
|
|
389
|
-
|
|
390
|
-
# ===== Security / Limits =====
|
|
391
|
-
|
|
392
|
-
# Parse with custom limits for untrusted input.
|
|
393
|
-
#
|
|
394
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
395
|
-
# @param input [String] Input string to parse
|
|
396
|
-
# @param max_input_size [Integer] Maximum input size in bytes (default: 100MB)
|
|
397
|
-
# @param max_recursion_depth [Integer] Maximum recursion depth (default: 1000)
|
|
398
|
-
# @return [Object] Parse result
|
|
399
|
-
def parse_with_limits(grammar_json, input, max_input_size: 100 * 1024 * 1024, max_recursion_depth: 1000)
|
|
400
|
-
unless available?
|
|
401
|
-
raise LoadError,
|
|
402
|
-
"Security limits require native extension. " \
|
|
403
|
-
"Run `rake compile` to build the extension."
|
|
404
|
-
end
|
|
405
|
-
|
|
406
|
-
_parse_with_limits(grammar_json, input, max_input_size, max_recursion_depth)
|
|
407
|
-
end
|
|
408
|
-
|
|
409
|
-
# ===== Debug Tools =====
|
|
410
|
-
|
|
411
|
-
# Parse with tracing enabled for debugging.
|
|
412
|
-
#
|
|
413
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
414
|
-
# @param input [String] Input string to parse
|
|
415
|
-
# @return [Array<(Object, Array)>] Tuple of [parse_result, trace_events]
|
|
416
|
-
def parse_with_trace(grammar_json, input)
|
|
417
|
-
unless available?
|
|
418
|
-
raise LoadError,
|
|
419
|
-
"Debug tracing requires native extension. " \
|
|
420
|
-
"Run `rake compile` to build the extension."
|
|
421
|
-
end
|
|
422
|
-
|
|
423
|
-
_parse_with_trace(grammar_json, input)
|
|
424
|
-
end
|
|
425
|
-
|
|
426
|
-
# Generate Mermaid diagram for a grammar.
|
|
427
|
-
#
|
|
428
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
429
|
-
# @return [String] Mermaid diagram source
|
|
430
|
-
def grammar_to_mermaid(grammar_json)
|
|
431
|
-
unless available?
|
|
432
|
-
raise LoadError,
|
|
433
|
-
"Grammar visualization requires native extension. " \
|
|
434
|
-
"Run `rake compile` to build the extension."
|
|
435
|
-
end
|
|
436
|
-
|
|
437
|
-
_grammar_to_mermaid(grammar_json)
|
|
438
|
-
end
|
|
439
|
-
|
|
440
|
-
# Generate GraphViz DOT diagram for a grammar.
|
|
441
|
-
#
|
|
442
|
-
# @param grammar_json [String] JSON-serialized grammar
|
|
443
|
-
# @return [String] GraphViz DOT source
|
|
444
|
-
def grammar_to_dot(grammar_json)
|
|
445
|
-
unless available?
|
|
446
|
-
raise LoadError,
|
|
447
|
-
"Grammar visualization requires native extension. " \
|
|
448
|
-
"Run `rake compile` to build the extension."
|
|
449
|
-
end
|
|
450
|
-
|
|
451
|
-
_grammar_to_dot(grammar_json)
|
|
452
|
-
end
|
|
453
|
-
|
|
454
55
|
private
|
|
455
56
|
|
|
456
|
-
def
|
|
457
|
-
|
|
458
|
-
end
|
|
459
|
-
|
|
460
|
-
def _parse_with_builder(grammar_json, input, builder)
|
|
461
|
-
# Call native Rust function directly - parse_with_builder is exposed
|
|
462
|
-
# from the native extension as a Ruby function
|
|
463
|
-
Parsanol::Native.parse_with_builder(grammar_json, input, builder)
|
|
464
|
-
end
|
|
465
|
-
|
|
466
|
-
def _parse_batch_parallel(grammar_json, inputs, num_threads)
|
|
467
|
-
raise NotImplementedError, 'Native extension method not available'
|
|
468
|
-
end
|
|
469
|
-
|
|
470
|
-
def _parse_with_limits(grammar_json, input, max_input_size, max_recursion_depth)
|
|
471
|
-
raise NotImplementedError, 'Native extension method not available'
|
|
472
|
-
end
|
|
473
|
-
|
|
474
|
-
def _parse_with_trace(grammar_json, input)
|
|
475
|
-
raise NotImplementedError, 'Native extension method not available'
|
|
476
|
-
end
|
|
477
|
-
|
|
478
|
-
def _grammar_to_mermaid(grammar_json)
|
|
479
|
-
raise NotImplementedError, 'Native extension method not available'
|
|
480
|
-
end
|
|
481
|
-
|
|
482
|
-
def _grammar_to_dot(grammar_json)
|
|
483
|
-
raise NotImplementedError, 'Native extension method not available'
|
|
57
|
+
def grammar_structure_hash(atom)
|
|
58
|
+
Digest::MD5.hexdigest(atom_structure(atom).to_s)
|
|
484
59
|
end
|
|
485
60
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
# 0x03 = float
|
|
492
|
-
# 0x04 = string_ref (offset, length) - creates Slice with position info
|
|
493
|
-
# 0x05 = array_start
|
|
494
|
-
# 0x06 = array_end
|
|
495
|
-
# 0x07 = hash_start
|
|
496
|
-
# 0x08 = hash_end
|
|
497
|
-
# 0x09 = hash_key (tag, len, key_chunks..., value)
|
|
498
|
-
# 0x0A = inline_string (interned string from arena)
|
|
499
|
-
#
|
|
500
|
-
# @param flat [Array<Integer>] Flat u64 array from native parser
|
|
501
|
-
# @param input [String] Original input string
|
|
502
|
-
# @param line_cache [Parsanol::Source::LineCache, nil] Line cache for position info
|
|
503
|
-
# @return Ruby AST with Slice objects for all string values
|
|
504
|
-
def decode_flat(flat, input, line_cache = nil)
|
|
505
|
-
stack = []
|
|
506
|
-
i = 0
|
|
507
|
-
|
|
508
|
-
while i < flat.length
|
|
509
|
-
tag = flat[i]
|
|
510
|
-
|
|
511
|
-
case tag
|
|
512
|
-
when 0x00 # nil
|
|
513
|
-
stack << nil
|
|
514
|
-
i += 1
|
|
515
|
-
when 0x01 # bool
|
|
516
|
-
stack << (flat[i + 1] != 0)
|
|
517
|
-
i += 2
|
|
518
|
-
when 0x02 # int
|
|
519
|
-
stack << flat[i + 1]
|
|
520
|
-
i += 2
|
|
521
|
-
when 0x03 # float
|
|
522
|
-
# Decode IEEE 754 float from bits
|
|
523
|
-
bits = flat[i + 1]
|
|
524
|
-
float = [bits].pack('Q').unpack1('D')
|
|
525
|
-
stack << float
|
|
526
|
-
i += 2
|
|
527
|
-
when 0x04 # string_ref (from input) - create Slice with position info
|
|
528
|
-
offset = flat[i + 1]
|
|
529
|
-
length = flat[i + 2]
|
|
530
|
-
content = input.byteslice(offset, length)
|
|
531
|
-
# Create Slice with position info - this is the key change
|
|
532
|
-
stack << ::Parsanol::Slice.new(offset, content, line_cache)
|
|
533
|
-
i += 3
|
|
534
|
-
when 0x0A # inline_string (interned string from arena)
|
|
535
|
-
# Format: tag, len, u64 chunks of string bytes
|
|
536
|
-
len = flat[i + 1]
|
|
537
|
-
i += 2
|
|
538
|
-
|
|
539
|
-
# Read string bytes from u64 chunks
|
|
540
|
-
chunks = (len + 7) / 8
|
|
541
|
-
bytes = []
|
|
542
|
-
chunks.times do |j|
|
|
543
|
-
chunk = flat[i + j]
|
|
544
|
-
8.times do |k|
|
|
545
|
-
break if bytes.length >= len
|
|
546
|
-
|
|
547
|
-
bytes << ((chunk >> (k * 8)) & 0xff)
|
|
548
|
-
end
|
|
549
|
-
end
|
|
550
|
-
i += chunks
|
|
551
|
-
|
|
552
|
-
# Inline strings don't have source position, use Slice with offset 0
|
|
553
|
-
content = bytes.pack('C*').force_encoding('UTF-8')
|
|
554
|
-
stack << ::Parsanol::Slice.new(0, content, nil)
|
|
555
|
-
when 0x05 # array_start
|
|
556
|
-
stack << :array_marker
|
|
557
|
-
i += 1
|
|
558
|
-
when 0x06 # array_end
|
|
559
|
-
items = []
|
|
560
|
-
items.unshift(stack.pop) until stack.last == :array_marker
|
|
561
|
-
stack.pop # Remove marker
|
|
562
|
-
stack << items
|
|
563
|
-
i += 1
|
|
564
|
-
when 0x07 # hash_start
|
|
565
|
-
stack << :hash_marker
|
|
566
|
-
i += 1
|
|
567
|
-
when 0x08 # hash_end
|
|
568
|
-
pairs = []
|
|
569
|
-
while stack.last != :hash_marker
|
|
570
|
-
value = stack.pop
|
|
571
|
-
key = stack.pop
|
|
572
|
-
pairs.unshift([key, value])
|
|
573
|
-
end
|
|
574
|
-
stack.pop # Remove marker
|
|
575
|
-
stack << pairs.to_h
|
|
576
|
-
i += 1
|
|
577
|
-
when 0x09 # hash_key
|
|
578
|
-
# Format: tag, len, key_chunks..., then value
|
|
579
|
-
len = flat[i + 1]
|
|
580
|
-
i += 2 # Skip tag and len
|
|
581
|
-
|
|
582
|
-
# Read key bytes from u64 chunks
|
|
583
|
-
chunks = (len + 7) / 8
|
|
584
|
-
key_bytes = []
|
|
585
|
-
chunks.times do |j|
|
|
586
|
-
chunk = flat[i + j]
|
|
587
|
-
8.times do |k|
|
|
588
|
-
break if key_bytes.length >= len
|
|
589
|
-
|
|
590
|
-
key_bytes << ((chunk >> (k * 8)) & 0xff)
|
|
591
|
-
end
|
|
592
|
-
end
|
|
593
|
-
i += chunks
|
|
594
|
-
|
|
595
|
-
key = key_bytes.pack('C*').force_encoding('UTF-8')
|
|
596
|
-
stack << key
|
|
597
|
-
else
|
|
598
|
-
raise "Unknown tag: #{tag} at index #{i}"
|
|
599
|
-
end
|
|
61
|
+
def atom_structure(atom, visited = {})
|
|
62
|
+
# Cycle detection - return a placeholder if we've seen this atom before
|
|
63
|
+
obj_id = atom.object_id
|
|
64
|
+
if visited[obj_id]
|
|
65
|
+
return [:cycle, atom.class.name]
|
|
600
66
|
end
|
|
67
|
+
visited[obj_id] = true
|
|
601
68
|
|
|
602
|
-
stack.first
|
|
603
|
-
end
|
|
604
|
-
|
|
605
|
-
# Compute structural hash of a grammar atom
|
|
606
|
-
# This returns the same hash for grammars with the same structure
|
|
607
|
-
# regardless of whether they are different object instances
|
|
608
|
-
def grammar_structure_hash(atom)
|
|
609
|
-
structure = atom_structure(atom)
|
|
610
|
-
Digest::MD5.hexdigest(structure.to_s)
|
|
611
|
-
end
|
|
612
|
-
|
|
613
|
-
# Recursively build structure representation for hashing
|
|
614
|
-
def atom_structure(atom)
|
|
615
69
|
case atom
|
|
70
|
+
when ::Parsanol::Atoms::Entity
|
|
71
|
+
# Recursively resolve entity to get actual structure for hash
|
|
72
|
+
atom_structure(atom.parslet, visited)
|
|
616
73
|
when ::Parsanol::Atoms::Str
|
|
617
74
|
[:str, atom.str]
|
|
618
75
|
when ::Parsanol::Atoms::Re
|
|
619
76
|
[:re, atom.match]
|
|
620
77
|
when ::Parsanol::Atoms::Sequence
|
|
621
|
-
[:seq, atom.parslets.map { |p| atom_structure(p) }]
|
|
78
|
+
[:seq, atom.parslets.map { |p| atom_structure(p, visited) }]
|
|
622
79
|
when ::Parsanol::Atoms::Alternative
|
|
623
|
-
[:alt, atom.alternatives.map { |p| atom_structure(p) }]
|
|
80
|
+
[:alt, atom.alternatives.map { |p| atom_structure(p, visited) }]
|
|
624
81
|
when ::Parsanol::Atoms::Repetition
|
|
625
|
-
[:rep, atom.min, atom.max, atom_structure(atom.parslet)]
|
|
82
|
+
[:rep, atom.min, atom.max, atom_structure(atom.parslet, visited)]
|
|
626
83
|
when ::Parsanol::Atoms::Named
|
|
627
|
-
[:named, atom.name.to_s, atom_structure(atom.parslet)]
|
|
84
|
+
[:named, atom.name.to_s, atom_structure(atom.parslet, visited)]
|
|
628
85
|
when ::Parsanol::Atoms::Lookahead
|
|
629
|
-
[:lookahead, atom.positive, atom_structure(atom.bound_parslet)]
|
|
630
|
-
when ::Parsanol::Atoms::Entity
|
|
631
|
-
# Entity is a lazy reference - use its name for hashing
|
|
632
|
-
[:entity, atom.name.to_s]
|
|
86
|
+
[:lookahead, atom.positive, atom_structure(atom.bound_parslet, visited)]
|
|
633
87
|
else
|
|
634
88
|
[:unknown, atom.class.name]
|
|
635
89
|
end
|