parsanol 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +546 -0
  3. data/Cargo.toml +9 -0
  4. data/HISTORY.txt +12 -0
  5. data/LICENSE +23 -0
  6. data/README.adoc +487 -0
  7. data/Rakefile +135 -0
  8. data/ext/parsanol_native/Cargo.toml +34 -0
  9. data/ext/parsanol_native/extconf.rb +15 -0
  10. data/ext/parsanol_native/src/lib.rs +17 -0
  11. data/lib/parsanol/ast_visitor.rb +122 -0
  12. data/lib/parsanol/atoms/alternative.rb +122 -0
  13. data/lib/parsanol/atoms/base.rb +202 -0
  14. data/lib/parsanol/atoms/can_flatten.rb +194 -0
  15. data/lib/parsanol/atoms/capture.rb +38 -0
  16. data/lib/parsanol/atoms/context.rb +334 -0
  17. data/lib/parsanol/atoms/context_optimized.rb +38 -0
  18. data/lib/parsanol/atoms/custom.rb +110 -0
  19. data/lib/parsanol/atoms/cut.rb +66 -0
  20. data/lib/parsanol/atoms/dsl.rb +96 -0
  21. data/lib/parsanol/atoms/dynamic.rb +39 -0
  22. data/lib/parsanol/atoms/entity.rb +75 -0
  23. data/lib/parsanol/atoms/ignored.rb +37 -0
  24. data/lib/parsanol/atoms/infix.rb +162 -0
  25. data/lib/parsanol/atoms/lookahead.rb +82 -0
  26. data/lib/parsanol/atoms/named.rb +74 -0
  27. data/lib/parsanol/atoms/re.rb +83 -0
  28. data/lib/parsanol/atoms/repetition.rb +259 -0
  29. data/lib/parsanol/atoms/scope.rb +35 -0
  30. data/lib/parsanol/atoms/sequence.rb +194 -0
  31. data/lib/parsanol/atoms/str.rb +103 -0
  32. data/lib/parsanol/atoms/visitor.rb +91 -0
  33. data/lib/parsanol/atoms.rb +46 -0
  34. data/lib/parsanol/buffer.rb +133 -0
  35. data/lib/parsanol/builder_callbacks.rb +353 -0
  36. data/lib/parsanol/cause.rb +122 -0
  37. data/lib/parsanol/context.rb +39 -0
  38. data/lib/parsanol/convenience.rb +36 -0
  39. data/lib/parsanol/edit_tracker.rb +111 -0
  40. data/lib/parsanol/error_reporter/contextual.rb +99 -0
  41. data/lib/parsanol/error_reporter/deepest.rb +120 -0
  42. data/lib/parsanol/error_reporter/tree.rb +63 -0
  43. data/lib/parsanol/error_reporter.rb +100 -0
  44. data/lib/parsanol/expression/treetop.rb +154 -0
  45. data/lib/parsanol/expression.rb +106 -0
  46. data/lib/parsanol/fast_mode.rb +149 -0
  47. data/lib/parsanol/first_set.rb +79 -0
  48. data/lib/parsanol/grammar_builder.rb +177 -0
  49. data/lib/parsanol/incremental_parser.rb +177 -0
  50. data/lib/parsanol/interval_tree.rb +217 -0
  51. data/lib/parsanol/lazy_result.rb +179 -0
  52. data/lib/parsanol/lexer.rb +144 -0
  53. data/lib/parsanol/mermaid.rb +139 -0
  54. data/lib/parsanol/native/parser.rb +612 -0
  55. data/lib/parsanol/native/serializer.rb +248 -0
  56. data/lib/parsanol/native/transformer.rb +435 -0
  57. data/lib/parsanol/native/types.rb +42 -0
  58. data/lib/parsanol/native.rb +217 -0
  59. data/lib/parsanol/optimizer.rb +85 -0
  60. data/lib/parsanol/optimizers/choice_optimizer.rb +78 -0
  61. data/lib/parsanol/optimizers/cut_inserter.rb +179 -0
  62. data/lib/parsanol/optimizers/lookahead_optimizer.rb +50 -0
  63. data/lib/parsanol/optimizers/quantifier_optimizer.rb +60 -0
  64. data/lib/parsanol/optimizers/sequence_optimizer.rb +97 -0
  65. data/lib/parsanol/options/ruby_transform.rb +107 -0
  66. data/lib/parsanol/options/serialized.rb +94 -0
  67. data/lib/parsanol/options/zero_copy.rb +128 -0
  68. data/lib/parsanol/options.rb +20 -0
  69. data/lib/parsanol/parallel.rb +133 -0
  70. data/lib/parsanol/parser.rb +182 -0
  71. data/lib/parsanol/parslet.rb +151 -0
  72. data/lib/parsanol/pattern/binding.rb +91 -0
  73. data/lib/parsanol/pattern.rb +159 -0
  74. data/lib/parsanol/pool.rb +219 -0
  75. data/lib/parsanol/pools/array_pool.rb +75 -0
  76. data/lib/parsanol/pools/buffer_pool.rb +175 -0
  77. data/lib/parsanol/pools/position_pool.rb +92 -0
  78. data/lib/parsanol/pools/slice_pool.rb +64 -0
  79. data/lib/parsanol/position.rb +94 -0
  80. data/lib/parsanol/resettable.rb +29 -0
  81. data/lib/parsanol/result.rb +46 -0
  82. data/lib/parsanol/result_builder.rb +208 -0
  83. data/lib/parsanol/result_stream.rb +261 -0
  84. data/lib/parsanol/rig/rspec.rb +71 -0
  85. data/lib/parsanol/rope.rb +81 -0
  86. data/lib/parsanol/scope.rb +104 -0
  87. data/lib/parsanol/slice.rb +146 -0
  88. data/lib/parsanol/source/line_cache.rb +109 -0
  89. data/lib/parsanol/source.rb +180 -0
  90. data/lib/parsanol/source_location.rb +167 -0
  91. data/lib/parsanol/streaming_parser.rb +124 -0
  92. data/lib/parsanol/string_view.rb +195 -0
  93. data/lib/parsanol/transform.rb +226 -0
  94. data/lib/parsanol/version.rb +5 -0
  95. data/lib/parsanol/wasm/README.md +80 -0
  96. data/lib/parsanol/wasm/package.json +51 -0
  97. data/lib/parsanol/wasm/parsanol.js +252 -0
  98. data/lib/parsanol/wasm/parslet.d.ts +129 -0
  99. data/lib/parsanol/wasm_parser.rb +240 -0
  100. data/lib/parsanol.rb +280 -0
  101. data/parsanol-ruby.gemspec +67 -0
  102. metadata +293 -0
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parsanol
4
+ module Native
5
+ # Type tags used in AST serialization
6
+ # These must match the tags used by the Rust parser
7
+ module Types
8
+ # AST node type tags (must match Rust parser output)
9
+ TAG_NIL = 0x00
10
+ TAG_BOOL = 0x01
11
+ TAG_INT = 0x02
12
+ TAG_FLOAT = 0x03
13
+ TAG_STRING_REF = 0x04
14
+ TAG_ARRAY_START = 0x05
15
+ TAG_ARRAY_END = 0x06
16
+ TAG_HASH_START = 0x07
17
+ TAG_HASH_END = 0x08
18
+ TAG_HASH_KEY = 0x09
19
+ TAG_INLINE_STRING = 0x0A
20
+
21
+ # Frozen string constants for transformer (avoid allocations)
22
+ SEQUENCE_TAG = ':sequence'
23
+ REPETITION_TAG = ':repetition'
24
+ EMPTY_STRING = ''
25
+ EMPTY_ARRAY = [].freeze
26
+ EMPTY_HASH = {}.freeze
27
+ end
28
+
29
+ # Symbol cache to avoid repeated string-to-symbol conversions
30
+ # This is a class variable to share across all transformations
31
+ @@symbol_cache = {}
32
+
33
+ # Convert string key to symbol with caching
34
+ # @param key [String, Symbol] The key to convert
35
+ # @return [Symbol] The symbol version of the key
36
+ def self.cached_symbol(key)
37
+ return key if key.is_a?(Symbol)
38
+
39
+ @@symbol_cache[key] ||= key.to_sym
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+
6
+ # Entry point for native parsing functionality
7
+ # Requires the individual components
8
+ require 'parsanol/native/types'
9
+ require 'parsanol/native/parser'
10
+ require 'parsanol/native/transformer'
11
+ require 'parsanol/native/serializer'
12
+
13
+ module Parsanol
14
+ module Native
15
+ VERSION = '0.1.0'
16
+
17
+ class << self
18
+ # Delegate to Parser module
19
+ def available?
20
+ Parser.available?
21
+ end
22
+
23
+ def parse(grammar_json, input)
24
+ Parser.parse(grammar_json, input)
25
+ end
26
+
27
+ def parse_with_grammar(root_atom, input)
28
+ Parser.parse_with_grammar(root_atom, input)
29
+ end
30
+
31
+ def parse_parslet_compatible(root_atom, input)
32
+ Parser.parse_parslet_compatible(root_atom, input)
33
+ end
34
+
35
+ def parse_batch_inputs(root_atom, inputs)
36
+ Parser.parse_batch_inputs(root_atom, inputs)
37
+ end
38
+
39
+ def parse_batch_with_transform(root_atom, inputs)
40
+ Parser.parse_batch_with_transform(root_atom, inputs)
41
+ end
42
+
43
+ def parse_raw(root_atom, input)
44
+ Parser.parse_raw(root_atom, input)
45
+ end
46
+
47
+ def serialize_grammar(root_atom)
48
+ Parser.serialize_grammar(root_atom)
49
+ end
50
+
51
+ def clear_cache
52
+ Parser.clear_cache
53
+ end
54
+
55
+ def cache_stats
56
+ Parser.cache_stats
57
+ end
58
+
59
+ # Serialized Mode (JSON Output)
60
+ def parse_to_json(grammar_json, input)
61
+ Parser.parse_to_json(grammar_json, input)
62
+ end
63
+
64
+ # ZeroCopy Mode (Direct Ruby Objects)
65
+ def parse_to_objects(grammar_json, input, type_map = nil)
66
+ Parser.parse_to_objects(grammar_json, input, type_map)
67
+ end
68
+
69
+ def convert_slices(obj, input)
70
+ Parser.convert_slices(obj, input)
71
+ end
72
+
73
+ # Source Location Tracking
74
+ def parse_with_spans(grammar_json, input)
75
+ Parser.parse_with_spans(grammar_json, input)
76
+ end
77
+
78
+ def get_span(result, node_id)
79
+ Parser.get_span(result, node_id)
80
+ end
81
+
82
+ # Grammar Composition
83
+ def grammar_import(builder_json, grammar_json, prefix = nil)
84
+ Parser.grammar_import(builder_json, grammar_json, prefix)
85
+ end
86
+
87
+ def grammar_rule_mut(builder_json, rule_name)
88
+ Parser.grammar_rule_mut(builder_json, rule_name)
89
+ end
90
+
91
+ # Streaming Parser
92
+ def streaming_parser_new(grammar_json)
93
+ Parser.streaming_parser_new(grammar_json)
94
+ end
95
+
96
+ def streaming_parser_add_chunk(parser, chunk)
97
+ Parser.streaming_parser_add_chunk(parser, chunk)
98
+ end
99
+
100
+ def streaming_parser_parse_chunk(parser)
101
+ Parser.streaming_parser_parse_chunk(parser)
102
+ end
103
+
104
+ # Incremental Parser
105
+ def incremental_parser_new(grammar_json, initial_input)
106
+ Parser.incremental_parser_new(grammar_json, initial_input)
107
+ end
108
+
109
+ def incremental_parser_apply_edit(parser, start, deleted, inserted = '')
110
+ Parser.incremental_parser_apply_edit(parser, start, deleted, inserted)
111
+ end
112
+
113
+ def incremental_parser_reparse(parser, new_input = nil)
114
+ Parser.incremental_parser_reparse(parser, new_input)
115
+ end
116
+
117
+ # Streaming Builder - uses native parse_with_builder directly (exposed from Rust)
118
+ # The native function is exposed directly on Parsanol::Native module
119
+
120
+ # Alias for parse_with_builder (same functionality)
121
+ def parse_with_callback(grammar_json, input, callback)
122
+ parse_with_builder(grammar_json, input, callback)
123
+ end
124
+
125
+ # Parallel Parsing - uses native _parse_batch_parallel
126
+ def parse_batch_parallel(grammar_json, inputs, num_threads: nil)
127
+ _parse_batch_parallel(grammar_json, inputs, num_threads || 0)
128
+ end
129
+
130
+ # Security / Limits - uses native _parse_with_limits
131
+ def parse_with_limits(grammar_json, input, max_input_size: 100 * 1024 * 1024, max_recursion_depth: 1000)
132
+ _parse_with_limits(grammar_json, input, max_input_size, max_recursion_depth)
133
+ end
134
+
135
+ # Debug Tools
136
+ def parse_with_trace(grammar_json, input)
137
+ Parser.parse_with_trace(grammar_json, input)
138
+ end
139
+
140
+ def grammar_to_mermaid(grammar_json)
141
+ Parser.grammar_to_mermaid(grammar_json)
142
+ end
143
+
144
+ def grammar_to_dot(grammar_json)
145
+ Parser.grammar_to_dot(grammar_json)
146
+ end
147
+
148
+ # Legacy internal methods (for backward compatibility)
149
+ def _parse_with_spans(grammar_json, input)
150
+ Parser.send(:_parse_with_spans, grammar_json, input)
151
+ end
152
+
153
+ def _get_span(result, node_id)
154
+ Parser.send(:_get_span, result, node_id)
155
+ end
156
+
157
+ def _grammar_import(builder_json, grammar_json, prefix)
158
+ Parser.send(:_grammar_import, builder_json, grammar_json, prefix)
159
+ end
160
+
161
+ def _grammar_rule_mut(builder_json, rule_name)
162
+ Parser.send(:_grammar_rule_mut, builder_json, rule_name)
163
+ end
164
+
165
+ def _streaming_parser_new(grammar_json)
166
+ Parser.send(:_streaming_parser_new, grammar_json)
167
+ end
168
+
169
+ def _streaming_parser_add_chunk(parser, chunk)
170
+ Parser.send(:_streaming_parser_add_chunk, parser, chunk)
171
+ end
172
+
173
+ def _streaming_parser_parse_chunk(parser)
174
+ Parser.send(:_streaming_parser_parse_chunk, parser)
175
+ end
176
+
177
+ def _incremental_parser_new(grammar_json, initial_input)
178
+ Parser.send(:_incremental_parser_new, grammar_json, initial_input)
179
+ end
180
+
181
+ def _incremental_parser_apply_edit(parser, start, deleted, inserted)
182
+ Parser.send(:_incremental_parser_apply_edit, parser, start, deleted, inserted)
183
+ end
184
+
185
+ def _incremental_parser_reparse(parser, new_input)
186
+ Parser.send(:_incremental_parser_reparse, parser, new_input)
187
+ end
188
+
189
+ def _parse_batch_parallel(grammar_json, inputs, num_threads)
190
+ Parser.send(:_parse_batch_parallel, grammar_json, inputs, num_threads)
191
+ end
192
+
193
+ def _parse_with_limits(grammar_json, input, max_input_size, max_recursion_depth)
194
+ Parser.send(:_parse_with_limits, grammar_json, input, max_input_size, max_recursion_depth)
195
+ end
196
+
197
+ def _parse_with_trace(grammar_json, input)
198
+ Parser.send(:_parse_with_trace, grammar_json, input)
199
+ end
200
+
201
+ def _grammar_to_mermaid(grammar_json)
202
+ Parser.send(:_grammar_to_mermaid, grammar_json)
203
+ end
204
+
205
+ def _grammar_to_dot(grammar_json)
206
+ Parser.send(:_grammar_to_dot, grammar_json)
207
+ end
208
+ end
209
+ end
210
+ end
211
+
212
+ # Attempt to load native extension
213
+ begin
214
+ require 'parsanol/parsanol_native'
215
+ rescue LoadError
216
+ # Native extension not built yet
217
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ast_visitor'
4
+ require_relative 'optimizers/quantifier_optimizer'
5
+ require_relative 'optimizers/sequence_optimizer'
6
+ require_relative 'optimizers/choice_optimizer'
7
+ require_relative 'optimizers/lookahead_optimizer'
8
+ require_relative 'optimizers/cut_inserter'
9
+
10
+ # Grammar-level optimizations for Parslet parsers
11
+ # These optimizations transform the parser AST to reduce runtime overhead
12
+ # without changing semantics.
13
+ #
14
+ # Architecture:
15
+ # - Uses Visitor pattern for clean separation of traversal and transformation
16
+ # - Each optimizer is a separate class inheriting from ASTVisitor
17
+ # - Optimizer module provides facade methods for easy access
18
+ module Parsanol
19
+ module Optimizer
20
+ # Simplifies redundant quantifiers in a parslet tree
21
+ # Example: str('a').repeat(1, 1) => str('a')
22
+ # str('a').repeat(0, 1).repeat(0, 1) => str('a').repeat(0, 1)
23
+ #
24
+ # @param parslet [Parsanol::Atoms::Base] parslet to simplify
25
+ # @return [Parsanol::Atoms::Base] simplified parslet
26
+ def self.simplify_quantifiers(parslet)
27
+ Optimizers::QuantifierOptimizer.new.visit(parslet)
28
+ end
29
+
30
+ # Simplifies sequences by flattening and merging adjacent strings
31
+ # Example: str('a') >> str('b') => str('ab')
32
+ # (str('a') >> str('b')) >> str('c') => str('abc')
33
+ #
34
+ # @param parslet [Parsanol::Atoms::Base] parslet to simplify
35
+ # @return [Parsanol::Atoms::Base] simplified parslet
36
+ def self.simplify_sequences(parslet)
37
+ Optimizers::SequenceOptimizer.new.visit(parslet)
38
+ end
39
+
40
+ # Simplifies choice/alternative patterns
41
+ # Example: (A | B) | C => A | B | C
42
+ # A | B | A => A | B
43
+ #
44
+ # @param parslet [Parsanol::Atoms::Base] parslet to simplify
45
+ # @return [Parsanol::Atoms::Base] simplified parslet
46
+ def self.simplify_choices(parslet)
47
+ Optimizers::ChoiceOptimizer.new.visit(parslet)
48
+ end
49
+
50
+ # Simplifies lookahead patterns
51
+ # Example: !(!x) => &x (double negation elimination)
52
+ #
53
+ # @param parslet [Parsanol::Atoms::Base] parslet to simplify
54
+ # @return [Parsanol::Atoms::Base] simplified parslet
55
+ def self.simplify_lookaheads(parslet)
56
+ Optimizers::LookaheadOptimizer.new.visit(parslet)
57
+ end
58
+
59
+ # Automatically insert cut operators where safe (AC-FIRST algorithm)
60
+ # Inserts cuts after deterministic prefixes when alternatives have disjoint FIRST sets
61
+ # This enables O(1) space complexity by allowing aggressive cache eviction
62
+ #
63
+ # Example: str('if') >> x | str('while') >> y
64
+ # => str('if').cut >> x | str('while').cut >> y
65
+ #
66
+ # @param parslet [Parsanol::Atoms::Base] parslet to optimize
67
+ # @return [Parsanol::Atoms::Base] optimized parslet with cuts inserted
68
+ def self.insert_cuts(parslet)
69
+ Optimizers::CutInserter.new.optimize(parslet)
70
+ end
71
+
72
+ # Apply all optimizations in recommended order
73
+ # This is a convenience method that applies all optimizer passes
74
+ #
75
+ # @param parslet [Parsanol::Atoms::Base] parslet to optimize
76
+ # @return [Parsanol::Atoms::Base] fully optimized parslet
77
+ def self.optimize_all(parslet)
78
+ result = simplify_quantifiers(parslet)
79
+ result = simplify_sequences(result)
80
+ result = simplify_choices(result)
81
+ result = simplify_lookaheads(result)
82
+ insert_cuts(result)
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../ast_visitor'
4
+
5
+ module Parsanol
6
+ module Optimizers
7
+ # Optimizes alternative/choice patterns in the AST
8
+ # Follows visitor pattern for clean separation of concerns
9
+ #
10
+ # Transformations:
11
+ # - (A | B) | C => A | B | C (flatten nested alternatives)
12
+ # - A | B | A => A | B (remove duplicates)
13
+ # - Alternative(A) => A (unwrap single-element alternatives)
14
+ class ChoiceOptimizer < ASTVisitor
15
+ # Visit an alternative node and apply choice optimizations
16
+ # @param parslet [Parsanol::Atoms::Alternative] alternative to optimize
17
+ # @return [Parsanol::Atoms::Base] optimized parslet
18
+ def visit_alternative(parslet)
19
+ # First optimize children recursively
20
+ new_alternatives = parslet.alternatives.map { |p| visit(p) }
21
+
22
+ # Optimization 1: Flatten nested alternatives
23
+ flattened = flatten_alternatives(new_alternatives)
24
+
25
+ # Optimization 2: Remove duplicate alternatives
26
+ deduplicated = deduplicate_alternatives(flattened)
27
+
28
+ # Optimization 3: Unwrap single-element alternatives
29
+ return deduplicated[0] if deduplicated.size == 1
30
+
31
+ # Return optimized alternative if changed
32
+ if deduplicated == parslet.alternatives
33
+ parslet
34
+ else
35
+ Parsanol::Atoms::Alternative.new(*deduplicated)
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ # Flatten nested alternatives into a single level
42
+ # @param alternatives [Array<Parsanol::Atoms::Base>] array of alternatives
43
+ # @return [Array<Parsanol::Atoms::Base>] flattened array
44
+ def flatten_alternatives(alternatives)
45
+ result = []
46
+ alternatives.each do |alt|
47
+ if alt.is_a?(Parsanol::Atoms::Alternative)
48
+ result.concat(alt.alternatives)
49
+ else
50
+ result << alt
51
+ end
52
+ end
53
+ result
54
+ end
55
+
56
+ # Remove duplicate alternatives using structural equality
57
+ # @param alternatives [Array<Parsanol::Atoms::Base>] array of alternatives
58
+ # @return [Array<Parsanol::Atoms::Base>] deduplicated array
59
+ def deduplicate_alternatives(alternatives)
60
+ return alternatives if alternatives.size < 2
61
+
62
+ # Use to_s as proxy for structural equality
63
+ seen = {}
64
+ result = []
65
+
66
+ alternatives.each do |alt|
67
+ key = alt.to_s
68
+ unless seen[key]
69
+ seen[key] = true
70
+ result << alt
71
+ end
72
+ end
73
+
74
+ result
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Automatic Cut Insertion (AC-FIRST Algorithm)
4
+ #
5
+ # This optimizer implements the AC-FIRST algorithm from Mizushima et al. (2010)
6
+ # to automatically insert cut operators when alternatives have disjoint FIRST sets.
7
+ #
8
+ # When all alternatives in a choice have non-overlapping FIRST sets, we can safely
9
+ # insert a cut after the deterministic prefix, since backtracking will never be
10
+ # needed.
11
+ #
12
+ # Example:
13
+ # str('if') >> condition >> then_clause |
14
+ # str('while') >> condition >> body |
15
+ # str('print') >> expression
16
+ #
17
+ # Becomes:
18
+ # str('if').cut >> condition >> then_clause |
19
+ # str('while').cut >> condition >> body |
20
+ # str('print').cut >> expression
21
+ #
22
+ # Reference: Mizushima et al. (2010) "Packrat Parsers Can Handle Practical
23
+ # Grammars in Mostly Constant Space"
24
+ #
25
+ module Parsanol
26
+ module Optimizers
27
+ class CutInserter
28
+ # Optimize a parslet by inserting cuts where safe
29
+ # Recursively traverses the grammar AST
30
+ #
31
+ # @param parslet [Parsanol::Atoms::Base] The parslet to optimize
32
+ # @return [Parsanol::Atoms::Base] Optimized parslet with cuts inserted
33
+ def optimize(parslet)
34
+ case parslet
35
+ when Parsanol::Atoms::Alternative
36
+ optimize_alternative(parslet)
37
+ when Parsanol::Atoms::Sequence
38
+ optimize_sequence(parslet)
39
+ when Parsanol::Atoms::Repetition
40
+ optimize_repetition(parslet)
41
+ when Parsanol::Atoms::Named
42
+ optimize_named(parslet)
43
+ else
44
+ # Return atom unchanged (Str, Re, Lookahead, etc.)
45
+ parslet
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # Optimize an Alternative atom by inserting cuts when all alternatives
52
+ # have disjoint FIRST sets
53
+ def optimize_alternative(alt)
54
+ alternatives = alt.alternatives
55
+ first_sets = alternatives.map(&:first_set)
56
+
57
+ # Only optimize if all FIRST sets are disjoint
58
+ unless Parsanol::FirstSet.all_disjoint?(first_sets)
59
+ # Not safe to insert cuts - return alternatives with recursive optimization
60
+ optimized = alternatives.map { |a| optimize(a) }
61
+ return Parsanol::Atoms::Alternative.new(*optimized)
62
+ end
63
+
64
+ # All FIRST sets are disjoint - safe to insert cuts!
65
+ # Insert cuts after deterministic prefixes
66
+ optimized = alternatives.map do |alternative|
67
+ insert_cut_if_safe(alternative)
68
+ end
69
+
70
+ Parsanol::Atoms::Alternative.new(*optimized)
71
+ end
72
+
73
+ # Optimize a Sequence atom by recursively optimizing its elements
74
+ def optimize_sequence(seq)
75
+ optimized_parslets = seq.parslets.map { |p| optimize(p) }
76
+ Parsanol::Atoms::Sequence.new(*optimized_parslets)
77
+ end
78
+
79
+ # Optimize a Repetition atom by recursively optimizing its parslet
80
+ def optimize_repetition(rep)
81
+ optimized_parslet = optimize(rep.parslet)
82
+ # Create new repetition with same min/max
83
+ # Note: We use default tag since it's not exposed as a reader
84
+ Parsanol::Atoms::Repetition.new(
85
+ optimized_parslet,
86
+ rep.min,
87
+ rep.max
88
+ )
89
+ end
90
+
91
+ # Optimize a Named atom by recursively optimizing its parslet
92
+ def optimize_named(named)
93
+ optimized_parslet = optimize(named.parslet)
94
+ optimized_parslet.as(named.name)
95
+ end
96
+
97
+ # Insert a cut after the deterministic prefix if safe
98
+ # For sequences: find longest prefix without EPSILON
99
+ # For other atoms: cut the whole thing if it doesn't include EPSILON
100
+ def insert_cut_if_safe(parslet)
101
+ # For sequences, find the longest safe prefix
102
+ if parslet.is_a?(Parsanol::Atoms::Sequence)
103
+ prefix_parslets = find_deterministic_prefix(parslet)
104
+ return build_cut_sequence(parslet, prefix_parslets) if prefix_parslets && !prefix_parslets.empty?
105
+ end
106
+
107
+ # For other atoms, cut the whole thing if safe
108
+ return parslet.cut if safe_to_cut?(parslet)
109
+
110
+ # Not safe to cut - recursively optimize and return
111
+ optimize(parslet)
112
+ end
113
+
114
+ # Find the longest deterministic prefix of a sequence
115
+ # A deterministic prefix doesn't include EPSILON in its FIRST set
116
+ #
117
+ # @param sequence [Parsanol::Atoms::Sequence] The sequence to analyze
118
+ # @return [Array<Parsanol::Atoms::Base>] Prefix parslets, or nil if none
119
+ def find_deterministic_prefix(sequence)
120
+ parslets = sequence.parslets
121
+ prefix_length = 0
122
+
123
+ # Find longest prefix where no element can match empty
124
+ parslets.each do |p|
125
+ break if p.first_set.include?(Parsanol::FirstSet::EPSILON)
126
+
127
+ prefix_length += 1
128
+ end
129
+
130
+ prefix_length.positive? ? parslets[0...prefix_length] : nil
131
+ end
132
+
133
+ # Check if it's safe to cut after this parslet
134
+ # Safe if the parslet doesn't have EPSILON in its FIRST set
135
+ # (i.e., it always consumes input)
136
+ def safe_to_cut?(parslet)
137
+ first = parslet.first_set
138
+ # Don't cut if EPSILON is in FIRST set (might not consume)
139
+ # Also don't cut if FIRST set contains only nil (unknown)
140
+ return false if first.include?(Parsanol::FirstSet::EPSILON)
141
+ return false if first.all?(&:nil?)
142
+
143
+ true
144
+ end
145
+
146
+ # Build a new sequence with a cut after the prefix
147
+ #
148
+ # @param sequence [Parsanol::Atoms::Sequence] Original sequence
149
+ # @param prefix_parslets [Array] Parslets forming the deterministic prefix
150
+ # @return [Parsanol::Atoms::Base] New sequence with cut inserted
151
+ def build_cut_sequence(sequence, prefix_parslets)
152
+ # Recursively optimize prefix parslets
153
+ optimized_prefix = prefix_parslets.map { |p| optimize(p) }
154
+
155
+ # Build prefix (single parslet or sequence)
156
+ prefix = if optimized_prefix.length == 1
157
+ optimized_prefix.first
158
+ else
159
+ Parsanol::Atoms::Sequence.new(*optimized_prefix)
160
+ end
161
+
162
+ # Get remaining parslets after prefix
163
+ remaining = sequence.parslets[prefix_parslets.length..]
164
+
165
+ # Recursively optimize remaining parslets
166
+ optimized_remaining = remaining.map { |p| optimize(p) }
167
+
168
+ # Build final sequence with cut
169
+ if optimized_remaining.empty?
170
+ # Prefix is the entire sequence
171
+ prefix.cut
172
+ else
173
+ # Prefix + cut + remaining
174
+ Parsanol::Atoms::Sequence.new(prefix.cut, *optimized_remaining)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../ast_visitor'
4
+
5
+ module Parsanol
6
+ module Optimizers
7
+ # Optimizes lookahead patterns in the AST
8
+ # Follows visitor pattern for clean separation of concerns
9
+ #
10
+ # Transformations:
11
+ # - !(!x) => &x (double negation elimination)
12
+ # - &(&x) => &x (positive lookahead is idempotent)
13
+ # - !(&x) => !x (negative of positive)
14
+ # - &(!x) => !x (positive of negative)
15
+ class LookaheadOptimizer < ASTVisitor
16
+ # Visit a lookahead node and apply lookahead optimizations
17
+ # @param parslet [Parsanol::Atoms::Lookahead] lookahead to optimize
18
+ # @return [Parsanol::Atoms::Base] optimized parslet
19
+ def visit_lookahead(parslet)
20
+ # First optimize the child
21
+ inner = visit(parslet.bound_parslet)
22
+
23
+ # If inner is also a lookahead, simplify nested lookaheads
24
+ if inner.is_a?(Parsanol::Atoms::Lookahead)
25
+ outer_positive = parslet.positive
26
+ inner_positive = inner.positive
27
+
28
+ # !(!x) => &x (double negation)
29
+ return Parsanol::Atoms::Lookahead.new(inner.bound_parslet, true) if !outer_positive && !inner_positive
30
+
31
+ # &(&x) => &x (idempotent)
32
+ return inner if outer_positive && inner_positive
33
+
34
+ # !(&x) => !x (negative of positive)
35
+ return Parsanol::Atoms::Lookahead.new(inner.bound_parslet, false) if !outer_positive && inner_positive
36
+
37
+ # &(!x) => !x (positive of negative)
38
+ return inner if outer_positive && !inner_positive
39
+ end
40
+
41
+ # Return lookahead with optimized child
42
+ if inner.equal?(parslet.bound_parslet)
43
+ parslet
44
+ else
45
+ Parsanol::Atoms::Lookahead.new(inner, parslet.positive)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end