pegparse 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +13 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +133 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/pegparse/biop_rule_chain.rb +113 -0
- data/lib/pegparse/borrowed_areas.rb +35 -0
- data/lib/pegparse/line_counter.rb +61 -0
- data/lib/pegparse/parser_base.rb +139 -0
- data/lib/pegparse/parser_context.rb +19 -0
- data/lib/pegparse/parser_core.rb +243 -0
- data/lib/pegparse/parser_errors.rb +97 -0
- data/lib/pegparse/version.rb +5 -0
- data/lib/pegparse.rb +9 -0
- data/pegparse.gemspec +37 -0
- data/samples/bsh_parser.rb +337 -0
- data/samples/calc_parser.rb +55 -0
- data/samples/json_parser.rb +92 -0
- data/samples/xml_parser.rb +182 -0
- metadata +67 -0
@@ -0,0 +1,243 @@
|
|
1
|
+
require_relative "parser_context"
|
2
|
+
|
3
|
+
# Parser base class (core mechanism for backtracking)
|
4
|
+
class Pegparse::ParserCore
|
5
|
+
# start rule symbol used by parse()
|
6
|
+
# @return [Symbol]
|
7
|
+
attr_accessor :start_rule_symbol
|
8
|
+
|
9
|
+
# @param scanner_or_context [StringScanner,Pegparse::ParserContext]
|
10
|
+
def initialize(scanner_or_context)
|
11
|
+
init_context(scanner_or_context) if scanner_or_context
|
12
|
+
end
|
13
|
+
|
14
|
+
# initialize inner state
|
15
|
+
def init_context(scanner_or_context)
|
16
|
+
if scanner_or_context.is_a? Pegparse::ParserContext
|
17
|
+
@context = scanner_or_context
|
18
|
+
else
|
19
|
+
@context = Pegparse::ParserContext.new(scanner_or_context)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Start parse
|
24
|
+
# @param scanner_or_context [StringScanner,Pegparse::ParserContext]
|
25
|
+
# @param rule [Symbol]
|
26
|
+
# @return [Object] match result
|
27
|
+
def parse(scanner_or_context = nil, rule: nil)
|
28
|
+
raise ArgumentError if !scanner_or_context && !@context
|
29
|
+
raise ArgumentError if !rule && !@start_rule_symbol
|
30
|
+
|
31
|
+
init_context(scanner_or_context) if scanner_or_context
|
32
|
+
current_start_rule_symbol = rule || @start_rule_symbol
|
33
|
+
|
34
|
+
ret = nil
|
35
|
+
catch(:backtrack) do
|
36
|
+
ret = __send__(current_start_rule_symbol)
|
37
|
+
end
|
38
|
+
@context.errors.clear_errors if eos?
|
39
|
+
return ret
|
40
|
+
end
|
41
|
+
|
42
|
+
def eos?
|
43
|
+
@context.scanner.eos?
|
44
|
+
end
|
45
|
+
|
46
|
+
def save_error(reason)
|
47
|
+
@context.errors.save_error(@context.scanner.pos, @context.rule_stack, reason)
|
48
|
+
end
|
49
|
+
|
50
|
+
def backtrack_position_to(pos)
|
51
|
+
@context.scanner.pos = pos
|
52
|
+
@context.borrowed_areas.backtracked(pos)
|
53
|
+
end
|
54
|
+
|
55
|
+
# parse error info
|
56
|
+
# @return [Array] array of meaningful errors. an element should be [[[line, char], parent reason], [[line, char], child reason]]
|
57
|
+
def best_errors
|
58
|
+
@context.errors.best_errors.map{|error|
|
59
|
+
error.map{|rule|
|
60
|
+
[ @context.line_counter.position(rule.pos), rule.reason ]
|
61
|
+
}
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
# Check whether matching will success or not.
|
66
|
+
# @param str_or_regexp [String, Regexp, nil] matching (if nil, block will be used)
|
67
|
+
# @return [String, Object] match result String (if you call with block, return is block's result)
|
68
|
+
def peek(str_or_regexp = nil, &block)
|
69
|
+
raise ArgumentError if str_or_regexp && block
|
70
|
+
raise ArgumentError if !str_or_regexp && !block
|
71
|
+
|
72
|
+
if block
|
73
|
+
bk_pos = @context.scanner.pos
|
74
|
+
ret = nil
|
75
|
+
catch(:backtrack) do
|
76
|
+
ret = block.call()
|
77
|
+
end
|
78
|
+
backtrack_position_to(bk_pos)
|
79
|
+
return ret
|
80
|
+
end
|
81
|
+
|
82
|
+
if str_or_regexp.is_a?(String)
|
83
|
+
if @context.scanner.match?(str_or_regexp)
|
84
|
+
@context.line_counter.memo(@context.scanner.pos, str_or_regexp)
|
85
|
+
if @context.borrowed_areas.conflicted_area(@context.scanner.pos + str_or_regexp.bytesize - 1)
|
86
|
+
return nil
|
87
|
+
end
|
88
|
+
return str_or_regexp
|
89
|
+
else
|
90
|
+
return nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
if str_or_regexp.is_a?(Regexp)
|
94
|
+
if (size = @context.scanner.match?(str_or_regexp))
|
95
|
+
str = @context.scanner.peek(size)
|
96
|
+
@context.line_counter.memo(@context.scanner.pos, str)
|
97
|
+
if @context.borrowed_areas.conflicted_area(@context.scanner.pos + size - 1)
|
98
|
+
return nil
|
99
|
+
end
|
100
|
+
return str
|
101
|
+
end
|
102
|
+
return nil
|
103
|
+
end
|
104
|
+
raise ArgumentError
|
105
|
+
end
|
106
|
+
|
107
|
+
# Match with pattern. Backtrack if match failed.
|
108
|
+
# @param str_or_regexp [String, Regexp] matching
|
109
|
+
# @return [String] match result
|
110
|
+
def read(str_or_regexp)
|
111
|
+
raise ArgumentError unless str_or_regexp
|
112
|
+
ret = peek(str_or_regexp)
|
113
|
+
if ret
|
114
|
+
@context.scanner.pos += ret.bytesize
|
115
|
+
return ret
|
116
|
+
end
|
117
|
+
save_error(str_or_regexp)
|
118
|
+
backtrack()
|
119
|
+
end
|
120
|
+
|
121
|
+
# Match with pattern or block. Returns nil if match failed.
|
122
|
+
# @param str_or_regexp [String, Regexp, nil] matching (if nil, block will be used)
|
123
|
+
# @return [String, Object] match result
|
124
|
+
def optional(str_or_regexp = nil, &block)
|
125
|
+
raise ArgumentError if str_or_regexp && block
|
126
|
+
raise ArgumentError if !str_or_regexp && !block
|
127
|
+
|
128
|
+
if block
|
129
|
+
bk_pos = @context.scanner.pos
|
130
|
+
ret = nil
|
131
|
+
catch(:backtrack) do
|
132
|
+
@context.rule_stack.push [@context.scanner.pos, :optional]
|
133
|
+
ret = block.call()
|
134
|
+
return ret
|
135
|
+
ensure
|
136
|
+
@context.rule_stack.pop
|
137
|
+
end
|
138
|
+
backtrack_position_to(bk_pos)
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
ret = peek(str_or_regexp)
|
143
|
+
@context.scanner.pos += ret.bytesize if ret
|
144
|
+
return ret
|
145
|
+
end
|
146
|
+
|
147
|
+
def backtrack()
|
148
|
+
throw :backtrack
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.wrap_with_trace_method(method_sym)
|
152
|
+
original_method_sym = ('original_' + method_sym.to_s).to_sym
|
153
|
+
unless self.method_defined?(original_method_sym)
|
154
|
+
self.alias_method original_method_sym, method_sym
|
155
|
+
self.define_method(method_sym) do |*args|
|
156
|
+
@context.rule_stack.push [@context.scanner.pos, method_sym]
|
157
|
+
ret = self.__send__(original_method_sym, *args)
|
158
|
+
return ret
|
159
|
+
ensure
|
160
|
+
@context.rule_stack.pop
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Wrap method as nonterminal symbol rule.
|
166
|
+
# @param method_sym [Symbol] wrapping method symbol
|
167
|
+
# @return [Symbol]
|
168
|
+
def self.rule(method_sym)
|
169
|
+
self.wrap_with_trace_method(method_sym)
|
170
|
+
method_sym
|
171
|
+
end
|
172
|
+
|
173
|
+
###
|
174
|
+
|
175
|
+
# Try to match some candidates in order. (PEG's choice operator) Backtrack if all match failed.
|
176
|
+
# @param alter_procs [Array<Proc>] match candidates
|
177
|
+
# @return [Object] result of the matched candidate
|
178
|
+
def choice(*alter_procs)
|
179
|
+
alter_procs.each do |alter_proc|
|
180
|
+
ret = optional{ alter_proc.call() }
|
181
|
+
return ret if ret
|
182
|
+
end
|
183
|
+
backtrack()
|
184
|
+
end
|
185
|
+
|
186
|
+
# Try to match in loop. Returns [] even no loop succeeded.
|
187
|
+
# @return [Array<Object>] array of match results for each loop
|
188
|
+
def zero_or_more(&block)
|
189
|
+
ret = []
|
190
|
+
while true
|
191
|
+
val = optional { block.call() }
|
192
|
+
break unless val
|
193
|
+
ret << val
|
194
|
+
end
|
195
|
+
return ret
|
196
|
+
end
|
197
|
+
|
198
|
+
# Try to match in loop. Backtrack if no loop succeeded.
|
199
|
+
# @return [Array<Object>] array of match results for each loop
|
200
|
+
def one_or_more(&block)
|
201
|
+
ret = [block.call()]
|
202
|
+
while true
|
203
|
+
val = optional { block.call() }
|
204
|
+
break unless val
|
205
|
+
ret << val
|
206
|
+
end
|
207
|
+
return ret
|
208
|
+
end
|
209
|
+
|
210
|
+
# Temporarily change scanner position to next line(use for here-document)
|
211
|
+
# area consumed by block becomes non-matchable().
|
212
|
+
def borrow_next_line(&block)
|
213
|
+
mark_pos = @context.scanner.pos
|
214
|
+
if @context.borrowed_areas.borrowed_area_end_pos
|
215
|
+
borrowed_start_pos = @context.borrowed_areas.borrowed_area_end_pos
|
216
|
+
else
|
217
|
+
read(/.*\n/)
|
218
|
+
borrowed_start_pos = @context.scanner.pos
|
219
|
+
end
|
220
|
+
@context.scanner.pos = borrowed_start_pos
|
221
|
+
ret = block.call
|
222
|
+
borrowed_end_pos = @context.scanner.pos
|
223
|
+
@context.scanner.pos = mark_pos
|
224
|
+
@context.borrowed_areas.add_area(Pegparse::BorrowedArea.new(
|
225
|
+
marker_pos: mark_pos,
|
226
|
+
start_pos: borrowed_start_pos,
|
227
|
+
end_pos: borrowed_end_pos,
|
228
|
+
))
|
229
|
+
return ret
|
230
|
+
end
|
231
|
+
|
232
|
+
# match to borrowed area
|
233
|
+
def borrowed_area
|
234
|
+
if area = @context.borrowed_areas.conflicted_area(@context.scanner.pos)
|
235
|
+
if area.start_pos == @context.scanner.pos
|
236
|
+
ret = @context.scanner.peek(area.end_pos - area.start_pos)
|
237
|
+
@context.scanner.pos = area.end_pos
|
238
|
+
return ret
|
239
|
+
end
|
240
|
+
end
|
241
|
+
backtrack()
|
242
|
+
end
|
243
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
|
2
|
+
module Pegparse
|
3
|
+
ParserError = Struct.new(
|
4
|
+
:pos,
|
5
|
+
:reason,
|
6
|
+
keyword_init: true,
|
7
|
+
)
|
8
|
+
|
9
|
+
ParseErrorLocation = Struct.new(
|
10
|
+
:index_in_errors,
|
11
|
+
:start_positions_of_optional,
|
12
|
+
keyword_init: true,
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
class Pegparse::ParserErrors
|
17
|
+
def initialize
|
18
|
+
@farthest_pos = 0
|
19
|
+
@farthest_errors = []
|
20
|
+
end
|
21
|
+
|
22
|
+
# just save parsing error
|
23
|
+
# @params pos [Integer]
|
24
|
+
# @params rule_stack [Array] array of [matching start pos, matching symbol]
|
25
|
+
# @params reason [Object]
|
26
|
+
def save_error(pos, rule_stack, reason)
|
27
|
+
return if pos < @farthest_pos
|
28
|
+
if pos > @farthest_pos
|
29
|
+
@farthest_errors.clear
|
30
|
+
end
|
31
|
+
@farthest_pos = pos
|
32
|
+
copy_stack = rule_stack.map{|pos, reason| Pegparse::ParserError.new(pos: pos, reason: reason) }
|
33
|
+
copy_stack << Pegparse::ParserError.new(pos: pos, reason: reason)
|
34
|
+
@farthest_errors << copy_stack
|
35
|
+
end
|
36
|
+
|
37
|
+
# compare two errors which is better by parsing descent path
|
38
|
+
# (optional matching starts more earlier, priority becomes lower)
|
39
|
+
# @params a [Pegparse::ParseErrorLocation]
|
40
|
+
# @params b [Pegparse::ParseErrorLocation]
|
41
|
+
def compare_optional_memo(a, b)
|
42
|
+
a_opts = a.start_positions_of_optional
|
43
|
+
b_opts = b.start_positions_of_optional
|
44
|
+
|
45
|
+
for i in 0...[a_opts.size, b_opts.size].min
|
46
|
+
if a_opts[i] > b_opts[i]
|
47
|
+
return -1
|
48
|
+
end
|
49
|
+
if a_opts[i] < b_opts[i]
|
50
|
+
return 1
|
51
|
+
end
|
52
|
+
end
|
53
|
+
return a_opts.size <=> b_opts.size
|
54
|
+
end
|
55
|
+
|
56
|
+
# get meaningful errors
|
57
|
+
# @return [Array<Pegparse::ParseError>]
|
58
|
+
def best_errors
|
59
|
+
optional_memos = @farthest_errors.map.with_index do |stack, index|
|
60
|
+
Pegparse::ParseErrorLocation.new(
|
61
|
+
index_in_errors: index,
|
62
|
+
start_positions_of_optional: stack.select{|rule| rule.reason == :optional}.map{|rule| rule.pos}
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
best_memos = []
|
67
|
+
optional_memos.each do |memo|
|
68
|
+
if best_memos.empty?
|
69
|
+
best_memos << memo
|
70
|
+
next
|
71
|
+
end
|
72
|
+
cmp = compare_optional_memo(memo, best_memos[0])
|
73
|
+
if cmp < 0
|
74
|
+
best_memos = [memo]
|
75
|
+
next
|
76
|
+
elsif cmp == 0
|
77
|
+
best_memos << memo
|
78
|
+
next
|
79
|
+
else
|
80
|
+
next
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
bests = best_memos.map{|x| @farthest_errors[x.index_in_errors]}
|
85
|
+
|
86
|
+
result = bests.map do |stack|
|
87
|
+
stack.select{|x| x.reason != :optional}.last(2)
|
88
|
+
end
|
89
|
+
result
|
90
|
+
end
|
91
|
+
|
92
|
+
# remove all stored errors
|
93
|
+
def clear_errors
|
94
|
+
@farthest_pos = 0
|
95
|
+
@farthest_errors = []
|
96
|
+
end
|
97
|
+
end
|
data/lib/pegparse.rb
ADDED
data/pegparse.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/pegparse/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "pegparse"
|
7
|
+
spec.version = Pegparse::VERSION
|
8
|
+
spec.authors = ["Riki Ishikawa"]
|
9
|
+
spec.email = ["riki.ishikawa@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "library to create recursive descent parser."
|
12
|
+
spec.description = "provide base class for PEG like recursive descent parser."
|
13
|
+
spec.homepage = "https://github.com/jljse/pegparse"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 3.0.0"
|
16
|
+
|
17
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'https://mygemserver.com'"
|
18
|
+
|
19
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
spec.metadata["source_code_uri"] = "https://github.com/jljse/pegparse"
|
21
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
# Uncomment to register a new dependency of your gem
|
33
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
34
|
+
|
35
|
+
# For more information and examples about making a new gem, checkout our
|
36
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
37
|
+
end
|