ebnf 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,9 @@ module EBNF
3
3
  autoload :BNF, "ebnf/bnf"
4
4
  autoload :LL1, "ebnf/ll1"
5
5
  autoload :Parser, "ebnf/parser"
6
+ autoload :PEG, "ebnf/peg"
6
7
  autoload :Rule, "ebnf/rule"
8
+ autoload :Terminals,"ebnf/terminals"
7
9
  autoload :Writer, "ebnf/writer"
8
10
  autoload :VERSION, "ebnf/version"
9
11
 
@@ -18,6 +20,6 @@ module EBNF
18
20
  # @return [EBNF::Base]
19
21
  # @raise [Exception] on invalid input
20
22
  def self.parse(input, **options)
21
- query = ::EBNF::Base.new(input, **options)
23
+ ::EBNF::Base.new(input, **options)
22
24
  end
23
25
  end
@@ -2,7 +2,7 @@ require 'strscan'
2
2
 
3
3
  # Extended Bakus-Nour Form (EBNF), being the W3C variation is
4
4
  # originaly defined in the
5
- # [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation).
5
+ # [W3C XML 1.0 Spec](https://www.w3.org/TR/REC-xml/#sec-notation).
6
6
  #
7
7
  # This version attempts to be less strict than the strict definition
8
8
  # to allow for coloquial variations (such as in the Turtle syntax).
@@ -12,8 +12,8 @@ require 'strscan'
12
12
  #
13
13
  # Comments include the content between '/*' and '*/'
14
14
  #
15
- # @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
16
- # @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
15
+ # @see https://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
16
+ # @see https://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
17
17
  #
18
18
  # Based on bnf2turtle by Dan Connolly.
19
19
  #
@@ -36,7 +36,7 @@ require 'strscan'
36
36
  # derived mechanically from the specification.
37
37
  #
38
38
  #
39
- # [N3 design note]: http://www.w3.org/DesignIssues/Notation3
39
+ # [N3 design note]: https://www.w3.org/DesignIssues/Notation3
40
40
  #
41
41
  # Related Work
42
42
  # ------------
@@ -59,12 +59,12 @@ require 'strscan'
59
59
  # expression of the grammar in terms of the higher level EBNF
60
60
  # constructs.
61
61
  #
62
- # [goal]: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
63
- # [n3p announcement]: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
64
- # [Yacker]: http://www.w3.org/1999/02/26-modules/User/Yacker
65
- # [SPARQL specification]: http://www.w3.org/TR/rdf-sparql-query/
66
- # [Cwm Release 1.1.0rc1]: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
67
- # [bnf-rules.n3]: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
62
+ # [goal]: https://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
63
+ # [n3p announcement]: https://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
64
+ # [Yacker]: https://rubygems/02/26-modules/User/Yacker
65
+ # [SPARQL specification]: https://www.w3.org/TR/rdf-sparql-query/
66
+ # [Cwm Release 1.1.0rc1]: https://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
67
+ # [bnf-rules.n3]: https://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
68
68
  #
69
69
  # Open Issues and Future Work
70
70
  # ---------------------------
@@ -82,8 +82,8 @@ require 'strscan'
82
82
  # It would be interesting to corroborate the claim in the SPARQL spec
83
83
  # that the grammar is LL(1) with a mechanical proof based on N3 rules.
84
84
  #
85
- # [swap/grammar/bnf]: http://www.w3.org/2000/10/swap/grammar/bnf
86
- # [bnf2html.n3]: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3
85
+ # [swap/grammar/bnf]: https://www.w3.org/2000/10/swap/grammar/bnf
86
+ # [bnf2html.n3]: https://www.w3.org/2000/10/swap/grammar/bnf2html.n3
87
87
  #
88
88
  # Background
89
89
  # ----------
@@ -93,7 +93,7 @@ require 'strscan'
93
93
  # of N3 that maps directly to (and from) the standard XML syntax for
94
94
  # RDF.
95
95
  #
96
- # [N3 Primer]: http://www.w3.org/2000/10/swap/Primer.html
96
+ # [N3 Primer]: https://www.w3.org/2000/10/swap/Primer.html
97
97
  #
98
98
  # @author Gregg Kellogg
99
99
  module EBNF
@@ -101,6 +101,7 @@ module EBNF
101
101
  include BNF
102
102
  include LL1
103
103
  include Parser
104
+ include PEG
104
105
 
105
106
  # Abstract syntax tree from parse
106
107
  #
@@ -116,9 +117,9 @@ module EBNF
116
117
  # in S-Expressions (similar to SPARQL SSE)
117
118
  #
118
119
  # @param [#read, #to_s] input
119
- # @param [Hash{Symbol => Object}] options
120
- # @param [Symbol] :format (:ebnf)
120
+ # @param [Symbol] format (:ebnf)
121
121
  # Format of input, one of :ebnf, or :sxp
122
+ # @param [Hash{Symbol => Object}] options
122
123
  # @option options [Boolean, Array] :debug
123
124
  # Output debug information to an array or $stdout.
124
125
  def initialize(input, format: :ebnf, **options)
@@ -194,26 +195,26 @@ module EBNF
194
195
  # Output Ruby parser files
195
196
  #
196
197
  # @param [IO, StringIO] output
197
- # @param [String] :grammarFile
198
- # @param [String] :mod_name ('Branch')
199
- def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Branch')
198
+ # @param [String] grammarFile
199
+ # @param [String] mod_name ('Meta')
200
+ def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Meta', **options)
200
201
  unless output == $stdout
201
- output.puts "# This file is automatically generated by #{__FILE__}"
202
- output.puts "# BRANCH derived from #{grammarFile}" if grammarFile
202
+ output.puts "# This file is automatically generated by ebnf version #{EBNF::VERSION}"
203
+ output.puts "# Derived from #{grammarFile}" if grammarFile
203
204
  unless self.errors.empty?
204
- output.puts "# Note, tables completed with errors, may need to be resolved manually:"
205
+ output.puts "# Note, grammar has errors, may need to be resolved manually:"
205
206
  #output.puts "# #{pp.conflicts.map{|c| c.join("\n# ")}.join("\n# ")}"
206
207
  end
207
208
  output.puts "module #{mod_name}"
208
- output.puts " START = #{self.start.inspect}"
209
- output.puts
209
+ output.puts " START = #{self.start.inspect}\n" if self.start
210
+ end
211
+
212
+ # Either output LL(1) BRANCH tables or rules for PEG parsing
213
+ if ast.first.is_a?(EBNF::PEG::Rule)
214
+ to_ruby_peg(output)
215
+ else
216
+ to_ruby_ll1(output)
210
217
  end
211
- self.outputTable(output, 'BRANCH', self.branch, 1)
212
- self.outputTable(output, 'TERMINALS', self.terminals, 1)
213
- self.outputTable(output, 'FIRST', self.first, 1)
214
- self.outputTable(output, 'FOLLOW', self.follow, 1)
215
- self.outputTable(output, 'CLEANUP', self.cleanup, 1)
216
- self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
217
218
  unless output == $stdout
218
219
  output.puts "end"
219
220
  end
@@ -17,32 +17,7 @@ module EBNF
17
17
  new_ast += new_rules
18
18
  end
19
19
 
20
- # Consolodate equivalent terminal rules
21
- to_rewrite = {}
22
- new_ast.select {|r| r.terminal?}.each do |src_rule|
23
- new_ast.select {|r| r.terminal?}.each do |dst_rule|
24
- if src_rule.equivalent?(dst_rule) && src_rule != dst_rule
25
- debug("make_bnf") {"equivalent rules: #{src_rule.inspect} and #{dst_rule.inspect}"}
26
- (to_rewrite[src_rule] ||= []) << dst_rule
27
- end
28
- end
29
- end
30
-
31
- # Replace references to equivalent rules with canonical rule
32
- to_rewrite.each do |src_rule, dst_rules|
33
- dst_rules.each do |dst_rule|
34
- new_ast.each do |mod_rule|
35
- debug("make_bnf") {"rewrite #{mod_rule.inspect} from #{dst_rule.sym} to #{src_rule.sym}"}
36
- mod_rule.rewrite(dst_rule, src_rule)
37
- end
38
- end
39
- end
40
-
41
- # AST now has just rewritten rules
42
- compacted_ast = new_ast - to_rewrite.values.flatten.compact
43
-
44
- # Sort AST by number
45
- @ast = compacted_ast
20
+ @ast = new_ast
46
21
  progress("make_bnf") {"End: #{@ast.length} rules"}
47
22
  self
48
23
  end
@@ -1,4 +1,90 @@
1
1
  module EBNF
2
+ ##
3
+ # This module extends {EBNF::Base} to create metadata including _branch_, [First/Follow][], and other tables which is used by {EBNF::LL1::Parser} to recognize examples of the associated grammar.
4
+ #
5
+ ### Branch Table
6
+ #
7
+ # The Branch table is a hash mapping production rules to a hash relating terminals appearing in input to sequence of productions to follow when the corresponding input terminal is found. This allows either the `seq` primitive, where all terminals map to the same sequence of productions, or the `alt` primitive, where each terminal may map to a different production.
8
+ #
9
+ # BRANCH = {
10
+ # :alt => {
11
+ # "(" => [:seq, :_alt_1],
12
+ # :ENUM => [:seq, :_alt_1],
13
+ # :HEX => [:seq, :_alt_1],
14
+ # :O_ENUM => [:seq, :_alt_1],
15
+ # :O_RANGE => [:seq, :_alt_1],
16
+ # :RANGE => [:seq, :_alt_1],
17
+ # :STRING1 => [:seq, :_alt_1],
18
+ # :STRING2 => [:seq, :_alt_1],
19
+ # :SYMBOL => [:seq, :_alt_1],
20
+ # },
21
+ # ...
22
+ # :declaration => {
23
+ # "@pass" => [:pass],
24
+ # "@terminals" => ["@terminals"],
25
+ # },
26
+ # ...
27
+ # }
28
+ #
29
+ # In this case the `alt` rule is `seq ('|' seq)*` can happen when any of the specified tokens appears on the input stream. The all cause the same token to be passed to the `seq` rule and follow with `_alt_1`, which handles the `('|' seq)*` portion of the rule, after the first sequence is matched.
30
+ #
31
+ # The `declaration` rule is `@terminals' | pass` using the `alt` primitive determining the production to run based on the terminal appearing on the input stream. Eventually, a terminal production is found and the token is consumed.
32
+ #
33
+ ### First/Follow Table
34
+ #
35
+ # The [First/Follow][] table is a hash mapping production rules to the terminals that may proceed or follow the rule. For example:
36
+ #
37
+ # FIRST = {
38
+ # :alt => [
39
+ # :HEX,
40
+ # :SYMBOL,
41
+ # :ENUM,
42
+ # :O_ENUM,
43
+ # :RANGE,
44
+ # :O_RANGE,
45
+ # :STRING1,
46
+ # :STRING2,
47
+ # "("],
48
+ # ...
49
+ # }
50
+ #
51
+ ### Terminals Table
52
+ #
53
+ # This table is a simple list of the terminal productions found in the grammar. For example:
54
+ #
55
+ # TERMINALS = ["(", ")", "-",
56
+ # "@pass", "@terminals",
57
+ # :ENUM, :HEX, :LHS, :O_ENUM, :O_RANGE,:POSTFIX,
58
+ # :RANGE, :STRING1, :STRING2, :SYMBOL,"|"
59
+ # ].freeze
60
+ #
61
+ ### Cleanup Table
62
+ #
63
+ # This table identifies productions which used EBNF rules, which are transformed to BNF for actual parsing. This allows the parser, in some cases, to reproduce *star*, *plus*, and *opt* rule matches. For example:
64
+ #
65
+ # CLEANUP = {
66
+ # :_alt_1 => :star,
67
+ # :_alt_3 => :merge,
68
+ # :_diff_1 => :opt,
69
+ # :ebnf => :star,
70
+ # :_ebnf_2 => :merge,
71
+ # :_postfix_1 => :opt,
72
+ # :seq => :plus,
73
+ # :_seq_1 => :star,
74
+ # :_seq_2 => :merge,
75
+ # }.freeze
76
+ #
77
+ # In this case the `ebnf` rule was `(declaration | rule)*`. As BNF does not support a star operator, this is decomposed into a set of rules using `alt` and `seq` primitives:
78
+ #
79
+ # ebnf ::= _empty _ebnf_2
80
+ # _ebnf_1 ::= declaration | rule
81
+ # _ebnf_2 ::= _ebnf_1 ebnf
82
+ # _ebnf_3 ::= ebnf
83
+ #
84
+ # The `_empty` production matches an empty string, so allows for now value. `_ebnf_2` matches `declaration | rule` (using the `alt` primitive) followed by `ebnf`, creating a sequence of zero or more `declaration` or `alt` members.
85
+ #
86
+ # [First/Follow]: https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
87
+
2
88
  module LL1
3
89
  autoload :Lexer, "ebnf/ll1/lexer"
4
90
  autoload :Parser, "ebnf/ll1/parser"
@@ -51,8 +137,40 @@ module EBNF
51
137
  ##
52
138
  # Create first/follow for each rule using techniques defined for LL(1) parsers.
53
139
  #
140
+ # This takes rules which have transformed into BNF and adds first/follow and otehr information to the rules to allow the generation of metadata tables used for driving a parser.
141
+ #
142
+ # Given an initial rule in EBNF:
143
+ #
144
+ # (rule enbf "1" (star declaration rule))
145
+ #
146
+ # The BNF transformation becomes:
147
+ #
148
+ # (rule ebnf "1" (alt _empty _ebnf_2))
149
+ # (rule _ebnf_1 "1.1" (alt declaration rule))
150
+ # (rule _ebnf_2 "1.2" (seq _ebnf_1 ebnf))
151
+ # (rule _ebnf_3 "1.3" (seq ebnf))
152
+ #
153
+ # After running this method, the rules are annotated with first/follow and cleanup rules:
154
+ #
155
+ # (rule ebnf "1"
156
+ # (start #t)
157
+ # (first "@pass" "@terminals" LHS _eps)
158
+ # (follow _eof)
159
+ # (cleanup star)
160
+ # (alt _empty _ebnf_2))
161
+ # (rule _ebnf_1 "1.1"
162
+ # (first "@pass" "@terminals" LHS)
163
+ # (follow "@pass" "@terminals" LHS _eof)
164
+ # (alt declaration rule))
165
+ # (rule _ebnf_2 "1.2"
166
+ # (first "@pass" "@terminals" LHS)
167
+ # (follow _eof)
168
+ # (cleanup merge)
169
+ # (seq _ebnf_1 ebnf))
170
+ # (rule _ebnf_3 "1.3" (first "@pass" "@terminals" LHS _eps) (follow _eof) (seq ebnf))
171
+ #
54
172
  # @return [EBNF] self
55
- # @see http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
173
+ # @see https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
56
174
  # @param [Array<Symbol>] starts
57
175
  # Set of symbols which are start rules
58
176
  def first_follow(*starts)
@@ -276,6 +394,19 @@ module EBNF
276
394
  end
277
395
  end
278
396
 
397
+ ##
398
+ # Output Ruby parser files for LL(1) parsing
399
+ #
400
+ # @param [IO, StringIO] output
401
+ def to_ruby_ll1(output, **options)
402
+ self.outputTable(output, 'BRANCH', self.branch, 1)
403
+ self.outputTable(output, 'TERMINALS', self.terminals, 1)
404
+ self.outputTable(output, 'FIRST', self.first, 1)
405
+ self.outputTable(output, 'FOLLOW', self.follow, 1)
406
+ self.outputTable(output, 'CLEANUP', self.cleanup, 1)
407
+ self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
408
+ end
409
+
279
410
  private
280
411
  def do_production(lhs)
281
412
  rule = find_rule(lhs)
@@ -29,7 +29,7 @@ module EBNF::LL1
29
29
  # warn error.inspect
30
30
  # end
31
31
  #
32
- # @see http://en.wikipedia.org/wiki/Lexical_analysis
32
+ # @see https://en.wikipedia.org/wiki/Lexical_analysis
33
33
  class Lexer
34
34
  include Enumerable
35
35
 
@@ -43,10 +43,10 @@ module EBNF::LL1
43
43
  "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
44
  '\\\\' => '\\' # \u005C (backslash)
45
45
  }.freeze
46
- ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/.freeze # \uXXXX
47
- ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/.freeze # \UXXXXXXXX
48
- ECHAR = /\\./ # More liberal unescaping
49
- UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/.freeze
46
+ ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
47
+ ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
48
+ ECHAR = /\\./u.freeze # More liberal unescaping
49
+ UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
50
50
 
51
51
  ##
52
52
  # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
@@ -59,7 +59,7 @@ module EBNF::LL1
59
59
  #
60
60
  # @param [String] string
61
61
  # @return [String]
62
- # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
62
+ # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
63
63
  def self.unescape_codepoints(string)
64
64
  string = string.dup
65
65
  string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
@@ -81,7 +81,7 @@ module EBNF::LL1
81
81
  #
82
82
  # @param [String] input
83
83
  # @return [String]
84
- # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
84
+ # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
85
85
  def self.unescape_string(input)
86
86
  input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
87
87
  end
@@ -131,7 +131,6 @@ module EBNF::LL1
131
131
 
132
132
  raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
133
133
 
134
- @lineno = 1
135
134
  @scanner = Scanner.new(input, **options)
136
135
  end
137
136
 
@@ -147,12 +146,6 @@ module EBNF::LL1
147
146
  # @return [String]
148
147
  attr_accessor :input
149
148
 
150
- ##
151
- # The current line number (zero-based).
152
- #
153
- # @return [Integer]
154
- attr_reader :lineno
155
-
156
149
  ##
157
150
  # Returns `true` if the input string is lexically valid.
158
151
  #
@@ -194,7 +187,7 @@ module EBNF::LL1
194
187
 
195
188
  @first ||= begin
196
189
  {} while !scanner.eos? && skip_whitespace
197
- return @scanner = nil if scanner.eos?
190
+ return nil if scanner.eos?
198
191
 
199
192
  token = match_token(*types)
200
193
 
@@ -233,7 +226,7 @@ module EBNF::LL1
233
226
  # @return [Token]
234
227
  def recover(*types)
235
228
  until scanner.eos? || tok = match_token(*types)
236
- if scanner.skip_until(@whitespace || /\s/m).nil? # Skip past current "token"
229
+ if scanner.skip_until(@whitespace || /\s+/m).nil? # Skip past current "token"
237
230
  # No whitespace at the end, must be and end of string
238
231
  scanner.terminate
239
232
  else
@@ -243,6 +236,14 @@ module EBNF::LL1
243
236
  scanner.unscan if tok
244
237
  first
245
238
  end
239
+
240
+ ##
241
+ # The current line number (one-based).
242
+ #
243
+ # @return [Integer]
244
+ def lineno
245
+ scanner.lineno
246
+ end
246
247
  protected
247
248
 
248
249
  # @return [StringScanner]
@@ -253,9 +254,7 @@ module EBNF::LL1
253
254
  def skip_whitespace
254
255
  # skip all white space, but keep track of the current line number
255
256
  while @whitespace && !scanner.eos?
256
- if matched = scanner.scan(@whitespace)
257
- @lineno += matched.count("\n")
258
- else
257
+ unless scanner.scan(@whitespace)
259
258
  return
260
259
  end
261
260
  end
@@ -281,7 +280,6 @@ module EBNF::LL1
281
280
  if matched = scanner.scan(term.regexp)
282
281
  #STDERR.puts " matched #{term.type.inspect}: #{matched.inspect}"
283
282
  tok = token(term.type, term.canonicalize(matched))
284
- @lineno += matched.count("\n")
285
283
  return tok
286
284
  end
287
285
  end
@@ -372,7 +370,7 @@ module EBNF::LL1
372
370
  # token.type #=> :LANGTAG
373
371
  # token.value #=> "en"
374
372
  #
375
- # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
373
+ # @see https://en.wikipedia.org/wiki/Lexical_analysis#Token
376
374
  class Token
377
375
  ##
378
376
  # The token's symbol type.
@@ -493,7 +491,7 @@ module EBNF::LL1
493
491
  # "invalid token '%' on line 10",
494
492
  # input: query, token: '%', lineno: 9)
495
493
  #
496
- # @see http://ruby-doc.org/core/classes/StandardError.html
494
+ # @see https://ruby-doc.org/core/classes/StandardError.html
497
495
  class Error < StandardError
498
496
  ##
499
497
  # The input string associated with the error.