ebnf 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,9 @@ module EBNF
3
3
  autoload :BNF, "ebnf/bnf"
4
4
  autoload :LL1, "ebnf/ll1"
5
5
  autoload :Parser, "ebnf/parser"
6
+ autoload :PEG, "ebnf/peg"
6
7
  autoload :Rule, "ebnf/rule"
8
+ autoload :Terminals,"ebnf/terminals"
7
9
  autoload :Writer, "ebnf/writer"
8
10
  autoload :VERSION, "ebnf/version"
9
11
 
@@ -18,6 +20,6 @@ module EBNF
18
20
  # @return [EBNF::Base]
19
21
  # @raise [Exception] on invalid input
20
22
  def self.parse(input, **options)
21
- query = ::EBNF::Base.new(input, **options)
23
+ ::EBNF::Base.new(input, **options)
22
24
  end
23
25
  end
@@ -2,7 +2,7 @@ require 'strscan'
2
2
 
3
3
  # Extended Bakus-Nour Form (EBNF), being the W3C variation is
4
4
  # originaly defined in the
5
- # [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation).
5
+ # [W3C XML 1.0 Spec](https://www.w3.org/TR/REC-xml/#sec-notation).
6
6
  #
7
7
  # This version attempts to be less strict than the strict definition
8
8
  # to allow for coloquial variations (such as in the Turtle syntax).
@@ -12,8 +12,8 @@ require 'strscan'
12
12
  #
13
13
  # Comments include the content between '/*' and '*/'
14
14
  #
15
- # @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
16
- # @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
15
+ # @see https://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
16
+ # @see https://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
17
17
  #
18
18
  # Based on bnf2turtle by Dan Connolly.
19
19
  #
@@ -36,7 +36,7 @@ require 'strscan'
36
36
  # derived mechanically from the specification.
37
37
  #
38
38
  #
39
- # [N3 design note]: http://www.w3.org/DesignIssues/Notation3
39
+ # [N3 design note]: https://www.w3.org/DesignIssues/Notation3
40
40
  #
41
41
  # Related Work
42
42
  # ------------
@@ -59,12 +59,12 @@ require 'strscan'
59
59
  # expression of the grammar in terms of the higher level EBNF
60
60
  # constructs.
61
61
  #
62
- # [goal]: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
63
- # [n3p announcement]: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
64
- # [Yacker]: http://www.w3.org/1999/02/26-modules/User/Yacker
65
- # [SPARQL specification]: http://www.w3.org/TR/rdf-sparql-query/
66
- # [Cwm Release 1.1.0rc1]: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
67
- # [bnf-rules.n3]: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
62
+ # [goal]: https://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
63
+ # [n3p announcement]: https://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
64
+ # [Yacker]: https://rubygems/02/26-modules/User/Yacker
65
+ # [SPARQL specification]: https://www.w3.org/TR/rdf-sparql-query/
66
+ # [Cwm Release 1.1.0rc1]: https://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
67
+ # [bnf-rules.n3]: https://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
68
68
  #
69
69
  # Open Issues and Future Work
70
70
  # ---------------------------
@@ -82,8 +82,8 @@ require 'strscan'
82
82
  # It would be interesting to corroborate the claim in the SPARQL spec
83
83
  # that the grammar is LL(1) with a mechanical proof based on N3 rules.
84
84
  #
85
- # [swap/grammar/bnf]: http://www.w3.org/2000/10/swap/grammar/bnf
86
- # [bnf2html.n3]: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3
85
+ # [swap/grammar/bnf]: https://www.w3.org/2000/10/swap/grammar/bnf
86
+ # [bnf2html.n3]: https://www.w3.org/2000/10/swap/grammar/bnf2html.n3
87
87
  #
88
88
  # Background
89
89
  # ----------
@@ -93,7 +93,7 @@ require 'strscan'
93
93
  # of N3 that maps directly to (and from) the standard XML syntax for
94
94
  # RDF.
95
95
  #
96
- # [N3 Primer]: http://www.w3.org/2000/10/swap/Primer.html
96
+ # [N3 Primer]: https://www.w3.org/2000/10/swap/Primer.html
97
97
  #
98
98
  # @author Gregg Kellogg
99
99
  module EBNF
@@ -101,6 +101,7 @@ module EBNF
101
101
  include BNF
102
102
  include LL1
103
103
  include Parser
104
+ include PEG
104
105
 
105
106
  # Abstract syntax tree from parse
106
107
  #
@@ -116,9 +117,9 @@ module EBNF
116
117
  # in S-Expressions (similar to SPARQL SSE)
117
118
  #
118
119
  # @param [#read, #to_s] input
119
- # @param [Hash{Symbol => Object}] options
120
- # @param [Symbol] :format (:ebnf)
120
+ # @param [Symbol] format (:ebnf)
121
121
  # Format of input, one of :ebnf, or :sxp
122
+ # @param [Hash{Symbol => Object}] options
122
123
  # @option options [Boolean, Array] :debug
123
124
  # Output debug information to an array or $stdout.
124
125
  def initialize(input, format: :ebnf, **options)
@@ -194,26 +195,26 @@ module EBNF
194
195
  # Output Ruby parser files
195
196
  #
196
197
  # @param [IO, StringIO] output
197
- # @param [String] :grammarFile
198
- # @param [String] :mod_name ('Branch')
199
- def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Branch')
198
+ # @param [String] grammarFile
199
+ # @param [String] mod_name ('Meta')
200
+ def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Meta', **options)
200
201
  unless output == $stdout
201
- output.puts "# This file is automatically generated by #{__FILE__}"
202
- output.puts "# BRANCH derived from #{grammarFile}" if grammarFile
202
+ output.puts "# This file is automatically generated by ebnf version #{EBNF::VERSION}"
203
+ output.puts "# Derived from #{grammarFile}" if grammarFile
203
204
  unless self.errors.empty?
204
- output.puts "# Note, tables completed with errors, may need to be resolved manually:"
205
+ output.puts "# Note, grammar has errors, may need to be resolved manually:"
205
206
  #output.puts "# #{pp.conflicts.map{|c| c.join("\n# ")}.join("\n# ")}"
206
207
  end
207
208
  output.puts "module #{mod_name}"
208
- output.puts " START = #{self.start.inspect}"
209
- output.puts
209
+ output.puts " START = #{self.start.inspect}\n" if self.start
210
+ end
211
+
212
+ # Either output LL(1) BRANCH tables or rules for PEG parsing
213
+ if ast.first.is_a?(EBNF::PEG::Rule)
214
+ to_ruby_peg(output)
215
+ else
216
+ to_ruby_ll1(output)
210
217
  end
211
- self.outputTable(output, 'BRANCH', self.branch, 1)
212
- self.outputTable(output, 'TERMINALS', self.terminals, 1)
213
- self.outputTable(output, 'FIRST', self.first, 1)
214
- self.outputTable(output, 'FOLLOW', self.follow, 1)
215
- self.outputTable(output, 'CLEANUP', self.cleanup, 1)
216
- self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
217
218
  unless output == $stdout
218
219
  output.puts "end"
219
220
  end
@@ -17,32 +17,7 @@ module EBNF
17
17
  new_ast += new_rules
18
18
  end
19
19
 
20
- # Consolodate equivalent terminal rules
21
- to_rewrite = {}
22
- new_ast.select {|r| r.terminal?}.each do |src_rule|
23
- new_ast.select {|r| r.terminal?}.each do |dst_rule|
24
- if src_rule.equivalent?(dst_rule) && src_rule != dst_rule
25
- debug("make_bnf") {"equivalent rules: #{src_rule.inspect} and #{dst_rule.inspect}"}
26
- (to_rewrite[src_rule] ||= []) << dst_rule
27
- end
28
- end
29
- end
30
-
31
- # Replace references to equivalent rules with canonical rule
32
- to_rewrite.each do |src_rule, dst_rules|
33
- dst_rules.each do |dst_rule|
34
- new_ast.each do |mod_rule|
35
- debug("make_bnf") {"rewrite #{mod_rule.inspect} from #{dst_rule.sym} to #{src_rule.sym}"}
36
- mod_rule.rewrite(dst_rule, src_rule)
37
- end
38
- end
39
- end
40
-
41
- # AST now has just rewritten rules
42
- compacted_ast = new_ast - to_rewrite.values.flatten.compact
43
-
44
- # Sort AST by number
45
- @ast = compacted_ast
20
+ @ast = new_ast
46
21
  progress("make_bnf") {"End: #{@ast.length} rules"}
47
22
  self
48
23
  end
@@ -1,4 +1,90 @@
1
1
  module EBNF
2
+ ##
3
+ # This module extends {EBNF::Base} to create metadata including _branch_, [First/Follow][], and other tables which is used by {EBNF::LL1::Parser} to recognize examples of the associated grammar.
4
+ #
5
+ ### Branch Table
6
+ #
7
+ # The Branch table is a hash mapping production rules to a hash relating terminals appearing in input to sequence of productions to follow when the corresponding input terminal is found. This allows either the `seq` primitive, where all terminals map to the same sequence of productions, or the `alt` primitive, where each terminal may map to a different production.
8
+ #
9
+ # BRANCH = {
10
+ # :alt => {
11
+ # "(" => [:seq, :_alt_1],
12
+ # :ENUM => [:seq, :_alt_1],
13
+ # :HEX => [:seq, :_alt_1],
14
+ # :O_ENUM => [:seq, :_alt_1],
15
+ # :O_RANGE => [:seq, :_alt_1],
16
+ # :RANGE => [:seq, :_alt_1],
17
+ # :STRING1 => [:seq, :_alt_1],
18
+ # :STRING2 => [:seq, :_alt_1],
19
+ # :SYMBOL => [:seq, :_alt_1],
20
+ # },
21
+ # ...
22
+ # :declaration => {
23
+ # "@pass" => [:pass],
24
+ # "@terminals" => ["@terminals"],
25
+ # },
26
+ # ...
27
+ # }
28
+ #
29
+ # In this case the `alt` rule is `seq ('|' seq)*` can happen when any of the specified tokens appears on the input stream. The all cause the same token to be passed to the `seq` rule and follow with `_alt_1`, which handles the `('|' seq)*` portion of the rule, after the first sequence is matched.
30
+ #
31
+ # The `declaration` rule is `@terminals' | pass` using the `alt` primitive determining the production to run based on the terminal appearing on the input stream. Eventually, a terminal production is found and the token is consumed.
32
+ #
33
+ ### First/Follow Table
34
+ #
35
+ # The [First/Follow][] table is a hash mapping production rules to the terminals that may proceed or follow the rule. For example:
36
+ #
37
+ # FIRST = {
38
+ # :alt => [
39
+ # :HEX,
40
+ # :SYMBOL,
41
+ # :ENUM,
42
+ # :O_ENUM,
43
+ # :RANGE,
44
+ # :O_RANGE,
45
+ # :STRING1,
46
+ # :STRING2,
47
+ # "("],
48
+ # ...
49
+ # }
50
+ #
51
+ ### Terminals Table
52
+ #
53
+ # This table is a simple list of the terminal productions found in the grammar. For example:
54
+ #
55
+ # TERMINALS = ["(", ")", "-",
56
+ # "@pass", "@terminals",
57
+ # :ENUM, :HEX, :LHS, :O_ENUM, :O_RANGE,:POSTFIX,
58
+ # :RANGE, :STRING1, :STRING2, :SYMBOL,"|"
59
+ # ].freeze
60
+ #
61
+ ### Cleanup Table
62
+ #
63
+ # This table identifies productions which used EBNF rules, which are transformed to BNF for actual parsing. This allows the parser, in some cases, to reproduce *star*, *plus*, and *opt* rule matches. For example:
64
+ #
65
+ # CLEANUP = {
66
+ # :_alt_1 => :star,
67
+ # :_alt_3 => :merge,
68
+ # :_diff_1 => :opt,
69
+ # :ebnf => :star,
70
+ # :_ebnf_2 => :merge,
71
+ # :_postfix_1 => :opt,
72
+ # :seq => :plus,
73
+ # :_seq_1 => :star,
74
+ # :_seq_2 => :merge,
75
+ # }.freeze
76
+ #
77
+ # In this case the `ebnf` rule was `(declaration | rule)*`. As BNF does not support a star operator, this is decomposed into a set of rules using `alt` and `seq` primitives:
78
+ #
79
+ # ebnf ::= _empty _ebnf_2
80
+ # _ebnf_1 ::= declaration | rule
81
+ # _ebnf_2 ::= _ebnf_1 ebnf
82
+ # _ebnf_3 ::= ebnf
83
+ #
84
+ # The `_empty` production matches an empty string, so allows for now value. `_ebnf_2` matches `declaration | rule` (using the `alt` primitive) followed by `ebnf`, creating a sequence of zero or more `declaration` or `alt` members.
85
+ #
86
+ # [First/Follow]: https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
87
+
2
88
  module LL1
3
89
  autoload :Lexer, "ebnf/ll1/lexer"
4
90
  autoload :Parser, "ebnf/ll1/parser"
@@ -51,8 +137,40 @@ module EBNF
51
137
  ##
52
138
  # Create first/follow for each rule using techniques defined for LL(1) parsers.
53
139
  #
140
+ # This takes rules which have transformed into BNF and adds first/follow and otehr information to the rules to allow the generation of metadata tables used for driving a parser.
141
+ #
142
+ # Given an initial rule in EBNF:
143
+ #
144
+ # (rule enbf "1" (star declaration rule))
145
+ #
146
+ # The BNF transformation becomes:
147
+ #
148
+ # (rule ebnf "1" (alt _empty _ebnf_2))
149
+ # (rule _ebnf_1 "1.1" (alt declaration rule))
150
+ # (rule _ebnf_2 "1.2" (seq _ebnf_1 ebnf))
151
+ # (rule _ebnf_3 "1.3" (seq ebnf))
152
+ #
153
+ # After running this method, the rules are annotated with first/follow and cleanup rules:
154
+ #
155
+ # (rule ebnf "1"
156
+ # (start #t)
157
+ # (first "@pass" "@terminals" LHS _eps)
158
+ # (follow _eof)
159
+ # (cleanup star)
160
+ # (alt _empty _ebnf_2))
161
+ # (rule _ebnf_1 "1.1"
162
+ # (first "@pass" "@terminals" LHS)
163
+ # (follow "@pass" "@terminals" LHS _eof)
164
+ # (alt declaration rule))
165
+ # (rule _ebnf_2 "1.2"
166
+ # (first "@pass" "@terminals" LHS)
167
+ # (follow _eof)
168
+ # (cleanup merge)
169
+ # (seq _ebnf_1 ebnf))
170
+ # (rule _ebnf_3 "1.3" (first "@pass" "@terminals" LHS _eps) (follow _eof) (seq ebnf))
171
+ #
54
172
  # @return [EBNF] self
55
- # @see http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
173
+ # @see https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
56
174
  # @param [Array<Symbol>] starts
57
175
  # Set of symbols which are start rules
58
176
  def first_follow(*starts)
@@ -276,6 +394,19 @@ module EBNF
276
394
  end
277
395
  end
278
396
 
397
+ ##
398
+ # Output Ruby parser files for LL(1) parsing
399
+ #
400
+ # @param [IO, StringIO] output
401
+ def to_ruby_ll1(output, **options)
402
+ self.outputTable(output, 'BRANCH', self.branch, 1)
403
+ self.outputTable(output, 'TERMINALS', self.terminals, 1)
404
+ self.outputTable(output, 'FIRST', self.first, 1)
405
+ self.outputTable(output, 'FOLLOW', self.follow, 1)
406
+ self.outputTable(output, 'CLEANUP', self.cleanup, 1)
407
+ self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
408
+ end
409
+
279
410
  private
280
411
  def do_production(lhs)
281
412
  rule = find_rule(lhs)
@@ -29,7 +29,7 @@ module EBNF::LL1
29
29
  # warn error.inspect
30
30
  # end
31
31
  #
32
- # @see http://en.wikipedia.org/wiki/Lexical_analysis
32
+ # @see https://en.wikipedia.org/wiki/Lexical_analysis
33
33
  class Lexer
34
34
  include Enumerable
35
35
 
@@ -43,10 +43,10 @@ module EBNF::LL1
43
43
  "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
44
  '\\\\' => '\\' # \u005C (backslash)
45
45
  }.freeze
46
- ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/.freeze # \uXXXX
47
- ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/.freeze # \UXXXXXXXX
48
- ECHAR = /\\./ # More liberal unescaping
49
- UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/.freeze
46
+ ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
47
+ ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
48
+ ECHAR = /\\./u.freeze # More liberal unescaping
49
+ UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
50
50
 
51
51
  ##
52
52
  # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
@@ -59,7 +59,7 @@ module EBNF::LL1
59
59
  #
60
60
  # @param [String] string
61
61
  # @return [String]
62
- # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
62
+ # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
63
63
  def self.unescape_codepoints(string)
64
64
  string = string.dup
65
65
  string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
@@ -81,7 +81,7 @@ module EBNF::LL1
81
81
  #
82
82
  # @param [String] input
83
83
  # @return [String]
84
- # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
84
+ # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
85
85
  def self.unescape_string(input)
86
86
  input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
87
87
  end
@@ -131,7 +131,6 @@ module EBNF::LL1
131
131
 
132
132
  raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
133
133
 
134
- @lineno = 1
135
134
  @scanner = Scanner.new(input, **options)
136
135
  end
137
136
 
@@ -147,12 +146,6 @@ module EBNF::LL1
147
146
  # @return [String]
148
147
  attr_accessor :input
149
148
 
150
- ##
151
- # The current line number (zero-based).
152
- #
153
- # @return [Integer]
154
- attr_reader :lineno
155
-
156
149
  ##
157
150
  # Returns `true` if the input string is lexically valid.
158
151
  #
@@ -194,7 +187,7 @@ module EBNF::LL1
194
187
 
195
188
  @first ||= begin
196
189
  {} while !scanner.eos? && skip_whitespace
197
- return @scanner = nil if scanner.eos?
190
+ return nil if scanner.eos?
198
191
 
199
192
  token = match_token(*types)
200
193
 
@@ -233,7 +226,7 @@ module EBNF::LL1
233
226
  # @return [Token]
234
227
  def recover(*types)
235
228
  until scanner.eos? || tok = match_token(*types)
236
- if scanner.skip_until(@whitespace || /\s/m).nil? # Skip past current "token"
229
+ if scanner.skip_until(@whitespace || /\s+/m).nil? # Skip past current "token"
237
230
  # No whitespace at the end, must be and end of string
238
231
  scanner.terminate
239
232
  else
@@ -243,6 +236,14 @@ module EBNF::LL1
243
236
  scanner.unscan if tok
244
237
  first
245
238
  end
239
+
240
+ ##
241
+ # The current line number (one-based).
242
+ #
243
+ # @return [Integer]
244
+ def lineno
245
+ scanner.lineno
246
+ end
246
247
  protected
247
248
 
248
249
  # @return [StringScanner]
@@ -253,9 +254,7 @@ module EBNF::LL1
253
254
  def skip_whitespace
254
255
  # skip all white space, but keep track of the current line number
255
256
  while @whitespace && !scanner.eos?
256
- if matched = scanner.scan(@whitespace)
257
- @lineno += matched.count("\n")
258
- else
257
+ unless scanner.scan(@whitespace)
259
258
  return
260
259
  end
261
260
  end
@@ -281,7 +280,6 @@ module EBNF::LL1
281
280
  if matched = scanner.scan(term.regexp)
282
281
  #STDERR.puts " matched #{term.type.inspect}: #{matched.inspect}"
283
282
  tok = token(term.type, term.canonicalize(matched))
284
- @lineno += matched.count("\n")
285
283
  return tok
286
284
  end
287
285
  end
@@ -372,7 +370,7 @@ module EBNF::LL1
372
370
  # token.type #=> :LANGTAG
373
371
  # token.value #=> "en"
374
372
  #
375
- # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
373
+ # @see https://en.wikipedia.org/wiki/Lexical_analysis#Token
376
374
  class Token
377
375
  ##
378
376
  # The token's symbol type.
@@ -493,7 +491,7 @@ module EBNF::LL1
493
491
  # "invalid token '%' on line 10",
494
492
  # input: query, token: '%', lineno: 9)
495
493
  #
496
- # @see http://ruby-doc.org/core/classes/StandardError.html
494
+ # @see https://ruby-doc.org/core/classes/StandardError.html
497
495
  class Error < StandardError
498
496
  ##
499
497
  # The input string associated with the error.