RubyGems - ebnf - Versions diffs - 1.2.0 → 2.0.0 - Mend

ebnf 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/README.md +160 -185
data/UNLICENSE +1 -1
data/VERSION +1 -1
data/bin/ebnf +6 -3
data/etc/doap.ttl +13 -12
data/etc/ebnf.ebnf +13 -19
data/etc/ebnf.html +205 -239
data/etc/{ebnf.rb → ebnf.ll1.rb} +3 -4
data/etc/ebnf.ll1.sxp +179 -183
data/etc/ebnf.peg.rb +98 -0
data/etc/ebnf.peg.sxp +93 -0
data/etc/ebnf.sxp +37 -41
data/etc/sparql.html +1603 -1751
data/etc/sparql.ll1.sxp +7372 -7372
data/etc/sparql.peg.rb +532 -0
data/etc/sparql.peg.sxp +597 -0
data/etc/sparql.sxp +362 -362
data/etc/turtle.html +465 -517
data/etc/{turtle.rb → turtle.ll1.rb} +3 -4
data/etc/turtle.ll1.sxp +425 -425
data/etc/turtle.peg.rb +182 -0
data/etc/turtle.peg.sxp +199 -0
data/etc/turtle.sxp +101 -101
data/lib/ebnf.rb +3 -1
data/lib/ebnf/base.rb +30 -29
data/lib/ebnf/bnf.rb +1 -26
data/lib/ebnf/ll1.rb +132 -1
data/lib/ebnf/ll1/lexer.rb +20 -22
data/lib/ebnf/ll1/parser.rb +86 -61
data/lib/ebnf/ll1/scanner.rb +83 -50
data/lib/ebnf/peg.rb +39 -0
data/lib/ebnf/peg/parser.rb +535 -0
data/lib/ebnf/peg/rule.rb +222 -0
data/lib/ebnf/rule.rb +118 -55
data/lib/ebnf/terminals.rb +18 -0
data/lib/ebnf/writer.rb +3 -2
metadata +29 -6
data/etc/sparql.rb +0 -45773

data/lib/ebnf.rb CHANGED

@@ -3,7 +3,9 @@ module EBNF
   autoload :BNF,      "ebnf/bnf"
   autoload :LL1,      "ebnf/ll1"
   autoload :Parser,   "ebnf/parser"
+  autoload :PEG,      "ebnf/peg"
   autoload :Rule,     "ebnf/rule"
+  autoload :Terminals,"ebnf/terminals"
   autoload :Writer,   "ebnf/writer"
   autoload :VERSION,  "ebnf/version"
@@ -18,6 +20,6 @@ module EBNF
   # @return [EBNF::Base]
   # @raise  [Exception] on invalid input
   def self.parse(input, **options)
-    query = ::EBNF::Base.new(input, **options)
+    ::EBNF::Base.new(input, **options)
   end
 end

data/lib/ebnf/base.rb CHANGED

@@ -2,7 +2,7 @@ require 'strscan'
 # Extended Bakus-Nour Form (EBNF), being the W3C variation is
 # originaly defined in the
-# [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation).
+# [W3C XML 1.0 Spec](https://www.w3.org/TR/REC-xml/#sec-notation).
 #
 # This version attempts to be less strict than the strict definition
 # to allow for coloquial variations (such as in the Turtle syntax).
@@ -12,8 +12,8 @@ require 'strscan'
 #
 # Comments include the content between '/*' and '*/'
 #
-# @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
-# @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
+# @see https://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
+# @see https://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
 #
 # Based on bnf2turtle by Dan Connolly.
 #
@@ -36,7 +36,7 @@ require 'strscan'
 # derived mechanically from the specification.
 #
 #
-# [N3 design note]: http://www.w3.org/DesignIssues/Notation3
+# [N3 design note]: https://www.w3.org/DesignIssues/Notation3
 #
 # Related Work
 # ------------
@@ -59,12 +59,12 @@ require 'strscan'
 # expression of the grammar in terms of the higher level EBNF
 # constructs.
 #
-# [goal]: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
-# [n3p announcement]: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
-# [Yacker]: http://www.w3.org/1999/02/26-modules/User/Yacker
-# [SPARQL specification]: http://www.w3.org/TR/rdf-sparql-query/
-# [Cwm Release 1.1.0rc1]: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
-# [bnf-rules.n3]: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
+# [goal]: https://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
+# [n3p announcement]: https://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
+# [Yacker]: https://rubygems/02/26-modules/User/Yacker
+# [SPARQL specification]: https://www.w3.org/TR/rdf-sparql-query/
+# [Cwm Release 1.1.0rc1]: https://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
+# [bnf-rules.n3]: https://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
 #
 # Open Issues and Future Work
 # ---------------------------
@@ -82,8 +82,8 @@ require 'strscan'
 # It would be interesting to corroborate the claim in the SPARQL spec
 # that the grammar is LL(1) with a mechanical proof based on N3 rules.
 #
-# [swap/grammar/bnf]: http://www.w3.org/2000/10/swap/grammar/bnf
-# [bnf2html.n3]: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3
+# [swap/grammar/bnf]: https://www.w3.org/2000/10/swap/grammar/bnf
+# [bnf2html.n3]: https://www.w3.org/2000/10/swap/grammar/bnf2html.n3
 #
 # Background
 # ----------
@@ -93,7 +93,7 @@ require 'strscan'
 # of N3 that maps directly to (and from) the standard XML syntax for
 # RDF.
 #
-# [N3 Primer]: http://www.w3.org/2000/10/swap/Primer.html
+# [N3 Primer]: https://www.w3.org/2000/10/swap/Primer.html
 #
 # @author Gregg Kellogg
 module EBNF
@@ -101,6 +101,7 @@ module EBNF
     include BNF
     include LL1
     include Parser
+    include PEG
     # Abstract syntax tree from parse
     #
@@ -116,9 +117,9 @@ module EBNF
     # in S-Expressions (similar to SPARQL SSE)
     #
     # @param [#read, #to_s] input
-    # @param [Hash{Symbol => Object}] options
-    # @param [Symbol] :format (:ebnf)
+    # @param [Symbol] format (:ebnf)
     #   Format of input, one of :ebnf, or :sxp
+    # @param [Hash{Symbol => Object}] options
     # @option options [Boolean, Array] :debug
     #   Output debug information to an array or $stdout.
     def initialize(input, format: :ebnf, **options)
@@ -194,26 +195,26 @@ module EBNF
     # Output Ruby parser files
     #
     # @param [IO, StringIO] output
-    # @param [String] :grammarFile
-    # @param [String] :mod_name ('Branch')
-    def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Branch')
+    # @param [String] grammarFile
+    # @param [String] mod_name ('Meta')
+    def to_ruby(output = $stdout, grammarFile: nil, mod_name: 'Meta', **options)
       unless output == $stdout
-        output.puts "# This file is automatically generated by #{__FILE__}"
-        output.puts "# BRANCH derived from #{grammarFile}" if grammarFile
+        output.puts "# This file is automatically generated by ebnf version #{EBNF::VERSION}"
+        output.puts "# Derived from #{grammarFile}" if grammarFile
         unless self.errors.empty?
-          output.puts "# Note, tables completed with errors, may need to be resolved manually:"
+          output.puts "# Note, grammar has errors, may need to be resolved manually:"
           #output.puts "#   #{pp.conflicts.map{|c| c.join("\n#      ")}.join("\n#   ")}"
         end
         output.puts "module #{mod_name}"
-        output.puts "  START = #{self.start.inspect}"
-        output.puts
+        output.puts "  START = #{self.start.inspect}\n" if self.start
+      end
+      # Either output LL(1) BRANCH tables or rules for PEG parsing
+      if ast.first.is_a?(EBNF::PEG::Rule)
+        to_ruby_peg(output)
+      else
+        to_ruby_ll1(output)
       end
-      self.outputTable(output, 'BRANCH', self.branch, 1)
-      self.outputTable(output, 'TERMINALS', self.terminals, 1)
-      self.outputTable(output, 'FIRST', self.first, 1)
-      self.outputTable(output, 'FOLLOW', self.follow, 1)
-      self.outputTable(output, 'CLEANUP', self.cleanup, 1)
-      self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
       unless output == $stdout
         output.puts "end"
       end

data/lib/ebnf/bnf.rb CHANGED

@@ -17,32 +17,7 @@ module EBNF
         new_ast += new_rules
       end
-      # Consolodate equivalent terminal rules
-      to_rewrite = {}
-      new_ast.select {|r| r.terminal?}.each do |src_rule|
-        new_ast.select {|r| r.terminal?}.each do |dst_rule|
-          if src_rule.equivalent?(dst_rule) && src_rule != dst_rule
-            debug("make_bnf") {"equivalent rules: #{src_rule.inspect} and #{dst_rule.inspect}"}
-            (to_rewrite[src_rule] ||= []) << dst_rule
-          end
-        end
-      end
-      # Replace references to equivalent rules with canonical rule
-      to_rewrite.each do |src_rule, dst_rules|
-        dst_rules.each do |dst_rule|
-          new_ast.each do |mod_rule|
-            debug("make_bnf") {"rewrite #{mod_rule.inspect} from #{dst_rule.sym} to #{src_rule.sym}"}
-            mod_rule.rewrite(dst_rule, src_rule)
-          end
-        end
-      end
-      # AST now has just rewritten rules
-      compacted_ast = new_ast - to_rewrite.values.flatten.compact
-      # Sort AST by number
-      @ast = compacted_ast
+      @ast = new_ast
       progress("make_bnf") {"End: #{@ast.length} rules"}
       self
     end

data/lib/ebnf/ll1.rb CHANGED

@@ -1,4 +1,90 @@
 module EBNF
+  ##
+  # This module extends {EBNF::Base} to create metadata including _branch_,  [First/Follow][], and other tables which is used by {EBNF::LL1::Parser} to recognize examples of the associated grammar.
+  #
+  ### Branch Table
+  #
+  #  The Branch table is a hash mapping production rules to a hash relating terminals appearing in input to sequence of productions to follow when the corresponding input terminal is found. This allows either the `seq` primitive, where all terminals map to the same sequence of productions, or the `alt` primitive, where each terminal may map to a different production.
+  #
+  #      BRANCH = {
+  #        :alt => {
+  #          "(" => [:seq, :_alt_1],
+  #          :ENUM => [:seq, :_alt_1],
+  #          :HEX => [:seq, :_alt_1],
+  #          :O_ENUM => [:seq, :_alt_1],
+  #          :O_RANGE => [:seq, :_alt_1],
+  #          :RANGE => [:seq, :_alt_1],
+  #          :STRING1 => [:seq, :_alt_1],
+  #          :STRING2 => [:seq, :_alt_1],
+  #          :SYMBOL => [:seq, :_alt_1],
+  #        },
+  #        ...
+  #        :declaration => {
+  #          "@pass" => [:pass],
+  #          "@terminals" => ["@terminals"],
+  #        },
+  #        ...
+  #      }
+  #
+  #  In this case the `alt` rule is `seq ('|' seq)*` can happen when any of the specified tokens appears on the input stream. The all cause the same token to be passed to the `seq` rule and follow with `_alt_1`, which handles the `('|' seq)*` portion of the rule, after the first sequence is matched.
+  #
+  #  The `declaration` rule is `@terminals' | pass` using the `alt` primitive determining the production to run based on the terminal appearing on the input stream. Eventually, a terminal production is found and the token is consumed.
+  #
+  ### First/Follow Table
+  #
+  #  The [First/Follow][] table is a hash mapping production rules to the terminals that may proceed or follow the rule. For example:
+  #
+  #      FIRST = {
+  #        :alt => [
+  #          :HEX,
+  #          :SYMBOL,
+  #          :ENUM,
+  #          :O_ENUM,
+  #          :RANGE,
+  #          :O_RANGE,
+  #          :STRING1,
+  #          :STRING2,
+  #          "("],
+  #        ...
+  #      }
+  #
+  ### Terminals Table
+  #
+  #  This table is a simple list of the terminal productions found in the grammar. For example:
+  #
+  #      TERMINALS = ["(", ")", "-",
+  #        "@pass", "@terminals",
+  #        :ENUM, :HEX, :LHS, :O_ENUM, :O_RANGE,:POSTFIX,
+  #        :RANGE, :STRING1, :STRING2, :SYMBOL,"|"
+  #      ].freeze
+  #
+  ### Cleanup Table
+  #
+  #  This table identifies productions which used EBNF rules, which are transformed to BNF for actual parsing. This allows the parser, in some cases, to reproduce *star*, *plus*, and *opt* rule matches. For example:
+  #
+  #      CLEANUP = {
+  #        :_alt_1 => :star,
+  #        :_alt_3 => :merge,
+  #        :_diff_1 => :opt,
+  #        :ebnf => :star,
+  #        :_ebnf_2 => :merge,
+  #        :_postfix_1 => :opt,
+  #        :seq => :plus,
+  #        :_seq_1 => :star,
+  #        :_seq_2 => :merge,
+  #      }.freeze
+  #
+  #  In this case the `ebnf` rule was `(declaration | rule)*`. As BNF does not support a star operator, this is decomposed into a set of rules using `alt` and `seq` primitives:
+  #
+  #      ebnf    ::= _empty _ebnf_2
+  #      _ebnf_1 ::= declaration | rule
+  #      _ebnf_2 ::= _ebnf_1 ebnf
+  #      _ebnf_3 ::= ebnf
+  #
+  #  The `_empty` production matches an empty string, so allows for now value. `_ebnf_2` matches `declaration | rule` (using the `alt` primitive) followed by `ebnf`, creating a sequence of zero or more `declaration` or `alt` members.
+  #
+  # [First/Follow]: https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
   module LL1
     autoload :Lexer,    "ebnf/ll1/lexer"
     autoload :Parser,   "ebnf/ll1/parser"
@@ -51,8 +137,40 @@ module EBNF
     ##
     # Create first/follow for each rule using techniques defined for LL(1) parsers.
     #
+    # This takes rules which have transformed into BNF and adds first/follow and otehr information to the rules to allow the generation of metadata tables used for driving a parser.
+    #
+    # Given an initial rule in EBNF:
+    #
+    #     (rule enbf "1" (star declaration rule))
+    #
+    # The BNF transformation becomes:
+    #
+    #     (rule ebnf "1" (alt _empty _ebnf_2))
+    #     (rule _ebnf_1 "1.1" (alt declaration rule))
+    #     (rule _ebnf_2 "1.2" (seq _ebnf_1 ebnf))
+    #     (rule _ebnf_3 "1.3" (seq ebnf))
+    #
+    # After running this method, the rules are annotated with first/follow and cleanup rules:
+    #
+    #     (rule ebnf "1"
+    #      (start #t)
+    #      (first "@pass" "@terminals" LHS _eps)
+    #      (follow _eof)
+    #      (cleanup star)
+    #      (alt _empty _ebnf_2))
+    #     (rule _ebnf_1 "1.1"
+    #      (first "@pass" "@terminals" LHS)
+    #      (follow "@pass" "@terminals" LHS _eof)
+    #      (alt declaration rule))
+    #     (rule _ebnf_2 "1.2"
+    #      (first "@pass" "@terminals" LHS)
+    #      (follow _eof)
+    #      (cleanup merge)
+    #      (seq _ebnf_1 ebnf))
+    #     (rule _ebnf_3 "1.3" (first "@pass" "@terminals" LHS _eps) (follow _eof) (seq ebnf))
+    #
     # @return [EBNF] self
-    # @see http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
+    # @see https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
     # @param [Array<Symbol>] starts
     #   Set of symbols which are start rules
     def first_follow(*starts)
@@ -276,6 +394,19 @@ module EBNF
       end
     end
+    ##
+    # Output Ruby parser files for LL(1) parsing
+    #
+    # @param [IO, StringIO] output
+    def to_ruby_ll1(output, **options)
+      self.outputTable(output, 'BRANCH', self.branch, 1)
+      self.outputTable(output, 'TERMINALS', self.terminals, 1)
+      self.outputTable(output, 'FIRST', self.first, 1)
+      self.outputTable(output, 'FOLLOW', self.follow, 1)
+      self.outputTable(output, 'CLEANUP', self.cleanup, 1)
+      self.outputTable(output, 'PASS', [self.pass], 1) if self.pass
+    end
     private
     def do_production(lhs)
       rule = find_rule(lhs)

data/lib/ebnf/ll1/lexer.rb CHANGED

@@ -29,7 +29,7 @@ module EBNF::LL1
   #     warn error.inspect
   #   end
   #
-  # @see http://en.wikipedia.org/wiki/Lexical_analysis
+  # @see https://en.wikipedia.org/wiki/Lexical_analysis
   class Lexer
     include Enumerable
@@ -43,10 +43,10 @@ module EBNF::LL1
       "\\'"  => '\'',   # \u0027 (apostrophe-quote, single quote mark)
       '\\\\' => '\\'    # \u005C (backslash)
     }.freeze
-    ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/.freeze    # \uXXXX
-    ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/.freeze    # \UXXXXXXXX
-    ECHAR               = /\\./                               # More liberal unescaping
-    UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/.freeze
+    ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze    # \uXXXX
+    ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze    # \UXXXXXXXX
+    ECHAR               = /\\./u.freeze                        # More liberal unescaping
+    UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
     ##
     # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
@@ -59,7 +59,7 @@ module EBNF::LL1
     #
     # @param  [String] string
     # @return [String]
-    # @see    http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
+    # @see    https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
     def self.unescape_codepoints(string)
       string = string.dup
       string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
@@ -81,7 +81,7 @@ module EBNF::LL1
     #
     # @param  [String] input
     # @return [String]
-    # @see    http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
+    # @see    https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
     def self.unescape_string(input)
       input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
     end
@@ -131,7 +131,6 @@ module EBNF::LL1
       raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
-      @lineno = 1
       @scanner = Scanner.new(input, **options)
     end
@@ -147,12 +146,6 @@ module EBNF::LL1
     # @return [String]
     attr_accessor :input
-    ##
-    # The current line number (zero-based).
-    #
-    # @return [Integer]
-    attr_reader   :lineno
     ##
     # Returns `true` if the input string is lexically valid.
     #
@@ -194,7 +187,7 @@ module EBNF::LL1
       @first ||= begin
         {} while !scanner.eos? && skip_whitespace
-        return @scanner = nil if scanner.eos?
+        return nil if scanner.eos?
         token = match_token(*types)
@@ -233,7 +226,7 @@ module EBNF::LL1
     # @return [Token]
     def recover(*types)
        until scanner.eos? || tok = match_token(*types)
-        if scanner.skip_until(@whitespace || /\s/m).nil? # Skip past current "token"
+        if scanner.skip_until(@whitespace || /\s+/m).nil? # Skip past current "token"
           # No whitespace at the end, must be and end of string
           scanner.terminate
         else
@@ -243,6 +236,14 @@ module EBNF::LL1
       scanner.unscan if tok
       first
     end
+    ##
+    # The current line number (one-based).
+    #
+    # @return [Integer]
+    def lineno
+      scanner.lineno
+    end
   protected
     # @return [StringScanner]
@@ -253,9 +254,7 @@ module EBNF::LL1
     def skip_whitespace
       # skip all white space, but keep track of the current line number
       while @whitespace && !scanner.eos?
-        if matched = scanner.scan(@whitespace)
-          @lineno += matched.count("\n")
-        else
+        unless scanner.scan(@whitespace)
           return
         end
       end
@@ -281,7 +280,6 @@ module EBNF::LL1
         if matched = scanner.scan(term.regexp)
           #STDERR.puts "  matched #{term.type.inspect}: #{matched.inspect}"
           tok = token(term.type, term.canonicalize(matched))
-          @lineno += matched.count("\n")
           return tok
         end
       end
@@ -372,7 +370,7 @@ module EBNF::LL1
     #   token.type   #=> :LANGTAG
     #   token.value  #=> "en"
     #
-    # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
+    # @see https://en.wikipedia.org/wiki/Lexical_analysis#Token
     class Token
       ##
       # The token's symbol type.
@@ -493,7 +491,7 @@ module EBNF::LL1
     #     "invalid token '%' on line 10",
     #     input: query, token: '%', lineno: 9)
     #
-    # @see http://ruby-doc.org/core/classes/StandardError.html
+    # @see https://ruby-doc.org/core/classes/StandardError.html
     class Error < StandardError
       ##
       # The input string associated with the error.