RubyGems - ebnf - Versions diffs - 1.1.3 → 2.1.2 - Mend

ebnf 1.1.3 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

checksums.yaml +4 -4
data/README.md +221 -198
data/UNLICENSE +1 -1
data/VERSION +1 -1
data/bin/ebnf +40 -21
data/etc/abnf-core.ebnf +52 -0
data/etc/abnf.abnf +121 -0
data/etc/abnf.ebnf +124 -0
data/etc/abnf.sxp +45 -0
data/etc/doap.ttl +23 -15
data/etc/ebnf.ebnf +21 -33
data/etc/ebnf.html +171 -160
data/etc/{ebnf.rb → ebnf.ll1.rb} +30 -107
data/etc/ebnf.ll1.sxp +182 -183
data/etc/ebnf.peg.rb +90 -0
data/etc/ebnf.peg.sxp +84 -0
data/etc/ebnf.sxp +40 -41
data/etc/iso-ebnf.ebnf +140 -0
data/etc/iso-ebnf.isoebnf +138 -0
data/etc/iso-ebnf.sxp +65 -0
data/etc/sparql.ebnf +4 -4
data/etc/sparql.html +1603 -1751
data/etc/sparql.ll1.sxp +7372 -7372
data/etc/sparql.peg.rb +532 -0
data/etc/sparql.peg.sxp +597 -0
data/etc/sparql.sxp +363 -362
data/etc/turtle.ebnf +3 -3
data/etc/turtle.html +465 -517
data/etc/{turtle.rb → turtle.ll1.rb} +3 -4
data/etc/turtle.ll1.sxp +425 -425
data/etc/turtle.peg.rb +182 -0
data/etc/turtle.peg.sxp +199 -0
data/etc/turtle.sxp +103 -101
data/lib/ebnf.rb +7 -2
data/lib/ebnf/abnf.rb +301 -0
data/lib/ebnf/abnf/core.rb +23 -0
data/lib/ebnf/abnf/meta.rb +111 -0
data/lib/ebnf/base.rb +113 -69
data/lib/ebnf/bnf.rb +1 -26
data/lib/ebnf/ebnf/meta.rb +90 -0
data/lib/ebnf/isoebnf.rb +229 -0
data/lib/ebnf/isoebnf/meta.rb +75 -0
data/lib/ebnf/ll1.rb +138 -6
data/lib/ebnf/ll1/lexer.rb +37 -32
data/lib/ebnf/ll1/parser.rb +113 -73
data/lib/ebnf/ll1/scanner.rb +83 -51
data/lib/ebnf/native.rb +320 -0
data/lib/ebnf/parser.rb +285 -302
data/lib/ebnf/peg.rb +39 -0
data/lib/ebnf/peg/parser.rb +561 -0
data/lib/ebnf/peg/rule.rb +250 -0
data/lib/ebnf/rule.rb +443 -148
data/lib/ebnf/terminals.rb +21 -0
data/lib/ebnf/writer.rb +565 -83
metadata +107 -29
data/etc/sparql.rb +0 -45773

data/lib/ebnf/peg/rule.rb ADDED

@@ -0,0 +1,250 @@
+module EBNF::PEG
+  # Behaviior for parsing a PEG rule
+  module Rule
+    ##
+    # Initialized by parser when loading rules.
+    # Used for finding rules and invoking elements of the parse process.
+    #
+    # @return [EBNF::PEG::Parser] parser
+    attr_accessor :parser
+    ##
+    # Parse a rule or terminal, invoking callbacks, as appropriate
+    # If there is are `start_production` and/or `production`,
+    # they are invoked with a `prod_data` stack, the input stream and offset.
+    # Otherwise, the results are added as an array value
+    # to a hash indexed by the rule name.
+    #
+    # If matched, the input position is updated and the results returned in a Hash.
+    #
+    # * `alt`: returns the value of the matched production or `:unmatched`.
+    # * `diff`: returns the value matched, or `:unmatched`.
+    # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
+    # * `opt`: returns the value matched, or `nil` if unmatched.
+    # * `plus`: returns an array of the values matched for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
+    # * `range`: returns a string composed of the values matched, or `:unmatched`, if less than `min` are matched.
+    # * `rept`: returns an array of the values matched for the speficied production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
+    # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values. Via option in a `production` or definition, the result can be a single hash with values for each matched production; note that this is not always possible due to the possibility of repeated productions within the sequence.
+    # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string.
+    #
+    # @param [Scanner] input
+    # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
+    def parse(input)
+      # Save position and linenumber for backtracking
+      pos, lineno = input.pos, input.lineno
+      parser.packrat[sym] ||= {}
+      if parser.packrat[sym][pos]
+        parser.debug("#{sym}(:memo)", lineno: lineno) { "#{parser.packrat[sym][pos].inspect}(@#{pos})"}
+        input.pos, input.lineno = parser.packrat[sym][pos][:pos], parser.packrat[sym][pos][:lineno]
+        return parser.packrat[sym][pos][:result]
+      end
+      if terminal?
+        # If the terminal is defined with a regular expression,
+        # use that to match the input,
+        # otherwise,
+        if regexp = parser.find_terminal_regexp(sym)
+          matched = input.scan(regexp)
+          result = parser.onTerminal(sym, (matched ? matched : :unmatched))
+          # Update furthest failure for strings and terminals
+          parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
+          parser.packrat[sym][pos] = {
+            pos: input.pos,
+            lineno: input.lineno,
+            result: result
+          }
+          return parser.packrat[sym][pos][:result]
+        end
+      else
+        eat_whitespace(input)
+      end
+      start_options = parser.onStart(sym)
+      result = case expr.first
+      when :alt
+        # Return the first expression to match.
+        # Result is either :unmatched, or the value of the matching rule
+        alt = :unmatched
+        expr[1..-1].each do |prod|
+          alt = case prod
+          when Symbol
+            rule = parser.find_rule(prod)
+            raise "No rule found for #{prod}" unless rule
+            rule.parse(input)
+          when String
+            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+          end
+          if alt == :unmatched
+            # Update furthest failure for strings and terminals
+            parser.update_furthest_failure(input.pos, input.lineno, prod) if prod.is_a?(String) || rule.terminal?
+          else
+            break
+          end
+        end
+        alt
+      when :diff
+        # matches any string that matches A but does not match B.
+        # (Note, this is only used for Terminal rules, non-terminals will use :not)
+        raise "Diff used on non-terminal #{prod}" unless terminal?
+        re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
+        matched = input.scan(re1)
+        if !matched || re2.match?(matched)
+          # Update furthest failure for terminals
+          parser.update_furthest_failure(input.pos, input.lineno, sym)
+          :unmatched
+        else
+          matched
+        end
+      when :hex
+        # Matches the given hex character if expression matches the character whose number (code point) in ISO/IEC 10646 is N. The number of leading zeros in the #xN form is insignificant.
+        input.scan(to_regexp) || begin
+          # Update furthest failure for terminals
+          parser.update_furthest_failure(input.pos, input.lineno, expr.last)
+          :unmatched
+        end
+      when :not
+        # matches any string that does not match B.
+        res = case prod = expr[1]
+        when Symbol
+          rule = parser.find_rule(prod)
+          raise "No rule found for #{prod}" unless rule
+          rule.parse(input)
+        when String
+          input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+        end
+        if res != :unmatched
+          # Update furthest failure for terminals
+          parser.update_furthest_failure(input.pos, input.lineno, sym) if terminal?
+          :unmatched
+        else
+          nil
+        end
+      when :opt
+        # Result is the matched value or nil
+        opt = rept(input, 0, 1, expr[1])
+        # Update furthest failure for strings and terminals
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        opt.first
+      when :plus
+        # Result is an array of all expressions while they match,
+        # at least one must match
+        plus = rept(input, 1, '*', expr[1])
+        # Update furthest failure for strings and terminals
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        plus.is_a?(Array) && terminal? ? plus.join("") : plus
+      when :range, :istr
+        # Matches the specified character range
+        input.scan(to_regexp) || begin
+          # Update furthest failure for strings and terminals
+          parser.update_furthest_failure(input.pos, input.lineno, expr[1])
+          :unmatched
+        end
+      when :rept
+        # Result is an array of all expressions while they match,
+        # an empty array of none match
+        rept = rept(input, expr[1], expr[2], expr[3])
+        # # Update furthest failure for strings and terminals
+        parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
+        rept.is_a?(Array) && terminal? ? rept.join("") : rept
+      when :seq
+        # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
+        seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
+          eat_whitespace(input) unless accumulator.empty? || terminal?
+          res = case prod
+          when Symbol
+            rule = parser.find_rule(prod)
+            raise "No rule found for #{prod}" unless rule
+            rule.parse(input)
+          when String
+            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+          end
+          if res == :unmatched
+            # Update furthest failure for strings and terminals
+            parser.update_furthest_failure(input.pos, input.lineno, prod)
+            break :unmatched
+          end
+          accumulator << {prod.to_sym => res}
+        end
+        if seq == :unmatched
+          :unmatched
+        elsif terminal?
+          seq.map(&:values).compact.join("") # Concat values for terminal production
+        elsif start_options[:as_hash]
+          seq.inject {|memo, h| memo.merge(h)}
+        else
+          seq
+        end
+      when :star
+        # Result is an array of all expressions while they match,
+        # an empty array of none match
+        star = rept(input, 0, '*', expr[1])
+        # Update furthest failure for strings and terminals
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        star.is_a?(Array) && terminal? ? star.join("") : star
+      else
+        raise "attempt to parse unknown rule type: #{expr.first}"
+      end
+      if result == :unmatched
+        input.pos, input.lineno = pos, lineno
+      end
+      result = parser.onFinish(result)
+      (parser.packrat[sym] ||= {})[pos] = {
+        pos: input.pos,
+        lineno: input.lineno,
+        result: result
+      }
+      return parser.packrat[sym][pos][:result]
+    end
+    ##
+    # Repitition, 0-1, 0-n, 1-n, ...
+    #
+    # Note, nil results are removed from the result, but count towards min/max calculations
+    #
+    # @param [Scanner] input
+    # @param [Integer] min
+    # @param [Integer] max
+    #   If it is an integer, it stops matching after max entries.
+    # @param [Symbol, String] prod
+    # @return [:unmatched, Array]
+    def rept(input, min, max, prod)
+      result = []
+      case prod
+      when Symbol
+        rule = parser.find_rule(prod)
+        raise "No rule found for #{prod}" unless rule
+        while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched
+          eat_whitespace(input) unless terminal?
+          result << res
+        end
+      when String
+        while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
+          eat_whitespace(input) unless terminal?
+          result << res
+        end
+      end
+      result.length < min ? :unmatched : result.compact
+    end
+    ##
+    # Eat whitespace between non-terminal rules
+    def eat_whitespace(input)
+      if parser.whitespace.is_a?(Regexp)
+        # Eat whitespace before a non-terminal
+        input.skip(parser.whitespace)
+      elsif parser.whitespace.is_a?(Rule)
+        parser.whitespace.parse(input) # throw away result
+      end
+    end
+  end
+end

data/lib/ebnf/rule.rb CHANGED

@@ -1,15 +1,33 @@
+require 'scanf'
+require 'strscan'
 module EBNF
   # Represent individual parsed rules
   class Rule
-    # Operations which are flattened to seprate rules in to_bnf
+    # Operations which are flattened to seprate rules in to_bnf.
     BNF_OPS = %w{
-      alt opt plus seq star
+      alt diff not opt plus rept seq star
     }.map(&:to_sym).freeze
     TERM_OPS = %w{
-      diff hex range
+      hex istr range
     }.map(&:to_sym).freeze
+    # The number of arguments expected per operator. `nil` for unspecified
+    OP_ARGN =       {
+      alt: nil,
+      diff: 2,
+      hex: 1,
+      istr: 1,
+      not: 1,
+      opt: 1,
+      plus: 1,
+      range: 1,
+      rept: 3,
+      seq: nil,
+      star: 1
+    }
     # Symbol of rule
     #
     # @return [Symbol]
@@ -26,7 +44,7 @@ module EBNF
     # Kind of rule
     #
-    # @return [:rule, :terminal, or :pass]
+    # @return [:rule, :terminal, :terminals, or :pass]
     attr_accessor :kind
     # Rule expression
@@ -57,19 +75,38 @@ module EBNF
     # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
     attr_accessor :cleanup
-    # @param [Integer] id
-    # @param [Symbol] sym
+    # @param [Symbol, nil] sym
+    #   `nil` is allowed only for @pass or @terminals
+    # @param [Integer, nil] id
     # @param [Array] expr
-    # @param [Symbol] :kind
-    # @param [String] :ebnf
-    # @param [Array] :first
-    # @param [Array] :follow
-    # @param [Boolean] :start
-    # @param [Rule] :top_rule
-    # @param [Boolean] :cleanup
+    #   The expression is an internal-representation of an S-Expression with one of the following oparators:
+    #
+    #   * `alt` – A list of alternative rules, which are attempted in order. It terminates with the first matching rule, or is terminated as unmatched, if no such rule is found.
+    #   * `diff` – matches any string that matches `A` but does not match `B`.
+    #   * `hex` – A single character represented using the hexadecimal notation `#xnn`.
+    #   * `istr` – A string which matches in a case-insensitive manner, so that `(istr "fOo")` will match either of the strings `"foo"`, `"FOO"` or any other combination.
+    #   * `opt` – An optional rule or terminal. It either results in the matching rule or returns `nil`.
+    #   * `plus` – A sequence of one or more of the matching rule. If there is no such rule, it is terminated as unmatched; otherwise, the result is an array containing all matched input.
+    #   * `range` – A range of characters, possibly repeated, of the form `(range "a-z")`. May also use hexadecimal notation.
+    #   * `rept m n` – A sequence of at lest `m` and at most `n` of the matching rule. It will always return an array.
+    #   * `seq` – A sequence of rules or terminals. If any (other than `opt` or `star`) to not parse, the rule is terminated as unmatched.
+    #   * `star` – A sequence of zero or more of the matching rule. It will always return an array.
+    # @param [:rule, :terminal, :terminals, :pass] kind (nil)
+    # @param [String] ebnf (nil)
+    #   When parsing, records the EBNF string used to create the rule.
+    # @param [Array] first (nil)
+    #   Recorded set of terminals that can proceed this rule (LL(1))
+    # @param [Array] follow (nil)
+    #   Recorded set of terminals that can follow this rule (LL(1))
+    # @param [Boolean] start (nil)
+    #   Is this the starting rule for the grammar?
+    # @param [Rule] top_rule (nil)
+    #   The top-most rule. All expressed rules are top-rules, derived rules have the original rule as their top-rule.
+    # @param [Boolean] cleanup (nil)
+    #   Records information useful for cleaning up converted :plus, and :star expansions (LL(1)).
     def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
       @sym, @id = sym, id
-      @expr = expr.is_a?(Array) ? expr : [:seq, expr]
+      @expr = expr.is_a?(Array) ? expr : [:seq, expr].compact
       @ebnf, @kind, @first, @follow, @start, @cleanup, @top_rule = ebnf, kind, first, follow, start, cleanup, top_rule
       @top_rule ||= self
       @kind ||= case
@@ -77,21 +114,53 @@ module EBNF
       when !BNF_OPS.include?(@expr.first) then :terminal
       else :rule
       end
+      # Allow @pass and @terminals to not be named
+      @sym ||= :_pass if @kind == :pass
+      @sym ||= :_terminals if @kind == :terminals
+      raise ArgumentError, "Rule sym must be a symbol, was #{@sym.inspect}" unless @sym.is_a?(Symbol)
+      raise ArgumentError, "Rule id must be a string or nil, was #{@id.inspect}" unless (@id || "").is_a?(String)
+      raise ArgumentError, "Rule kind must be one of :rule, :terminal, :terminals, or :pass, was #{@kind.inspect}" unless
+        @kind.is_a?(Symbol) && %w(rule terminal terminals pass).map(&:to_sym).include?(@kind)
+      case @expr.first
+      when :alt
+        raise ArgumentError, "#{@expr.first} operation must have at least one operand, had #{@expr.length - 1}" unless @expr.length > 1
+      when :diff
+        raise ArgumentError, "#{@expr.first} operation must have exactly two operands, had #{@expr.length - 1}" unless @expr.length == 3
+      when :hex, :istr, :not, :opt, :plus, :range, :star
+        raise ArgumentError, "#{@expr.first} operation must have exactly one operand, had #{@expr.length - 1}" unless @expr.length == 2
+      when :rept
+        raise ArgumentError, "#{@expr.first} operation must have exactly three, had #{@expr.length - 1}" unless @expr.length == 4
+        raise ArgumentError, "#{@expr.first} operation must an non-negative integer minimum, was #{@expr[1]}" unless
+          @expr[1].is_a?(Integer) && @expr[1] >= 0
+        raise ArgumentError, "#{@expr.first} operation must an non-negative integer maximum or '*', was #{@expr[2]}" unless
+          @expr[2] == '*' || @expr[2].is_a?(Integer) && @expr[2] >= 0
+      when :seq
+        # It's legal to have a zero-length sequence
+      else
+        raise ArgumentError, "Rule expression must be an array using a known operator, was #{@expr.first}"
+      end
     end
     ##
     # Return a rule from its SXP representation:
     #
     # @example inputs
-    #    (pass (plus (range "#x20\\t\\r\\n")))
+    #    (pass _pass (plus (range "#x20\\t\\r\\n")))
     #    (rule ebnf "1" (star (alt declaration rule)))
-    #    (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
+    #    (terminal R_CHAR "19" (diff CHAR (alt "]" "-")))
     #
-    # Also may have (first ...), (follow ...), or (start #t)
+    # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
     #
-    # @param [Array] sxp
+    # @param [String, Array] sxp
     # @return [Rule]
     def self.from_sxp(sxp)
+      if sxp.is_a?(String)
+        require 'sxp' unless defined?(SXP)
+        sxp = SXP.parse(sxp)
+      end
       expr = sxp.detect {|e| e.is_a?(Array) && ![:first, :follow, :start].include?(e.first.to_sym)}
       first = sxp.detect {|e| e.is_a?(Array) && e.first.to_sym == :first}
       first = first[1..-1] if first
@@ -102,26 +171,28 @@ module EBNF
       start = sxp.any? {|e| e.is_a?(Array) && e.first.to_sym == :start}
       sym = sxp[1] if sxp[1].is_a?(Symbol)
       id = sxp[2] if sxp[2].is_a?(String)
-      Rule.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
+      self.new(sym, id, expr, kind: sxp.first, first: first, follow: follow, cleanup: cleanup, start: start)
     end
     # Build a new rule creating a symbol and numbering from the current rule
-    # Symbol and number creation is handled by the top-most rule in such a chain
+    # Symbol and number creation is handled by the top-most rule in such a chain.
     #
     # @param [Array] expr
+    # @param [Symbol] kind (nil)
+    # @param [Hash{Symbol => Symbol}] cleanup (nil)
     # @param [Hash{Symbol => Object}] options
-    # @param [Symbol] :kind
     def build(expr, kind: nil, cleanup: nil, **options)
-      new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
-      Rule.new(new_sym, new_id, expr,
-               kind: kind,
-               ebnf: @ebnf,
-               top_rule: (@top_rule || self),
-               cleanup: cleanup,
-               **options)
+      new_sym, new_id = @top_rule.send(:make_sym_id)
+      self.class.new(new_sym, new_id, expr,
+                     kind: kind,
+                     ebnf: @ebnf,
+                     top_rule: @top_rule,
+                     cleanup: cleanup,
+                     **options)
     end
-    # Return representation for building S-Expressions
+    # Return representation for building S-Expressions.
+    #
     # @return [Array]
     def for_sxp
       elements = [kind, sym]
@@ -137,40 +208,51 @@ module EBNF
     # Return SXP representation of this rule
     # @return [String]
     def to_sxp
+      require 'sxp' unless defined?(SXP)
       for_sxp.to_sxp
     end
     alias_method :to_s, :to_sxp
-    # Serializes this rule to an Turtle
+    # Serializes this rule to an Turtle.
+    #
     # @return [String]
     def to_ttl
       @ebnf.debug("to_ttl") {inspect} if @ebnf
-      comment = orig.to_s.strip.
-        gsub(/"""/, '\"\"\"').
-        gsub("\\", "\\\\").
-        sub(/^\"/, '\"').
-        sub(/\"$/m, '\"')
-      statements = [
-        %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
-        %{  rdfs:comment #{comment.inspect};},
-      ]
+      statements = [%{:#{sym} rdfs:label "#{sym}";}]
+      if orig
+        comment = orig.to_s.strip.
+          gsub(/"""/, '\"\"\"').
+          gsub("\\", "\\\\").
+          sub(/^\"/, '\"').
+          sub(/\"$/m, '\"')
+        statements << %{  rdfs:comment #{comment.inspect};}
+      end
+      statements << %{  dc:identifier "#{id}";} if id
       statements += ttl_expr(expr, terminal? ? "re" : "g", 1, false)
       "\n" + statements.join("\n")
     end
+    # Return a Ruby representation of this rule
+    # @return [String]
+    def to_ruby
+      "EBNF::Rule.new(#{sym.inspect}, #{id.inspect}, #{expr.inspect}#{', kind: ' + kind.inspect unless kind == :rule})"
+    end
     ##
     # Transform EBNF rule to BNF rules:
     #
-    #   * Transform (a [n] rule (op1 (op2))) into two rules:
-    #     (a [n] rule (op1 _a_1))
-    #     (_a_1 [n.1] rule (op2))
-    #   * Transform (a rule (opt b)) into (a rule (alt _empty b))
-    #   * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
-    #   * Transform (a rule (plus b)) into (a rule (seq b (star b)
+    #   * Transform `(rule a "n" (op1 (op2)))` into two rules:
+    #
+    #         (rule a "n" (op1 _a_1))
+    #         (rule _a_1 "n.1" (op2))
+    #   * Transform `(rule a (opt b))` into `(rule a (alt _empty b))`
+    #   * Transform `(rule a (star b))` into `(rule a (alt _empty (seq b a)))`
+    #   * Transform `(rule a (plus b))` into `(rule a (seq b (star b)`
+    #
+    # Transformation includes information used to re-construct non-transformed.
     #
-    # Transformation includes information used to re-construct non-transformed
     # AST representation
     # @return [Array<Rule>]
     def to_bnf
@@ -197,19 +279,19 @@ module EBNF
         new_rules = new_rules.map {|r| r.to_bnf}.flatten
       elsif expr.first == :opt
         this = dup
-        #   * Transform (a rule (opt b)) into (a rule (alt _empty b))
+        #   * Transform (rule a (opt b)) into (rule a (alt _empty b))
         this.expr = [:alt, :_empty, expr.last]
         this.cleanup = :opt
         new_rules = this.to_bnf
       elsif expr.first == :star
-        #   * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
+        #   * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
         this = dup
         this.cleanup = :star
         new_rule = this.build([:seq, expr.last, this.sym], cleanup: :merge)
         this.expr = [:alt, :_empty, new_rule.sym]
         new_rules = [this] + new_rule.to_bnf
       elsif expr.first == :plus
-        #   * Transform (a rule (plus b)) into (a rule (seq b (star b)
+        #   * Transform (rule a (plus b)) into (rule a (seq b (star b)
         this = dup
         this.cleanup = :plus
         this.expr = [:seq, expr.last, [:star, expr.last]]
@@ -218,7 +300,7 @@ module EBNF
         # Otherwise, no further transformation necessary
         new_rules << self
       elsif [:diff, :hex, :range].include?(expr.first)
-        # This rules are fine, the just need to be terminals
+        # This rules are fine, they just need to be terminals
         raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
         new_rules << self
       else
@@ -229,89 +311,73 @@ module EBNF
       return new_rules
     end
-    # Return the non-terminals for this rule. For seq, this is the first
-    # non-terminals in the seq. For alt, this is every non-terminal ni the alt
-    # @param [Array<Rule>] ast
-    #   The set of rules, used to turn symbols into rules
+    ##
+    # Transform EBNF rule for PEG:
+    #
+    #   * Transform `(rule a "n" (op1 ... (op2 y) ...z))` into two rules:
+    #
+    #         (rule a "n" (op1 ... _a_1 ... z))
+    #         (rule _a_1 "n.1" (op2 y))
+    #   * Transform `(rule a "n" (diff op1 op2))` into two rules:
+    #
+    #         (rule a "n" (seq _a_1 op1))
+    #         (rule _a_1 "n.1" (not op1))
+    #
     # @return [Array<Rule>]
-    def non_terminals(ast)
-      @non_terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
-        case sym
-        when Symbol
-          r = ast.detect {|r| r.sym == sym}
-          r if r && r.rule?
-        else
-          nil
-        end
-      end.compact
-    end
+    def to_peg
+      new_rules = []
-    # Return the terminals for this rule. For seq, this is the first
-    # terminals or strings in the seq. For alt, this is every non-terminal ni the alt
-    # @param [Array<Rule>] ast
-    #   The set of rules, used to turn symbols into rules
-    # @return [Array<Rule>]
-    def terminals(ast)
-      @terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
-        case sym
-        when Symbol
-          r = ast.detect {|r| r.sym == sym}
-          r if r && r.terminal?
-        when String
-          sym
-        else
-          nil
+      # Look for rules containing sub-sequences
+      if expr.any? {|e| e.is_a?(Array) && e.first.is_a?(Symbol)}
+        # duplicate ourselves for rewriting
+        this = dup
+        new_rules << this
+        expr.each_with_index do |e, index|
+          next unless e.is_a?(Array) && e.first.is_a?(Symbol)
+          new_rule = build(e)
+          this.expr[index] = new_rule.sym
+          new_rules << new_rule
         end
-      end.compact
-    end
-    # Does this rule start with a sym? It does if expr is that sym,
-    # expr starts with alt and contains that sym, or
-    # expr starts with seq and the next element is that sym
-    # @param [Symbol, class] sym
-    #   Symbol matching any start element, or if it is String, any start element which is a String
-    # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
-    def starts_with?(sym)
-      if seq? && sym === (v = expr.fetch(1, nil))
-        [v]
-      elsif alt? && expr.any? {|e| sym === e}
-        expr.select {|e| sym === e}
+        # Return new rules after recursively applying #to_bnf
+        new_rules = new_rules.map {|r| r.to_peg}.flatten
+      elsif expr.first == :diff && !terminal?
+        this = dup
+        new_rule = build([:not, expr[2]])
+        this.expr = [:seq, new_rule.sym, expr[1]]
+        new_rules << this
+        new_rules << new_rule
+      elsif [:hex, :istr, :range].include?(expr.first)
+        # This rules are fine, they just need to be terminals
+        raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
+        new_rules << self
       else
-        nil
+        new_rules << self
       end
+      return new_rules.map {|r| r.extend(EBNF::PEG::Rule)}
     end
-    # Do the firsts of this rule include the empty string?
-    # @return [Boolean]
-    def first_includes_eps?
-      @first && @first.include?(:_eps)
-    end
-    # Add terminal as proceding this rule
-    # @param [Array<Rule, Symbol, String>] terminals
-    # @return [Integer] if number of terminals added
-    def add_first(terminals)
-      @first ||= []
-      terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - @first
-      @first += terminals
-      terminals.length
-    end
-    # Add terminal as following this rule. Don't add _eps as a follow
+    ##
+    # For :hex or :range, create a regular expression.
     #
-    # @param [Array<Rule, Symbol, String>] terminals
-    # @return [Integer] if number of terminals added
-    def add_follow(terminals)
-      # Remove terminals already in follows, and empty string
-      terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - (@follow || []) - [:_eps]
-      unless terminals.empty?
-        @follow ||= []
-        @follow += terminals
+    # @return [Regexp]
+    def to_regexp
+      case expr.first
+      when :hex
+        Regexp.new(translate_codepoints(expr[1]))
+      when :istr
+        /#{expr.last}/ui
+      when :range
+        Regexp.new("[#{translate_codepoints(expr[1])}]")
+      else
+        raise "Can't turn #{expr.inspect} into a regexp"
       end
-      terminals.length
     end
     # Is this a terminal?
+    #
     # @return [Boolean]
     def terminal?
       kind == :terminal
@@ -339,18 +405,14 @@ module EBNF
       expr.is_a?(Array) && expr.first == :seq
     end
-    # Is this rule of the form (alt ...)?
-    def alt?
-      expr.is_a?(Array) && expr.first == :alt
-    end
     def inspect
       "#<EBNF::Rule:#{object_id} " +
       {sym: sym, id: id, kind: kind, expr: expr}.inspect +
       ">"
     end
-    # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}
+    # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
+    #
     # @param [Rule] other
     # @return [Boolean]
     def ==(other)
@@ -359,37 +421,259 @@ module EBNF
       expr  == other.expr
     end
-    # Two rules are equivalent if they have the same {#expr}
+    # Two rules are equivalent if they have the same {#expr}.
+    #
     # @param [Rule] other
     # @return [Boolean]
-    def equivalent?(other)
-      expr  == other.expr
+    def eql?(other)
+      expr == other.expr
     end
-    # Rewrite the rule substituting src_rule for dst_rule wherever
-    # it is used in the production (first level only).
-    # @param [Rule] src_rule
-    # @param [Rule] dst_rule
-    # @return [Rule]
-    def rewrite(src_rule, dst_rule)
-      case @expr
-      when Array
-        @expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e}
+    # Rules compare using their ids
+    def <=>(other)
+      if id && other.id
+        if id == other.id
+          id.to_s <=> other.id.to_s
+        else
+          id.to_f <=> other.id.to_f
+        end
       else
-        @expr = dst_rule.sym if @expr == src_rule.sym
+        sym.to_s <=> other.sym.to_s
       end
-      self
     end
-    # Rules compare using their ids
-    def <=>(other)
-      if id.to_i == other.id.to_i
-        id.to_s <=> other.id.to_s
+    ##
+    # Utility function to translate code points of the form '#xN' into ruby unicode characters
+    def translate_codepoints(str)
+      str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
+    end
+    # Return the non-terminals for this rule.
+    #
+    # * `alt` => this is every non-terminal.
+    # * `diff` => this is every non-terminal.
+    # * `hex` => nil
+    # * `istr` => nil
+    # * `not` => this is the last expression, if any.
+    # * `opt` => this is the last expression, if any.
+    # * `plus` => this is the last expression, if any.
+    # * `range` => nil
+    # * `rept` => this is the last expression, if any.
+    # * `seq` => this is the first expression in the sequence, if any.
+    # * `star` => this is the last expression, if any.
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @return [Array<Rule>]
+    # @note this is used for LL(1) tansformation, so rule types are limited
+    def non_terminals(ast, expr = @expr)
+      ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
+        case sym
+        when Symbol
+          r = ast.detect {|r| r.sym == sym}
+          r if r && r.rule?
+        when Array
+          non_terminals(ast, sym)
+        else
+          nil
+        end
+      end.flatten.compact.uniq
+    end
+    # Return the terminals for this rule.
+    #
+    # * `alt` => this is every terminal.
+    # * `diff` => this is every terminal.
+    # * `hex` => nil
+    # * `istr` => nil
+    # * `not` => this is the last expression, if any.
+    # * `opt` => this is the last expression, if any.
+    # * `plus` => this is the last expression, if any.
+    # * `range` => nil
+    # * `rept` => this is the last expression, if any.
+    # * `seq` => this is the first expression in the sequence, if any.
+    # * `star` => this is the last expression, if any.
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @return [Array<Rule>]
+    # @note this is used for LL(1) tansformation, so rule types are limited
+    def terminals(ast, expr = @expr)
+      ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
+        case sym
+        when Symbol
+          r = ast.detect {|r| r.sym == sym}
+          r if r && r.terminal?
+        when String
+          sym
+        when Array
+          terminals(ast, sym)
+        end
+      end.flatten.compact.uniq
+    end
+    # Return the symbols used in the rule.
+    #
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @return [Array<Rule>]
+    def symbols(expr = @expr)
+      expr[1..-1].map do |sym|
+        case sym
+        when Symbol
+          sym
+        when Array
+          symbols(sym)
+        end
+      end.flatten.compact.uniq
+    end
+    ##
+    # The following are used for LL(1) transformation.
+    ##
+    # Does this rule start with `sym`? It does if expr is that sym,
+    # expr starts with alt and contains that sym,
+    # or expr starts with seq and the next element is that sym.
+    #
+    # @param [Symbol, class] sym
+    #   Symbol matching any start element, or if it is String, any start element which is a String
+    # @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
+    def starts_with?(sym)
+      if seq? && sym === (v = expr.fetch(1, nil))
+        [v]
+      elsif alt? && expr.any? {|e| sym === e}
+        expr.select {|e| sym === e}
+      else
+        nil
+      end
+    end
+    ##
+    # Validate the rule, with respect to an AST.
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @raise [RangeError]
+    def validate!(ast, expr = @expr)
+      op = expr.first
+      raise SyntaxError, "Unknown operator: #{op}" unless OP_ARGN.key?(op)
+      raise SyntaxError, "Argument count missmatch on operator #{op}, had #{expr.length - 1} expected #{OP_ARGN[op]}" if
+        OP_ARGN[op] && OP_ARGN[op] != expr.length - 1
+      # rept operator needs min and max
+      if op == :alt
+        raise SyntaxError, "alt operation must have at least one operand, had #{expr.length - 1}" unless expr.length > 1
+      elsif op == :rept
+        raise SyntaxError, "rept operation must an non-negative integer minimum, was #{expr[1]}" unless
+          expr[1].is_a?(Integer) && expr[1] >= 0
+        raise SyntaxError, "rept operation must an non-negative integer maximum or '*', was #{expr[2]}" unless
+          expr[2] == '*' || expr[2].is_a?(Integer) && expr[2] >= 0
+      end
+      case op
+      when :hex
+        raise SyntaxError, "Hex operand must be of form '#xN+': #{sym}" unless expr.last.match?(/^#x\h+$/)
+      when :range
+        str = expr.last.dup
+        str = str[1..-1] if str.start_with?('^')
+        str = str[0..-2] if str.end_with?('-')  # Allowed at end of range
+        scanner = StringScanner.new(str)
+        hex = rchar = in_range = false
+        while !scanner.eos?
+          begin
+            if scanner.scan(Terminals::HEX)
+              raise SyntaxError if in_range && rchar
+              rchar = in_range = false
+              hex = true
+            elsif scanner.scan(Terminals::R_CHAR)
+              raise SyntaxError if in_range && hex
+              hex = in_range = false
+              rchar = true
+            else
+              raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
+            end
+            if scanner.scan(/\-/)
+              raise SyntaxError if in_range
+              in_range = true
+            end
+          rescue SyntaxError
+            raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
+          end
+        end
       else
-        id.to_i <=> other.id.to_i
+        ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).each do |sym|
+          case sym
+          when Symbol
+            r = ast.detect {|r| r.sym == sym}
+            raise SyntaxError, "No rule found for #{sym}" unless r
+          when Array
+            validate!(ast, sym)
+          when String
+            raise SyntaxError, "String must be of the form CHAR*" unless sym.match?(/^#{Terminals::CHAR}*$/)
+          end
+        end
       end
     end
+    ##
+    # Validate the rule, with respect to an AST.
+    #
+    # Uses `#validate!` and catches `RangeError`
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @return [Boolean]
+    def valid?(ast)
+      validate!(ast)
+      true
+    rescue SyntaxError
+      false
+    end
+    # Do the firsts of this rule include the empty string?
+    #
+    # @return [Boolean]
+    def first_includes_eps?
+      @first && @first.include?(:_eps)
+    end
+    # Add terminal as proceding this rule.
+    #
+    # @param [Array<Rule, Symbol, String>] terminals
+    # @return [Integer] if number of terminals added
+    def add_first(terminals)
+      @first ||= []
+      terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - @first
+      @first += terminals
+      terminals.length
+    end
+    # Add terminal as following this rule. Don't add _eps as a follow
+    #
+    # @param [Array<Rule, Symbol, String>] terminals
+    # @return [Integer] if number of terminals added
+    def add_follow(terminals)
+      # Remove terminals already in follows, and empty string
+      terminals = terminals.map {|t| t.is_a?(Rule) ? t.sym : t} - (@follow || []) - [:_eps]
+      unless terminals.empty?
+        @follow ||= []
+        @follow += terminals
+      end
+      terminals.length
+    end
     private
     def ttl_expr(expr, pfx, depth, is_obj = true)
       indent = '  ' * depth
@@ -405,17 +689,28 @@ module EBNF
       case op
       when :seq, :alt, :diff
+        # Multiple operands
         statements << %{#{indent}#{bra}#{pfx}:#{op} (}
         expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
         statements << %{#{indent} )#{ket}}
-      when :opt, :plus, :star
+      when :opt, :plus, :star, :not
+        # Single operand
         statements << %{#{indent}#{bra}#{pfx}:#{op} }
         statements += ttl_expr(expr.first, pfx, depth + 1)
         statements << %{#{indent} #{ket}} unless ket.empty?
-      when :_empty, :_eps, :_empty
+      when :rept
+        # Three operands (min, max and expr)
+        statements << %{  #{indent}#{pfx}:min #{expr[0].inspect};}
+        statements << %{  #{indent}#{pfx}:max #{expr[1].inspect};}
+        statements << %{#{indent}#{bra}#{pfx}:#{op} }
+        statements += ttl_expr(expr.last, pfx, depth + 1)
+        statements << %{#{indent} #{ket}} unless ket.empty?
+      when :_empty, :_eps
         statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
       when :"'"
         statements << %{#{indent}"#{esc(expr)}"}
+      when :istr
+        statements << %{#{indent}#{bra} re:matches #{expr.first.inspect} #{ket}}
       when :range
         statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
       when :hex
@@ -471,7 +766,7 @@ module EBNF
     def make_sym_id(variation = nil)
       @id_seq ||= 0
       @id_seq += 1
-      ["_#{@sym}_#{@id_seq}#{variation}".to_sym, "#{@id}.#{@id_seq}#{variation}"]
+      ["_#{@sym}_#{@id_seq}#{variation}".to_sym, ("#{@id}.#{@id_seq}#{variation}" if @id)]
     end
   end
 end