RubyGems - ebnf - Versions diffs - 2.0.0 → 2.1.0 - Mend

ebnf 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/lib/ebnf/peg.rb CHANGED

@@ -31,7 +31,7 @@ module EBNF
     def to_ruby_peg(output, **options)
       output.puts "  RULES = ["
       ast.each do |rule|
-        output.puts "    " + rule.to_ruby + '.extend(EBNF::PEG::Rule),'
+        output.puts "    " + rule.to_ruby + (rule.is_a?(EBNF::PEG::Rule) ? '.extend(EBNF::PEG::Rule)' : '') + ','
       end
       output.puts "  ]"
     end

data/lib/ebnf/peg/parser.rb CHANGED

@@ -51,6 +51,7 @@ module EBNF::PEG
     # DSL for creating terminals and productions
     module ClassMethods
       def start_handlers; (@start_handlers ||= {}); end
+      def start_options; (@start_hoptions ||= {}); end
       def production_handlers; (@production_handlers ||= {}); end
       def terminal_handlers; (@terminal_handlers ||= {}); end
       def terminal_regexps; (@terminal_regexps ||= {}); end
@@ -97,6 +98,10 @@ module EBNF::PEG
       #
       # @param [Symbol] term
       #   The rule name
+      # @param [Hash{Symbol => Object}] options
+      #   Options which are returned from {Parser#onStart}.
+      # @option options [Boolean] :as_hash (false)
+      #   If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
       # @yield [data, block]
       # @yieldparam [Hash] data
       #   A Hash defined for the current production, during :start
@@ -106,8 +111,9 @@ module EBNF::PEG
       #   Block passed to initialization for yielding to calling parser.
       #   Should conform to the yield specs for #initialize
       # Yield to generate a triple
-      def start_production(term, &block)
+      def start_production(term, **options, &block)
         start_handlers[term] = block
+        start_options[term] = options.freeze
       end
       ##
@@ -204,6 +210,7 @@ module EBNF::PEG
       @whitespace = case options[:whitespace]
       when Regexp then options[:whitespace]
       when Symbol then @rules[options[:whitespace]]
+      else options[:whitespace]
       end ||
         @rules.values.detect(&:pass?) ||
         /(?:\s|(?:#[^x][^\n\r]*))+/m.freeze
@@ -329,19 +336,30 @@ module EBNF::PEG
     #   @option options [Integer] :depth
     #     Recursion depth for indenting output
     #   @yieldreturn [String] additional string appended to `message`.
-    def debug(*args)
+    def debug(*args, &block)
       return unless @options[:logger]
       options = args.last.is_a?(Hash) ? args.pop : {}
       lineno = options[:lineno] || (scanner.lineno if scanner)
       level = options.fetch(:level, 0)
       depth = options[:depth] || self.depth
-      args << yield if block_given?
-      @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
+      if self.respond_to?(:log_debug)
+        level = [:debug, :info, :warn, :error, :fatal][level]
+        log_debug(*args, **options.merge(level: level, lineno: lineno, depth: depth), &block)
+      elsif @options[:logger].respond_to?(:add)
+        args << yield if block_given?
+        @options[:logger].add(level, "[#{lineno}]" + (" " * depth) + args.join(" "))
+      elsif @options[:logger].respond_to?(:<<)
+        args << yield if block_given?
+        @options[:logger] << "[#{lineno}]" + (" " * depth) + args.join(" ")
+      end
     end
     # Start for production
     # Adds data avoiable during the processing of the production
+    #
+    # @return [Hash] composed of production options. Currently only `as_hash` is supported.
+    # @see ClassMethods#start_production
     def onStart(prod)
       handler = self.class.start_handlers[prod]
       @productions << prod
@@ -367,6 +385,7 @@ module EBNF::PEG
         # explicit start handler
         @prod_data << {}
       end
+      return self.class.start_options.fetch(prod, {}) # any options on this production
     end
     # Finish of production

data/lib/ebnf/peg/rule.rb CHANGED

@@ -18,14 +18,15 @@ module EBNF::PEG
     #
     # If matched, the input position is updated and the results returned in a Hash.
     #
-    # * `alt`: returns the value of the matched production or `:unmatched`
-    # * `diff`: returns the string value matched, or `:unmatched`
+    # * `alt`: returns the value of the matched production or `:unmatched`.
+    # * `diff`: returns the value matched, or `:unmatched`.
     # * `hex`: returns a string composed of the matched hex character, or `:unmatched`.
-    # * `opt`: returns the matched production, or `nil` if unmatched.
-    # * `plus`: returns an array of the matches for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
-    # * `range`: returns a string composed of the character matching the range, or `:unmatched`.
-    # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values.
-    # * `star`: returns an array of the matches for the specified production.For Terminals, these are concatenated into a single string.
+    # * `opt`: returns the value matched, or `nil` if unmatched.
+    # * `plus`: returns an array of the values matched for the specified production, or `:unmatched`, if none are matched. For Terminals, these are concatenated into a single string.
+    # * `range`: returns a string composed of the values matched, or `:unmatched`, if less than `min` are matched.
+    # * `seq`: returns an array composed of single-entry hashes for each matched production indexed by the production name, or `:unmatched` if any production fails to match. For Terminals, returns a string created by concatenating these values. Via option in a `production` or definition, the result can be a single hash with values for each matched production; note that this is not always possible due to the possibility of repeated productions within the sequence.
+    # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string.
+    #
     # @param [Scanner] input
     # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production.
     def parse(input)
@@ -45,7 +46,7 @@ module EBNF::PEG
         # otherwise,
         if regexp = parser.find_terminal_regexp(sym)
           matched = input.scan(regexp)
-          result = (matched ? parser.onTerminal(sym, matched) : :unmatched)
+          result = parser.onTerminal(sym, (matched ? matched : :unmatched))
           # Update furthest failure for strings and terminals
           parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
           parser.packrat[sym][pos] = {
@@ -58,7 +59,7 @@ module EBNF::PEG
       else
         eat_whitespace(input)
       end
-      parser.onStart(sym)
+      start_options = parser.onStart(sym)
       result = case expr.first
       when :alt
@@ -84,7 +85,8 @@ module EBNF::PEG
         alt
       when :diff
         # matches any string that matches A but does not match B.
-        # XXX: Should this work for arbitrary rules?
+        # (Note, this is only used for Terminal rules, non-terminals will use :not)
+        raise "Diff used on non-terminal #{prod}" unless terminal?
         re1, re2 = Regexp.new(translate_codepoints(expr[1])), Regexp.new(translate_codepoints(expr[2]))
         matched = input.scan(re1)
         if !matched || re2.match?(matched)
@@ -101,9 +103,9 @@ module EBNF::PEG
           parser.update_furthest_failure(input.pos, input.lineno, expr.last)
           :unmatched
         end
-      when :opt
-        # Always matches
-        opt = case prod = expr[1]
+      when :not
+        # matches any string that does not match B.
+        res = case prod = expr[1]
         when Symbol
           rule = parser.find_rule(prod)
           raise "No rule found for #{prod}" unless rule
@@ -111,35 +113,29 @@ module EBNF::PEG
         when String
           input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
         end
-        if opt == :unmatched
+        if res != :unmatched
           # Update furthest failure for terminals
-          parser.update_furthest_failure(input.pos, input.lineno, prod) if terminal?
-          nil
+          parser.update_furthest_failure(input.pos, input.lineno, sym) if terminal?
+          :unmatched
         else
-          opt
+          nil
         end
+      when :opt
+        # Result is the matched value or nil
+        opt = rept(input, 0, 1, expr[1])
+        # Update furthest failure for strings and terminals
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        opt.first
       when :plus
         # Result is an array of all expressions while they match,
         # at least one must match
-        prod, plus = expr[1], []
-        case prod
-        when Symbol
-          rule = parser.find_rule(prod)
-          raise "No rule found for #{prod}" unless rule
-          while (res = rule.parse(input)) != :unmatched
-            eat_whitespace(input)
-            plus << res
-          end
-        when String
-          while res = input.scan(Regexp.new(Regexp.quote(prod)))
-            eat_whitespace(input)
-            plus << res
-          end
-        end
+        plus = rept(input, 1, '*', expr[1])
         # Update furthest failure for strings and terminals
-        parser.update_furthest_failure(input.pos, input.lineno, prod)
-        plus.empty? ? :unmatched : (terminal? ? plus.compact.join("") : plus.compact)
-      when :range
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        plus.is_a?(Array) && terminal? ? plus.join("") : plus
+      when :range, :istr
         # Matches the specified character range
         input.scan(to_regexp) || begin
           # Update furthest failure for strings and terminals
@@ -149,7 +145,7 @@ module EBNF::PEG
       when :seq
         # Evaluate each expression into an array of hashes where each hash contains a key from the associated production and the value is the parsed value of that production. Returns :unmatched if the input does not match the production. Value ordering is ensured by native Hash ordering.
         seq = expr[1..-1].each_with_object([]) do |prod, accumulator|
-          eat_whitespace(input) unless accumulator.empty?
+          eat_whitespace(input) unless accumulator.empty? || terminal?
           res = case prod
           when Symbol
             rule = parser.find_rule(prod)
@@ -165,32 +161,23 @@ module EBNF::PEG
           end
           accumulator << {prod.to_sym => res}
         end
-        seq == :unmatched ?
-          :unmatched :
-          (terminal? ?
-            seq.map(&:values).compact.join("") : # Concat values for terminal production
-            seq)
+        if seq == :unmatched
+          :unmatched
+        elsif terminal?
+          seq.map(&:values).compact.join("") # Concat values for terminal production
+        elsif start_options[:as_hash]
+          seq.inject {|memo, h| memo.merge(h)}
+        else
+          seq
+        end
       when :star
         # Result is an array of all expressions while they match,
         # an empty array of none match
-        prod, star = expr[1], []
-        case prod
-        when Symbol
-          rule = parser.find_rule(prod)
-          raise "No rule found for #{prod}" unless rule
-          while (res = rule.parse(input)) != :unmatched
-            eat_whitespace(input)
-            star << res
-          end
-        when String
-          while res = input.scan(Regexp.new(Regexp.quote(prod)))
-            eat_whitespace(input)
-            star << res
-          end
-        end
+        star = rept(input, 0, '*', expr[1])
         # Update furthest failure for strings and terminals
-        parser.update_furthest_failure(input.pos, input.lineno, prod)
-        star.compact
+        parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
+        star.is_a?(Array) && terminal? ? star.join("") : star
       else
         raise "attempt to parse unknown rule type: #{expr.first}"
       end
@@ -208,6 +195,38 @@ module EBNF::PEG
       return parser.packrat[sym][pos][:result]
     end
+    ##
+    # Repitition, 0-1, 0-n, 1-n, ...
+    #
+    # Note, nil results are removed from the result, but count towards min/max calculations
+    #
+    # @param [Scanner] input
+    # @param [Integer] min
+    # @param [Integer] max
+    #   If it is an integer, it stops matching after max entries.
+    # @param [Symbol, String] prod
+    # @return [:unmatched, Array]
+    def rept(input, min, max, prod)
+      result = []
+      case prod
+      when Symbol
+        rule = parser.find_rule(prod)
+        raise "No rule found for #{prod}" unless rule
+        while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched
+          eat_whitespace(input) unless terminal?
+          result << res
+        end
+      when String
+        while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
+          eat_whitespace(input) unless terminal?
+          result << res
+        end
+      end
+      result.length < min ? :unmatched : result.compact
+    end
     ##
     # Eat whitespace between non-terminal rules
     def eat_whitespace(input)

data/lib/ebnf/rule.rb CHANGED

@@ -1,17 +1,33 @@
 require 'scanf'
+require 'strscan'
 module EBNF
   # Represent individual parsed rules
   class Rule
     # Operations which are flattened to seprate rules in to_bnf.
     BNF_OPS = %w{
-      alt opt plus seq star
+      alt diff not opt plus rept seq star
     }.map(&:to_sym).freeze
     TERM_OPS = %w{
-      diff hex range
+      hex istr range
     }.map(&:to_sym).freeze
+    # The number of arguments expected per operator. `nil` for unspecified
+    OP_ARGN =       {
+      alt: nil,
+      diff: 2,
+      hex: 1,
+      istr: 1,
+      not: 1,
+      opt: 1,
+      plus: 1,
+      range: 1,
+      rept: 3,
+      seq: nil,
+      star: 1
+    }
     # Symbol of rule
     #
     # @return [Symbol]
@@ -28,7 +44,7 @@ module EBNF
     # Kind of rule
     #
-    # @return [:rule, :terminal, or :pass]
+    # @return [:rule, :terminal, :terminals, or :pass]
     attr_accessor :kind
     # Rule expression
@@ -59,19 +75,38 @@ module EBNF
     # Determines preparation and cleanup rules for reconstituting EBNF ? * + from BNF
     attr_accessor :cleanup
-    # @param [Symbol] sym
-    # @param [Integer] id
+    # @param [Symbol, nil] sym
+    #   `nil` is allowed only for @pass or @terminals
+    # @param [Integer, nil] id
     # @param [Array] expr
-    # @param [Symbol] kind (nil)
+    #   The expression is an internal-representation of an S-Expression with one of the following oparators:
+    #
+    #   * `alt` – A list of alternative rules, which are attempted in order. It terminates with the first matching rule, or is terminated as unmatched, if no such rule is found.
+    #   * `diff` – matches any string that matches `A` but does not match `B`.
+    #   * `hex` – A single character represented using the hexadecimal notation `#xnn`.
+    #   * `istr` – A string which matches in a case-insensitive manner, so that `(istr "fOo")` will match either of the strings `"foo"`, `"FOO"` or any other combination.
+    #   * `opt` – An optional rule or terminal. It either results in the matching rule or returns `nil`.
+    #   * `plus` – A sequence of one or more of the matching rule. If there is no such rule, it is terminated as unmatched; otherwise, the result is an array containing all matched input.
+    #   * `range` – A range of characters, possibly repeated, of the form `(range "a-z")`. May also use hexadecimal notation.
+    #   * `rept m n` – A sequence of at lest `m` and at most `n` of the matching rule. It will always return an array.
+    #   * `seq` – A sequence of rules or terminals. If any (other than `opt` or `star`) to not parse, the rule is terminated as unmatched.
+    #   * `star` – A sequence of zero or more of the matching rule. It will always return an array.
+    # @param [:rule, :terminal, :terminals, :pass] kind (nil)
     # @param [String] ebnf (nil)
+    #   When parsing, records the EBNF string used to create the rule.
     # @param [Array] first (nil)
+    #   Recorded set of terminals that can proceed this rule (LL(1))
     # @param [Array] follow (nil)
+    #   Recorded set of terminals that can follow this rule (LL(1))
     # @param [Boolean] start (nil)
+    #   Is this the starting rule for the grammar?
     # @param [Rule] top_rule (nil)
+    #   The top-most rule. All expressed rules are top-rules, derived rules have the original rule as their top-rule.
     # @param [Boolean] cleanup (nil)
+    #   Records information useful for cleaning up converted :plus, and :star expansions (LL(1)).
     def initialize(sym, id, expr, kind: nil, ebnf: nil, first: nil, follow: nil, start: nil, top_rule: nil, cleanup: nil)
       @sym, @id = sym, id
-      @expr = expr.is_a?(Array) ? expr : [:seq, expr]
+      @expr = expr.is_a?(Array) ? expr : [:seq, expr].compact
       @ebnf, @kind, @first, @follow, @start, @cleanup, @top_rule = ebnf, kind, first, follow, start, cleanup, top_rule
       @top_rule ||= self
       @kind ||= case
@@ -79,21 +114,53 @@ module EBNF
       when !BNF_OPS.include?(@expr.first) then :terminal
       else :rule
       end
+      # Allow @pass and @terminals to not be named
+      @sym ||= :_pass if @kind == :pass
+      @sym ||= :_terminals if @kind == :terminals
+      raise ArgumentError, "Rule sym must be a symbol, was #{@sym.inspect}" unless @sym.is_a?(Symbol)
+      raise ArgumentError, "Rule id must be a string or nil, was #{@id.inspect}" unless (@id || "").is_a?(String)
+      raise ArgumentError, "Rule kind must be one of :rule, :terminal, :terminals, or :pass, was #{@kind.inspect}" unless
+        @kind.is_a?(Symbol) && %w(rule terminal terminals pass).map(&:to_sym).include?(@kind)
+      case @expr.first
+      when :alt
+        raise ArgumentError, "#{@expr.first} operation must have at least one operand, had #{@expr.length - 1}" unless @expr.length > 1
+      when :diff
+        raise ArgumentError, "#{@expr.first} operation must have exactly two operands, had #{@expr.length - 1}" unless @expr.length == 3
+      when :hex, :istr, :not, :opt, :plus, :range, :star
+        raise ArgumentError, "#{@expr.first} operation must have exactly one operand, had #{@expr.length - 1}" unless @expr.length == 2
+      when :rept
+        raise ArgumentError, "#{@expr.first} operation must have exactly three, had #{@expr.length - 1}" unless @expr.length == 4
+        raise ArgumentError, "#{@expr.first} operation must an non-negative integer minimum, was #{@expr[1]}" unless
+          @expr[1].is_a?(Integer) && @expr[1] >= 0
+        raise ArgumentError, "#{@expr.first} operation must an non-negative integer maximum or '*', was #{@expr[2]}" unless
+          @expr[2] == '*' || @expr[2].is_a?(Integer) && @expr[2] >= 0
+      when :seq
+        # It's legal to have a zero-length sequence
+      else
+        raise ArgumentError, "Rule expression must be an array using a known operator, was #{@expr.first}"
+      end
     end
     ##
     # Return a rule from its SXP representation:
     #
     # @example inputs
-    #    (pass (plus (range "#x20\\t\\r\\n")))
+    #    (pass _pass (plus (range "#x20\\t\\r\\n")))
     #    (rule ebnf "1" (star (alt declaration rule)))
-    #    (terminal O_ENUM "17" (seq "[^" (plus CHAR) "]"))
+    #    (terminal R_CHAR "19" (diff CHAR (alt "]" "-")))
     #
     # Also may have `(first ...)`, `(follow ...)`, or `(start #t)`.
     #
-    # @param [Array] sxp
+    # @param [String, Array] sxp
     # @return [Rule]
     def self.from_sxp(sxp)
+      if sxp.is_a?(String)
+        require 'sxp' unless defined?(SXP)
+        sxp = SXP.parse(sxp)
+      end
       expr = sxp.detect {|e| e.is_a?(Array) && ![:first, :follow, :start].include?(e.first.to_sym)}
       first = sxp.detect {|e| e.is_a?(Array) && e.first.to_sym == :first}
       first = first[1..-1] if first
@@ -115,11 +182,11 @@ module EBNF
     # @param [Hash{Symbol => Symbol}] cleanup (nil)
     # @param [Hash{Symbol => Object}] options
     def build(expr, kind: nil, cleanup: nil, **options)
-      new_sym, new_id = (@top_rule ||self).send(:make_sym_id)
+      new_sym, new_id = @top_rule.send(:make_sym_id)
       self.class.new(new_sym, new_id, expr,
                      kind: kind,
                      ebnf: @ebnf,
-                     top_rule: (@top_rule || self),
+                     top_rule: @top_rule,
                      cleanup: cleanup,
                      **options)
     end
@@ -152,15 +219,16 @@ module EBNF
     # @return [String]
     def to_ttl
       @ebnf.debug("to_ttl") {inspect} if @ebnf
-      comment = orig.to_s.strip.
-        gsub(/"""/, '\"\"\"').
-        gsub("\\", "\\\\").
-        sub(/^\"/, '\"').
-        sub(/\"$/m, '\"')
-      statements = [
-        %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
-        %{  rdfs:comment #{comment.inspect};},
-      ]
+      statements = [%{:#{sym} rdfs:label "#{sym}";}]
+      if orig
+        comment = orig.to_s.strip.
+          gsub(/"""/, '\"\"\"').
+          gsub("\\", "\\\\").
+          sub(/^\"/, '\"').
+          sub(/\"$/m, '\"')
+        statements << %{  rdfs:comment #{comment.inspect};}
+      end
+      statements << %{  dc:identifier "#{id}";} if id
       statements += ttl_expr(expr, terminal? ? "re" : "g", 1, false)
       "\n" + statements.join("\n")
@@ -175,12 +243,13 @@ module EBNF
     ##
     # Transform EBNF rule to BNF rules:
     #
-    #   * Transform (rule a "n" (op1 (op2))) into two rules:
-    #     (rule a "n" (op1 _a_1))
-    #     (rule _a_1 "n.1" (op2))
-    #   * Transform (rule a (opt b)) into (rule a (alt _empty b))
-    #   * Transform (rule a (star b)) into (rule a (alt _empty (seq b a)))
-    #   * Transform (rule a (plus b)) into (rule a (seq b (star b)
+    #   * Transform `(rule a "n" (op1 (op2)))` into two rules:
+    #
+    #         (rule a "n" (op1 _a_1))
+    #         (rule _a_1 "n.1" (op2))
+    #   * Transform `(rule a (opt b))` into `(rule a (alt _empty b))`
+    #   * Transform `(rule a (star b))` into `(rule a (alt _empty (seq b a)))`
+    #   * Transform `(rule a (plus b))` into `(rule a (seq b (star b)`
     #
     # Transformation includes information used to re-construct non-transformed.
     #
@@ -231,7 +300,7 @@ module EBNF
         # Otherwise, no further transformation necessary
         new_rules << self
       elsif [:diff, :hex, :range].include?(expr.first)
-        # This rules are fine, the just need to be terminals
+        # This rules are fine, they just need to be terminals
         raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
         new_rules << self
       else
@@ -245,9 +314,14 @@ module EBNF
     ##
     # Transform EBNF rule for PEG:
     #
-    #   * Transform (rule a "n" (op1 ... (op2 y) ...z)) into two rules:
-    #     (rule a "n" (op1 ... _a_1 ... z))
-    #     (rule _a_1 "n.1" (op2 y))
+    #   * Transform `(rule a "n" (op1 ... (op2 y) ...z))` into two rules:
+    #
+    #         (rule a "n" (op1 ... _a_1 ... z))
+    #         (rule _a_1 "n.1" (op2 y))
+    #   * Transform `(rule a "n" (diff op1 op2))` into two rules:
+    #
+    #         (rule a "n" (seq _a_1 op1))
+    #         (rule _a_1 "n.1" (not op1))
     #
     # @return [Array<Rule>]
     def to_peg
@@ -268,8 +342,14 @@ module EBNF
         # Return new rules after recursively applying #to_bnf
         new_rules = new_rules.map {|r| r.to_peg}.flatten
-      elsif [:diff, :hex, :range].include?(expr.first)
-        # This rules are fine, the just need to be terminals
+      elsif expr.first == :diff && !terminal?
+        this = dup
+        new_rule = build([:not, expr[2]])
+        this.expr = [:seq, new_rule.sym, expr[1]]
+        new_rules << this
+        new_rules << new_rule
+      elsif [:hex, :istr, :range].include?(expr.first)
+        # This rules are fine, they just need to be terminals
         raise "Encountered #{expr.first.inspect}, which is a #{self.kind}, not :terminal" unless self.terminal?
         new_rules << self
       else
@@ -287,6 +367,8 @@ module EBNF
       case expr.first
       when :hex
         Regexp.new(translate_codepoints(expr[1]))
+      when :istr
+        /#{expr.last}/ui
       when :range
         Regexp.new("[#{translate_codepoints(expr[1])}]")
       else
@@ -294,45 +376,170 @@ module EBNF
       end
     end
-    # Return the non-terminals for this rule. For seq, this is the first
-    # non-terminal in the sequence. For alt, this is every non-terminal in the alt.
+    # Is this a terminal?
+    #
+    # @return [Boolean]
+    def terminal?
+      kind == :terminal
+    end
+    # Is this a pass?
+    # @return [Boolean]
+    def pass?
+      kind == :pass
+    end
+    # Is this a rule?
+    # @return [Boolean]
+    def rule?
+      kind == :rule
+    end
+    # Is this rule of the form (alt ...)?
+    def alt?
+      expr.is_a?(Array) && expr.first == :alt
+    end
+    # Is this rule of the form (seq ...)?
+    def seq?
+      expr.is_a?(Array) && expr.first == :seq
+    end
+    def inspect
+      "#<EBNF::Rule:#{object_id} " +
+      {sym: sym, id: id, kind: kind, expr: expr}.inspect +
+      ">"
+    end
+    # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
+    #
+    # @param [Rule] other
+    # @return [Boolean]
+    def ==(other)
+      sym   == other.sym &&
+      kind  == other.kind &&
+      expr  == other.expr
+    end
+    # Two rules are equivalent if they have the same {#expr}.
+    #
+    # @param [Rule] other
+    # @return [Boolean]
+    def eql?(other)
+      expr == other.expr
+    end
+    # Rules compare using their ids
+    def <=>(other)
+      if id && other.id
+        if id == other.id
+          id.to_s <=> other.id.to_s
+        else
+          id.to_f <=> other.id.to_f
+        end
+      else
+        sym.to_s <=> other.sym.to_s
+      end
+    end
+    ##
+    # Utility function to translate code points of the form '#xN' into ruby unicode characters
+    def translate_codepoints(str)
+      str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
+    end
+    # Return the non-terminals for this rule.
+    #
+    # * `alt` => this is every non-terminal.
+    # * `diff` => this is every non-terminal.
+    # * `hex` => nil
+    # * `istr` => nil
+    # * `not` => this is the last expression, if any.
+    # * `opt` => this is the last expression, if any.
+    # * `plus` => this is the last expression, if any.
+    # * `range` => nil
+    # * `rept` => this is the last expression, if any.
+    # * `seq` => this is the first expression in the sequence, if any.
+    # * `star` => this is the last expression, if any.
     #
     # @param [Array<Rule>] ast
     #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
     # @return [Array<Rule>]
-    def non_terminals(ast)
-      @non_terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
+    # @note this is used for LL(1) tansformation, so rule types are limited
+    def non_terminals(ast, expr = @expr)
+      ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
         case sym
         when Symbol
           r = ast.detect {|r| r.sym == sym}
           r if r && r.rule?
+        when Array
+          non_terminals(ast, sym)
         else
           nil
         end
-      end.compact
+      end.flatten.compact.uniq
     end
-    # Return the terminals for this rule. For seq, this is the first
-    # terminals or strings in the seq. For alt, this is every non-terminal ni the alt.
+    # Return the terminals for this rule.
+    #
+    # * `alt` => this is every terminal.
+    # * `diff` => this is every terminal.
+    # * `hex` => nil
+    # * `istr` => nil
+    # * `not` => this is the last expression, if any.
+    # * `opt` => this is the last expression, if any.
+    # * `plus` => this is the last expression, if any.
+    # * `range` => nil
+    # * `rept` => this is the last expression, if any.
+    # * `seq` => this is the first expression in the sequence, if any.
+    # * `star` => this is the last expression, if any.
     #
     # @param [Array<Rule>] ast
     #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
     # @return [Array<Rule>]
-    def terminals(ast)
-      @terms ||= (alt? ? expr[1..-1] : expr[1,1]).map do |sym|
+    # @note this is used for LL(1) tansformation, so rule types are limited
+    def terminals(ast, expr = @expr)
+      ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).map do |sym|
         case sym
         when Symbol
           r = ast.detect {|r| r.sym == sym}
           r if r && r.terminal?
         when String
           sym
-        else
-          nil
+        when Array
+          terminals(ast, sym)
         end
-      end.compact
+      end.flatten.compact.uniq
     end
-    # Does this rule start with a sym? It does if expr is that sym,
+    # Return the symbols used in the rule.
+    #
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @return [Array<Rule>]
+    def symbols(expr = @expr)
+      expr[1..-1].map do |sym|
+        case sym
+        when Symbol
+          sym
+        when Array
+          symbols(sym)
+        end
+      end.flatten.compact.uniq
+    end
+    ##
+    # The following are used for LL(1) transformation.
+    ##
+    # Does this rule start with `sym`? It does if expr is that sym,
     # expr starts with alt and contains that sym,
     # or expr starts with seq and the next element is that sym.
     #
@@ -349,6 +556,92 @@ module EBNF
       end
     end
+    ##
+    # Validate the rule, with respect to an AST.
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @param [Array<Symbol,String,Array>] expr (@expr)
+    #   The expression to check, defaults to the rule expression.
+    #   Typically, if the expression is recursive, the embedded expression is called recursively.
+    # @raise [RangeError]
+    def validate!(ast, expr = @expr)
+      op = expr.first
+      raise SyntaxError, "Unknown operator: #{op}" unless OP_ARGN.key?(op)
+      raise SyntaxError, "Argument count missmatch on operator #{op}, had #{expr.length - 1} expected #{OP_ARGN[op]}" if
+        OP_ARGN[op] && OP_ARGN[op] != expr.length - 1
+      # rept operator needs min and max
+      if op == :alt
+        raise SyntaxError, "alt operation must have at least one operand, had #{expr.length - 1}" unless expr.length > 1
+      elsif op == :rept
+        raise SyntaxError, "rept operation must an non-negative integer minimum, was #{expr[1]}" unless
+          expr[1].is_a?(Integer) && expr[1] >= 0
+        raise SyntaxError, "rept operation must an non-negative integer maximum or '*', was #{expr[2]}" unless
+          expr[2] == '*' || expr[2].is_a?(Integer) && expr[2] >= 0
+      end
+      case op
+      when :hex
+        raise SyntaxError, "Hex operand must be of form '#xN+': #{sym}" unless expr.last.match?(/^#x\h+$/)
+      when :range
+        str = expr.last.dup
+        str = str[1..-1] if str.start_with?('^')
+        str = str[0..-2] if str.end_with?('-')  # Allowed at end of range
+        scanner = StringScanner.new(str)
+        hex = rchar = in_range = false
+        while !scanner.eos?
+          begin
+            if scanner.scan(Terminals::HEX)
+              raise SyntaxError if in_range && rchar
+              rchar = in_range = false
+              hex = true
+            elsif scanner.scan(Terminals::R_CHAR)
+              raise SyntaxError if in_range && hex
+              hex = in_range = false
+              rchar = true
+            else
+              raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
+            end
+            if scanner.scan(/\-/)
+              raise SyntaxError if in_range
+              in_range = true
+            end
+          rescue SyntaxError
+            raise(SyntaxError, "Range contains illegal components at offset #{scanner.pos}: was #{expr.last}")
+          end
+        end
+      else
+        ([:alt, :diff].include?(expr.first) ? expr[1..-1] : expr[1,1]).each do |sym|
+          case sym
+          when Symbol
+            r = ast.detect {|r| r.sym == sym}
+            raise SyntaxError, "No rule found for #{sym}" unless r
+          when Array
+            validate!(ast, sym)
+          when String
+            raise SyntaxError, "String must be of the form CHAR*" unless sym.match?(/^#{Terminals::CHAR}*$/)
+          end
+        end
+      end
+    end
+    ##
+    # Validate the rule, with respect to an AST.
+    #
+    # Uses `#validate!` and catches `RangeError`
+    #
+    # @param [Array<Rule>] ast
+    #   The set of rules, used to turn symbols into rules
+    # @return [Boolean]
+    def valid?(ast)
+      validate!(ast)
+      true
+    rescue SyntaxError
+      false
+    end
     # Do the firsts of this rule include the empty string?
     #
     # @return [Boolean]
@@ -381,79 +674,6 @@ module EBNF
       terminals.length
     end
-    # Is this a terminal?
-    #
-    # @return [Boolean]
-    def terminal?
-      kind == :terminal
-    end
-    # Is this a pass?
-    # @return [Boolean]
-    def pass?
-      kind == :pass
-    end
-    # Is this a rule?
-    # @return [Boolean]
-    def rule?
-      kind == :rule
-    end
-    # Is this rule of the form (alt ...)?
-    def alt?
-      expr.is_a?(Array) && expr.first == :alt
-    end
-    # Is this rule of the form (seq ...)?
-    def seq?
-      expr.is_a?(Array) && expr.first == :seq
-    end
-    # Is this rule of the form (alt ...)?
-    def alt?
-      expr.is_a?(Array) && expr.first == :alt
-    end
-    def inspect
-      "#<EBNF::Rule:#{object_id} " +
-      {sym: sym, id: id, kind: kind, expr: expr}.inspect +
-      ">"
-    end
-    # Two rules are equal if they have the same {#sym}, {#kind} and {#expr}.
-    #
-    # @param [Rule] other
-    # @return [Boolean]
-    def ==(other)
-      sym   == other.sym &&
-      kind  == other.kind &&
-      expr  == other.expr
-    end
-    # Two rules are equivalent if they have the same {#expr}.
-    #
-    # @param [Rule] other
-    # @return [Boolean]
-    def equivalent?(other)
-      expr == other.expr
-    end
-    # Rules compare using their ids
-    def <=>(other)
-      if id.to_i == other.id.to_i
-        id.to_s <=> other.id.to_s
-      else
-        id.to_i <=> other.id.to_i
-      end
-    end
-    ##
-    # Utility function to translate code points of the form '#xN' into ruby unicode characters
-    def translate_codepoints(str)
-      str.gsub(/#x\h+/) {|c| c[2..-1].scanf("%x").first.chr(Encoding::UTF_8)}
-    end
     private
     def ttl_expr(expr, pfx, depth, is_obj = true)
       indent = '  ' * depth
@@ -469,17 +689,28 @@ module EBNF
       case op
       when :seq, :alt, :diff
+        # Multiple operands
         statements << %{#{indent}#{bra}#{pfx}:#{op} (}
         expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
         statements << %{#{indent} )#{ket}}
-      when :opt, :plus, :star
+      when :opt, :plus, :star, :not
+        # Single operand
         statements << %{#{indent}#{bra}#{pfx}:#{op} }
         statements += ttl_expr(expr.first, pfx, depth + 1)
         statements << %{#{indent} #{ket}} unless ket.empty?
+      when :rept
+        # Three operands (min, max and expr)
+        statements << %{  #{indent}#{pfx}:min #{expr[0].inspect};}
+        statements << %{  #{indent}#{pfx}:max #{expr[1].inspect};}
+        statements << %{#{indent}#{bra}#{pfx}:#{op} }
+        statements += ttl_expr(expr.last, pfx, depth + 1)
+        statements << %{#{indent} #{ket}} unless ket.empty?
       when :_empty, :_eps
         statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
       when :"'"
         statements << %{#{indent}"#{esc(expr)}"}
+      when :istr
+        statements << %{#{indent}#{bra} re:matches #{expr.first.inspect} #{ket}}
       when :range
         statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
       when :hex
@@ -535,7 +766,7 @@ module EBNF
     def make_sym_id(variation = nil)
       @id_seq ||= 0
       @id_seq += 1
-      ["_#{@sym}_#{@id_seq}#{variation}".to_sym, "#{@id}.#{@id_seq}#{variation}"]
+      ["_#{@sym}_#{@id_seq}#{variation}".to_sym, ("#{@id}.#{@id_seq}#{variation}" if @id)]
     end
   end
 end