RubyGems - ebnf - Versions diffs - 0.0.1 - Mend

ebnf 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/lib/ebnf/ll1/lexer.rb ADDED Viewed

@@ -0,0 +1,475 @@
+module RDF::LL1
+  require 'rdf/ll1/scanner'    unless defined?(Scanner)
+  ##
+  # A lexical analyzer
+  #
+  # @example Tokenizing a Turtle string
+  #   terminals = [
+  #     [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))],
+  #     ...
+  #   ]
+  #   ttl = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ."
+  #   lexer = RDF::LL1::Lexer.tokenize(ttl, terminals)
+  #   lexer.each_token do |token|
+  #     puts token.inspect
+  #   end
+  #
+  # @example Tokenizing and returning a token stream
+  #   lexer = RDF::LL1::Lexer.tokenize(...)
+  #   while :some-condition
+  #     token = lexer.first # Get the current token
+  #     token = lexer.shift # Get the current token and shift to the next
+  #   end
+  #
+  # @example Handling error conditions
+  #   begin
+  #     RDF::Turtle::Lexer.tokenize(query)
+  #   rescue RDF::Turtle::Lexer::Error => error
+  #     warn error.inspect
+  #   end
+  #
+  # @see http://en.wikipedia.org/wiki/Lexical_analysis
+  class Lexer
+    include Enumerable
+    ESCAPE_CHARS         = {
+      '\\t'   => "\t",  # \u0009 (tab)
+      '\\n'   => "\n",  # \u000A (line feed)
+      '\\r'   => "\r",  # \u000D (carriage return)
+      '\\b'   => "\b",  # \u0008 (backspace)
+      '\\f'   => "\f",  # \u000C (form feed)
+      '\\"'  => '"',    # \u0022 (quotation mark, double quote mark)
+      "\\'"  => '\'',   # \u0027 (apostrophe-quote, single quote mark)
+      '\\\\' => '\\'    # \u005C (backslash)
+    }
+    ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/           # \uXXXX
+    ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/           # \UXXXXXXXX
+    ECHAR               = /\\./                               # More liberal unescaping
+    UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/
+    COMMENT             = /#.*/
+    WS                  = / |\t|\r|\n/m
+    ML_START            = /\'\'\'|\"\"\"/                     # Beginning of terminals that may span lines
+    ##
+    # @!attribute whitespace
+    # @return [Regexp] defines whitespace, defaults to WS
+    attr_reader :whitespace
+    ##
+    # @!attribute comment
+    # @return [Regexp] defines single-line comment, defaults to COMMENT
+    attr_reader :comment
+    ##
+    # Returns a copy of the given `input` string with all `\uXXXX` and
+    # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
+    # unescaped UTF-8 character counterparts.
+    #
+    # @param  [String] string
+    # @return [String]
+    # @see    http://www.w3.org/TR/rdf-sparql-query/#codepointEscape
+    def self.unescape_codepoints(string)
+      # Decode \uXXXX and \UXXXXXXXX code points:
+      string = string.gsub(UCHAR) do |c|
+        s = [(c[2..-1]).hex].pack('U*')
+        s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
+      end
+      string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)      # Ruby 1.9+
+      string
+    end
+    ##
+    # Returns a copy of the given `input` string with all string escape
+    # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
+    # character counterparts.
+    #
+    # @param  [String] input
+    # @return [String]
+    # @see    http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
+    def self.unescape_string(input)
+      input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
+    end
+    ##
+    # Tokenizes the given `input` string or stream.
+    #
+    # @param  [String, #to_s]                 input
+    # @param  [Array<Array<Symbol, Regexp>>]  terminals
+    #   Array of symbol, regexp pairs used to match terminals.
+    #   If the symbol is nil, it defines a Regexp to match string terminals.
+    # @param  [Hash{Symbol => Object}]        options
+    # @yield  [lexer]
+    # @yieldparam [Lexer] lexer
+    # @return [Lexer]
+    # @raise  [Lexer::Error] on invalid input
+    def self.tokenize(input, terminals, options = {}, &block)
+      lexer = self.new(input, terminals, options)
+      block_given? ? block.call(lexer) : lexer
+    end
+    ##
+    # Initializes a new lexer instance.
+    #
+    # @param  [String, #to_s]                 input
+    # @param  [Array<Array<Symbol, Regexp>>]  terminals
+    #   Array of symbol, regexp pairs used to match terminals.
+    #   If the symbol is nil, it defines a Regexp to match string terminals.
+    # @param  [Hash{Symbol => Object}]        options
+    # @option options [Regexp]                :whitespace (WS)
+    # @option options [Regexp]                :comment (COMMENT)
+    # @option options [Array<Symbol>]         :unescape_terms ([])
+    #   Regular expression matching the beginning of terminals that may cross newlines
+    def initialize(input = nil, terminals = nil, options = {})
+      @options        = options.dup
+      @whitespace     = @options[:whitespace]     || WS
+      @comment        = @options[:comment]        || COMMENT
+      @unescape_terms = @options[:unescape_terms] || []
+      @terminals      = terminals
+      raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0
+      @lineno = 1
+      @scanner = Scanner.new(input) do |string|
+        string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)      # Ruby 1.9+
+        string
+      end
+    end
+    ##
+    # Any additional options for the lexer.
+    #
+    # @return [Hash]
+    attr_reader   :options
+    ##
+    # The current input string being processed.
+    #
+    # @return [String]
+    attr_accessor :input
+    ##
+    # The current line number (zero-based).
+    #
+    # @return [Integer]
+    attr_reader   :lineno
+    ##
+    # Returns `true` if the input string is lexically valid.
+    #
+    # To be considered valid, the input string must contain more than zero
+    # terminals, and must not contain any invalid terminals.
+    #
+    # @return [Boolean]
+    def valid?
+      begin
+        !count.zero?
+      rescue Error
+        false
+      end
+    end
+    ##
+    # Enumerates each token in the input string.
+    #
+    # @yield  [token]
+    # @yieldparam [Token] token
+    # @return [Enumerator]
+    def each_token(&block)
+      if block_given?
+        while token = shift
+          yield token
+        end
+      end
+      enum_for(:each_token)
+    end
+    alias_method :each, :each_token
+    ##
+    # Returns first token in input stream
+    #
+    # @return [Token]
+    def first
+      return nil unless scanner
+      @first ||= begin
+        {} while !scanner.eos? && skip_whitespace
+        return @scanner = nil if scanner.eos?
+        token = match_token
+        if token.nil?
+          lexme = (scanner.rest.split(/#{@whitespace}|#{@comment}/).first rescue nil) || scanner.rest
+          raise Error.new("Invalid token #{lexme[0..100].inspect}",
+            :input => scanner.rest[0..100], :token => lexme, :lineno => lineno)
+        end
+        token
+      end
+    rescue ArgumentError, Encoding::CompatibilityError => e
+      raise Error.new("#{e.message} on line #{lineno + 1}",
+        :input => (scanner.rest[0..100] rescue '??'), :token => lexme, :lineno => lineno)
+    rescue Error
+      raise
+    rescue
+      STDERR.puts "Expected ArgumentError, got #{$!.class}"
+      raise
+    end
+    ##
+    # Returns first token and shifts to next
+    #
+    # @return [Token]
+    def shift
+      cur = first
+      @first = nil
+      cur
+    end
+    ##
+    # Skip input until a token is matched
+    #
+    # @return [Token]
+    def recover
+      until scanner.eos? do
+        begin
+          shift
+          return first
+        rescue Error, ArgumentError
+          # Ignore errors until something scans, or EOS.
+          scanner.pos = scanner.pos + 1
+        end
+      end
+    end
+  protected
+    # @return [StringScanner]
+    attr_reader :scanner
+    # Perform string and codepoint unescaping
+    # @param [String] string
+    # @return [String]
+    def unescape(string)
+      self.class.unescape_string(self.class.unescape_codepoints(string))
+    end
+    ##
+    # Skip whitespace or comments, as defined through input options or defaults
+    def skip_whitespace
+      # skip all white space, but keep track of the current line number
+      while !scanner.eos?
+       if matched = scanner.scan(@whitespace)
+          @lineno += matched.count("\n")
+        elsif (com = scanner.scan(@comment))
+        else
+          return
+        end
+      end
+    end
+    ##
+    # Return the matched token
+    #
+    # @return [Token]
+    def match_token
+      @terminals.each do |(term, regexp)|
+        #STDERR.puts "match[#{term}] #{scanner.rest[0..100].inspect} against #{regexp.inspect}" #if term == :STRING_LITERAL_SINGLE_QUOTE
+        if matched = scanner.scan(regexp)
+          matched = unescape(matched) if @unescape_terms.include?(term)
+          #STDERR.puts "  unescape? #{@unescape_terms.include?(term).inspect}"
+          #STDERR.puts "  matched #{term.inspect}: #{matched.inspect}"
+          return token(term, matched)
+        end
+      end
+      nil
+    end
+  protected
+    ##
+    # Constructs a new token object annotated with the current line number.
+    #
+    # The parser relies on the type being a symbolized URI and the value being
+    # a string, if there is no type. If there is a type, then the value takes
+    # on the native representation appropriate for that type.
+    #
+    # @param  [Symbol] type
+    # @param  [String] value
+    #   Scanner instance with access to matched groups
+    # @return [Token]
+    def token(type, value)
+      Token.new(type, value, :lineno => lineno)
+    end
+    ##
+    # Represents a lexer token.
+    #
+    # @example Creating a new token
+    #   token = RDF::LL1::Lexer::Token.new(:LANGTAG, "en")
+    #   token.type   #=> :LANGTAG
+    #   token.value  #=> "en"
+    #
+    # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token
+    class Token
+      ##
+      # Initializes a new token instance.
+      #
+      # @param  [Symbol]                 type
+      # @param  [String]                 value
+      # @param  [Hash{Symbol => Object}] options
+      # @option options [Integer]        :lineno (nil)
+      def initialize(type, value, options = {})
+        @type, @value = (type ? type.to_s.to_sym : nil), value
+        @options = options.dup
+        @lineno  = @options.delete(:lineno)
+      end
+      ##
+      # The token's symbol type.
+      #
+      # @return [Symbol]
+      attr_reader :type
+      ##
+      # The token's value.
+      #
+      # @return [String]
+      attr_reader :value
+      ##
+      # The line number where the token was encountered.
+      #
+      # @return [Integer]
+      attr_reader :lineno
+      ##
+      # Any additional options for the token.
+      #
+      # @return [Hash]
+      attr_reader :options
+      ##
+      # Returns the attribute named by `key`.
+      #
+      # @param  [Symbol] key
+      # @return [Object]
+      def [](key)
+        key = key.to_s.to_sym unless key.is_a?(Integer) || key.is_a?(Symbol)
+        case key
+          when 0, :type    then @type
+          when 1, :value   then @value
+          else nil
+        end
+      end
+      ##
+      # Returns `true` if the given `value` matches either the type or value
+      # of this token.
+      #
+      # @example Matching using the symbolic type
+      #   RDF::LL1::Lexer::Token.new(:NIL) === :NIL     #=> true
+      #
+      # @example Matching using the string value
+      #   RDF::LL1::Lexer::Token.new(nil, "{") === "{"  #=> true
+      #
+      # @param  [Symbol, String] value
+      # @return [Boolean]
+      def ===(value)
+        case value
+          when Symbol   then value == @type
+          when ::String then value.to_s == @value.to_s
+          else value == @value
+        end
+      end
+      ##
+      # Returns a hash table representation of this token.
+      #
+      # @return [Hash]
+      def to_hash
+        {:type => @type, :value => @value}
+      end
+      ##
+      # Readable version of token
+      def to_s
+        @type ? @type.inspect : @value
+      end
+      ##
+      # Returns type, if not nil, otherwise value
+      def representation
+        @type ? @type : @value
+      end
+      ##
+      # Returns an array representation of this token.
+      #
+      # @return [Array]
+      def to_a
+        [@type, @value]
+      end
+      ##
+      # Returns a developer-friendly representation of this token.
+      #
+      # @return [String]
+      def inspect
+        to_hash.inspect
+      end
+    end # class Token
+    ##
+    # Raised for errors during lexical analysis.
+    #
+    # @example Raising a lexer error
+    #   raise RDF::LL1::Lexer::Error.new(
+    #     "invalid token '%' on line 10",
+    #     :input => query, :token => '%', :lineno => 9)
+    #
+    # @see http://ruby-doc.org/core/classes/StandardError.html
+    class Error < StandardError
+      ##
+      # The input string associated with the error.
+      #
+      # @return [String]
+      attr_reader :input
+      ##
+      # The invalid token which triggered the error.
+      #
+      # @return [String]
+      attr_reader :token
+      ##
+      # The line number where the error occurred.
+      #
+      # @return [Integer]
+      attr_reader :lineno
+      ##
+      # Initializes a new lexer error instance.
+      #
+      # @param  [String, #to_s]          message
+      # @param  [Hash{Symbol => Object}] options
+      # @option options [String]         :input  (nil)
+      # @option options [String]         :token  (nil)
+      # @option options [Integer]        :lineno (nil)
+      def initialize(message, options = {})
+        @input  = options[:input]
+        @token  = options[:token]
+        @lineno = options[:lineno]
+        super(message.to_s)
+      end
+    end # class Error
+    unless "".respond_to?(:force_encoding)
+      # Compatibility with 1.9 Encoding
+      module Encoding
+        class CompatibilityError < StandardError; end
+      end
+    end
+  end # class Lexer
+end # module RDF::Turtle