RubyGems - crass - Versions diffs - 0.0.1 - Mend

crass 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/lib/crass/scanner.rb ADDED Viewed

@@ -0,0 +1,125 @@
+module Crass
+  # Similar to a StringScanner, but with extra functionality needed to tokenize
+  # CSS while preserving the original text.
+  class Scanner
+    # Current character, or `nil` if the scanner hasn't yet consumed a
+    # character, or is at the end of the string.
+    attr_reader :current
+    # Current marker position. Use {#marked} to get the substring between
+    # {#marker} and {#pos}.
+    attr_accessor :marker
+    # Position of the next character that will be consumed. This is a character
+    # position, not a byte position, so it accounts for multi-byte characters.
+    attr_accessor :pos
+    # The string being scanned.
+    attr_reader :string
+    # Creates a Scanner instance for the given _input_ string or IO instance.
+    def initialize(input)
+      @string = input.is_a?(IO) ? input.read : input.to_s
+      @chars  = @string.chars.to_a
+      reset
+    end
+    # Consumes the next character and returns it, advancing the pointer, or
+    # an empty string if the end of the string has been reached.
+    def consume
+      @current = @chars[@pos] || ''
+      @pos += 1 if @current
+      @current
+    end
+    # Consumes the rest of the string and returns it, advancing the pointer to
+    # the end of the string. Returns an empty string is the end of the string
+    # has already been reached.
+    def consume_rest
+      rest     = @string[@pos..@len] || ''
+      @current = rest[-1] || ''
+      @pos     = @len
+      rest
+    end
+    # Returns `true` if the end of the string has been reached, `false`
+    # otherwise.
+    def eos?
+      @pos == @len
+    end
+    # Sets the marker to the position of the next character that will be
+    # consumed.
+    def mark
+      @marker = @pos
+    end
+    # Returns the substring between {#marker} and {#pos}, without altering the
+    # pointer.
+    def marked
+      if result = @chars[@marker...@pos]
+        result.join('')
+      else
+        ''
+      end
+    end
+    # Returns up to _length_ characters starting at the current position, but
+    # doesn't consume them. The number of characters returned may be less than
+    # _length_ if the end of the string is reached.
+    def peek(length = 1)
+      if result = @chars[@pos, length]
+        result.join('')
+      else
+        ''
+      end
+    end
+    # Moves the pointer back one character without changing the value of
+    # {#current}. The next call to {#consume} will re-consume the current
+    # character.
+    def reconsume
+      @pos -= 1 if @pos > 0
+    end
+    # Resets the pointer to the beginning of the string.
+    def reset
+      @current = nil
+      @len     = @string.length
+      @marker  = 0
+      @pos     = 0
+    end
+    # Tries to match _pattern_ at the current position. If it matches, the
+    # matched substring will be returned and the pointer will be advanced.
+    # Otherwise, `nil` will be returned.
+    def scan(pattern)
+      match = pattern.match(@string, @pos)
+      return nil if match.nil? || match.begin(0) != @pos
+      @pos     = match.end(0)
+      @current = @chars[@pos - 1]
+      match[0]
+    end
+    # Scans the string until the _pattern_ is matched. Returns the substring up
+    # to and including the end of the match, and advances the pointer. If there
+    # is no match, `nil` is returned and the pointer is not advanced.
+    def scan_until(pattern)
+      start = @pos
+      match = pattern.match(@string, @pos)
+      return nil if match.nil?
+      @pos     = match.end(0)
+      @current = @chars[@pos - 1]
+      @string[start...@pos]
+    end
+  end
+end

data/lib/crass/token-scanner.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module Crass
+  # Like {Scanner}, but for tokens!
+  class TokenScanner
+    attr_reader :current, :pos, :tokens
+    def initialize(tokens)
+      @tokens = tokens.to_a
+      reset
+    end
+    # Executes the given block, collects all tokens that are consumed during its
+    # execution, and returns them.
+    def collect
+      start = @pos
+      yield
+      @tokens[start...@pos] || []
+    end
+    # Consumes the next token and returns it, advancing the pointer.
+    def consume
+      @current = @tokens[@pos]
+      @pos += 1 if @current
+      @current
+    end
+    # Reconsumes the current token, moving the pointer back one position.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#reconsume-the-current-input-token
+    def reconsume
+      @pos -= 1 if @pos > 0
+    end
+    # Resets the pointer to the first token in the list.
+    def reset
+      @current = nil
+      @pos     = 0
+    end
+  end
+end

data/lib/crass/tokenizer.rb ADDED Viewed

@@ -0,0 +1,668 @@
+# encoding: utf-8
+require_relative 'scanner'
+module Crass
+  # Tokenizes a CSS string.
+  #
+  # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#tokenization
+  class Tokenizer
+    RE_COMMENT_CLOSE   = /\*\//
+    RE_DIGIT           = /[0-9]+/
+    RE_ESCAPE          = /\\[^\n]/
+    RE_HEX             = /[0-9A-Fa-f]{1,6}/
+    RE_NAME            = /[0-9A-Za-z_\u0080-\u{10ffff}-]+/
+    RE_NAME_START      = /[A-Za-z_\u0080-\u{10ffff}]+/
+    RE_NON_PRINTABLE   = /[\u0000-\u0008\u000b\u000e-\u001f\u007f]+/
+    RE_NUMBER_DECIMAL  = /\.[0-9]+/
+    RE_NUMBER_EXPONENT = /[Ee][+-]?[0-9]+/
+    RE_NUMBER_SIGN     = /[+-]/
+    RE_NUMBER_STR = /\A
+      (?<sign> [+-]?)
+      (?<integer> [0-9]*)
+      (?:\.
+        (?<fractional> [0-9]*)
+      )?
+      (?:[Ee]
+        (?<exponent_sign> [+-]?)
+        (?<exponent> [0-9]*)
+      )?
+    \z/x
+    RE_UNICODE_RANGE_START = /\+(?:[0-9A-Fa-f]|\?)/
+    RE_UNICODE_RANGE_END   = /-[0-9A-Fa-f]/
+    RE_URL_QUOTE           = /["']/
+    RE_WHITESPACE          = /[\n\u0009\u0020]+/
+    # -- Class Methods ---------------------------------------------------------
+    # Tokenizes the given _input_ as a CSS string and returns an array of
+    # tokens.
+    #
+    # See {#initialize} for _options_.
+    def self.tokenize(input, options = {})
+      Tokenizer.new(input, options).tokenize
+    end
+    # -- Instance Methods ------------------------------------------------------
+    # Initializes a new Tokenizer.
+    #
+    # Options:
+    #
+    #   * **:preserve_comments** - If `true`, comments will be preserved as
+    #     `:comment` tokens.
+    #
+    #   * **:preserve_hacks** - If `true`, certain non-standard browser hacks
+    #     such as the IE "*" hack will be preserved even though they violate
+    #     CSS 3 syntax rules.
+    #
+    def initialize(input, options = {})
+      @s       = Scanner.new(preprocess(input))
+      @options = options
+    end
+    # Consumes a token and returns the token that was consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-token0
+    def consume
+      return token(:eof) if @s.eos?
+      @s.mark
+      return token(:whitespace) if @s.scan(RE_WHITESPACE)
+      case char = @s.consume
+      when '"'
+        consume_string('"')
+      when '#'
+        if @s.peek =~ RE_NAME || valid_escape?
+          value = consume_name
+          token(:hash,
+            :type  => start_identifier? ? :id : :unrestricted,
+            :value => value)
+        else
+          token(:delim, :value => char)
+        end
+      when '$'
+        if @s.peek == '='
+          @s.consume
+          token(:suffix_match)
+        else
+          token(:delim, :value => char)
+        end
+      when "'"
+        consume_string("'")
+      when '('
+        token(:'(')
+      when ')'
+        token(:')')
+      when '*'
+        if @s.peek == '='
+          @s.consume
+          token(:substring_match)
+        elsif @options[:preserve_hacks] && @s.peek =~ RE_NAME_START
+          # NON-STANDARD: IE * hack
+          @s.reconsume
+          consume_ident
+        else
+          token(:delim, :value => char)
+        end
+      when '+'
+        if start_number?
+          @s.reconsume
+          consume_numeric
+        else
+          token(:delim, :value => char)
+        end
+      when ','
+        token(:comma)
+      when '-'
+        if start_number?
+          @s.reconsume
+          consume_numeric
+        elsif start_identifier?
+          @s.reconsume
+          consume_ident
+        elsif @s.peek(2) == '->'
+          @s.consume
+          @s.consume
+          token(:cdc)
+        else
+          token(:delim, :value => char)
+        end
+      when '.'
+        if start_number?
+          @s.reconsume
+          consume_numeric
+        else
+          token(:delim, :value => char)
+        end
+      when '/'
+        if @s.peek == '*'
+          @s.consume
+          if text = @s.scan_until(RE_COMMENT_CLOSE)
+            text.slice!(-2, 2)
+          else
+            text = @s.rest
+          end
+          if @options[:preserve_comments]
+            token(:comment, :value => text)
+          else
+            consume
+          end
+        else
+          token(:delim, :value => char)
+        end
+      when ':'
+        token(:colon)
+      when ';'
+        token(:semicolon)
+      when '<'
+        if @s.peek(3) == '!--'
+          @s.consume
+          @s.consume
+          @s.consume
+          token(:cdo)
+        else
+          token(:delim, :value => char)
+        end
+      when '@'
+        if start_identifier?
+          token(:at_keyword, :value => consume_name)
+        else
+          token(:delim, :value => char)
+        end
+      when '['
+        token(:'[')
+      when '\\'
+        if valid_escape?(char + @s.peek)
+          @s.reconsume
+          consume_ident
+        else
+          token(:delim,
+            :error => true,
+            :value => char)
+        end
+      when ']'
+        token(:']')
+      when '^'
+        if @s.peek == '='
+          @s.consume
+          token(:prefix_match)
+        else
+          token(:delim, :value => char)
+        end
+      when '{'
+        token(:'{')
+      when '}'
+        token(:'}')
+      when RE_DIGIT
+        @s.reconsume
+        consume_numeric
+      when 'U', 'u'
+        if @s.peek(2) =~ RE_UNICODE_RANGE_START
+          @s.consume
+          consume_unicode_range
+        else
+          @s.reconsume
+          consume_ident
+        end
+      when RE_NAME_START
+        @s.reconsume
+        consume_ident
+      when '|'
+        case @s.peek
+        when '='
+          @s.consume
+          token(:dash_match)
+        when '|'
+          @s.consume
+          token(:column)
+        else
+          token(:delim, :value => char)
+        end
+      when '~'
+        if @s.peek == '='
+          @s.consume
+          token(:include_match)
+        else
+          token(:delim, :value => char)
+        end
+      else
+        token(:delim, :value => char)
+      end
+    end
+    # Consumes the remnants of a bad URL and returns the consumed text.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-the-remnants-of-a-bad-url0
+    def consume_bad_url
+      text = ''
+      while true
+        return text if @s.eos?
+        if valid_escape?
+          text << consume_escaped
+        else
+          char = @s.consume
+          if char == ')'
+            return text
+          else
+            text << char
+          end
+        end
+      end
+    end
+    # Consumes an escaped code point and returns its unescaped value.
+    #
+    # This method assumes that the `\` has already been consumed, and that the
+    # next character in the input has already been verified not to be a newline
+    # or EOF.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-an-escaped-code-point0
+    def consume_escaped
+      case
+      when @s.eos?
+        "\ufffd"
+      when hex_str = @s.scan(RE_HEX)
+        @s.consume if @s.peek =~ RE_WHITESPACE
+        codepoint = hex_str.hex
+        if codepoint == 0 ||
+            codepoint.between?(0xD800, 0xDFFF) ||
+            codepoint > 0x10FFFF
+          "\ufffd"
+        else
+          codepoint.chr(Encoding::UTF_8)
+        end
+      else
+        @s.consume
+      end
+    end
+    # Consumes an ident-like token and returns it.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-an-ident-like-token0
+    def consume_ident
+      value = consume_name
+      if value.downcase == 'url' && @s.peek == '('
+        @s.consume
+        consume_url
+      elsif @s.peek == '('
+        @s.consume
+        token(:function, :value => value)
+      else
+        token(:ident, :value => value)
+      end
+    end
+    # Consumes a name and returns it.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-name0
+    def consume_name
+      result = ''
+      while char = @s.peek
+        if char =~ RE_NAME
+          result << @s.consume
+        elsif char == '\\' && valid_escape?
+          result << @s.consume
+          result << consume_escaped
+        # NON-STANDARD: IE * hack
+        elsif @options[:preserve_hacks] && char == '*'
+          result << @s.consume
+        else
+          return result
+        end
+      end
+    end
+    # Consumes a number and returns a 3-element array containing the number's
+    # original representation, its numeric value, and its type (either
+    # `:integer` or `:number`).
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-number0
+    def consume_number
+      repr = ''
+      type = :integer
+      repr << @s.consume if @s.peek =~ RE_NUMBER_SIGN
+      repr << (@s.scan(RE_DIGIT) || '')
+      if match = @s.scan(RE_NUMBER_DECIMAL)
+        repr << match
+        type = :number
+      end
+      if match = @s.scan(RE_NUMBER_EXPONENT)
+        repr << match
+        type = :number
+      end
+      [repr, convert_string_to_number(repr), type]
+    end
+    # Consumes a numeric token and returns it.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-numeric-token0
+    def consume_numeric
+      number = consume_number
+      if start_identifier?
+        token(:dimension,
+          :repr  => number[0],
+          :type  => number[2],
+          :unit  => consume_name,
+          :value => number[1])
+      elsif @s.peek == '%'
+        @s.consume
+        token(:percentage,
+          :repr  => number[0],
+          :value => number[1])
+      else
+        token(:number,
+          :repr  => number[0],
+          :type  => number[2],
+          :value => number[1])
+      end
+    end
+    # Consumes a string token that ends at the given character, and returns the
+    # token.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-string-token0
+    def consume_string(ending)
+      value = ''
+      while char = @s.consume
+        case char
+        when ending then break
+        when "\n"
+          return token(:bad_string,
+            :error => true,
+            :value => value)
+        when '\\'
+          case @s.peek
+          when ''
+            # End of the input, so do nothing.
+            next
+          when "\n"
+            @s.consume
+          else
+            value += consume_escaped
+          end
+        else
+          value << char
+        end
+      end
+      token(:string, :value => value)
+    end
+    # Consumes a Unicode range token and returns it. Assumes the initial "u+" or
+    # "U+" has already been consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-unicode-range-token0
+    def consume_unicode_range
+      value = @s.scan(RE_HEX)
+      while value.length < 6
+        break unless @s.peek == '?'
+        value << @s.consume
+      end
+      range = {}
+      if value.include?('?')
+        range[:start] = value.gsub('?', '0').hex
+        range[:end]   = value.gsub('?', 'F').hex
+        return token(:unicode_range, range)
+      end
+      range[:start] = value.hex
+      if @s.peek(2) =~ RE_UNICODE_RANGE_END
+        range[:value] << @s.consume << end_value = @s.scan(RE_HEX)
+        range[:end] = end_value.hex
+      else
+        range[:end] = range[:start]
+      end
+      token(:unicode_range, range)
+    end
+    # Consumes a URL token and returns it. Assumes the original "url(" has
+    # already been consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#consume-a-url-token0
+    def consume_url
+      value = ''
+      @s.scan(RE_WHITESPACE)
+      return token(:url, :value => value) if @s.eos?
+      # Quoted URL.
+      if @s.peek =~ RE_URL_QUOTE
+        string = consume_string(@s.consume)
+        if string[:node] == :bad_string
+          return token(:bad_url, :value => string[:value] + consume_bad_url)
+        end
+        value = string[:value]
+        @s.scan(RE_WHITESPACE)
+        if @s.eos? || @s.peek == ')'
+          @s.consume
+          return token(:url, :value => value)
+        else
+          return token(:bad_url, :value => value + consume_bad_url)
+        end
+      end
+      # Unquoted URL.
+      while !@s.eos?
+        case char = @s.consume
+        when ')' then break
+        when RE_WHITESPACE
+          @s.scan(RE_WHITESPACE)
+          if @s.eos? || @s.peek == ')'
+            @s.consume
+            break
+          else
+            return token(:bad_url, :value => value + consume_bad_url)
+          end
+        when '"', "'", '(', RE_NON_PRINTABLE
+          return token(:bad_url,
+            :error => true,
+            :value => value + consume_bad_url)
+        when '\\'
+          if valid_escape?
+            value << consume_escaped
+          else
+            return token(:bad_url,
+              :error => true,
+              :value => value + consume_bad_url
+            )
+          end
+        else
+          value << char
+        end
+      end
+      token(:url, :value => value)
+    end
+    # Converts a valid CSS number string into a number and returns the number.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#convert-a-string-to-a-number0
+    def convert_string_to_number(str)
+      matches = RE_NUMBER_STR.match(str)
+      s = matches[:sign] == '-' ? -1 : 1
+      i = matches[:integer].to_i
+      f = matches[:fractional].to_i
+      d = matches[:fractional] ? matches[:fractional].length : 0
+      t = matches[:exponent_sign] == '-' ? -1 : 1
+      e = matches[:exponent].to_i
+      # I know this looks nutty, but it's exactly what's defined in the spec,
+      # and it works.
+      s * (i + f * 10**-d) * 10**(t * e)
+    end
+    # Preprocesses _input_ to prepare it for the tokenizer.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#input-preprocessing
+    def preprocess(input)
+      input = input.to_s.encode('UTF-8',
+        :invalid => :replace,
+        :undef   => :replace)
+      input.gsub!(/(?:\r\n|[\r\f])/, "\n")
+      input.gsub!("\u0000", "\ufffd")
+      input
+    end
+    # Returns `true` if the given three-character _text_ would start an
+    # identifier. If _text_ is `nil`, the next three characters in the input
+    # stream will be checked, but will not be consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#check-if-three-code-points-would-start-an-identifier
+    def start_identifier?(text = nil)
+      text = @s.peek(3) if text.nil?
+      case text[0]
+      when '-'
+        !!(text[1] =~ RE_NAME_START || valid_escape?(text[1, 2]))
+      when RE_NAME_START
+        true
+      when '\\'
+        valid_escape?(text[0, 2])
+      else
+        false
+      end
+    end
+    # Returns `true` if the given three-character _text_ would start a number.
+    # If _text_ is `nil`, the next three characters in the input stream will be
+    # checked, but will not be consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#check-if-three-code-points-would-start-a-number
+    def start_number?(text = nil)
+      text = @s.peek(3) if text.nil?
+      case text[0]
+      when '+', '-'
+        !!(text[1] =~ RE_DIGIT || (text[1] == '.' && text[2] =~ RE_DIGIT))
+      when '.'
+        !!(text[1] =~ RE_DIGIT)
+      when RE_DIGIT
+        true
+      else
+        false
+      end
+    end
+    # Creates and returns a new token with the given _properties_.
+    def token(type, properties = {})
+      {
+        :node => type,
+        :pos  => @s.marker,
+        :raw  => @s.marked
+      }.merge!(properties)
+    end
+    # Tokenizes the input stream and returns an array of tokens.
+    def tokenize
+      @s.reset
+      tokens = []
+      token  = consume
+      while token && token[:node] != :eof
+        tokens << token
+        token = consume
+      end
+      tokens
+    end
+    # Returns `true` if the given two-character _text_ is the beginning of a
+    # valid escape sequence. If _text_ is `nil`, the next two characters in the
+    # input stream will be checked, but will not be consumed.
+    #
+    # http://www.w3.org/TR/2013/WD-css-syntax-3-20130919/#check-if-two-code-points-are-a-valid-escape
+    def valid_escape?(text = nil)
+      text = @s.peek(2) if text.nil?
+      !!(text[0] == '\\' && text[1] != "\n")
+    end
+  end
+end