RubyGems - rkelly-remix - Versions diffs - 0.0.5 → 0.0.6 - Mend

rkelly-remix 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/rkelly/constants.rb +1 -1
data/lib/rkelly/generated_parser.rb +214 -214
data/lib/rkelly/tokenizer.rb +48 -2
data/test/test_tokenizer.rb +37 -4
metadata +2 -2

data/lib/rkelly/tokenizer.rb CHANGED

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 require 'rkelly/lexeme'
 require 'rkelly/char_range'
 require 'strscan'
@@ -69,7 +70,7 @@ module RKelly
     }
     # Determine the method to use to measure String length in bytes,
-    # because StringScanner#pos can obly be set in bytes.
+    # because StringScanner#pos can only be set in bytes.
     #
     # - In Ruby 1.8 String#length returns always the string length
     #   in bytes.
@@ -79,12 +80,50 @@ module RKelly
     #
     BYTESIZE_METHOD = "".respond_to?(:bytesize) ? :bytesize : :length
+    # JavaScript whitespace can consist of any Unicode space separator
+    # characters.
+    #
+    # - In Ruby 1.9+ we can just use the [[:space:]] character class
+    #   and match them all.
+    #
+    # - In Ruby 1.8 we need a regex that identifies the specific bytes
+    #   in UTF-8 text.
+    #
+    WHITESPACE_REGEX = "".respond_to?(:encoding) ? /[[:space:]]+/m : %r{
+      (
+        \xC2\xA0     |   # no-break space
+        \xE1\x9A\x80 |   # ogham space mark
+        \xE2\x80\x80 |   # en quad
+        \xE2\x80\x81 |   # em quad
+        \xE2\x80\x82 |   # en space
+        \xE2\x80\x83 |   # em space
+        \xE2\x80\x84 |   # three-per-em space
+        \xE2\x80\x85 |   # four-pre-em süace
+        \xE2\x80\x86 |   # six-per-em space
+        \xE2\x80\x87 |   # figure space
+        \xE2\x80\x88 |   # punctuation space
+        \xE2\x80\x89 |   # thin space
+        \xE2\x80\x8A |   # hair space
+        \xE2\x80\xA8 |   # line separator
+        \xE2\x80\xA9 |   # paragraph separator
+        \xE2\x80\xAF |   # narrow no-break space
+        \xE2\x81\x9F |   # medium mathematical space
+        \xE3\x80\x80     # ideographic space
+      )+
+    }mx
     def initialize(&block)
       @lexemes = Hash.new {|hash, key| hash[key] = [] }
       token(:COMMENT, /\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m, ['/'])
       token(:STRING, /"(?:[^"\\]*(?:\\.[^"\\]*)*)"|'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m, ["'", '"'])
-      token(:S, /\s*/m, [" ", "\t", "\r", "\n", "\f"])
+      # Matcher for basic ASCII whitespace.
+      # (Unicode whitespace is handled separately in #match_lexeme)
+      #
+      # Can't use just "\s" in regex, because in Ruby 1.8 this
+      # doesn't include the vertical tab "\v" character
+      token(:S, /[ \t\r\n\f\v]*/m, [" ", "\t", "\r", "\n", "\f", "\v"])
       # A regexp to match floating point literals (but not integer literals).
       digits = ('0'..'9').to_a
@@ -180,6 +219,13 @@ module RKelly
         token = lexeme.match(scanner)
         return token if token
       end
+      # When some other character encountered, try to match it as
+      # whitespace, as in JavaScript whitespace can contain any
+      # Unicode whitespace character.
+      if str = scanner.check(WHITESPACE_REGEX)
+        return Token.new(:S, str)
+      end
     end
     # Registers a lexeme and maps it to all the characters it can

data/test/test_tokenizer.rb CHANGED

@@ -6,6 +6,43 @@ class TokenizerTest < Test::Unit::TestCase
     @tokenizer = RKelly::Tokenizer.new
   end
+  {
+    :space  => " ",
+    :tab => "\t",
+    :form_feed  => "\f",
+    :vertical_tab  => "\v",
+    :no_break_space  => [0x00A0].pack("U"),
+    :ogham_space_mark => [0x1680].pack("U"),
+    :en_quad => [0x2000].pack("U"),
+    :em_quad => [0x2001].pack("U"),
+    :en_space => [0x2002].pack("U"),
+    :em_space => [0x2003].pack("U"),
+    :three_per_em_space => [0x2004].pack("U"),
+    :four_per_em_space => [0x2005].pack("U"),
+    :six_per_em_space => [0x2006].pack("U"),
+    :figure_space => [0x2007].pack("U"),
+    :punctuation_space => [0x2008].pack("U"),
+    :thin_space => [0x2009].pack("U"),
+    :hair_space => [0x200a].pack("U"),
+    :narrow_no_break_space => [0x202f].pack("U"),
+    :medium_mathematical_space => [0x205f].pack("U"),
+    :ideographic_space => [0x3000].pack("U"),
+    # Line terminators
+    :newline  => "\n",
+    :carriage_return  => "\r",
+    :line_separator => [0x2028].pack("U"),
+    :paragraph_separator => [0x2029].pack("U"),
+  }.each do |name, char|
+    define_method(:"test_whitespace_#{name}") do
+      assert_equal([[:S, char]], @tokenizer.tokenize(char))
+    end
+  end
+  def assert_tokens(expected, actual)
+    assert_equal(expected, actual.select { |x| x[0] != :S })
+  end
   def test_comments
     tokens = @tokenizer.tokenize("/** Fooo */")
     assert_tokens([[:COMMENT, '/** Fooo */']], tokens)
@@ -183,10 +220,6 @@ class TokenizerTest < Test::Unit::TestCase
     ], tokens)
   end
-  def assert_tokens(expected, actual)
-    assert_equal(expected, actual.select { |x| x[0] != :S })
-  end
   %w{
     break case catch continue default delete do else finally for function
     if in instanceof new return switch this throw try typeof var void while

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rkelly-remix
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
 platform: ruby
 authors:
 - Aaron Patterson
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-22 00:00:00.000000000 Z
+date: 2013-12-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdoc