rkelly-remix 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'rkelly/lexeme'
2
3
  require 'rkelly/char_range'
3
4
  require 'strscan'
@@ -69,7 +70,7 @@ module RKelly
69
70
  }
70
71
 
71
72
  # Determine the method to use to measure String length in bytes,
72
- # because StringScanner#pos can obly be set in bytes.
73
+ # because StringScanner#pos can only be set in bytes.
73
74
  #
74
75
  # - In Ruby 1.8 String#length returns always the string length
75
76
  # in bytes.
@@ -79,12 +80,50 @@ module RKelly
79
80
  #
80
81
  BYTESIZE_METHOD = "".respond_to?(:bytesize) ? :bytesize : :length
81
82
 
83
+ # JavaScript whitespace can consist of any Unicode space separator
84
+ # characters.
85
+ #
86
+ # - In Ruby 1.9+ we can just use the [[:space:]] character class
87
+ # and match them all.
88
+ #
89
+ # - In Ruby 1.8 we need a regex that identifies the specific bytes
90
+ # in UTF-8 text.
91
+ #
92
+ WHITESPACE_REGEX = "".respond_to?(:encoding) ? /[[:space:]]+/m : %r{
93
+ (
94
+ \xC2\xA0 | # no-break space
95
+ \xE1\x9A\x80 | # ogham space mark
96
+ \xE2\x80\x80 | # en quad
97
+ \xE2\x80\x81 | # em quad
98
+ \xE2\x80\x82 | # en space
99
+ \xE2\x80\x83 | # em space
100
+ \xE2\x80\x84 | # three-per-em space
101
+ \xE2\x80\x85 | # four-pre-em süace
102
+ \xE2\x80\x86 | # six-per-em space
103
+ \xE2\x80\x87 | # figure space
104
+ \xE2\x80\x88 | # punctuation space
105
+ \xE2\x80\x89 | # thin space
106
+ \xE2\x80\x8A | # hair space
107
+ \xE2\x80\xA8 | # line separator
108
+ \xE2\x80\xA9 | # paragraph separator
109
+ \xE2\x80\xAF | # narrow no-break space
110
+ \xE2\x81\x9F | # medium mathematical space
111
+ \xE3\x80\x80 # ideographic space
112
+ )+
113
+ }mx
114
+
82
115
  def initialize(&block)
83
116
  @lexemes = Hash.new {|hash, key| hash[key] = [] }
84
117
 
85
118
  token(:COMMENT, /\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m, ['/'])
86
119
  token(:STRING, /"(?:[^"\\]*(?:\\.[^"\\]*)*)"|'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m, ["'", '"'])
87
- token(:S, /\s*/m, [" ", "\t", "\r", "\n", "\f"])
120
+
121
+ # Matcher for basic ASCII whitespace.
122
+ # (Unicode whitespace is handled separately in #match_lexeme)
123
+ #
124
+ # Can't use just "\s" in regex, because in Ruby 1.8 this
125
+ # doesn't include the vertical tab "\v" character
126
+ token(:S, /[ \t\r\n\f\v]*/m, [" ", "\t", "\r", "\n", "\f", "\v"])
88
127
 
89
128
  # A regexp to match floating point literals (but not integer literals).
90
129
  digits = ('0'..'9').to_a
@@ -180,6 +219,13 @@ module RKelly
180
219
  token = lexeme.match(scanner)
181
220
  return token if token
182
221
  end
222
+
223
+ # When some other character encountered, try to match it as
224
+ # whitespace, as in JavaScript whitespace can contain any
225
+ # Unicode whitespace character.
226
+ if str = scanner.check(WHITESPACE_REGEX)
227
+ return Token.new(:S, str)
228
+ end
183
229
  end
184
230
 
185
231
  # Registers a lexeme and maps it to all the characters it can
@@ -6,6 +6,43 @@ class TokenizerTest < Test::Unit::TestCase
6
6
  @tokenizer = RKelly::Tokenizer.new
7
7
  end
8
8
 
9
+ {
10
+ :space => " ",
11
+ :tab => "\t",
12
+ :form_feed => "\f",
13
+ :vertical_tab => "\v",
14
+ :no_break_space => [0x00A0].pack("U"),
15
+ :ogham_space_mark => [0x1680].pack("U"),
16
+ :en_quad => [0x2000].pack("U"),
17
+ :em_quad => [0x2001].pack("U"),
18
+ :en_space => [0x2002].pack("U"),
19
+ :em_space => [0x2003].pack("U"),
20
+ :three_per_em_space => [0x2004].pack("U"),
21
+ :four_per_em_space => [0x2005].pack("U"),
22
+ :six_per_em_space => [0x2006].pack("U"),
23
+ :figure_space => [0x2007].pack("U"),
24
+ :punctuation_space => [0x2008].pack("U"),
25
+ :thin_space => [0x2009].pack("U"),
26
+ :hair_space => [0x200a].pack("U"),
27
+ :narrow_no_break_space => [0x202f].pack("U"),
28
+ :medium_mathematical_space => [0x205f].pack("U"),
29
+ :ideographic_space => [0x3000].pack("U"),
30
+
31
+ # Line terminators
32
+ :newline => "\n",
33
+ :carriage_return => "\r",
34
+ :line_separator => [0x2028].pack("U"),
35
+ :paragraph_separator => [0x2029].pack("U"),
36
+ }.each do |name, char|
37
+ define_method(:"test_whitespace_#{name}") do
38
+ assert_equal([[:S, char]], @tokenizer.tokenize(char))
39
+ end
40
+ end
41
+
42
+ def assert_tokens(expected, actual)
43
+ assert_equal(expected, actual.select { |x| x[0] != :S })
44
+ end
45
+
9
46
  def test_comments
10
47
  tokens = @tokenizer.tokenize("/** Fooo */")
11
48
  assert_tokens([[:COMMENT, '/** Fooo */']], tokens)
@@ -183,10 +220,6 @@ class TokenizerTest < Test::Unit::TestCase
183
220
  ], tokens)
184
221
  end
185
222
 
186
- def assert_tokens(expected, actual)
187
- assert_equal(expected, actual.select { |x| x[0] != :S })
188
- end
189
-
190
223
  %w{
191
224
  break case catch continue default delete do else finally for function
192
225
  if in instanceof new return switch this throw try typeof var void while
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rkelly-remix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Patterson
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-10-22 00:00:00.000000000 Z
12
+ date: 2013-12-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rdoc