rkelly-remix 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rkelly/constants.rb +1 -1
- data/lib/rkelly/generated_parser.rb +214 -214
- data/lib/rkelly/tokenizer.rb +48 -2
- data/test/test_tokenizer.rb +37 -4
- metadata +2 -2
data/lib/rkelly/tokenizer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'rkelly/lexeme'
|
2
3
|
require 'rkelly/char_range'
|
3
4
|
require 'strscan'
|
@@ -69,7 +70,7 @@ module RKelly
|
|
69
70
|
}
|
70
71
|
|
71
72
|
# Determine the method to use to measure String length in bytes,
|
72
|
-
# because StringScanner#pos can
|
73
|
+
# because StringScanner#pos can only be set in bytes.
|
73
74
|
#
|
74
75
|
# - In Ruby 1.8 String#length returns always the string length
|
75
76
|
# in bytes.
|
@@ -79,12 +80,50 @@ module RKelly
|
|
79
80
|
#
|
80
81
|
BYTESIZE_METHOD = "".respond_to?(:bytesize) ? :bytesize : :length
|
81
82
|
|
83
|
+
# JavaScript whitespace can consist of any Unicode space separator
|
84
|
+
# characters.
|
85
|
+
#
|
86
|
+
# - In Ruby 1.9+ we can just use the [[:space:]] character class
|
87
|
+
# and match them all.
|
88
|
+
#
|
89
|
+
# - In Ruby 1.8 we need a regex that identifies the specific bytes
|
90
|
+
# in UTF-8 text.
|
91
|
+
#
|
92
|
+
WHITESPACE_REGEX = "".respond_to?(:encoding) ? /[[:space:]]+/m : %r{
|
93
|
+
(
|
94
|
+
\xC2\xA0 | # no-break space
|
95
|
+
\xE1\x9A\x80 | # ogham space mark
|
96
|
+
\xE2\x80\x80 | # en quad
|
97
|
+
\xE2\x80\x81 | # em quad
|
98
|
+
\xE2\x80\x82 | # en space
|
99
|
+
\xE2\x80\x83 | # em space
|
100
|
+
\xE2\x80\x84 | # three-per-em space
|
101
|
+
\xE2\x80\x85 | # four-pre-em süace
|
102
|
+
\xE2\x80\x86 | # six-per-em space
|
103
|
+
\xE2\x80\x87 | # figure space
|
104
|
+
\xE2\x80\x88 | # punctuation space
|
105
|
+
\xE2\x80\x89 | # thin space
|
106
|
+
\xE2\x80\x8A | # hair space
|
107
|
+
\xE2\x80\xA8 | # line separator
|
108
|
+
\xE2\x80\xA9 | # paragraph separator
|
109
|
+
\xE2\x80\xAF | # narrow no-break space
|
110
|
+
\xE2\x81\x9F | # medium mathematical space
|
111
|
+
\xE3\x80\x80 # ideographic space
|
112
|
+
)+
|
113
|
+
}mx
|
114
|
+
|
82
115
|
def initialize(&block)
|
83
116
|
@lexemes = Hash.new {|hash, key| hash[key] = [] }
|
84
117
|
|
85
118
|
token(:COMMENT, /\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m, ['/'])
|
86
119
|
token(:STRING, /"(?:[^"\\]*(?:\\.[^"\\]*)*)"|'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m, ["'", '"'])
|
87
|
-
|
120
|
+
|
121
|
+
# Matcher for basic ASCII whitespace.
|
122
|
+
# (Unicode whitespace is handled separately in #match_lexeme)
|
123
|
+
#
|
124
|
+
# Can't use just "\s" in regex, because in Ruby 1.8 this
|
125
|
+
# doesn't include the vertical tab "\v" character
|
126
|
+
token(:S, /[ \t\r\n\f\v]*/m, [" ", "\t", "\r", "\n", "\f", "\v"])
|
88
127
|
|
89
128
|
# A regexp to match floating point literals (but not integer literals).
|
90
129
|
digits = ('0'..'9').to_a
|
@@ -180,6 +219,13 @@ module RKelly
|
|
180
219
|
token = lexeme.match(scanner)
|
181
220
|
return token if token
|
182
221
|
end
|
222
|
+
|
223
|
+
# When some other character encountered, try to match it as
|
224
|
+
# whitespace, as in JavaScript whitespace can contain any
|
225
|
+
# Unicode whitespace character.
|
226
|
+
if str = scanner.check(WHITESPACE_REGEX)
|
227
|
+
return Token.new(:S, str)
|
228
|
+
end
|
183
229
|
end
|
184
230
|
|
185
231
|
# Registers a lexeme and maps it to all the characters it can
|
data/test/test_tokenizer.rb
CHANGED
@@ -6,6 +6,43 @@ class TokenizerTest < Test::Unit::TestCase
|
|
6
6
|
@tokenizer = RKelly::Tokenizer.new
|
7
7
|
end
|
8
8
|
|
9
|
+
{
|
10
|
+
:space => " ",
|
11
|
+
:tab => "\t",
|
12
|
+
:form_feed => "\f",
|
13
|
+
:vertical_tab => "\v",
|
14
|
+
:no_break_space => [0x00A0].pack("U"),
|
15
|
+
:ogham_space_mark => [0x1680].pack("U"),
|
16
|
+
:en_quad => [0x2000].pack("U"),
|
17
|
+
:em_quad => [0x2001].pack("U"),
|
18
|
+
:en_space => [0x2002].pack("U"),
|
19
|
+
:em_space => [0x2003].pack("U"),
|
20
|
+
:three_per_em_space => [0x2004].pack("U"),
|
21
|
+
:four_per_em_space => [0x2005].pack("U"),
|
22
|
+
:six_per_em_space => [0x2006].pack("U"),
|
23
|
+
:figure_space => [0x2007].pack("U"),
|
24
|
+
:punctuation_space => [0x2008].pack("U"),
|
25
|
+
:thin_space => [0x2009].pack("U"),
|
26
|
+
:hair_space => [0x200a].pack("U"),
|
27
|
+
:narrow_no_break_space => [0x202f].pack("U"),
|
28
|
+
:medium_mathematical_space => [0x205f].pack("U"),
|
29
|
+
:ideographic_space => [0x3000].pack("U"),
|
30
|
+
|
31
|
+
# Line terminators
|
32
|
+
:newline => "\n",
|
33
|
+
:carriage_return => "\r",
|
34
|
+
:line_separator => [0x2028].pack("U"),
|
35
|
+
:paragraph_separator => [0x2029].pack("U"),
|
36
|
+
}.each do |name, char|
|
37
|
+
define_method(:"test_whitespace_#{name}") do
|
38
|
+
assert_equal([[:S, char]], @tokenizer.tokenize(char))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def assert_tokens(expected, actual)
|
43
|
+
assert_equal(expected, actual.select { |x| x[0] != :S })
|
44
|
+
end
|
45
|
+
|
9
46
|
def test_comments
|
10
47
|
tokens = @tokenizer.tokenize("/** Fooo */")
|
11
48
|
assert_tokens([[:COMMENT, '/** Fooo */']], tokens)
|
@@ -183,10 +220,6 @@ class TokenizerTest < Test::Unit::TestCase
|
|
183
220
|
], tokens)
|
184
221
|
end
|
185
222
|
|
186
|
-
def assert_tokens(expected, actual)
|
187
|
-
assert_equal(expected, actual.select { |x| x[0] != :S })
|
188
|
-
end
|
189
|
-
|
190
223
|
%w{
|
191
224
|
break case catch continue default delete do else finally for function
|
192
225
|
if in instanceof new return switch this throw try typeof var void while
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rkelly-remix
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Patterson
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-10
|
12
|
+
date: 2013-12-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rdoc
|