regexp_parser 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +57 -0
- data/Gemfile +8 -0
- data/LICENSE +1 -1
- data/README.md +225 -206
- data/Rakefile +9 -3
- data/lib/regexp_parser.rb +7 -11
- data/lib/regexp_parser/expression.rb +72 -14
- data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
- data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
- data/lib/regexp_parser/expression/classes/keep.rb +7 -0
- data/lib/regexp_parser/expression/classes/set.rb +28 -7
- data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
- data/lib/regexp_parser/expression/methods/tests.rb +116 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
- data/lib/regexp_parser/expression/quantifier.rb +10 -0
- data/lib/regexp_parser/expression/sequence.rb +45 -0
- data/lib/regexp_parser/expression/subexpression.rb +29 -1
- data/lib/regexp_parser/lexer.rb +31 -8
- data/lib/regexp_parser/parser.rb +118 -45
- data/lib/regexp_parser/scanner.rb +1745 -1404
- data/lib/regexp_parser/scanner/property.rl +57 -3
- data/lib/regexp_parser/scanner/scanner.rl +161 -34
- data/lib/regexp_parser/syntax.rb +12 -2
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
- data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
- data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +19 -2
- data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
- data/lib/regexp_parser/token.rb +23 -8
- data/lib/regexp_parser/version.rb +5 -0
- data/regexp_parser.gemspec +35 -0
- data/test/expression/test_all.rb +6 -1
- data/test/expression/test_base.rb +19 -0
- data/test/expression/test_conditionals.rb +114 -0
- data/test/expression/test_free_space.rb +33 -0
- data/test/expression/test_set.rb +61 -0
- data/test/expression/test_strfregexp.rb +214 -0
- data/test/expression/test_subexpression.rb +24 -0
- data/test/expression/test_tests.rb +99 -0
- data/test/expression/test_to_h.rb +48 -0
- data/test/expression/test_to_s.rb +46 -0
- data/test/expression/test_traverse.rb +164 -0
- data/test/lexer/test_all.rb +16 -3
- data/test/lexer/test_conditionals.rb +101 -0
- data/test/lexer/test_keep.rb +24 -0
- data/test/lexer/test_literals.rb +51 -51
- data/test/lexer/test_nesting.rb +62 -62
- data/test/lexer/test_refcalls.rb +18 -20
- data/test/parser/test_all.rb +18 -3
- data/test/parser/test_alternation.rb +11 -14
- data/test/parser/test_conditionals.rb +148 -0
- data/test/parser/test_escapes.rb +29 -5
- data/test/parser/test_free_space.rb +139 -0
- data/test/parser/test_groups.rb +40 -0
- data/test/parser/test_keep.rb +21 -0
- data/test/scanner/test_all.rb +8 -2
- data/test/scanner/test_conditionals.rb +166 -0
- data/test/scanner/test_escapes.rb +8 -5
- data/test/scanner/test_free_space.rb +133 -0
- data/test/scanner/test_groups.rb +28 -0
- data/test/scanner/test_keep.rb +33 -0
- data/test/scanner/test_properties.rb +4 -0
- data/test/scanner/test_scripts.rb +71 -1
- data/test/syntax/ruby/test_1.9.3.rb +2 -2
- data/test/syntax/ruby/test_2.0.0.rb +38 -0
- data/test/syntax/ruby/test_2.2.0.rb +38 -0
- data/test/syntax/ruby/test_all.rb +1 -8
- data/test/syntax/ruby/test_files.rb +104 -0
- data/test/test_all.rb +2 -1
- data/test/token/test_all.rb +2 -0
- data/test/token/test_token.rb +109 -0
- metadata +75 -21
- data/VERSION.yml +0 -5
- data/lib/regexp_parser/ctype.rb +0 -48
- data/test/syntax/ruby/test_2.x.rb +0 -46
@@ -48,4 +48,50 @@ class ExpressionToS < Test::Unit::TestCase
|
|
48
48
|
assert_equal( pattern, RP.parse(pattern).to_s )
|
49
49
|
end
|
50
50
|
|
51
|
+
def test_expression_to_s_multiline_source
|
52
|
+
multiline = %r{
|
53
|
+
\A
|
54
|
+
a? # One letter
|
55
|
+
b{2,5} # Another one
|
56
|
+
[c-g]+ # A set
|
57
|
+
\z
|
58
|
+
}x
|
59
|
+
|
60
|
+
assert_equal( multiline.source, RP.parse(multiline).to_s )
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_expression_to_s_multiline_to_s
|
64
|
+
multiline = %r{
|
65
|
+
\A
|
66
|
+
a? # One letter
|
67
|
+
b{2,5} # Another one
|
68
|
+
[c-g]+ # A set
|
69
|
+
\z
|
70
|
+
}x
|
71
|
+
|
72
|
+
assert_equal( multiline.to_s, RP.parse(multiline.to_s).to_s )
|
73
|
+
end
|
74
|
+
|
75
|
+
# Free spacing expressions that use spaces between quantifiers and their
|
76
|
+
# targets do not produce identical results due to the way quantifiers are
|
77
|
+
# applied to expressions (members, not nodes) and the merging of consecutive
|
78
|
+
# space nodes. This tests that they produce equivalent results.
|
79
|
+
def test_expression_to_s_multiline_equivalence
|
80
|
+
multiline = %r{
|
81
|
+
\A
|
82
|
+
a ? # One letter
|
83
|
+
b {2,5} # Another one
|
84
|
+
[c-g] + # A set
|
85
|
+
\z
|
86
|
+
}x
|
87
|
+
|
88
|
+
str = 'bbbcged'
|
89
|
+
root = RP.parse(multiline)
|
90
|
+
|
91
|
+
assert_equal(
|
92
|
+
multiline.match(str)[0],
|
93
|
+
Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]
|
94
|
+
)
|
95
|
+
end
|
96
|
+
|
51
97
|
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
require File.expand_path("../../helpers", __FILE__)
|
2
|
+
|
3
|
+
class SubexpressionTraverse < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_subexpression_traverse
|
6
|
+
root = RP.parse(/a(b(c(d)))|g[hi]j|klmn/)
|
7
|
+
|
8
|
+
enters = 0
|
9
|
+
visits = 0
|
10
|
+
exits = 0
|
11
|
+
|
12
|
+
root.traverse {|event, exp, index|
|
13
|
+
enters += 1 if event == :enter
|
14
|
+
visits += 1 if event == :visit
|
15
|
+
exits += 1 if event == :exit
|
16
|
+
}
|
17
|
+
|
18
|
+
assert_equal( 7, enters )
|
19
|
+
assert_equal( exits, enters )
|
20
|
+
|
21
|
+
assert_equal( 8, visits )
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_subexpression_traverse_include_self
|
25
|
+
root = RP.parse(/a(b(c(d)))|g[hi]j|klmn/)
|
26
|
+
|
27
|
+
enters = 0
|
28
|
+
visits = 0
|
29
|
+
exits = 0
|
30
|
+
|
31
|
+
root.traverse(true) {|event, exp, index|
|
32
|
+
enters += 1 if event == :enter
|
33
|
+
visits += 1 if event == :visit
|
34
|
+
exits += 1 if event == :exit
|
35
|
+
}
|
36
|
+
|
37
|
+
assert_equal( 8, enters )
|
38
|
+
assert_equal( exits, enters )
|
39
|
+
|
40
|
+
assert_equal( 8, visits )
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_subexpression_walk_alias
|
44
|
+
root = RP.parse(/abc/)
|
45
|
+
|
46
|
+
assert_equal( true, root.respond_to?(:walk) )
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_subexpression_each_expression
|
50
|
+
root = RP.parse(/a(?x:b(c))|g[h-k]/)
|
51
|
+
|
52
|
+
count = 0
|
53
|
+
root.each_expression {|exp, index|
|
54
|
+
count += 1
|
55
|
+
}
|
56
|
+
|
57
|
+
assert_equal( 10, count )
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_subexpression_each_expression_include_self
|
61
|
+
root = RP.parse(/a(?x:b(c))|g[hi]/)
|
62
|
+
|
63
|
+
count = 0
|
64
|
+
root.each_expression(true) {|exp, index|
|
65
|
+
count += 1
|
66
|
+
}
|
67
|
+
|
68
|
+
assert_equal( 11, count )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_subexpression_each_expression_indices
|
72
|
+
root = RP.parse(/a(b)c/)
|
73
|
+
|
74
|
+
indices = []
|
75
|
+
root.each_expression {|exp, index| indices << index}
|
76
|
+
|
77
|
+
assert_equal( [0, 1, 0, 2], indices )
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_subexpression_each_expression_indices_include_self
|
81
|
+
root = RP.parse(/a(b)c/)
|
82
|
+
|
83
|
+
indices = []
|
84
|
+
root.each_expression(true) {|exp, index| indices << index}
|
85
|
+
|
86
|
+
assert_equal( [0, 0, 1, 0, 2], indices )
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_subexpression_map_without_block
|
90
|
+
root = RP.parse(/a(b([c-e]+))?/)
|
91
|
+
|
92
|
+
array = root.map
|
93
|
+
|
94
|
+
assert_equal( Array, array.class )
|
95
|
+
assert_equal( 5, array.length )
|
96
|
+
|
97
|
+
array.each do |item|
|
98
|
+
assert_equal( Array, item.class )
|
99
|
+
assert_equal( 2, item.length )
|
100
|
+
assert_equal( true, item.first.is_a?(Regexp::Expression::Base) )
|
101
|
+
assert_equal( true, item.last.is_a?(Fixnum) )
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_subexpression_map_without_block_include_self
|
106
|
+
root = RP.parse(/a(b([c-e]+))?/)
|
107
|
+
|
108
|
+
array = root.map(true)
|
109
|
+
|
110
|
+
assert_equal( Array, array.class )
|
111
|
+
assert_equal( 6, array.length )
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_subexpression_map_indices
|
115
|
+
root = RP.parse(/a(b([c-e]+))?f*g/)
|
116
|
+
|
117
|
+
indices = root.map {|exp, index| index}
|
118
|
+
|
119
|
+
assert_equal( [0, 1, 0, 1, 0, 2, 3], indices )
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_subexpression_map_indices_include_self
|
123
|
+
root = RP.parse(/a(b([c-e]+))?f*g/)
|
124
|
+
|
125
|
+
indices = root.map(true) {|exp, index| index}
|
126
|
+
|
127
|
+
assert_equal( [0, 0, 1, 0, 1, 0, 2, 3], indices )
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_subexpression_map_expressions
|
131
|
+
root = RP.parse(/a(b(c(d)))/)
|
132
|
+
|
133
|
+
levels = root.map {|exp, index|
|
134
|
+
[exp.level, exp.text] if exp.terminal?
|
135
|
+
}.compact
|
136
|
+
|
137
|
+
assert_equal(
|
138
|
+
[[0, 'a'], [1, 'b'], [2, 'c'], [3, 'd']],
|
139
|
+
levels
|
140
|
+
)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_subexpression_map_expressions_include_self
|
144
|
+
root = RP.parse(/a(b(c(d)))/)
|
145
|
+
|
146
|
+
levels = root.map(true) {|exp, index|
|
147
|
+
[exp.level, exp.to_s]
|
148
|
+
}.compact
|
149
|
+
|
150
|
+
assert_equal( [
|
151
|
+
[nil, 'a(b(c(d)))'],
|
152
|
+
[0, 'a'],
|
153
|
+
[0, '(b(c(d)))'],
|
154
|
+
[1, 'b'],
|
155
|
+
[1, '(c(d))'],
|
156
|
+
[2, 'c'],
|
157
|
+
[2, '(d)'],
|
158
|
+
[3, 'd']
|
159
|
+
],
|
160
|
+
levels
|
161
|
+
)
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
data/test/lexer/test_all.rb
CHANGED
@@ -6,21 +6,34 @@ require File.expand_path("../../helpers", __FILE__)
|
|
6
6
|
require File.expand_path("../test_#{tc}", __FILE__)
|
7
7
|
end
|
8
8
|
|
9
|
+
if RUBY_VERSION >= '2.0.0'
|
10
|
+
%w{conditionals keep}.each do|tc|
|
11
|
+
require File.expand_path("../test_#{tc}", __FILE__)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
9
15
|
class TestRegexpLexer < Test::Unit::TestCase
|
10
16
|
|
11
17
|
def test_lexer_returns_an_array
|
12
|
-
assert_instance_of( Array, RL.
|
18
|
+
assert_instance_of( Array, RL.lex('abc'))
|
13
19
|
end
|
14
20
|
|
15
21
|
def test_lexer_returns_tokens
|
16
|
-
tokens = RL.
|
22
|
+
tokens = RL.lex('^abc+[^one]{2,3}\b\d\\\C-C$')
|
17
23
|
assert( tokens.all?{|token| token.kind_of?(Regexp::Token)},
|
18
24
|
"Not all array members are tokens")
|
25
|
+
|
26
|
+
assert( tokens.all?{|token| token.to_a.length == 8},
|
27
|
+
"Not all tokens have a length of 8")
|
19
28
|
end
|
20
29
|
|
21
30
|
def test_lexer_token_count
|
22
|
-
tokens = RL.
|
31
|
+
tokens = RL.lex(/^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i)
|
23
32
|
assert_equal( 26, tokens.length )
|
24
33
|
end
|
25
34
|
|
35
|
+
def test_lexer_scan_alias
|
36
|
+
assert_equal( RL.lex(/a|b|c/), RL.scan(/a|b|c/) )
|
37
|
+
end
|
38
|
+
|
26
39
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require File.expand_path("../../helpers", __FILE__)
|
2
|
+
|
3
|
+
class LexerConditionals < Test::Unit::TestCase
|
4
|
+
|
5
|
+
if RUBY_VERSION >= '2.0'
|
6
|
+
|
7
|
+
# Basic lexer output and nesting tests
|
8
|
+
tests = {
|
9
|
+
'(?<A>a)(?(<A>)b|c)' => [3, :conditional, :open, '(?', 7, 9, 0, 0, 0],
|
10
|
+
'(?<B>a)(?(<B>)b|c)' => [4, :conditional, :condition, '(<B>)', 9, 14, 0, 0, 1],
|
11
|
+
'(?<C>a)(?(<C>)b|c)' => [6, :conditional, :separator, '|', 15, 16, 0, 0, 1],
|
12
|
+
'(?<D>a)(?(<D>)b|c)' => [8, :conditional, :close, ')', 17, 18, 0, 0, 0],
|
13
|
+
}
|
14
|
+
|
15
|
+
count = 0
|
16
|
+
tests.each do |pattern, test|
|
17
|
+
define_method "test_lexer_#{test[1]}_#{test[2]}_#{count+=1}" do
|
18
|
+
tokens = RL.lex(pattern)
|
19
|
+
assert_equal( test[1,8], tokens[test[0]].to_a)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_lexer_conditional_mixed_nesting
|
24
|
+
regexp = /((?<A>a)(?<B>(?(<A>)b|((?(<B>)[e-g]|[h-j])))))/
|
25
|
+
tokens = RL.lex(regexp)
|
26
|
+
|
27
|
+
expected = [
|
28
|
+
[ 0, :group, :capture, '(', 0, 1, 0, 0, 0],
|
29
|
+
[ 1, :group, :named, '(?<A>', 1, 6, 1, 0, 0],
|
30
|
+
|
31
|
+
[ 5, :conditional, :open, '(?', 13, 15, 2, 0, 0],
|
32
|
+
[ 6, :conditional, :condition, '(<A>)', 15, 20, 2, 0, 1],
|
33
|
+
[ 8, :conditional, :separator, '|', 21, 22, 2, 0, 1],
|
34
|
+
|
35
|
+
[10, :conditional, :open, '(?', 23, 25, 3, 0, 1],
|
36
|
+
[11, :conditional, :condition, '(<B>)', 25, 30, 3, 0, 2],
|
37
|
+
|
38
|
+
[12, :set, :open, '[', 30, 31, 3, 0, 2],
|
39
|
+
[13, :set, :range, 'e-g', 31, 34, 3, 1, 2],
|
40
|
+
[14, :set, :close, ']', 34, 35, 3, 0, 2],
|
41
|
+
|
42
|
+
[15, :conditional, :separator, '|', 35, 36, 3, 0, 2],
|
43
|
+
[19, :conditional, :close, ')', 41, 42, 3, 0, 1],
|
44
|
+
[21, :conditional, :close, ')', 43, 44, 2, 0, 0],
|
45
|
+
|
46
|
+
[22, :group, :close, ')', 44, 45, 1, 0, 0],
|
47
|
+
[23, :group, :close, ')', 45, 46, 0, 0, 0]
|
48
|
+
].each do |test|
|
49
|
+
assert_equal( test[1,8], tokens[test[0]].to_a)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_lexer_conditional_deep_nesting
|
54
|
+
regexp = /(a(b(c)))(?(1)(?(2)(?(3)d|e))|(?(3)(?(2)f|g)|(?(1)f|g)))/
|
55
|
+
tokens = RL.lex(regexp)
|
56
|
+
|
57
|
+
expected = [
|
58
|
+
[ 9, :conditional, :open, '(?', 9, 11, 0, 0, 0],
|
59
|
+
[10, :conditional, :condition, '(1)', 11, 14, 0, 0, 1],
|
60
|
+
|
61
|
+
[11, :conditional, :open, '(?', 14, 16, 0, 0, 1],
|
62
|
+
[12, :conditional, :condition, '(2)', 16, 19, 0, 0, 2],
|
63
|
+
|
64
|
+
[13, :conditional, :open, '(?', 19, 21, 0, 0, 2],
|
65
|
+
[14, :conditional, :condition, '(3)', 21, 24, 0, 0, 3],
|
66
|
+
|
67
|
+
[16, :conditional, :separator, '|', 25, 26, 0, 0, 3],
|
68
|
+
|
69
|
+
[18, :conditional, :close, ')', 27, 28, 0, 0, 2],
|
70
|
+
[19, :conditional, :close, ')', 28, 29, 0, 0, 1],
|
71
|
+
|
72
|
+
[20, :conditional, :separator, '|', 29, 30, 0, 0, 1],
|
73
|
+
|
74
|
+
[21, :conditional, :open, '(?', 30, 32, 0, 0, 1],
|
75
|
+
[22, :conditional, :condition, '(3)', 32, 35, 0, 0, 2],
|
76
|
+
|
77
|
+
[23, :conditional, :open, '(?', 35, 37, 0, 0, 2],
|
78
|
+
[24, :conditional, :condition, '(2)', 37, 40, 0, 0, 3],
|
79
|
+
|
80
|
+
[26, :conditional, :separator, '|', 41, 42, 0, 0, 3],
|
81
|
+
|
82
|
+
[28, :conditional, :close, ')', 43, 44, 0, 0, 2],
|
83
|
+
|
84
|
+
[29, :conditional, :separator, '|', 44, 45, 0, 0, 2],
|
85
|
+
|
86
|
+
[30, :conditional, :open, '(?', 45, 47, 0, 0, 2],
|
87
|
+
[31, :conditional, :condition, '(1)', 47, 50, 0, 0, 3],
|
88
|
+
|
89
|
+
[33, :conditional, :separator, '|', 51, 52, 0, 0, 3],
|
90
|
+
|
91
|
+
[35, :conditional, :close, ')', 53, 54, 0, 0, 2],
|
92
|
+
[36, :conditional, :close, ')', 54, 55, 0, 0, 1],
|
93
|
+
[37, :conditional, :close, ')', 55, 56, 0, 0, 0]
|
94
|
+
].each do |test|
|
95
|
+
assert_equal( test[1,8], tokens[test[0]].to_a)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path("../../helpers", __FILE__)
|
2
|
+
|
3
|
+
class LexerKeep < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_lex_keep_token
|
6
|
+
regexp = /ab\Kcd/
|
7
|
+
tokens = RL.lex(regexp)
|
8
|
+
|
9
|
+
assert_equal( :keep, tokens[1].type )
|
10
|
+
assert_equal( :mark, tokens[1].token )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_lex_keep_nested
|
14
|
+
regexp = /(a\Kb)|(c\\\Kd)ef/
|
15
|
+
tokens = RL.lex(regexp)
|
16
|
+
|
17
|
+
assert_equal( :keep, tokens[2].type )
|
18
|
+
assert_equal( :mark, tokens[2].token )
|
19
|
+
|
20
|
+
assert_equal( :keep, tokens[9].type )
|
21
|
+
assert_equal( :mark, tokens[9].token )
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
data/test/lexer/test_literals.rb
CHANGED
@@ -7,86 +7,86 @@ class LexerLiterals < Test::Unit::TestCase
|
|
7
7
|
tests = {
|
8
8
|
# ascii, single byte characters
|
9
9
|
'a' => {
|
10
|
-
0 => [:literal, :literal, 'a', 0, 1, 0, 0],
|
10
|
+
0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
|
11
11
|
},
|
12
12
|
|
13
13
|
'ab+' => {
|
14
|
-
0 => [:literal, :literal, 'a', 0, 1, 0, 0],
|
15
|
-
1 => [:literal, :literal, 'b', 1, 2, 0, 0],
|
16
|
-
2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0],
|
14
|
+
0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
|
15
|
+
1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
|
16
|
+
2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0],
|
17
17
|
},
|
18
18
|
|
19
19
|
|
20
20
|
# 2 byte wide characters, Arabic
|
21
21
|
'ا' => {
|
22
|
-
0 => [:literal, :literal, 'ا', 0, 2, 0, 0],
|
22
|
+
0 => [:literal, :literal, 'ا', 0, 2, 0, 0, 0],
|
23
23
|
},
|
24
24
|
|
25
25
|
'aاbبcت' => {
|
26
|
-
0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0],
|
26
|
+
0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0, 0],
|
27
27
|
},
|
28
28
|
|
29
29
|
'aاbبت?' => {
|
30
|
-
0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0],
|
31
|
-
1 => [:literal, :literal, 'ت', 6, 8, 0, 0],
|
32
|
-
2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
|
30
|
+
0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0, 0],
|
31
|
+
1 => [:literal, :literal, 'ت', 6, 8, 0, 0, 0],
|
32
|
+
2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0],
|
33
33
|
},
|
34
34
|
|
35
35
|
'aا?bبcت+' => {
|
36
|
-
0 => [:literal, :literal, 'a', 0, 1, 0, 0],
|
37
|
-
1 => [:literal, :literal, 'ا', 1, 3, 0, 0],
|
38
|
-
2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0],
|
39
|
-
3 => [:literal, :literal, 'bبc', 4, 8, 0, 0],
|
40
|
-
4 => [:literal, :literal, 'ت', 8, 10, 0, 0],
|
41
|
-
5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0],
|
36
|
+
0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
|
37
|
+
1 => [:literal, :literal, 'ا', 1, 3, 0, 0, 0],
|
38
|
+
2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
|
39
|
+
3 => [:literal, :literal, 'bبc', 4, 8, 0, 0, 0],
|
40
|
+
4 => [:literal, :literal, 'ت', 8, 10, 0, 0, 0],
|
41
|
+
5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0, 0],
|
42
42
|
},
|
43
43
|
|
44
44
|
'a(اbب+)cت?' => {
|
45
|
-
0 => [:literal, :literal, 'a', 0, 1, 0, 0],
|
46
|
-
1 => [:group, :capture, '(', 1, 2, 0, 0],
|
47
|
-
2 => [:literal, :literal, 'اb', 2, 5, 1, 0],
|
48
|
-
3 => [:literal, :literal, 'ب', 5, 7, 1, 0],
|
49
|
-
4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0],
|
50
|
-
5 => [:group, :close, ')', 8, 9, 0, 0],
|
51
|
-
6 => [:literal, :literal, 'c', 9, 10, 0, 0],
|
52
|
-
7 => [:literal, :literal, 'ت', 10, 12, 0, 0],
|
53
|
-
8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0],
|
45
|
+
0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
|
46
|
+
1 => [:group, :capture, '(', 1, 2, 0, 0, 0],
|
47
|
+
2 => [:literal, :literal, 'اb', 2, 5, 1, 0, 0],
|
48
|
+
3 => [:literal, :literal, 'ب', 5, 7, 1, 0, 0],
|
49
|
+
4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0, 0],
|
50
|
+
5 => [:group, :close, ')', 8, 9, 0, 0, 0],
|
51
|
+
6 => [:literal, :literal, 'c', 9, 10, 0, 0, 0],
|
52
|
+
7 => [:literal, :literal, 'ت', 10, 12, 0, 0, 0],
|
53
|
+
8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0, 0],
|
54
54
|
},
|
55
55
|
|
56
56
|
|
57
57
|
# 3 byte wide characters, Japanese
|
58
58
|
'ab?れます+cd' => {
|
59
|
-
0 => [:literal, :literal, 'a', 0, 1, 0, 0],
|
60
|
-
1 => [:literal, :literal, 'b', 1, 2, 0, 0],
|
61
|
-
2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0],
|
62
|
-
3 => [:literal, :literal, 'れま', 3, 9, 0, 0],
|
63
|
-
4 => [:literal, :literal, 'す', 9, 12, 0, 0],
|
64
|
-
5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0],
|
65
|
-
6 => [:literal, :literal, 'cd', 13, 15, 0, 0],
|
59
|
+
0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
|
60
|
+
1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
|
61
|
+
2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
|
62
|
+
3 => [:literal, :literal, 'れま', 3, 9, 0, 0, 0],
|
63
|
+
4 => [:literal, :literal, 'す', 9, 12, 0, 0, 0],
|
64
|
+
5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0, 0],
|
65
|
+
6 => [:literal, :literal, 'cd', 13, 15, 0, 0, 0],
|
66
66
|
},
|
67
67
|
|
68
68
|
|
69
69
|
# 4 byte wide characters, Osmanya
|
70
70
|
'𐒀𐒁?𐒂ab+𐒃' => {
|
71
|
-
0 => [:literal, :literal, '𐒀', 0, 4, 0, 0],
|
72
|
-
1 => [:literal, :literal, '𐒁', 4, 8, 0, 0],
|
73
|
-
2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
|
74
|
-
3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0],
|
75
|
-
4 => [:literal, :literal, 'b', 14, 15, 0, 0],
|
76
|
-
5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
|
77
|
-
6 => [:literal, :literal, '𐒃', 16, 20, 0, 0],
|
71
|
+
0 => [:literal, :literal, '𐒀', 0, 4, 0, 0, 0],
|
72
|
+
1 => [:literal, :literal, '𐒁', 4, 8, 0, 0, 0],
|
73
|
+
2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0],
|
74
|
+
3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0, 0],
|
75
|
+
4 => [:literal, :literal, 'b', 14, 15, 0, 0, 0],
|
76
|
+
5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0],
|
77
|
+
6 => [:literal, :literal, '𐒃', 16, 20, 0, 0, 0],
|
78
78
|
},
|
79
79
|
|
80
80
|
'mu𝄞?si*𝄫c+' => {
|
81
|
-
0 => [:literal, :literal, 'mu', 0, 2, 0, 0],
|
82
|
-
1 => [:literal, :literal, '𝄞', 2, 6, 0, 0],
|
83
|
-
2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0],
|
84
|
-
3 => [:literal, :literal, 's', 7, 8, 0, 0],
|
85
|
-
4 => [:literal, :literal, 'i', 8, 9, 0, 0],
|
86
|
-
5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0],
|
87
|
-
6 => [:literal, :literal, '𝄫', 10, 14, 0, 0],
|
88
|
-
7 => [:literal, :literal, 'c', 14, 15, 0, 0],
|
89
|
-
8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
|
81
|
+
0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0],
|
82
|
+
1 => [:literal, :literal, '𝄞', 2, 6, 0, 0, 0],
|
83
|
+
2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0, 0],
|
84
|
+
3 => [:literal, :literal, 's', 7, 8, 0, 0, 0],
|
85
|
+
4 => [:literal, :literal, 'i', 8, 9, 0, 0, 0],
|
86
|
+
5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0, 0],
|
87
|
+
6 => [:literal, :literal, '𝄫', 10, 14, 0, 0, 0],
|
88
|
+
7 => [:literal, :literal, 'c', 14, 15, 0, 0, 0],
|
89
|
+
8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0],
|
90
90
|
},
|
91
91
|
}
|
92
92
|
|
@@ -94,7 +94,7 @@ class LexerLiterals < Test::Unit::TestCase
|
|
94
94
|
tests.each do |pattern, checks|
|
95
95
|
define_method "test_lex_literal_runs_#{count+=1}" do
|
96
96
|
|
97
|
-
tokens = RL.
|
97
|
+
tokens = RL.lex(pattern)
|
98
98
|
checks.each do |offset, token|
|
99
99
|
assert_equal( token, tokens[offset].to_a )
|
100
100
|
end
|
@@ -103,17 +103,17 @@ class LexerLiterals < Test::Unit::TestCase
|
|
103
103
|
end
|
104
104
|
|
105
105
|
def test_lex_single_2_byte_char
|
106
|
-
tokens = RL.
|
106
|
+
tokens = RL.lex('ا+')
|
107
107
|
assert_equal( 2, tokens.length )
|
108
108
|
end
|
109
109
|
|
110
110
|
def test_lex_single_3_byte_char
|
111
|
-
tokens = RL.
|
111
|
+
tokens = RL.lex('れ+')
|
112
112
|
assert_equal( 2, tokens.length )
|
113
113
|
end
|
114
114
|
|
115
115
|
def test_lex_single_4_byte_char
|
116
|
-
tokens = RL.
|
116
|
+
tokens = RL.lex('𝄞+')
|
117
117
|
assert_equal( 2, tokens.length )
|
118
118
|
end
|
119
119
|
|