regexp_parser 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -3,365 +3,33 @@ require File.expand_path("../../helpers", __FILE__)
3
3
  class ScannerUnicodeScripts < Test::Unit::TestCase
4
4
 
5
5
  tests = {
6
- 'Aghb' => :script_caucasian_albanian,
7
- 'Caucasian Albanian' => :script_caucasian_albanian,
6
+ 'Aghb' => :caucasian_albanian,
7
+ 'Caucasian Albanian' => :caucasian_albanian,
8
8
 
9
- 'Arab' => :script_arabic,
10
- 'Arabic' => :script_arabic,
9
+ 'Arab' => :arabic,
10
+ 'Arabic' => :arabic,
11
11
 
12
- 'Armi' => :script_imperial_aramaic,
13
- 'Imperial Aramaic' => :script_imperial_aramaic,
12
+ 'Armi' => :imperial_aramaic,
13
+ 'Imperial Aramaic' => :imperial_aramaic,
14
14
 
15
- 'Armn' => :script_armenian,
16
- 'Armenian' => :script_armenian,
15
+ 'Egyp' => :egyptian_hieroglyphs,
16
+ 'Egyptian Hieroglyphs' => :egyptian_hieroglyphs, # test whitespace
17
17
 
18
- 'Avst' => :script_avestan,
19
- 'Avestan' => :script_avestan,
18
+ 'Linb' => :linear_b,
19
+ 'Linear-B' => :linear_b, # test dash
20
20
 
21
- 'Bali' => :script_balinese,
22
- 'Balinese' => :script_balinese,
21
+ 'Yiii' => :yi,
22
+ 'Yi' => :yi,
23
23
 
24
- 'Bamu' => :script_bamum,
25
- 'Bamum' => :script_bamum,
24
+ 'Zinh' => :inherited,
25
+ 'Inherited' => :inherited,
26
+ 'Qaai' => :inherited,
26
27
 
27
- 'Bass' => :script_bassa_vah,
28
- 'Bassa Vah' => :script_bassa_vah,
28
+ 'Zyyy' => :common,
29
+ 'Common' => :common,
29
30
 
30
- 'Batk' => :script_batak,
31
- 'Batak' => :script_batak,
32
-
33
- 'Beng' => :script_bengali,
34
- 'Bengali' => :script_bengali,
35
-
36
- 'Bopo' => :script_bopomofo,
37
- 'Bopomofo' => :script_bopomofo,
38
-
39
- 'Brah' => :script_brahmi,
40
- 'Brahmi' => :script_brahmi,
41
-
42
- 'Brai' => :script_braille,
43
- 'Braille' => :script_braille,
44
-
45
- 'Bugi' => :script_buginese,
46
- 'Buginese' => :script_buginese,
47
-
48
- 'Buhd' => :script_buhid,
49
- 'Buhid' => :script_buhid,
50
-
51
- 'Cans' => :script_canadian_aboriginal,
52
- 'Canadian Aboriginal' => :script_canadian_aboriginal,
53
-
54
- 'Cari' => :script_carian,
55
- 'Carian' => :script_carian,
56
-
57
- 'Cham' => :script_cham,
58
-
59
- 'Cher' => :script_cherokee,
60
- 'Cherokee' => :script_cherokee,
61
-
62
- 'Copt' => :script_coptic,
63
- 'Coptic' => :script_coptic,
64
- 'Qaac' => :script_coptic,
65
-
66
- 'Cprt' => :script_cypriot,
67
- 'Cypriot' => :script_cypriot,
68
-
69
- 'Cyrl' => :script_cyrillic,
70
- 'Cyrillic' => :script_cyrillic,
71
-
72
- 'Deva' => :script_devanagari,
73
- 'Devanagari' => :script_devanagari,
74
-
75
- 'Dsrt' => :script_deseret,
76
- 'Deseret' => :script_deseret,
77
-
78
-
79
- 'Dupl' => :script_duployan,
80
- 'Duployan' => :script_duployan,
81
-
82
-
83
- 'Egyp' => :script_egyptian_hieroglyphs,
84
- 'Egyptian Hieroglyphs' => :script_egyptian_hieroglyphs,
85
-
86
- 'Elba' => :script_elbasan,
87
- 'Elbasan' => :script_elbasan,
88
-
89
- 'Ethi' => :script_ethiopic,
90
- 'Ethiopic' => :script_ethiopic,
91
-
92
- 'Geor' => :script_georgian,
93
- 'Georgian' => :script_georgian,
94
-
95
- 'Glag' => :script_glagolitic,
96
- 'Glagolitic' => :script_glagolitic,
97
-
98
- 'Goth' => :script_gothic,
99
- 'Gothic' => :script_gothic,
100
-
101
- 'Gran' => :script_grantha,
102
- 'Grantha' => :script_grantha,
103
-
104
- 'Grek' => :script_greek,
105
- 'Greek' => :script_greek,
106
-
107
- 'Gujr' => :script_gujarati,
108
- 'Gujarati' => :script_gujarati,
109
-
110
- 'Guru' => :script_gurmukhi,
111
- 'Gurmukhi' => :script_gurmukhi,
112
-
113
- 'Hang' => :script_hangul,
114
- 'Hangul' => :script_hangul,
115
-
116
- 'Hani' => :script_han,
117
- 'Han' => :script_han,
118
-
119
- 'Hano' => :script_hanunoo,
120
- 'Hanunoo' => :script_hanunoo,
121
-
122
- 'Hebr' => :script_hebrew,
123
- 'Hebrew' => :script_hebrew,
124
-
125
- 'Hira' => :script_hiragana,
126
- 'Hiragana' => :script_hiragana,
127
-
128
- 'Hmng' => :script_pahawh_hmong,
129
- 'Pahawh Hmong' => :script_pahawh_hmong,
130
-
131
- 'Hrkt' => :script_katakana_or_hiragana,
132
- 'Katakana or Hiragana' => :script_katakana_or_hiragana,
133
-
134
- 'Ital' => :script_old_italic,
135
- 'Old Italic' => :script_old_italic,
136
-
137
- 'Java' => :script_javanese,
138
- 'Javanese' => :script_javanese,
139
-
140
- 'Kali' => :script_kayah_li,
141
- 'Kayah Li' => :script_kayah_li,
142
-
143
- 'Kana' => :script_katakana,
144
- 'Katakana' => :script_katakana,
145
-
146
- 'Khar' => :script_kharoshthi,
147
- 'Kharoshthi' => :script_kharoshthi,
148
-
149
- 'Khmr' => :script_khmer,
150
- 'Khmer' => :script_khmer,
151
-
152
- 'Khoj' => :script_khojki,
153
- 'Khojki' => :script_khojki,
154
-
155
- 'Knda' => :script_kannada,
156
- 'Kannada' => :script_kannada,
157
-
158
- 'Kthi' => :script_kaithi,
159
- 'Kaithi' => :script_kaithi,
160
-
161
- 'Lana' => :script_tai_tham,
162
- 'Tai Tham' => :script_tai_tham,
163
-
164
- 'Laoo' => :script_lao,
165
- 'Lao' => :script_lao,
166
-
167
- 'Latn' => :script_latin,
168
- 'Latin' => :script_latin,
169
-
170
- 'Lepc' => :script_lepcha,
171
- 'Lepcha' => :script_lepcha,
172
-
173
- 'Limb' => :script_limbu,
174
- 'Limbu' => :script_limbu,
175
-
176
- 'Lina' => :script_linear_a,
177
- 'Linear A' => :script_linear_a,
178
-
179
- 'Linb' => :script_linear_b,
180
- 'Linear B' => :script_linear_b,
181
-
182
- 'Lisu' => :script_lisu,
183
-
184
- 'Lyci' => :script_lycian,
185
- 'Lycian' => :script_lycian,
186
-
187
- 'Lydi' => :script_lydian,
188
- 'Lydian' => :script_lydian,
189
-
190
- 'Mand' => :script_mandaic,
191
- 'Mandaic' => :script_mandaic,
192
-
193
- 'Mlym' => :script_malayalam,
194
- 'Malayalam' => :script_malayalam,
195
-
196
- 'Mahj' => :script_mahajani,
197
- 'Mahajani' => :script_mahajani,
198
-
199
- 'Mani' => :script_manichaean,
200
- 'Manichaean' => :script_manichaean,
201
-
202
- 'Mend' => :script_mende_kikakui,
203
- 'Mende Kikakui' => :script_mende_kikakui,
204
-
205
- 'Modi' => :script_modi,
206
-
207
- 'Mong' => :script_mongolian,
208
- 'Mongolian' => :script_mongolian,
209
-
210
- 'Mroo' => :script_mro,
211
- 'Mro' => :script_mro,
212
-
213
- 'Mtei' => :script_meetei_mayek,
214
- 'Meetei Mayek' => :script_meetei_mayek,
215
-
216
- 'Mymr' => :script_myanmar,
217
- 'Myanmar' => :script_myanmar,
218
-
219
- 'Narb' => :script_old_north_arabian,
220
- 'Old North Arabian' => :script_old_north_arabian,
221
-
222
- 'Nbat' => :script_nabataean,
223
- 'Nabataean' => :script_nabataean,
224
-
225
- 'Nkoo' => :script_nko,
226
- 'Nko' => :script_nko,
227
-
228
- 'Ogam' => :script_ogham,
229
- 'Ogham' => :script_ogham,
230
-
231
- 'Olck' => :script_ol_chiki,
232
- 'Ol Chiki' => :script_ol_chiki,
233
-
234
- 'Orkh' => :script_old_turkic,
235
- 'Old Turkic' => :script_old_turkic,
236
-
237
- 'Orya' => :script_oriya,
238
- 'Oriya' => :script_oriya,
239
-
240
- 'Osma' => :script_osmanya,
241
- 'Osmanya' => :script_osmanya,
242
-
243
- 'Palm' => :script_palmyrene,
244
- 'Palmyrene' => :script_palmyrene,
245
-
246
- 'Pauc' => :script_pau_cin_hau,
247
- 'Pau Cin Hau' => :script_pau_cin_hau,
248
-
249
- 'Perm' => :script_old_permic,
250
- 'Old Permic' => :script_old_permic,
251
-
252
- 'Phag' => :script_phags_pa,
253
- 'Phags Pa' => :script_phags_pa,
254
-
255
- 'Phli' => :script_inscriptional_pahlavi,
256
- 'Inscriptional Pahlavi' => :script_inscriptional_pahlavi,
257
-
258
- 'Phlp' => :script_psalter_pahlavi,
259
- 'Psalter Pahlavi' => :script_psalter_pahlavi,
260
-
261
- 'Phnx' => :script_phoenician,
262
- 'Phoenician' => :script_phoenician,
263
-
264
- 'Prti' => :script_inscriptional_parthian,
265
- 'Inscriptional Parthian' => :script_inscriptional_parthian,
266
-
267
- 'Rjng' => :script_rejang,
268
- 'Rejang' => :script_rejang,
269
-
270
- 'Runr' => :script_runic,
271
- 'Runic' => :script_runic,
272
-
273
- 'Samr' => :script_samaritan,
274
- 'Samaritan' => :script_samaritan,
275
-
276
- 'Sarb' => :script_old_south_arabian,
277
- 'Old South Arabian' => :script_old_south_arabian,
278
-
279
- 'Saur' => :script_saurashtra,
280
- 'Saurashtra' => :script_saurashtra,
281
-
282
- 'Shaw' => :script_shavian,
283
- 'Shavian' => :script_shavian,
284
-
285
- 'Sidd' => :script_siddham,
286
- 'Siddham' => :script_siddham,
287
-
288
- 'Sind' => :script_khudawadi,
289
- 'Khudawadi' => :script_khudawadi,
290
-
291
- 'Sinh' => :script_sinhala,
292
- 'Sinhala' => :script_sinhala,
293
-
294
- 'Sund' => :script_sundanese,
295
- 'Sundanese' => :script_sundanese,
296
-
297
- 'Sylo' => :script_syloti_nagri,
298
- 'Syloti Nagri' => :script_syloti_nagri,
299
-
300
- 'Syrc' => :script_syriac,
301
- 'Syriac' => :script_syriac,
302
-
303
- 'Tagb' => :script_tagbanwa,
304
- 'Tagbanwa' => :script_tagbanwa,
305
-
306
- 'Tale' => :script_tai_le,
307
- 'Tai Le' => :script_tai_le,
308
-
309
- 'Talu' => :script_new_tai_lue,
310
- 'New Tai Lue' => :script_new_tai_lue,
311
-
312
- 'Taml' => :script_tamil,
313
- 'Tamil' => :script_tamil,
314
-
315
- 'Tavt' => :script_tai_viet,
316
- 'Tai Viet' => :script_tai_viet,
317
-
318
- 'Telu' => :script_telugu,
319
- 'Telugu' => :script_telugu,
320
-
321
- 'Tfng' => :script_tifinagh,
322
- 'Tifinagh' => :script_tifinagh,
323
-
324
- 'Tglg' => :script_tagalog,
325
- 'Tagalog' => :script_tagalog,
326
-
327
- 'Thaa' => :script_thaana,
328
- 'Thaana' => :script_thaana,
329
-
330
- 'Thai' => :script_thai,
331
-
332
- 'Tibt' => :script_tibetan,
333
- 'Tibetan' => :script_tibetan,
334
-
335
- 'Tirh' => :script_tirhuta,
336
- 'Tirhuta' => :script_tirhuta,
337
-
338
- 'Ugar' => :script_ugaritic,
339
- 'Ugaritic' => :script_ugaritic,
340
-
341
- 'Vaii' => :script_vai,
342
- 'Vai' => :script_vai,
343
-
344
- 'Wara' => :script_warang_citi,
345
- 'Warang Citi' => :script_warang_citi,
346
-
347
- 'Xpeo' => :script_old_persian,
348
- 'Old Persian' => :script_old_persian,
349
-
350
- 'Xsux' => :script_cuneiform,
351
- 'Cuneiform' => :script_cuneiform,
352
-
353
- 'Yiii' => :script_yi,
354
- 'Yi' => :script_yi,
355
-
356
- 'Zinh' => :script_inherited,
357
- 'Inherited' => :script_inherited,
358
- 'Qaai' => :script_inherited,
359
-
360
- 'Zyyy' => :script_common,
361
- 'Common' => :script_common,
362
-
363
- 'Zzzz' => :script_unknown,
364
- 'Unknown' => :script_unknown,
31
+ 'Zzzz' => :unknown,
32
+ 'Unknown' => :unknown,
365
33
  }
366
34
 
367
35
  tests.each_with_index do |(property, token), count|
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require File.expand_path("../../helpers", __FILE__)
2
4
 
3
5
  class ScannerSets < Test::Unit::TestCase
@@ -7,77 +9,93 @@ class ScannerSets < Test::Unit::TestCase
7
9
  '[b]' => [2, :set, :close, ']', 2, 3],
8
10
  '[^n]' => [1, :set, :negate, '^', 1, 2],
9
11
 
10
- '[c]' => [1, :set, :member, 'c', 1, 2],
11
- '[\b]' => [1, :set, :backspace, '\b', 1, 3],
12
- '[A\bX]' => [2, :set, :backspace, '\b', 2, 4],
13
-
14
- '[.]' => [1, :set, :member, '.', 1, 2],
15
- '[?]' => [1, :set, :member, '?', 1, 2],
16
- '[*]' => [1, :set, :member, '*', 1, 2],
17
- '[+]' => [1, :set, :member, '+', 1, 2],
18
- '[{]' => [1, :set, :member, '{', 1, 2],
19
- '[}]' => [1, :set, :member, '}', 1, 2],
20
- '[<]' => [1, :set, :member, '<', 1, 2],
21
- '[>]' => [1, :set, :member, '>', 1, 2],
22
-
23
- '[\x20]' => [1, :set, :member_hex, '\x20', 1, 5],
24
-
25
- '[\.]' => [1, :set, :escape, '\.', 1, 3],
26
- '[\!]' => [1, :set, :escape, '\!', 1, 3],
27
- '[\#]' => [1, :set, :escape, '\#', 1, 3],
28
- '[\]]' => [1, :set, :escape, '\]', 1, 3],
29
- '[\\\]' => [1, :set, :escape, '\\\\', 1, 3],
30
- '[a\-c]' => [2, :set, :escape, '\-', 2, 4],
31
-
32
- '[\d]' => [1, :set, :type_digit, '\d', 1, 3],
33
- '[\da-z]' => [1, :set, :type_digit, '\d', 1, 3],
34
- '[\D]' => [1, :set, :type_nondigit, '\D', 1, 3],
35
-
36
- '[\h]' => [1, :set, :type_hex, '\h', 1, 3],
37
- '[\H]' => [1, :set, :type_nonhex, '\H', 1, 3],
38
-
39
- '[\s]' => [1, :set, :type_space, '\s', 1, 3],
40
- '[\S]' => [1, :set, :type_nonspace, '\S', 1, 3],
41
-
42
- '[\w]' => [1, :set, :type_word, '\w', 1, 3],
43
- '[\W]' => [1, :set, :type_nonword, '\W', 1, 3],
44
-
45
- '[\R]' => [1, :set, :type_linebreak, '\R', 1, 3],
46
- '[\X]' => [1, :set, :type_xgrapheme, '\X', 1, 3],
47
-
48
- '[a-c]' => [1, :set, :range, 'a-c', 1, 4],
49
- '[a-c-]' => [2, :set, :member, '-', 4, 6],
50
- '[a-c^]' => [2, :set, :member, '^', 4, 5],
51
- '[a-cd-f]' => [2, :set, :range, 'd-f', 4, 7],
52
-
53
- '[a[:digit:]c]' => [2, :set, :class_digit, '[:digit:]', 2, 11],
54
- '[[:digit:][:space:]]' => [2, :set, :class_space, '[:space:]', 10, 19],
55
- '[[:^digit:]]' => [1, :set, :class_nondigit, '[:^digit:]', 1, 11],
12
+ '[c]' => [1, :literal, :literal, 'c', 1, 2],
13
+ '[\b]' => [1, :escape, :backspace, '\b', 1, 3],
14
+ '[A\bX]' => [2, :escape, :backspace, '\b', 2, 4],
15
+
16
+ '[.]' => [1, :literal, :literal, '.', 1, 2],
17
+ '[?]' => [1, :literal, :literal, '?', 1, 2],
18
+ '[*]' => [1, :literal, :literal, '*', 1, 2],
19
+ '[+]' => [1, :literal, :literal, '+', 1, 2],
20
+ '[{]' => [1, :literal, :literal, '{', 1, 2],
21
+ '[}]' => [1, :literal, :literal, '}', 1, 2],
22
+ '[<]' => [1, :literal, :literal, '<', 1, 2],
23
+ '[>]' => [1, :literal, :literal, '>', 1, 2],
24
+
25
+ '[äöü]' => [2, :literal, :literal, 'ö', 3, 5],
26
+
27
+ '[\x20]' => [1, :escape, :hex, '\x20', 1, 5],
28
+
29
+ '[\.]' => [1, :escape, :dot, '\.', 1, 3],
30
+ '[\!]' => [1, :escape, :literal, '\!', 1, 3],
31
+ '[\#]' => [1, :escape, :literal, '\#', 1, 3],
32
+ '[\]]' => [1, :escape, :set_close, '\]', 1, 3],
33
+ '[\\\]' => [1, :escape, :backslash, '\\\\', 1, 3],
34
+ '[\A]' => [1, :escape, :literal, '\A', 1, 3],
35
+ '[\z]' => [1, :escape, :literal, '\z', 1, 3],
36
+ '[\g]' => [1, :escape, :literal, '\g', 1, 3],
37
+ '[\K]' => [1, :escape, :literal, '\K', 1, 3],
38
+ '[\c2]' => [1, :escape, :literal, '\c', 1, 3],
39
+ '[\B]' => [1, :escape, :literal, '\B', 1, 3],
40
+ '[a\-c]' => [2, :escape, :literal, '\-', 2, 4],
41
+
42
+ '[\d]' => [1, :type, :digit, '\d', 1, 3],
43
+ '[\da-z]' => [1, :type, :digit, '\d', 1, 3],
44
+ '[\D]' => [1, :type, :nondigit, '\D', 1, 3],
45
+
46
+ '[\h]' => [1, :type, :hex, '\h', 1, 3],
47
+ '[\H]' => [1, :type, :nonhex, '\H', 1, 3],
48
+
49
+ '[\s]' => [1, :type, :space, '\s', 1, 3],
50
+ '[\S]' => [1, :type, :nonspace, '\S', 1, 3],
51
+
52
+ '[\w]' => [1, :type, :word, '\w', 1, 3],
53
+ '[\W]' => [1, :type, :nonword, '\W', 1, 3],
54
+
55
+ '[\R]' => [1, :escape, :literal, '\R', 1, 3],
56
+ '[\X]' => [1, :escape, :literal, '\X', 1, 3],
57
+
58
+ '[a-b]' => [1, :literal, :literal, 'a', 1, 2],
59
+ '[a-c]' => [2, :set, :range, '-', 2, 3],
60
+ '[a-d]' => [3, :literal, :literal, 'd', 3, 4],
61
+ '[a-b-]' => [4, :literal, :literal, '-', 4, 6],
62
+ '[-a]' => [1, :literal, :literal, '-', 1, 2],
63
+ '[a-c^]' => [4, :literal, :literal, '^', 4, 5],
64
+ '[a-bd-f]' => [2, :set, :range, '-', 2, 3],
65
+ '[a-cd-f]' => [5, :set, :range, '-', 5, 6],
66
+
67
+ '[a[:digit:]c]' => [2, :posixclass, :digit, '[:digit:]', 2, 11],
68
+ '[[:digit:][:space:]]' => [2, :posixclass, :space, '[:space:]', 10, 19],
69
+ '[[:^digit:]]' => [1, :nonposixclass, :digit, '[:^digit:]', 1, 11],
56
70
 
57
71
  '[a[.a-b.]c]' => [2, :set, :collation, '[.a-b.]', 2, 9],
58
72
  '[a[=e=]c]' => [2, :set, :equivalent, '[=e=]', 2, 7],
59
73
 
60
- '[a-d&&g-h]' => [2, :set, :intersection, '&&', 4, 6],
74
+ '[a-d&&g-h]' => [4, :set, :intersection, '&&', 4, 6],
75
+ '[a&&]' => [2, :set, :intersection, '&&', 2, 4],
76
+ '[&&z]' => [1, :set, :intersection, '&&', 1, 3],
61
77
 
62
- '[\\x20-\\x28]' => [1, :set, :range_hex, '\x20-\x28', 1, 10],
78
+ '[\\x20-\\x27]' => [1, :escape, :hex, '\x20', 1, 5],
79
+ '[\\x20-\\x28]' => [2, :set, :range, '-', 5, 6],
80
+ '[\\x20-\\x29]' => [3, :escape, :hex, '\x29', 6, 10],
63
81
 
64
- '[a\p{digit}c]' => [2, :set, :digit, '\p{digit}', 2, 11],
65
- '[a\P{digit}c]' => [2, :set, :digit, '\P{digit}', 2, 11],
66
- '[a\p{^digit}c]' => [2, :set, :digit, '\p{^digit}', 2, 12],
67
- '[a\P{^digit}c]' => [2, :set, :digit, '\P{^digit}', 2, 12],
82
+ '[a\p{digit}c]' => [2, :property, :digit, '\p{digit}', 2, 11],
83
+ '[a\P{digit}c]' => [2, :nonproperty, :digit, '\P{digit}', 2, 11],
84
+ '[a\p{^digit}c]' => [2, :nonproperty, :digit, '\p{^digit}', 2, 12],
85
+ '[a\P{^digit}c]' => [2, :property, :digit, '\P{^digit}', 2, 12],
68
86
 
69
- '[a\p{ALPHA}c]' => [2, :set, :alpha, '\p{ALPHA}', 2, 11],
70
- '[a\p{P}c]' => [2, :set, :punct_any, '\p{P}', 2, 7],
71
- '[a\p{P}\P{Z}c]' => [3, :set, :separator_any, '\P{Z}', 7, 12],
87
+ '[a\p{ALPHA}c]' => [2, :property, :alpha, '\p{ALPHA}', 2, 11],
88
+ '[a\p{P}c]' => [2, :property, :punctuation,'\p{P}', 2, 7],
89
+ '[a\p{P}\P{P}c]' => [3, :nonproperty, :punctuation,'\P{P}', 7, 12],
72
90
 
73
- '[a-w&&[^c-g]z]' => [3, :subset, :open, '[', 6, 7],
74
- '[a-w&&[^c-h]z]' => [4, :subset, :negate, '^', 7, 8],
75
- '[a-w&&[^c-i]z]' => [5, :subset, :range, 'c-i', 8, 11],
76
- '[a-w&&[^c-j]z]' => [6, :subset, :close, ']', 11, 12],
91
+ '[a-w&&[^c-g]z]' => [5, :set, :open, '[', 6, 7],
92
+ '[a-w&&[^c-h]z]' => [6, :set, :negate, '^', 7, 8],
93
+ '[a-w&&[^c-i]z]' => [8, :set, :range, '-', 9, 10],
94
+ '[a-w&&[^c-j]z]' => [10,:set, :close, ']', 11, 12],
77
95
  }
78
96
 
79
97
  tests.each_with_index do |(pattern, (index, type, token, text, ts, te)), count|
80
- define_method "test_scanner_#{type}_#{token}_#{count}" do
98
+ define_method "test_scanner_#{type}_#{token}_in_'#{pattern}'_#{count}" do
81
99
  tokens = RS.scan(pattern)
82
100
  result = tokens.at(index)
83
101
 
@@ -89,4 +107,13 @@ class ScannerSets < Test::Unit::TestCase
89
107
  end
90
108
  end
91
109
 
110
+ def test_set_literal_encoding
111
+ text = RS.scan('[a]')[1][2].to_s
112
+ assert_equal 'a', text
113
+ assert_equal 'UTF-8', text.encoding.to_s
114
+
115
+ text = RS.scan('[😲]')[1][2].to_s
116
+ assert_equal '😲', text
117
+ assert_equal 'UTF-8', text.encoding.to_s
118
+ end
92
119
  end