regexp_parser 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -3,365 +3,33 @@ require File.expand_path("../../helpers", __FILE__)
|
|
3
3
|
class ScannerUnicodeScripts < Test::Unit::TestCase
|
4
4
|
|
5
5
|
tests = {
|
6
|
-
'Aghb'
|
7
|
-
'Caucasian Albanian'
|
6
|
+
'Aghb' => :caucasian_albanian,
|
7
|
+
'Caucasian Albanian' => :caucasian_albanian,
|
8
8
|
|
9
|
-
'Arab'
|
10
|
-
'Arabic'
|
9
|
+
'Arab' => :arabic,
|
10
|
+
'Arabic' => :arabic,
|
11
11
|
|
12
|
-
'Armi'
|
13
|
-
'Imperial Aramaic' => :
|
12
|
+
'Armi' => :imperial_aramaic,
|
13
|
+
'Imperial Aramaic' => :imperial_aramaic,
|
14
14
|
|
15
|
-
'
|
16
|
-
'
|
15
|
+
'Egyp' => :egyptian_hieroglyphs,
|
16
|
+
'Egyptian Hieroglyphs' => :egyptian_hieroglyphs, # test whitespace
|
17
17
|
|
18
|
-
'
|
19
|
-
'
|
18
|
+
'Linb' => :linear_b,
|
19
|
+
'Linear-B' => :linear_b, # test dash
|
20
20
|
|
21
|
-
'
|
22
|
-
'
|
21
|
+
'Yiii' => :yi,
|
22
|
+
'Yi' => :yi,
|
23
23
|
|
24
|
-
'
|
25
|
-
|
24
|
+
'Zinh' => :inherited,
|
25
|
+
'Inherited' => :inherited,
|
26
|
+
'Qaai' => :inherited,
|
26
27
|
|
27
|
-
'
|
28
|
-
|
28
|
+
'Zyyy' => :common,
|
29
|
+
'Common' => :common,
|
29
30
|
|
30
|
-
'
|
31
|
-
|
32
|
-
|
33
|
-
'Beng' => :script_bengali,
|
34
|
-
'Bengali' => :script_bengali,
|
35
|
-
|
36
|
-
'Bopo' => :script_bopomofo,
|
37
|
-
'Bopomofo' => :script_bopomofo,
|
38
|
-
|
39
|
-
'Brah' => :script_brahmi,
|
40
|
-
'Brahmi' => :script_brahmi,
|
41
|
-
|
42
|
-
'Brai' => :script_braille,
|
43
|
-
'Braille' => :script_braille,
|
44
|
-
|
45
|
-
'Bugi' => :script_buginese,
|
46
|
-
'Buginese' => :script_buginese,
|
47
|
-
|
48
|
-
'Buhd' => :script_buhid,
|
49
|
-
'Buhid' => :script_buhid,
|
50
|
-
|
51
|
-
'Cans' => :script_canadian_aboriginal,
|
52
|
-
'Canadian Aboriginal' => :script_canadian_aboriginal,
|
53
|
-
|
54
|
-
'Cari' => :script_carian,
|
55
|
-
'Carian' => :script_carian,
|
56
|
-
|
57
|
-
'Cham' => :script_cham,
|
58
|
-
|
59
|
-
'Cher' => :script_cherokee,
|
60
|
-
'Cherokee' => :script_cherokee,
|
61
|
-
|
62
|
-
'Copt' => :script_coptic,
|
63
|
-
'Coptic' => :script_coptic,
|
64
|
-
'Qaac' => :script_coptic,
|
65
|
-
|
66
|
-
'Cprt' => :script_cypriot,
|
67
|
-
'Cypriot' => :script_cypriot,
|
68
|
-
|
69
|
-
'Cyrl' => :script_cyrillic,
|
70
|
-
'Cyrillic' => :script_cyrillic,
|
71
|
-
|
72
|
-
'Deva' => :script_devanagari,
|
73
|
-
'Devanagari' => :script_devanagari,
|
74
|
-
|
75
|
-
'Dsrt' => :script_deseret,
|
76
|
-
'Deseret' => :script_deseret,
|
77
|
-
|
78
|
-
|
79
|
-
'Dupl' => :script_duployan,
|
80
|
-
'Duployan' => :script_duployan,
|
81
|
-
|
82
|
-
|
83
|
-
'Egyp' => :script_egyptian_hieroglyphs,
|
84
|
-
'Egyptian Hieroglyphs' => :script_egyptian_hieroglyphs,
|
85
|
-
|
86
|
-
'Elba' => :script_elbasan,
|
87
|
-
'Elbasan' => :script_elbasan,
|
88
|
-
|
89
|
-
'Ethi' => :script_ethiopic,
|
90
|
-
'Ethiopic' => :script_ethiopic,
|
91
|
-
|
92
|
-
'Geor' => :script_georgian,
|
93
|
-
'Georgian' => :script_georgian,
|
94
|
-
|
95
|
-
'Glag' => :script_glagolitic,
|
96
|
-
'Glagolitic' => :script_glagolitic,
|
97
|
-
|
98
|
-
'Goth' => :script_gothic,
|
99
|
-
'Gothic' => :script_gothic,
|
100
|
-
|
101
|
-
'Gran' => :script_grantha,
|
102
|
-
'Grantha' => :script_grantha,
|
103
|
-
|
104
|
-
'Grek' => :script_greek,
|
105
|
-
'Greek' => :script_greek,
|
106
|
-
|
107
|
-
'Gujr' => :script_gujarati,
|
108
|
-
'Gujarati' => :script_gujarati,
|
109
|
-
|
110
|
-
'Guru' => :script_gurmukhi,
|
111
|
-
'Gurmukhi' => :script_gurmukhi,
|
112
|
-
|
113
|
-
'Hang' => :script_hangul,
|
114
|
-
'Hangul' => :script_hangul,
|
115
|
-
|
116
|
-
'Hani' => :script_han,
|
117
|
-
'Han' => :script_han,
|
118
|
-
|
119
|
-
'Hano' => :script_hanunoo,
|
120
|
-
'Hanunoo' => :script_hanunoo,
|
121
|
-
|
122
|
-
'Hebr' => :script_hebrew,
|
123
|
-
'Hebrew' => :script_hebrew,
|
124
|
-
|
125
|
-
'Hira' => :script_hiragana,
|
126
|
-
'Hiragana' => :script_hiragana,
|
127
|
-
|
128
|
-
'Hmng' => :script_pahawh_hmong,
|
129
|
-
'Pahawh Hmong' => :script_pahawh_hmong,
|
130
|
-
|
131
|
-
'Hrkt' => :script_katakana_or_hiragana,
|
132
|
-
'Katakana or Hiragana' => :script_katakana_or_hiragana,
|
133
|
-
|
134
|
-
'Ital' => :script_old_italic,
|
135
|
-
'Old Italic' => :script_old_italic,
|
136
|
-
|
137
|
-
'Java' => :script_javanese,
|
138
|
-
'Javanese' => :script_javanese,
|
139
|
-
|
140
|
-
'Kali' => :script_kayah_li,
|
141
|
-
'Kayah Li' => :script_kayah_li,
|
142
|
-
|
143
|
-
'Kana' => :script_katakana,
|
144
|
-
'Katakana' => :script_katakana,
|
145
|
-
|
146
|
-
'Khar' => :script_kharoshthi,
|
147
|
-
'Kharoshthi' => :script_kharoshthi,
|
148
|
-
|
149
|
-
'Khmr' => :script_khmer,
|
150
|
-
'Khmer' => :script_khmer,
|
151
|
-
|
152
|
-
'Khoj' => :script_khojki,
|
153
|
-
'Khojki' => :script_khojki,
|
154
|
-
|
155
|
-
'Knda' => :script_kannada,
|
156
|
-
'Kannada' => :script_kannada,
|
157
|
-
|
158
|
-
'Kthi' => :script_kaithi,
|
159
|
-
'Kaithi' => :script_kaithi,
|
160
|
-
|
161
|
-
'Lana' => :script_tai_tham,
|
162
|
-
'Tai Tham' => :script_tai_tham,
|
163
|
-
|
164
|
-
'Laoo' => :script_lao,
|
165
|
-
'Lao' => :script_lao,
|
166
|
-
|
167
|
-
'Latn' => :script_latin,
|
168
|
-
'Latin' => :script_latin,
|
169
|
-
|
170
|
-
'Lepc' => :script_lepcha,
|
171
|
-
'Lepcha' => :script_lepcha,
|
172
|
-
|
173
|
-
'Limb' => :script_limbu,
|
174
|
-
'Limbu' => :script_limbu,
|
175
|
-
|
176
|
-
'Lina' => :script_linear_a,
|
177
|
-
'Linear A' => :script_linear_a,
|
178
|
-
|
179
|
-
'Linb' => :script_linear_b,
|
180
|
-
'Linear B' => :script_linear_b,
|
181
|
-
|
182
|
-
'Lisu' => :script_lisu,
|
183
|
-
|
184
|
-
'Lyci' => :script_lycian,
|
185
|
-
'Lycian' => :script_lycian,
|
186
|
-
|
187
|
-
'Lydi' => :script_lydian,
|
188
|
-
'Lydian' => :script_lydian,
|
189
|
-
|
190
|
-
'Mand' => :script_mandaic,
|
191
|
-
'Mandaic' => :script_mandaic,
|
192
|
-
|
193
|
-
'Mlym' => :script_malayalam,
|
194
|
-
'Malayalam' => :script_malayalam,
|
195
|
-
|
196
|
-
'Mahj' => :script_mahajani,
|
197
|
-
'Mahajani' => :script_mahajani,
|
198
|
-
|
199
|
-
'Mani' => :script_manichaean,
|
200
|
-
'Manichaean' => :script_manichaean,
|
201
|
-
|
202
|
-
'Mend' => :script_mende_kikakui,
|
203
|
-
'Mende Kikakui' => :script_mende_kikakui,
|
204
|
-
|
205
|
-
'Modi' => :script_modi,
|
206
|
-
|
207
|
-
'Mong' => :script_mongolian,
|
208
|
-
'Mongolian' => :script_mongolian,
|
209
|
-
|
210
|
-
'Mroo' => :script_mro,
|
211
|
-
'Mro' => :script_mro,
|
212
|
-
|
213
|
-
'Mtei' => :script_meetei_mayek,
|
214
|
-
'Meetei Mayek' => :script_meetei_mayek,
|
215
|
-
|
216
|
-
'Mymr' => :script_myanmar,
|
217
|
-
'Myanmar' => :script_myanmar,
|
218
|
-
|
219
|
-
'Narb' => :script_old_north_arabian,
|
220
|
-
'Old North Arabian' => :script_old_north_arabian,
|
221
|
-
|
222
|
-
'Nbat' => :script_nabataean,
|
223
|
-
'Nabataean' => :script_nabataean,
|
224
|
-
|
225
|
-
'Nkoo' => :script_nko,
|
226
|
-
'Nko' => :script_nko,
|
227
|
-
|
228
|
-
'Ogam' => :script_ogham,
|
229
|
-
'Ogham' => :script_ogham,
|
230
|
-
|
231
|
-
'Olck' => :script_ol_chiki,
|
232
|
-
'Ol Chiki' => :script_ol_chiki,
|
233
|
-
|
234
|
-
'Orkh' => :script_old_turkic,
|
235
|
-
'Old Turkic' => :script_old_turkic,
|
236
|
-
|
237
|
-
'Orya' => :script_oriya,
|
238
|
-
'Oriya' => :script_oriya,
|
239
|
-
|
240
|
-
'Osma' => :script_osmanya,
|
241
|
-
'Osmanya' => :script_osmanya,
|
242
|
-
|
243
|
-
'Palm' => :script_palmyrene,
|
244
|
-
'Palmyrene' => :script_palmyrene,
|
245
|
-
|
246
|
-
'Pauc' => :script_pau_cin_hau,
|
247
|
-
'Pau Cin Hau' => :script_pau_cin_hau,
|
248
|
-
|
249
|
-
'Perm' => :script_old_permic,
|
250
|
-
'Old Permic' => :script_old_permic,
|
251
|
-
|
252
|
-
'Phag' => :script_phags_pa,
|
253
|
-
'Phags Pa' => :script_phags_pa,
|
254
|
-
|
255
|
-
'Phli' => :script_inscriptional_pahlavi,
|
256
|
-
'Inscriptional Pahlavi' => :script_inscriptional_pahlavi,
|
257
|
-
|
258
|
-
'Phlp' => :script_psalter_pahlavi,
|
259
|
-
'Psalter Pahlavi' => :script_psalter_pahlavi,
|
260
|
-
|
261
|
-
'Phnx' => :script_phoenician,
|
262
|
-
'Phoenician' => :script_phoenician,
|
263
|
-
|
264
|
-
'Prti' => :script_inscriptional_parthian,
|
265
|
-
'Inscriptional Parthian' => :script_inscriptional_parthian,
|
266
|
-
|
267
|
-
'Rjng' => :script_rejang,
|
268
|
-
'Rejang' => :script_rejang,
|
269
|
-
|
270
|
-
'Runr' => :script_runic,
|
271
|
-
'Runic' => :script_runic,
|
272
|
-
|
273
|
-
'Samr' => :script_samaritan,
|
274
|
-
'Samaritan' => :script_samaritan,
|
275
|
-
|
276
|
-
'Sarb' => :script_old_south_arabian,
|
277
|
-
'Old South Arabian' => :script_old_south_arabian,
|
278
|
-
|
279
|
-
'Saur' => :script_saurashtra,
|
280
|
-
'Saurashtra' => :script_saurashtra,
|
281
|
-
|
282
|
-
'Shaw' => :script_shavian,
|
283
|
-
'Shavian' => :script_shavian,
|
284
|
-
|
285
|
-
'Sidd' => :script_siddham,
|
286
|
-
'Siddham' => :script_siddham,
|
287
|
-
|
288
|
-
'Sind' => :script_khudawadi,
|
289
|
-
'Khudawadi' => :script_khudawadi,
|
290
|
-
|
291
|
-
'Sinh' => :script_sinhala,
|
292
|
-
'Sinhala' => :script_sinhala,
|
293
|
-
|
294
|
-
'Sund' => :script_sundanese,
|
295
|
-
'Sundanese' => :script_sundanese,
|
296
|
-
|
297
|
-
'Sylo' => :script_syloti_nagri,
|
298
|
-
'Syloti Nagri' => :script_syloti_nagri,
|
299
|
-
|
300
|
-
'Syrc' => :script_syriac,
|
301
|
-
'Syriac' => :script_syriac,
|
302
|
-
|
303
|
-
'Tagb' => :script_tagbanwa,
|
304
|
-
'Tagbanwa' => :script_tagbanwa,
|
305
|
-
|
306
|
-
'Tale' => :script_tai_le,
|
307
|
-
'Tai Le' => :script_tai_le,
|
308
|
-
|
309
|
-
'Talu' => :script_new_tai_lue,
|
310
|
-
'New Tai Lue' => :script_new_tai_lue,
|
311
|
-
|
312
|
-
'Taml' => :script_tamil,
|
313
|
-
'Tamil' => :script_tamil,
|
314
|
-
|
315
|
-
'Tavt' => :script_tai_viet,
|
316
|
-
'Tai Viet' => :script_tai_viet,
|
317
|
-
|
318
|
-
'Telu' => :script_telugu,
|
319
|
-
'Telugu' => :script_telugu,
|
320
|
-
|
321
|
-
'Tfng' => :script_tifinagh,
|
322
|
-
'Tifinagh' => :script_tifinagh,
|
323
|
-
|
324
|
-
'Tglg' => :script_tagalog,
|
325
|
-
'Tagalog' => :script_tagalog,
|
326
|
-
|
327
|
-
'Thaa' => :script_thaana,
|
328
|
-
'Thaana' => :script_thaana,
|
329
|
-
|
330
|
-
'Thai' => :script_thai,
|
331
|
-
|
332
|
-
'Tibt' => :script_tibetan,
|
333
|
-
'Tibetan' => :script_tibetan,
|
334
|
-
|
335
|
-
'Tirh' => :script_tirhuta,
|
336
|
-
'Tirhuta' => :script_tirhuta,
|
337
|
-
|
338
|
-
'Ugar' => :script_ugaritic,
|
339
|
-
'Ugaritic' => :script_ugaritic,
|
340
|
-
|
341
|
-
'Vaii' => :script_vai,
|
342
|
-
'Vai' => :script_vai,
|
343
|
-
|
344
|
-
'Wara' => :script_warang_citi,
|
345
|
-
'Warang Citi' => :script_warang_citi,
|
346
|
-
|
347
|
-
'Xpeo' => :script_old_persian,
|
348
|
-
'Old Persian' => :script_old_persian,
|
349
|
-
|
350
|
-
'Xsux' => :script_cuneiform,
|
351
|
-
'Cuneiform' => :script_cuneiform,
|
352
|
-
|
353
|
-
'Yiii' => :script_yi,
|
354
|
-
'Yi' => :script_yi,
|
355
|
-
|
356
|
-
'Zinh' => :script_inherited,
|
357
|
-
'Inherited' => :script_inherited,
|
358
|
-
'Qaai' => :script_inherited,
|
359
|
-
|
360
|
-
'Zyyy' => :script_common,
|
361
|
-
'Common' => :script_common,
|
362
|
-
|
363
|
-
'Zzzz' => :script_unknown,
|
364
|
-
'Unknown' => :script_unknown,
|
31
|
+
'Zzzz' => :unknown,
|
32
|
+
'Unknown' => :unknown,
|
365
33
|
}
|
366
34
|
|
367
35
|
tests.each_with_index do |(property, token), count|
|
data/test/scanner/test_sets.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require File.expand_path("../../helpers", __FILE__)
|
2
4
|
|
3
5
|
class ScannerSets < Test::Unit::TestCase
|
@@ -7,77 +9,93 @@ class ScannerSets < Test::Unit::TestCase
|
|
7
9
|
'[b]' => [2, :set, :close, ']', 2, 3],
|
8
10
|
'[^n]' => [1, :set, :negate, '^', 1, 2],
|
9
11
|
|
10
|
-
'[c]' => [1, :
|
11
|
-
'[\b]' => [1, :
|
12
|
-
'[A\bX]' => [2, :
|
13
|
-
|
14
|
-
'[.]' => [1, :
|
15
|
-
'[?]' => [1, :
|
16
|
-
'[*]' => [1, :
|
17
|
-
'[+]' => [1, :
|
18
|
-
'[{]' => [1, :
|
19
|
-
'[}]' => [1, :
|
20
|
-
'[<]' => [1, :
|
21
|
-
'[>]' => [1, :
|
22
|
-
|
23
|
-
'[
|
24
|
-
|
25
|
-
'[
|
26
|
-
|
27
|
-
'[
|
28
|
-
'[
|
29
|
-
'[
|
30
|
-
'[
|
31
|
-
|
32
|
-
'[\
|
33
|
-
'[\
|
34
|
-
'[\
|
35
|
-
|
36
|
-
'[\
|
37
|
-
'[\
|
38
|
-
|
39
|
-
|
40
|
-
'[\
|
41
|
-
|
42
|
-
'[\
|
43
|
-
|
44
|
-
|
45
|
-
'[\
|
46
|
-
|
47
|
-
|
48
|
-
'[
|
49
|
-
|
50
|
-
'[
|
51
|
-
'[
|
52
|
-
|
53
|
-
'[
|
54
|
-
'[
|
55
|
-
|
12
|
+
'[c]' => [1, :literal, :literal, 'c', 1, 2],
|
13
|
+
'[\b]' => [1, :escape, :backspace, '\b', 1, 3],
|
14
|
+
'[A\bX]' => [2, :escape, :backspace, '\b', 2, 4],
|
15
|
+
|
16
|
+
'[.]' => [1, :literal, :literal, '.', 1, 2],
|
17
|
+
'[?]' => [1, :literal, :literal, '?', 1, 2],
|
18
|
+
'[*]' => [1, :literal, :literal, '*', 1, 2],
|
19
|
+
'[+]' => [1, :literal, :literal, '+', 1, 2],
|
20
|
+
'[{]' => [1, :literal, :literal, '{', 1, 2],
|
21
|
+
'[}]' => [1, :literal, :literal, '}', 1, 2],
|
22
|
+
'[<]' => [1, :literal, :literal, '<', 1, 2],
|
23
|
+
'[>]' => [1, :literal, :literal, '>', 1, 2],
|
24
|
+
|
25
|
+
'[äöü]' => [2, :literal, :literal, 'ö', 3, 5],
|
26
|
+
|
27
|
+
'[\x20]' => [1, :escape, :hex, '\x20', 1, 5],
|
28
|
+
|
29
|
+
'[\.]' => [1, :escape, :dot, '\.', 1, 3],
|
30
|
+
'[\!]' => [1, :escape, :literal, '\!', 1, 3],
|
31
|
+
'[\#]' => [1, :escape, :literal, '\#', 1, 3],
|
32
|
+
'[\]]' => [1, :escape, :set_close, '\]', 1, 3],
|
33
|
+
'[\\\]' => [1, :escape, :backslash, '\\\\', 1, 3],
|
34
|
+
'[\A]' => [1, :escape, :literal, '\A', 1, 3],
|
35
|
+
'[\z]' => [1, :escape, :literal, '\z', 1, 3],
|
36
|
+
'[\g]' => [1, :escape, :literal, '\g', 1, 3],
|
37
|
+
'[\K]' => [1, :escape, :literal, '\K', 1, 3],
|
38
|
+
'[\c2]' => [1, :escape, :literal, '\c', 1, 3],
|
39
|
+
'[\B]' => [1, :escape, :literal, '\B', 1, 3],
|
40
|
+
'[a\-c]' => [2, :escape, :literal, '\-', 2, 4],
|
41
|
+
|
42
|
+
'[\d]' => [1, :type, :digit, '\d', 1, 3],
|
43
|
+
'[\da-z]' => [1, :type, :digit, '\d', 1, 3],
|
44
|
+
'[\D]' => [1, :type, :nondigit, '\D', 1, 3],
|
45
|
+
|
46
|
+
'[\h]' => [1, :type, :hex, '\h', 1, 3],
|
47
|
+
'[\H]' => [1, :type, :nonhex, '\H', 1, 3],
|
48
|
+
|
49
|
+
'[\s]' => [1, :type, :space, '\s', 1, 3],
|
50
|
+
'[\S]' => [1, :type, :nonspace, '\S', 1, 3],
|
51
|
+
|
52
|
+
'[\w]' => [1, :type, :word, '\w', 1, 3],
|
53
|
+
'[\W]' => [1, :type, :nonword, '\W', 1, 3],
|
54
|
+
|
55
|
+
'[\R]' => [1, :escape, :literal, '\R', 1, 3],
|
56
|
+
'[\X]' => [1, :escape, :literal, '\X', 1, 3],
|
57
|
+
|
58
|
+
'[a-b]' => [1, :literal, :literal, 'a', 1, 2],
|
59
|
+
'[a-c]' => [2, :set, :range, '-', 2, 3],
|
60
|
+
'[a-d]' => [3, :literal, :literal, 'd', 3, 4],
|
61
|
+
'[a-b-]' => [4, :literal, :literal, '-', 4, 6],
|
62
|
+
'[-a]' => [1, :literal, :literal, '-', 1, 2],
|
63
|
+
'[a-c^]' => [4, :literal, :literal, '^', 4, 5],
|
64
|
+
'[a-bd-f]' => [2, :set, :range, '-', 2, 3],
|
65
|
+
'[a-cd-f]' => [5, :set, :range, '-', 5, 6],
|
66
|
+
|
67
|
+
'[a[:digit:]c]' => [2, :posixclass, :digit, '[:digit:]', 2, 11],
|
68
|
+
'[[:digit:][:space:]]' => [2, :posixclass, :space, '[:space:]', 10, 19],
|
69
|
+
'[[:^digit:]]' => [1, :nonposixclass, :digit, '[:^digit:]', 1, 11],
|
56
70
|
|
57
71
|
'[a[.a-b.]c]' => [2, :set, :collation, '[.a-b.]', 2, 9],
|
58
72
|
'[a[=e=]c]' => [2, :set, :equivalent, '[=e=]', 2, 7],
|
59
73
|
|
60
|
-
'[a-d&&g-h]' => [
|
74
|
+
'[a-d&&g-h]' => [4, :set, :intersection, '&&', 4, 6],
|
75
|
+
'[a&&]' => [2, :set, :intersection, '&&', 2, 4],
|
76
|
+
'[&&z]' => [1, :set, :intersection, '&&', 1, 3],
|
61
77
|
|
62
|
-
'[\\x20-\\
|
78
|
+
'[\\x20-\\x27]' => [1, :escape, :hex, '\x20', 1, 5],
|
79
|
+
'[\\x20-\\x28]' => [2, :set, :range, '-', 5, 6],
|
80
|
+
'[\\x20-\\x29]' => [3, :escape, :hex, '\x29', 6, 10],
|
63
81
|
|
64
|
-
'[a\p{digit}c]' => [2, :
|
65
|
-
'[a\P{digit}c]' => [2, :
|
66
|
-
'[a\p{^digit}c]' => [2, :
|
67
|
-
'[a\P{^digit}c]' => [2, :
|
82
|
+
'[a\p{digit}c]' => [2, :property, :digit, '\p{digit}', 2, 11],
|
83
|
+
'[a\P{digit}c]' => [2, :nonproperty, :digit, '\P{digit}', 2, 11],
|
84
|
+
'[a\p{^digit}c]' => [2, :nonproperty, :digit, '\p{^digit}', 2, 12],
|
85
|
+
'[a\P{^digit}c]' => [2, :property, :digit, '\P{^digit}', 2, 12],
|
68
86
|
|
69
|
-
'[a\p{ALPHA}c]' => [2, :
|
70
|
-
'[a\p{P}c]' => [2, :
|
71
|
-
'[a\p{P}\P{
|
87
|
+
'[a\p{ALPHA}c]' => [2, :property, :alpha, '\p{ALPHA}', 2, 11],
|
88
|
+
'[a\p{P}c]' => [2, :property, :punctuation,'\p{P}', 2, 7],
|
89
|
+
'[a\p{P}\P{P}c]' => [3, :nonproperty, :punctuation,'\P{P}', 7, 12],
|
72
90
|
|
73
|
-
'[a-w&&[^c-g]z]' => [
|
74
|
-
'[a-w&&[^c-h]z]' => [
|
75
|
-
'[a-w&&[^c-i]z]' => [
|
76
|
-
'[a-w&&[^c-j]z]' => [
|
91
|
+
'[a-w&&[^c-g]z]' => [5, :set, :open, '[', 6, 7],
|
92
|
+
'[a-w&&[^c-h]z]' => [6, :set, :negate, '^', 7, 8],
|
93
|
+
'[a-w&&[^c-i]z]' => [8, :set, :range, '-', 9, 10],
|
94
|
+
'[a-w&&[^c-j]z]' => [10,:set, :close, ']', 11, 12],
|
77
95
|
}
|
78
96
|
|
79
97
|
tests.each_with_index do |(pattern, (index, type, token, text, ts, te)), count|
|
80
|
-
define_method "test_scanner_#{type}_#{token}_#{count}" do
|
98
|
+
define_method "test_scanner_#{type}_#{token}_in_'#{pattern}'_#{count}" do
|
81
99
|
tokens = RS.scan(pattern)
|
82
100
|
result = tokens.at(index)
|
83
101
|
|
@@ -89,4 +107,13 @@ class ScannerSets < Test::Unit::TestCase
|
|
89
107
|
end
|
90
108
|
end
|
91
109
|
|
110
|
+
def test_set_literal_encoding
|
111
|
+
text = RS.scan('[a]')[1][2].to_s
|
112
|
+
assert_equal 'a', text
|
113
|
+
assert_equal 'UTF-8', text.encoding.to_s
|
114
|
+
|
115
|
+
text = RS.scan('[😲]')[1][2].to_s
|
116
|
+
assert_equal '😲', text
|
117
|
+
assert_equal 'UTF-8', text.encoding.to_s
|
118
|
+
end
|
92
119
|
end
|