regexp_parser 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +12 -0
- data/README.md +1 -6
- data/lib/regexp_parser/expression/classes/property.rb +1 -0
- data/lib/regexp_parser/parser.rb +3 -0
- data/lib/regexp_parser/scanner.rb +234 -20
- data/lib/regexp_parser/scanner/property.rl +217 -3
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +109 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/test/parser/test_properties.rb +7 -0
- data/test/scanner/test_all.rb +1 -1
- data/test/scanner/test_unicode_blocks.rb +130 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 682c13ce2716430eea274098ce40b3e842b47eba
|
4
|
+
data.tar.gz: 15faf04abe034eb801292781bb2d1503f0b1df0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5991a90cc872e72b4361692deb7d5370c70f5baafeed5afe3e90db4a95947bdaf7e072f77c6e2ca07eb839c79bcebd9022312bcbe3f8ccfc59b84b445d889c1c
|
7
|
+
data.tar.gz: 46d4cebfd6c904002da41db9cfc7747d42775c6e55d9e2853a4fca1a18652ef368e381057dd8a83fa59908c68f2f3c45e6bee19b0a3b8fa282036cfcdef90443
|
data/ChangeLog
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
Sun Aug 6 2015 Ammar Ali <ammarabuali@gmail.com>
|
2
|
+
|
3
|
+
* Added UnicodeBlocks support to the parser.
|
4
|
+
|
5
|
+
Mon Aug 3 2015 Garen Torikian <gjtorikian@gmail.com>
|
6
|
+
|
7
|
+
* Added UnicodeBlocks support to the scanner.
|
8
|
+
|
9
|
+
Sat Apr 18 14:38:12 2015 Ammar Ali <ammarabuali@gmail.com>
|
10
|
+
|
11
|
+
* Updated ruby versions for latest releases.
|
12
|
+
|
1
13
|
Wed Dec 3 05:21:27 2014 Ammar Ali <ammarabuali@gmail.com>
|
2
14
|
|
3
15
|
* Added expand_members method to CharacterSet, returns traditional
|
data/README.md
CHANGED
@@ -355,12 +355,7 @@ _Note that not all of these are available in all versions of Ruby_
|
|
355
355
|
|   _**General Categories**_ | `\p{Lu}`, `\P{Cs}` | ✓ |
|
356
356
|
|   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}` | ✓ |
|
357
357
|
|   _**Simple**_ | `\p{Dash}`, `\p{Extender}` | ✓ |
|
358
|
-
|
359
|
-
|
360
|
-
<br/>
|
361
|
-
##### Missing Features
|
362
|
-
|
363
|
-
- Unicode blocks, e.g. \p{InArrows}, \p{InArmenian}. _(h/t @gjtorikian for pointing it out)_
|
358
|
+
|   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}` | ✓ |
|
364
359
|
|
365
360
|
##### Inapplicable Features
|
366
361
|
|
@@ -103,6 +103,7 @@ module Regexp::Expression
|
|
103
103
|
class Age < UnicodeProperty::Base; end
|
104
104
|
class Derived < UnicodeProperty::Base; end
|
105
105
|
class Script < UnicodeProperty::Base; end
|
106
|
+
class Block < UnicodeProperty::Base; end
|
106
107
|
end
|
107
108
|
|
108
109
|
end # module Regexp::Expression
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -270,6 +270,9 @@ module Regexp::Parser
|
|
270
270
|
when *Regexp::Syntax::Token::UnicodeProperty::Script
|
271
271
|
@node << Script.new(token)
|
272
272
|
|
273
|
+
when *Regexp::Syntax::Token::UnicodeProperty::UnicodeBlock
|
274
|
+
@node << Block.new(token)
|
275
|
+
|
273
276
|
else
|
274
277
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
275
278
|
end
|
@@ -346,9 +346,9 @@ self._re_scanner_indicies = [
|
|
346
346
|
40, 40, 40, 40, 40, 40, 40, 40,
|
347
347
|
40, 40, 40, 40, 40, 40, 40, 40,
|
348
348
|
41, 40, 40, 40, 40, 40, 40, 40,
|
349
|
-
40, 40, 40, 40, 40,
|
350
|
-
|
351
|
-
|
349
|
+
40, 40, 40, 40, 40, 41, 40, 40,
|
350
|
+
41, 41, 41, 41, 41, 41, 41, 41,
|
351
|
+
41, 41, 40, 40, 40, 40, 40, 40,
|
352
352
|
40, 42, 41, 41, 41, 41, 41, 41,
|
353
353
|
41, 41, 41, 41, 41, 41, 41, 41,
|
354
354
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -361,8 +361,8 @@ self._re_scanner_indicies = [
|
|
361
361
|
40, 40, 40, 40, 40, 40, 40, 40,
|
362
362
|
40, 40, 40, 41, 40, 40, 40, 40,
|
363
363
|
40, 40, 40, 40, 40, 40, 40, 40,
|
364
|
-
|
365
|
-
|
364
|
+
41, 40, 40, 41, 41, 41, 41, 41,
|
365
|
+
41, 41, 41, 41, 41, 40, 40, 40,
|
366
366
|
40, 40, 40, 40, 41, 41, 41, 41,
|
367
367
|
41, 41, 41, 41, 41, 41, 41, 41,
|
368
368
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -375,9 +375,9 @@ self._re_scanner_indicies = [
|
|
375
375
|
40, 40, 40, 40, 40, 40, 40, 40,
|
376
376
|
40, 40, 40, 40, 40, 40, 40, 40,
|
377
377
|
40, 41, 40, 40, 40, 40, 40, 40,
|
378
|
-
40, 40, 40, 40, 40, 40,
|
379
|
-
40,
|
380
|
-
|
378
|
+
40, 40, 40, 40, 40, 40, 41, 40,
|
379
|
+
40, 41, 41, 41, 41, 41, 41, 41,
|
380
|
+
41, 41, 41, 40, 40, 40, 40, 40,
|
381
381
|
40, 40, 41, 41, 41, 41, 41, 41,
|
382
382
|
45, 41, 41, 41, 41, 41, 41, 41,
|
383
383
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -390,9 +390,9 @@ self._re_scanner_indicies = [
|
|
390
390
|
40, 40, 40, 40, 40, 40, 40, 40,
|
391
391
|
40, 40, 40, 40, 40, 40, 40, 41,
|
392
392
|
40, 40, 40, 40, 40, 40, 40, 40,
|
393
|
-
40, 40, 40, 40,
|
394
|
-
|
395
|
-
|
393
|
+
40, 40, 40, 40, 41, 40, 40, 41,
|
394
|
+
41, 41, 41, 41, 41, 41, 41, 41,
|
395
|
+
41, 40, 40, 40, 40, 40, 40, 40,
|
396
396
|
41, 41, 41, 41, 46, 41, 41, 41,
|
397
397
|
41, 41, 41, 41, 41, 41, 41, 41,
|
398
398
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -405,8 +405,8 @@ self._re_scanner_indicies = [
|
|
405
405
|
40, 40, 40, 40, 40, 40, 40, 40,
|
406
406
|
40, 40, 40, 40, 40, 41, 40, 40,
|
407
407
|
40, 40, 40, 40, 40, 40, 40, 40,
|
408
|
-
40, 40,
|
409
|
-
|
408
|
+
40, 40, 41, 40, 40, 41, 41, 41,
|
409
|
+
41, 41, 41, 41, 41, 41, 41, 40,
|
410
410
|
40, 40, 47, 40, 40, 40, 41, 41,
|
411
411
|
41, 41, 41, 41, 41, 41, 41, 41,
|
412
412
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -424,9 +424,9 @@ self._re_scanner_indicies = [
|
|
424
424
|
40, 40, 40, 40, 40, 40, 40, 40,
|
425
425
|
40, 40, 40, 40, 40, 40, 40, 41,
|
426
426
|
40, 40, 40, 40, 40, 40, 40, 40,
|
427
|
-
40, 40, 40, 40,
|
428
|
-
|
429
|
-
|
427
|
+
40, 40, 40, 40, 41, 40, 40, 41,
|
428
|
+
41, 41, 41, 41, 41, 41, 41, 41,
|
429
|
+
41, 40, 40, 40, 40, 40, 40, 40,
|
430
430
|
42, 41, 41, 41, 41, 41, 41, 41,
|
431
431
|
41, 41, 41, 41, 41, 41, 41, 41,
|
432
432
|
41, 41, 41, 41, 41, 41, 41, 41,
|
@@ -2205,7 +2205,7 @@ te = p+1
|
|
2205
2205
|
self.emit(type, :script_tagalog, text, ts-1, te)
|
2206
2206
|
when 'thaa', 'thaana'
|
2207
2207
|
self.emit(type, :script_thaana, text, ts-1, te)
|
2208
|
-
when 'thai'
|
2208
|
+
when 'thai'
|
2209
2209
|
self.emit(type, :script_thai, text, ts-1, te)
|
2210
2210
|
when 'tibt', 'tibetan'
|
2211
2211
|
self.emit(type, :script_tibetan, text, ts-1, te)
|
@@ -2230,6 +2230,220 @@ te = p+1
|
|
2230
2230
|
when 'zzzz', 'unknown'
|
2231
2231
|
self.emit(type, :script_unknown, text, ts-1, te)
|
2232
2232
|
|
2233
|
+
# Unicode blocks
|
2234
|
+
when 'inalphabeticpresentationforms'
|
2235
|
+
self.emit(type, :block_inalphabetic_presentation_forms, text, ts-1, te)
|
2236
|
+
when 'inalphabeticpresentationforms'
|
2237
|
+
self.emit(type, :block_inalphabetic_presentation_forms, text, ts-1, te)
|
2238
|
+
when 'inarabicpresentationforms-a'
|
2239
|
+
self.emit(type, :block_inarabic_presentation_forms_a, text, ts-1, te)
|
2240
|
+
when 'inarabicpresentationforms-b'
|
2241
|
+
self.emit(type, :block_inarabic_presentation_forms_b, text, ts-1, te)
|
2242
|
+
when 'inarabic'
|
2243
|
+
self.emit(type, :block_inarabic, text, ts-1, te)
|
2244
|
+
when 'inarmenian'
|
2245
|
+
self.emit(type, :block_inarmenian, text, ts-1, te)
|
2246
|
+
when 'inarrows'
|
2247
|
+
self.emit(type, :block_inarrows, text, ts-1, te)
|
2248
|
+
when 'inbasiclatin'
|
2249
|
+
self.emit(type, :block_inbasic_latin, text, ts-1, te)
|
2250
|
+
when 'inbengali'
|
2251
|
+
self.emit(type, :block_inbengali, text, ts-1, te)
|
2252
|
+
when 'inblockelements'
|
2253
|
+
self.emit(type, :block_inblock_elements, text, ts-1, te)
|
2254
|
+
when 'inbopomofoextended'
|
2255
|
+
self.emit(type, :block_inbopomofo_extended, text, ts-1, te)
|
2256
|
+
when 'inbopomofo'
|
2257
|
+
self.emit(type, :block_inbopomofo, text, ts-1, te)
|
2258
|
+
when 'inboxdrawing'
|
2259
|
+
self.emit(type, :block_inbox_drawing, text, ts-1, te)
|
2260
|
+
when 'inbraillepatterns'
|
2261
|
+
self.emit(type, :block_inbraille_patterns, text, ts-1, te)
|
2262
|
+
when 'inbuhid'
|
2263
|
+
self.emit(type, :block_inbuhid, text, ts-1, te)
|
2264
|
+
when 'incjkcompatibilityforms'
|
2265
|
+
self.emit(type, :block_incjk_compatibility_forms, text, ts-1, te)
|
2266
|
+
when 'incjkcompatibilityideographs'
|
2267
|
+
self.emit(type, :block_incjk_compatibility_ideographs, text, ts-1, te)
|
2268
|
+
when 'incjkcompatibility'
|
2269
|
+
self.emit(type, :block_incjk_compatibility, text, ts-1, te)
|
2270
|
+
when 'incjkradicalssupplement'
|
2271
|
+
self.emit(type, :block_incjk_radicals_supplement, text, ts-1, te)
|
2272
|
+
when 'incjksymbolsandpunctuation'
|
2273
|
+
self.emit(type, :block_incjk_symbols_and_punctuation, text, ts-1, te)
|
2274
|
+
when 'incjkunifiedideographsextensiona'
|
2275
|
+
self.emit(type, :block_incjk_unified_ideographs_extension_a, text, ts-1, te)
|
2276
|
+
when 'incjkunifiedideographs'
|
2277
|
+
self.emit(type, :block_incjk_unified_ideographs, text, ts-1, te)
|
2278
|
+
when 'incherokee'
|
2279
|
+
self.emit(type, :block_incherokee, text, ts-1, te)
|
2280
|
+
when 'incombiningdiacriticalmarksforsymbols'
|
2281
|
+
self.emit(type, :block_incombining_diacritical_marks_for_symbols, text, ts-1, te)
|
2282
|
+
when 'incombiningdiacriticalmarks'
|
2283
|
+
self.emit(type, :block_incombining_diacritical_marks, text, ts-1, te)
|
2284
|
+
when 'incombininghalfmarks'
|
2285
|
+
self.emit(type, :block_incombining_half_marks, text, ts-1, te)
|
2286
|
+
when 'incontrolpictures'
|
2287
|
+
self.emit(type, :block_incontrol_pictures, text, ts-1, te)
|
2288
|
+
when 'incurrencysymbols'
|
2289
|
+
self.emit(type, :block_incurrency_symbols, text, ts-1, te)
|
2290
|
+
when 'incyrillicsupplementary'
|
2291
|
+
self.emit(type, :block_incyrillic_supplementary, text, ts-1, te)
|
2292
|
+
when 'incyrillic'
|
2293
|
+
self.emit(type, :block_incyrillic, text, ts-1, te)
|
2294
|
+
when 'indevanagari'
|
2295
|
+
self.emit(type, :block_indevanagari, text, ts-1, te)
|
2296
|
+
when 'indingbats'
|
2297
|
+
self.emit(type, :block_indingbats, text, ts-1, te)
|
2298
|
+
when 'inenclosedalphanumerics'
|
2299
|
+
self.emit(type, :block_inenclosed_alphanumerics, text, ts-1, te)
|
2300
|
+
when 'inenclosedcjklettersandmonths'
|
2301
|
+
self.emit(type, :block_inenclosed_cjk_letters_and_months, text, ts-1, te)
|
2302
|
+
when 'inethiopic'
|
2303
|
+
self.emit(type, :block_inethiopic, text, ts-1, te)
|
2304
|
+
when 'ingeneralpunctuation'
|
2305
|
+
self.emit(type, :block_ingeneral_punctuation, text, ts-1, te)
|
2306
|
+
when 'ingeometricshapes'
|
2307
|
+
self.emit(type, :block_ingeometric_shapes, text, ts-1, te)
|
2308
|
+
when 'ingeorgian'
|
2309
|
+
self.emit(type, :block_ingeorgian, text, ts-1, te)
|
2310
|
+
when 'ingreekextended'
|
2311
|
+
self.emit(type, :block_ingreek_extended, text, ts-1, te)
|
2312
|
+
when 'ingreekandcoptic'
|
2313
|
+
self.emit(type, :block_ingreek_and_coptic, text, ts-1, te)
|
2314
|
+
when 'ingujarati'
|
2315
|
+
self.emit(type, :block_ingujarati, text, ts-1, te)
|
2316
|
+
when 'ingurmukhi'
|
2317
|
+
self.emit(type, :block_ingurmukhi, text, ts-1, te)
|
2318
|
+
when 'inhalfwidthandfullwidthforms'
|
2319
|
+
self.emit(type, :block_inhalfwidth_and_fullwidth_forms, text, ts-1, te)
|
2320
|
+
when 'inhangulcompatibilityjamo'
|
2321
|
+
self.emit(type, :block_inhangul_compatibility_jamo, text, ts-1, te)
|
2322
|
+
when 'inhanguljamo'
|
2323
|
+
self.emit(type, :block_inhangul_jamo, text, ts-1, te)
|
2324
|
+
when 'inhangulsyllables'
|
2325
|
+
self.emit(type, :block_inhangul_syllables, text, ts-1, te)
|
2326
|
+
when 'inhanunoo'
|
2327
|
+
self.emit(type, :block_inhanunoo, text, ts-1, te)
|
2328
|
+
when 'inhebrew'
|
2329
|
+
self.emit(type, :block_inhebrew, text, ts-1, te)
|
2330
|
+
when 'inhighprivateusesurrogates'
|
2331
|
+
self.emit(type, :block_inhigh_private_use_surrogates, text, ts-1, te)
|
2332
|
+
when 'inhighsurrogates'
|
2333
|
+
self.emit(type, :block_inhigh_surrogates, text, ts-1, te)
|
2334
|
+
when 'inhiragana'
|
2335
|
+
self.emit(type, :block_inhiragana, text, ts-1, te)
|
2336
|
+
when 'inipaextensions'
|
2337
|
+
self.emit(type, :block_inipa_extensions, text, ts-1, te)
|
2338
|
+
when 'inideographicdescriptioncharacters'
|
2339
|
+
self.emit(type, :block_inideographic_description_characters, text, ts-1, te)
|
2340
|
+
when 'inkanbun'
|
2341
|
+
self.emit(type, :block_inkanbun, text, ts-1, te)
|
2342
|
+
when 'inkangxiradicals'
|
2343
|
+
self.emit(type, :block_inkangxi_radicals, text, ts-1, te)
|
2344
|
+
when 'inkannada'
|
2345
|
+
self.emit(type, :block_inkannada, text, ts-1, te)
|
2346
|
+
when 'inkatakanaphoneticextensions'
|
2347
|
+
self.emit(type, :block_inkatakana_phonetic_extensions, text, ts-1, te)
|
2348
|
+
when 'inkatakana'
|
2349
|
+
self.emit(type, :block_inkatakana, text, ts-1, te)
|
2350
|
+
when 'inkhmersymbols'
|
2351
|
+
self.emit(type, :block_inkhmer_symbols, text, ts-1, te)
|
2352
|
+
when 'inkhmer'
|
2353
|
+
self.emit(type, :block_inkhmer, text, ts-1, te)
|
2354
|
+
when 'inlao'
|
2355
|
+
self.emit(type, :block_inlao, text, ts-1, te)
|
2356
|
+
when 'inlatin-1supplement'
|
2357
|
+
self.emit(type, :block_inlatin_1_supplement, text, ts-1, te)
|
2358
|
+
when 'inlatinextended-a'
|
2359
|
+
self.emit(type, :block_inlatin_extended_a, text, ts-1, te)
|
2360
|
+
when 'inlatinextended-b'
|
2361
|
+
self.emit(type, :block_inlatin_extended_b, text, ts-1, te)
|
2362
|
+
when 'inlatinextendedadditional'
|
2363
|
+
self.emit(type, :block_inlatin_extended_additional, text, ts-1, te)
|
2364
|
+
when 'inletterlikesymbols'
|
2365
|
+
self.emit(type, :block_inletterlike_symbols, text, ts-1, te)
|
2366
|
+
when 'inlimbu'
|
2367
|
+
self.emit(type, :block_inlimbu, text, ts-1, te)
|
2368
|
+
when 'inlowsurrogates'
|
2369
|
+
self.emit(type, :block_inlow_surrogates, text, ts-1, te)
|
2370
|
+
when 'inmalayalam'
|
2371
|
+
self.emit(type, :block_inmalayalam, text, ts-1, te)
|
2372
|
+
when 'inmathematicaloperators'
|
2373
|
+
self.emit(type, :block_inmathematical_operators, text, ts-1, te)
|
2374
|
+
when 'inmiscellaneousmathematicalsymbols-a'
|
2375
|
+
self.emit(type, :block_inmiscellaneous_mathematical_symbols_a, text, ts-1, te)
|
2376
|
+
when 'inmiscellaneousmathematicalsymbols-b'
|
2377
|
+
self.emit(type, :block_inmiscellaneous_mathematical_symbols_b, text, ts-1, te)
|
2378
|
+
when 'inmiscellaneoussymbolsandarrows'
|
2379
|
+
self.emit(type, :block_inmiscellaneous_symbols_and_arrows, text, ts-1, te)
|
2380
|
+
when 'inmiscellaneoussymbols'
|
2381
|
+
self.emit(type, :block_inmiscellaneous_symbols, text, ts-1, te)
|
2382
|
+
when 'inmiscellaneoustechnical'
|
2383
|
+
self.emit(type, :block_inmiscellaneous_technical, text, ts-1, te)
|
2384
|
+
when 'inmongolian'
|
2385
|
+
self.emit(type, :block_inmongolian, text, ts-1, te)
|
2386
|
+
when 'inmyanmar'
|
2387
|
+
self.emit(type, :block_inmyanmar, text, ts-1, te)
|
2388
|
+
when 'innumberforms'
|
2389
|
+
self.emit(type, :block_innumber_forms, text, ts-1, te)
|
2390
|
+
when 'inogham'
|
2391
|
+
self.emit(type, :block_inogham, text, ts-1, te)
|
2392
|
+
when 'inopticalcharacterrecognition'
|
2393
|
+
self.emit(type, :block_inoptical_character_recognition, text, ts-1, te)
|
2394
|
+
when 'inoriya'
|
2395
|
+
self.emit(type, :block_inoriya, text, ts-1, te)
|
2396
|
+
when 'inphoneticextensions'
|
2397
|
+
self.emit(type, :block_inphonetic_extensions, text, ts-1, te)
|
2398
|
+
when 'inprivateusearea'
|
2399
|
+
self.emit(type, :block_inprivate_use_area, text, ts-1, te)
|
2400
|
+
when 'inrunic'
|
2401
|
+
self.emit(type, :block_inrunic, text, ts-1, te)
|
2402
|
+
when 'insinhala'
|
2403
|
+
self.emit(type, :block_insinhala, text, ts-1, te)
|
2404
|
+
when 'insmallformvariants'
|
2405
|
+
self.emit(type, :block_insmall_form_variants, text, ts-1, te)
|
2406
|
+
when 'inspacingmodifierletters'
|
2407
|
+
self.emit(type, :block_inspacing_modifier_letters, text, ts-1, te)
|
2408
|
+
when 'inspecials'
|
2409
|
+
self.emit(type, :block_inspecials, text, ts-1, te)
|
2410
|
+
when 'insuperscriptsandsubscripts'
|
2411
|
+
self.emit(type, :block_insuperscripts_and_subscripts, text, ts-1, te)
|
2412
|
+
when 'insupplementalarrows-a'
|
2413
|
+
self.emit(type, :block_insupplemental_arrows_a, text, ts-1, te)
|
2414
|
+
when 'insupplementalarrows-b'
|
2415
|
+
self.emit(type, :block_insupplemental_arrows_b, text, ts-1, te)
|
2416
|
+
when 'insupplementalmathematicaloperators'
|
2417
|
+
self.emit(type, :block_insupplemental_mathematical_operators, text, ts-1, te)
|
2418
|
+
when 'insyriac'
|
2419
|
+
self.emit(type, :block_insyriac, text, ts-1, te)
|
2420
|
+
when 'intagalog'
|
2421
|
+
self.emit(type, :block_intagalog, text, ts-1, te)
|
2422
|
+
when 'intagbanwa'
|
2423
|
+
self.emit(type, :block_intagbanwa, text, ts-1, te)
|
2424
|
+
when 'intaile'
|
2425
|
+
self.emit(type, :block_intai_le, text, ts-1, te)
|
2426
|
+
when 'intamil'
|
2427
|
+
self.emit(type, :block_intamil, text, ts-1, te)
|
2428
|
+
when 'intelugu'
|
2429
|
+
self.emit(type, :block_intelugu, text, ts-1, te)
|
2430
|
+
when 'inthaana'
|
2431
|
+
self.emit(type, :block_inthaana, text, ts-1, te)
|
2432
|
+
when 'inthai'
|
2433
|
+
self.emit(type, :block_inthai, text, ts-1, te)
|
2434
|
+
when 'intibetan'
|
2435
|
+
self.emit(type, :block_intibetan, text, ts-1, te)
|
2436
|
+
when 'inunifiedcanadianaboriginalsyllabics'
|
2437
|
+
self.emit(type, :block_inunified_canadian_aboriginal_syllabics, text, ts-1, te)
|
2438
|
+
when 'invariationselectors'
|
2439
|
+
self.emit(type, :block_invariation_selectors, text, ts-1, te)
|
2440
|
+
when 'inyiradicals'
|
2441
|
+
self.emit(type, :block_inyi_radicals, text, ts-1, te)
|
2442
|
+
when 'inyisyllables'
|
2443
|
+
self.emit(type, :block_inyi_syllables, text, ts-1, te)
|
2444
|
+
when 'inyijinghexagramsymbols'
|
2445
|
+
self.emit(type, :block_inyijing_hexagram_symbols, text, ts-1, te)
|
2446
|
+
|
2233
2447
|
else
|
2234
2448
|
# Should this really be an error? Or would emitting
|
2235
2449
|
# an :unknown for the property be better?
|
@@ -3791,7 +4005,7 @@ te = p+1
|
|
3791
4005
|
# line 764 "/Users/ammar/src/ruby/projects/regexp_parser/lib/regexp_parser/scanner/scanner.rl"
|
3792
4006
|
begin
|
3793
4007
|
act = 59; end
|
3794
|
-
# line
|
4008
|
+
# line 4009 "/Users/ammar/src/ruby/projects/regexp_parser/lib/regexp_parser/scanner.rb"
|
3795
4009
|
end
|
3796
4010
|
end
|
3797
4011
|
end
|
@@ -3809,7 +4023,7 @@ ts = nil; end
|
|
3809
4023
|
begin
|
3810
4024
|
act = 0
|
3811
4025
|
end
|
3812
|
-
# line
|
4026
|
+
# line 4027 "/Users/ammar/src/ruby/projects/regexp_parser/lib/regexp_parser/scanner.rb"
|
3813
4027
|
end
|
3814
4028
|
|
3815
4029
|
if cs == 0
|
@@ -3843,7 +4057,7 @@ act = 0
|
|
3843
4057
|
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
3844
4058
|
raise PrematureEndError.new( text )
|
3845
4059
|
end
|
3846
|
-
# line
|
4060
|
+
# line 4061 "/Users/ammar/src/ruby/projects/regexp_parser/lib/regexp_parser/scanner.rb"
|
3847
4061
|
end
|
3848
4062
|
end
|
3849
4063
|
|
@@ -33,7 +33,7 @@
|
|
33
33
|
'id_start'i | 'id_continue'i |
|
34
34
|
'xid_start'i | 'xid_continue'i |
|
35
35
|
'grapheme_base'i | 'grapheme_extend'i |
|
36
|
-
'default_ignorable_code_point'i;
|
36
|
+
'default_ignorable_code_point'i;
|
37
37
|
|
38
38
|
property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
|
39
39
|
'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
|
@@ -42,7 +42,7 @@
|
|
42
42
|
'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
|
43
43
|
'age=7.0'i;
|
44
44
|
|
45
|
-
property_script = (
|
45
|
+
property_script = (alnum | space | '_' | '-')+; # everything else
|
46
46
|
|
47
47
|
property_sequence = property_char . '{' . '^'? (
|
48
48
|
property_name | general_category |
|
@@ -553,7 +553,7 @@
|
|
553
553
|
self.emit(type, :script_tagalog, text, ts-1, te)
|
554
554
|
when 'thaa', 'thaana'
|
555
555
|
self.emit(type, :script_thaana, text, ts-1, te)
|
556
|
-
when 'thai'
|
556
|
+
when 'thai'
|
557
557
|
self.emit(type, :script_thai, text, ts-1, te)
|
558
558
|
when 'tibt', 'tibetan'
|
559
559
|
self.emit(type, :script_tibetan, text, ts-1, te)
|
@@ -578,6 +578,220 @@
|
|
578
578
|
when 'zzzz', 'unknown'
|
579
579
|
self.emit(type, :script_unknown, text, ts-1, te)
|
580
580
|
|
581
|
+
# Unicode blocks
|
582
|
+
when 'inalphabeticpresentationforms'
|
583
|
+
self.emit(type, :block_inalphabetic_presentation_forms, text, ts-1, te)
|
584
|
+
when 'inalphabeticpresentationforms'
|
585
|
+
self.emit(type, :block_inalphabetic_presentation_forms, text, ts-1, te)
|
586
|
+
when 'inarabicpresentationforms-a'
|
587
|
+
self.emit(type, :block_inarabic_presentation_forms_a, text, ts-1, te)
|
588
|
+
when 'inarabicpresentationforms-b'
|
589
|
+
self.emit(type, :block_inarabic_presentation_forms_b, text, ts-1, te)
|
590
|
+
when 'inarabic'
|
591
|
+
self.emit(type, :block_inarabic, text, ts-1, te)
|
592
|
+
when 'inarmenian'
|
593
|
+
self.emit(type, :block_inarmenian, text, ts-1, te)
|
594
|
+
when 'inarrows'
|
595
|
+
self.emit(type, :block_inarrows, text, ts-1, te)
|
596
|
+
when 'inbasiclatin'
|
597
|
+
self.emit(type, :block_inbasic_latin, text, ts-1, te)
|
598
|
+
when 'inbengali'
|
599
|
+
self.emit(type, :block_inbengali, text, ts-1, te)
|
600
|
+
when 'inblockelements'
|
601
|
+
self.emit(type, :block_inblock_elements, text, ts-1, te)
|
602
|
+
when 'inbopomofoextended'
|
603
|
+
self.emit(type, :block_inbopomofo_extended, text, ts-1, te)
|
604
|
+
when 'inbopomofo'
|
605
|
+
self.emit(type, :block_inbopomofo, text, ts-1, te)
|
606
|
+
when 'inboxdrawing'
|
607
|
+
self.emit(type, :block_inbox_drawing, text, ts-1, te)
|
608
|
+
when 'inbraillepatterns'
|
609
|
+
self.emit(type, :block_inbraille_patterns, text, ts-1, te)
|
610
|
+
when 'inbuhid'
|
611
|
+
self.emit(type, :block_inbuhid, text, ts-1, te)
|
612
|
+
when 'incjkcompatibilityforms'
|
613
|
+
self.emit(type, :block_incjk_compatibility_forms, text, ts-1, te)
|
614
|
+
when 'incjkcompatibilityideographs'
|
615
|
+
self.emit(type, :block_incjk_compatibility_ideographs, text, ts-1, te)
|
616
|
+
when 'incjkcompatibility'
|
617
|
+
self.emit(type, :block_incjk_compatibility, text, ts-1, te)
|
618
|
+
when 'incjkradicalssupplement'
|
619
|
+
self.emit(type, :block_incjk_radicals_supplement, text, ts-1, te)
|
620
|
+
when 'incjksymbolsandpunctuation'
|
621
|
+
self.emit(type, :block_incjk_symbols_and_punctuation, text, ts-1, te)
|
622
|
+
when 'incjkunifiedideographsextensiona'
|
623
|
+
self.emit(type, :block_incjk_unified_ideographs_extension_a, text, ts-1, te)
|
624
|
+
when 'incjkunifiedideographs'
|
625
|
+
self.emit(type, :block_incjk_unified_ideographs, text, ts-1, te)
|
626
|
+
when 'incherokee'
|
627
|
+
self.emit(type, :block_incherokee, text, ts-1, te)
|
628
|
+
when 'incombiningdiacriticalmarksforsymbols'
|
629
|
+
self.emit(type, :block_incombining_diacritical_marks_for_symbols, text, ts-1, te)
|
630
|
+
when 'incombiningdiacriticalmarks'
|
631
|
+
self.emit(type, :block_incombining_diacritical_marks, text, ts-1, te)
|
632
|
+
when 'incombininghalfmarks'
|
633
|
+
self.emit(type, :block_incombining_half_marks, text, ts-1, te)
|
634
|
+
when 'incontrolpictures'
|
635
|
+
self.emit(type, :block_incontrol_pictures, text, ts-1, te)
|
636
|
+
when 'incurrencysymbols'
|
637
|
+
self.emit(type, :block_incurrency_symbols, text, ts-1, te)
|
638
|
+
when 'incyrillicsupplementary'
|
639
|
+
self.emit(type, :block_incyrillic_supplementary, text, ts-1, te)
|
640
|
+
when 'incyrillic'
|
641
|
+
self.emit(type, :block_incyrillic, text, ts-1, te)
|
642
|
+
when 'indevanagari'
|
643
|
+
self.emit(type, :block_indevanagari, text, ts-1, te)
|
644
|
+
when 'indingbats'
|
645
|
+
self.emit(type, :block_indingbats, text, ts-1, te)
|
646
|
+
when 'inenclosedalphanumerics'
|
647
|
+
self.emit(type, :block_inenclosed_alphanumerics, text, ts-1, te)
|
648
|
+
when 'inenclosedcjklettersandmonths'
|
649
|
+
self.emit(type, :block_inenclosed_cjk_letters_and_months, text, ts-1, te)
|
650
|
+
when 'inethiopic'
|
651
|
+
self.emit(type, :block_inethiopic, text, ts-1, te)
|
652
|
+
when 'ingeneralpunctuation'
|
653
|
+
self.emit(type, :block_ingeneral_punctuation, text, ts-1, te)
|
654
|
+
when 'ingeometricshapes'
|
655
|
+
self.emit(type, :block_ingeometric_shapes, text, ts-1, te)
|
656
|
+
when 'ingeorgian'
|
657
|
+
self.emit(type, :block_ingeorgian, text, ts-1, te)
|
658
|
+
when 'ingreekextended'
|
659
|
+
self.emit(type, :block_ingreek_extended, text, ts-1, te)
|
660
|
+
when 'ingreekandcoptic'
|
661
|
+
self.emit(type, :block_ingreek_and_coptic, text, ts-1, te)
|
662
|
+
when 'ingujarati'
|
663
|
+
self.emit(type, :block_ingujarati, text, ts-1, te)
|
664
|
+
when 'ingurmukhi'
|
665
|
+
self.emit(type, :block_ingurmukhi, text, ts-1, te)
|
666
|
+
when 'inhalfwidthandfullwidthforms'
|
667
|
+
self.emit(type, :block_inhalfwidth_and_fullwidth_forms, text, ts-1, te)
|
668
|
+
when 'inhangulcompatibilityjamo'
|
669
|
+
self.emit(type, :block_inhangul_compatibility_jamo, text, ts-1, te)
|
670
|
+
when 'inhanguljamo'
|
671
|
+
self.emit(type, :block_inhangul_jamo, text, ts-1, te)
|
672
|
+
when 'inhangulsyllables'
|
673
|
+
self.emit(type, :block_inhangul_syllables, text, ts-1, te)
|
674
|
+
when 'inhanunoo'
|
675
|
+
self.emit(type, :block_inhanunoo, text, ts-1, te)
|
676
|
+
when 'inhebrew'
|
677
|
+
self.emit(type, :block_inhebrew, text, ts-1, te)
|
678
|
+
when 'inhighprivateusesurrogates'
|
679
|
+
self.emit(type, :block_inhigh_private_use_surrogates, text, ts-1, te)
|
680
|
+
when 'inhighsurrogates'
|
681
|
+
self.emit(type, :block_inhigh_surrogates, text, ts-1, te)
|
682
|
+
when 'inhiragana'
|
683
|
+
self.emit(type, :block_inhiragana, text, ts-1, te)
|
684
|
+
when 'inipaextensions'
|
685
|
+
self.emit(type, :block_inipa_extensions, text, ts-1, te)
|
686
|
+
when 'inideographicdescriptioncharacters'
|
687
|
+
self.emit(type, :block_inideographic_description_characters, text, ts-1, te)
|
688
|
+
when 'inkanbun'
|
689
|
+
self.emit(type, :block_inkanbun, text, ts-1, te)
|
690
|
+
when 'inkangxiradicals'
|
691
|
+
self.emit(type, :block_inkangxi_radicals, text, ts-1, te)
|
692
|
+
when 'inkannada'
|
693
|
+
self.emit(type, :block_inkannada, text, ts-1, te)
|
694
|
+
when 'inkatakanaphoneticextensions'
|
695
|
+
self.emit(type, :block_inkatakana_phonetic_extensions, text, ts-1, te)
|
696
|
+
when 'inkatakana'
|
697
|
+
self.emit(type, :block_inkatakana, text, ts-1, te)
|
698
|
+
when 'inkhmersymbols'
|
699
|
+
self.emit(type, :block_inkhmer_symbols, text, ts-1, te)
|
700
|
+
when 'inkhmer'
|
701
|
+
self.emit(type, :block_inkhmer, text, ts-1, te)
|
702
|
+
when 'inlao'
|
703
|
+
self.emit(type, :block_inlao, text, ts-1, te)
|
704
|
+
when 'inlatin-1supplement'
|
705
|
+
self.emit(type, :block_inlatin_1_supplement, text, ts-1, te)
|
706
|
+
when 'inlatinextended-a'
|
707
|
+
self.emit(type, :block_inlatin_extended_a, text, ts-1, te)
|
708
|
+
when 'inlatinextended-b'
|
709
|
+
self.emit(type, :block_inlatin_extended_b, text, ts-1, te)
|
710
|
+
when 'inlatinextendedadditional'
|
711
|
+
self.emit(type, :block_inlatin_extended_additional, text, ts-1, te)
|
712
|
+
when 'inletterlikesymbols'
|
713
|
+
self.emit(type, :block_inletterlike_symbols, text, ts-1, te)
|
714
|
+
when 'inlimbu'
|
715
|
+
self.emit(type, :block_inlimbu, text, ts-1, te)
|
716
|
+
when 'inlowsurrogates'
|
717
|
+
self.emit(type, :block_inlow_surrogates, text, ts-1, te)
|
718
|
+
when 'inmalayalam'
|
719
|
+
self.emit(type, :block_inmalayalam, text, ts-1, te)
|
720
|
+
when 'inmathematicaloperators'
|
721
|
+
self.emit(type, :block_inmathematical_operators, text, ts-1, te)
|
722
|
+
when 'inmiscellaneousmathematicalsymbols-a'
|
723
|
+
self.emit(type, :block_inmiscellaneous_mathematical_symbols_a, text, ts-1, te)
|
724
|
+
when 'inmiscellaneousmathematicalsymbols-b'
|
725
|
+
self.emit(type, :block_inmiscellaneous_mathematical_symbols_b, text, ts-1, te)
|
726
|
+
when 'inmiscellaneoussymbolsandarrows'
|
727
|
+
self.emit(type, :block_inmiscellaneous_symbols_and_arrows, text, ts-1, te)
|
728
|
+
when 'inmiscellaneoussymbols'
|
729
|
+
self.emit(type, :block_inmiscellaneous_symbols, text, ts-1, te)
|
730
|
+
when 'inmiscellaneoustechnical'
|
731
|
+
self.emit(type, :block_inmiscellaneous_technical, text, ts-1, te)
|
732
|
+
when 'inmongolian'
|
733
|
+
self.emit(type, :block_inmongolian, text, ts-1, te)
|
734
|
+
when 'inmyanmar'
|
735
|
+
self.emit(type, :block_inmyanmar, text, ts-1, te)
|
736
|
+
when 'innumberforms'
|
737
|
+
self.emit(type, :block_innumber_forms, text, ts-1, te)
|
738
|
+
when 'inogham'
|
739
|
+
self.emit(type, :block_inogham, text, ts-1, te)
|
740
|
+
when 'inopticalcharacterrecognition'
|
741
|
+
self.emit(type, :block_inoptical_character_recognition, text, ts-1, te)
|
742
|
+
when 'inoriya'
|
743
|
+
self.emit(type, :block_inoriya, text, ts-1, te)
|
744
|
+
when 'inphoneticextensions'
|
745
|
+
self.emit(type, :block_inphonetic_extensions, text, ts-1, te)
|
746
|
+
when 'inprivateusearea'
|
747
|
+
self.emit(type, :block_inprivate_use_area, text, ts-1, te)
|
748
|
+
when 'inrunic'
|
749
|
+
self.emit(type, :block_inrunic, text, ts-1, te)
|
750
|
+
when 'insinhala'
|
751
|
+
self.emit(type, :block_insinhala, text, ts-1, te)
|
752
|
+
when 'insmallformvariants'
|
753
|
+
self.emit(type, :block_insmall_form_variants, text, ts-1, te)
|
754
|
+
when 'inspacingmodifierletters'
|
755
|
+
self.emit(type, :block_inspacing_modifier_letters, text, ts-1, te)
|
756
|
+
when 'inspecials'
|
757
|
+
self.emit(type, :block_inspecials, text, ts-1, te)
|
758
|
+
when 'insuperscriptsandsubscripts'
|
759
|
+
self.emit(type, :block_insuperscripts_and_subscripts, text, ts-1, te)
|
760
|
+
when 'insupplementalarrows-a'
|
761
|
+
self.emit(type, :block_insupplemental_arrows_a, text, ts-1, te)
|
762
|
+
when 'insupplementalarrows-b'
|
763
|
+
self.emit(type, :block_insupplemental_arrows_b, text, ts-1, te)
|
764
|
+
when 'insupplementalmathematicaloperators'
|
765
|
+
self.emit(type, :block_insupplemental_mathematical_operators, text, ts-1, te)
|
766
|
+
when 'insyriac'
|
767
|
+
self.emit(type, :block_insyriac, text, ts-1, te)
|
768
|
+
when 'intagalog'
|
769
|
+
self.emit(type, :block_intagalog, text, ts-1, te)
|
770
|
+
when 'intagbanwa'
|
771
|
+
self.emit(type, :block_intagbanwa, text, ts-1, te)
|
772
|
+
when 'intaile'
|
773
|
+
self.emit(type, :block_intai_le, text, ts-1, te)
|
774
|
+
when 'intamil'
|
775
|
+
self.emit(type, :block_intamil, text, ts-1, te)
|
776
|
+
when 'intelugu'
|
777
|
+
self.emit(type, :block_intelugu, text, ts-1, te)
|
778
|
+
when 'inthaana'
|
779
|
+
self.emit(type, :block_inthaana, text, ts-1, te)
|
780
|
+
when 'inthai'
|
781
|
+
self.emit(type, :block_inthai, text, ts-1, te)
|
782
|
+
when 'intibetan'
|
783
|
+
self.emit(type, :block_intibetan, text, ts-1, te)
|
784
|
+
when 'inunifiedcanadianaboriginalsyllabics'
|
785
|
+
self.emit(type, :block_inunified_canadian_aboriginal_syllabics, text, ts-1, te)
|
786
|
+
when 'invariationselectors'
|
787
|
+
self.emit(type, :block_invariation_selectors, text, ts-1, te)
|
788
|
+
when 'inyiradicals'
|
789
|
+
self.emit(type, :block_inyi_radicals, text, ts-1, te)
|
790
|
+
when 'inyisyllables'
|
791
|
+
self.emit(type, :block_inyi_syllables, text, ts-1, te)
|
792
|
+
when 'inyijinghexagramsymbols'
|
793
|
+
self.emit(type, :block_inyijing_hexagram_symbols, text, ts-1, te)
|
794
|
+
|
581
795
|
else
|
582
796
|
# Should this really be an error? Or would emitting
|
583
797
|
# an :unknown for the property be better?
|
@@ -225,7 +225,115 @@ module Regexp::Syntax
|
|
225
225
|
:script_warang_citi
|
226
226
|
]
|
227
227
|
|
228
|
-
|
228
|
+
UnicodeBlock = [
|
229
|
+
:block_inalphabetic_presentation_forms,
|
230
|
+
:block_inarabic_presentation_forms_a,
|
231
|
+
:block_inarabic_presentation_forms_b,
|
232
|
+
:block_inarabic,
|
233
|
+
:block_inarmenian,
|
234
|
+
:block_inarrows,
|
235
|
+
:block_inbasic_latin,
|
236
|
+
:block_inbengali,
|
237
|
+
:block_inblock_elements,
|
238
|
+
:block_inbopomofo_extended,
|
239
|
+
:block_inbopomofo,
|
240
|
+
:block_inbox_drawing,
|
241
|
+
:block_inbraille_patterns,
|
242
|
+
:block_inbuhid,
|
243
|
+
:block_incjk_compatibility_forms,
|
244
|
+
:block_incjk_compatibility_ideographs,
|
245
|
+
:block_incjk_compatibility,
|
246
|
+
:block_incjk_radicals_supplement,
|
247
|
+
:block_incjk_symbols_and_punctuation,
|
248
|
+
:block_incjk_unified_ideographs_extension_a,
|
249
|
+
:block_incjk_unified_ideographs,
|
250
|
+
:block_incherokee,
|
251
|
+
:block_incombining_diacritical_marks_for_symbols,
|
252
|
+
:block_incombining_diacritical_marks,
|
253
|
+
:block_incombining_half_marks,
|
254
|
+
:block_incontrol_pictures,
|
255
|
+
:block_incurrency_symbols,
|
256
|
+
:block_incyrillic_supplementary,
|
257
|
+
:block_incyrillic,
|
258
|
+
:block_indevanagari,
|
259
|
+
:block_indingbats,
|
260
|
+
:block_inenclosed_alphanumerics,
|
261
|
+
:block_inenclosed_cjk_letters_and_months,
|
262
|
+
:block_inethiopic,
|
263
|
+
:block_ingeneral_punctuation,
|
264
|
+
:block_ingeometric_shapes,
|
265
|
+
:block_ingeorgian,
|
266
|
+
:block_ingreek_extended,
|
267
|
+
:block_ingreek_and_coptic,
|
268
|
+
:block_ingujarati,
|
269
|
+
:block_ingurmukhi,
|
270
|
+
:block_inhalfwidth_and_fullwidth_forms,
|
271
|
+
:block_inhangul_compatibility_jamo,
|
272
|
+
:block_inhangul_jamo,
|
273
|
+
:block_inhangul_syllables,
|
274
|
+
:block_inhanunoo,
|
275
|
+
:block_inhebrew,
|
276
|
+
:block_inhigh_private_use_surrogates,
|
277
|
+
:block_inhigh_surrogates,
|
278
|
+
:block_inhiragana,
|
279
|
+
:block_inipa_extensions,
|
280
|
+
:block_inideographic_description_characters,
|
281
|
+
:block_inkanbun,
|
282
|
+
:block_inkangxi_radicals,
|
283
|
+
:block_inkannada,
|
284
|
+
:block_inkatakana_phonetic_extensions,
|
285
|
+
:block_inkatakana,
|
286
|
+
:block_inkhmer_symbols,
|
287
|
+
:block_inkhmer,
|
288
|
+
:block_inlao,
|
289
|
+
:block_inlatin_1_supplement,
|
290
|
+
:block_inlatin_extended_a,
|
291
|
+
:block_inlatin_extended_b,
|
292
|
+
:block_inlatin_extended_additional,
|
293
|
+
:block_inletterlike_symbols,
|
294
|
+
:block_inlimbu,
|
295
|
+
:block_inlow_surrogates,
|
296
|
+
:block_inmalayalam,
|
297
|
+
:block_inmathematical_operators,
|
298
|
+
:block_inmiscellaneous_mathematical_symbols_a,
|
299
|
+
:block_inmiscellaneous_mathematical_symbols_b,
|
300
|
+
:block_inmiscellaneous_symbols_and_arrows,
|
301
|
+
:block_inmiscellaneous_symbols,
|
302
|
+
:block_inmiscellaneous_technical,
|
303
|
+
:block_inmongolian,
|
304
|
+
:block_inmyanmar,
|
305
|
+
:block_innumber_forms,
|
306
|
+
:block_inogham,
|
307
|
+
:block_inoptical_character_recognition,
|
308
|
+
:block_inoriya,
|
309
|
+
:block_inphonetic_extensions,
|
310
|
+
:block_inprivate_use_area,
|
311
|
+
:block_inrunic,
|
312
|
+
:block_insinhala,
|
313
|
+
:block_insmall_form_variants,
|
314
|
+
:block_inspacing_modifier_letters,
|
315
|
+
:block_inspecials,
|
316
|
+
:block_insuperscripts_and_subscripts,
|
317
|
+
:block_insupplemental_arrows_a,
|
318
|
+
:block_insupplemental_arrows_b,
|
319
|
+
:block_insupplemental_mathematical_operators,
|
320
|
+
:block_insyriac,
|
321
|
+
:block_intagalog,
|
322
|
+
:block_intagbanwa,
|
323
|
+
:block_intai_le,
|
324
|
+
:block_intamil,
|
325
|
+
:block_intelugu,
|
326
|
+
:block_inthaana,
|
327
|
+
:block_inthai,
|
328
|
+
:block_intibetan,
|
329
|
+
:block_inunified_canadian_aboriginal_syllabics,
|
330
|
+
:block_invariation_selectors,
|
331
|
+
:block_inyi_radicals,
|
332
|
+
:block_inyi_syllables,
|
333
|
+
:block_inyijing_hexagram_symbols,
|
334
|
+
]
|
335
|
+
|
336
|
+
V190 = CharType + POSIX + Category::All + Derived + Script + UnicodeBlock
|
229
337
|
V193 = Age_V193 + Script_6_0
|
230
338
|
|
231
339
|
V200 = Age_V200
|
@@ -337,6 +337,13 @@ class ParserProperties < Test::Unit::TestCase
|
|
337
337
|
"Expected Script property, but got #{t.expressions[1].class.name}")
|
338
338
|
end
|
339
339
|
|
340
|
+
def test_parse_property_block
|
341
|
+
t = RP.parse 'ab\p{InArmenian}cd', 'ruby/1.9'
|
342
|
+
|
343
|
+
assert( t.expressions[1].is_a?(UnicodeProperty::Block),
|
344
|
+
"Expected Block property, but got #{t.expressions[1].class.name}")
|
345
|
+
end
|
346
|
+
|
340
347
|
def test_parse_property_following_literal
|
341
348
|
t = RP.parse 'ab\p{Lu}cd', 'ruby/1.9'
|
342
349
|
|
data/test/scanner/test_all.rb
CHANGED
@@ -2,7 +2,7 @@ require File.expand_path("../../helpers", __FILE__)
|
|
2
2
|
|
3
3
|
%w{
|
4
4
|
anchors errors escapes free_space groups literals
|
5
|
-
meta properties quantifiers scripts sets types
|
5
|
+
meta properties quantifiers scripts sets types unicode_blocks
|
6
6
|
}.each do|tc|
|
7
7
|
require File.expand_path("../test_#{tc}", __FILE__)
|
8
8
|
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require File.expand_path("../../helpers", __FILE__)
|
2
|
+
|
3
|
+
class ScannerUnicodeBlocks < Test::Unit::TestCase
|
4
|
+
|
5
|
+
tests = {
|
6
|
+
'InAlphabetic_Presentation_Forms' => :block_inalphabetic_presentation_forms,
|
7
|
+
'InArabic_Presentation_Forms-A' => :block_inarabic_presentation_forms_a,
|
8
|
+
'InArabic_Presentation_Forms-B' => :block_inarabic_presentation_forms_b,
|
9
|
+
'InArabic' => :block_inarabic,
|
10
|
+
'InArmenian' => :block_inarmenian,
|
11
|
+
'InArrows' => :block_inarrows,
|
12
|
+
'InBasic_Latin' => :block_inbasic_latin,
|
13
|
+
'InBengali' => :block_inbengali,
|
14
|
+
'InBlock_Elements' => :block_inblock_elements,
|
15
|
+
'InBopomofo_Extended' => :block_inbopomofo_extended,
|
16
|
+
'InBopomofo' => :block_inbopomofo,
|
17
|
+
'InBox_Drawing' => :block_inbox_drawing,
|
18
|
+
'InBraille_Patterns' => :block_inbraille_patterns,
|
19
|
+
'InBuhid' => :block_inbuhid,
|
20
|
+
'InCJK_Compatibility_Forms' => :block_incjk_compatibility_forms,
|
21
|
+
'InCJK_Compatibility_Ideographs' => :block_incjk_compatibility_ideographs,
|
22
|
+
'InCJK_Compatibility' => :block_incjk_compatibility,
|
23
|
+
'InCJK_Radicals_Supplement' => :block_incjk_radicals_supplement,
|
24
|
+
'InCJK_Symbols_and_Punctuation' => :block_incjk_symbols_and_punctuation,
|
25
|
+
'InCJK_Unified_Ideographs_Extension_A' => :block_incjk_unified_ideographs_extension_a,
|
26
|
+
'InCJK_Unified_Ideographs' => :block_incjk_unified_ideographs,
|
27
|
+
'InCherokee' => :block_incherokee,
|
28
|
+
'InCombining_Diacritical_Marks_for_Symbols' => :block_incombining_diacritical_marks_for_symbols,
|
29
|
+
'InCombining_Diacritical_Marks' => :block_incombining_diacritical_marks,
|
30
|
+
'InCombining_Half_Marks' => :block_incombining_half_marks,
|
31
|
+
'InControl_Pictures' => :block_incontrol_pictures,
|
32
|
+
'InCurrency_Symbols' => :block_incurrency_symbols,
|
33
|
+
'InCyrillic_Supplementary' => :block_incyrillic_supplementary,
|
34
|
+
'InCyrillic' => :block_incyrillic,
|
35
|
+
'InDevanagari' => :block_indevanagari,
|
36
|
+
'InDingbats' => :block_indingbats,
|
37
|
+
'InEnclosed_Alphanumerics' => :block_inenclosed_alphanumerics,
|
38
|
+
'InEnclosed_CJK_Letters_and_Months' => :block_inenclosed_cjk_letters_and_months,
|
39
|
+
'InEthiopic' => :block_inethiopic,
|
40
|
+
'InGeneral_Punctuation' => :block_ingeneral_punctuation,
|
41
|
+
'InGeometric_Shapes' => :block_ingeometric_shapes,
|
42
|
+
'InGeorgian' => :block_ingeorgian,
|
43
|
+
'InGreek_Extended' => :block_ingreek_extended,
|
44
|
+
'InGreek_and_Coptic' => :block_ingreek_and_coptic,
|
45
|
+
'InGujarati' => :block_ingujarati,
|
46
|
+
'InGurmukhi' => :block_ingurmukhi,
|
47
|
+
'InHalfwidth_and_Fullwidth_Forms' => :block_inhalfwidth_and_fullwidth_forms,
|
48
|
+
'InHangul_Compatibility_Jamo' => :block_inhangul_compatibility_jamo,
|
49
|
+
'InHangul_Jamo' => :block_inhangul_jamo,
|
50
|
+
'InHangul_Syllables' => :block_inhangul_syllables,
|
51
|
+
'InHanunoo' => :block_inhanunoo,
|
52
|
+
'InHebrew' => :block_inhebrew,
|
53
|
+
'InHigh_Private_Use_Surrogates' => :block_inhigh_private_use_surrogates,
|
54
|
+
'InHigh_Surrogates' => :block_inhigh_surrogates,
|
55
|
+
'InHiragana' => :block_inhiragana,
|
56
|
+
'InIPA_Extensions' => :block_inipa_extensions,
|
57
|
+
'InIdeographic_Description_Characters' => :block_inideographic_description_characters,
|
58
|
+
'InKanbun' => :block_inkanbun,
|
59
|
+
'InKangxi_Radicals' => :block_inkangxi_radicals,
|
60
|
+
'InKannada' => :block_inkannada,
|
61
|
+
'InKatakana_Phonetic_Extensions' => :block_inkatakana_phonetic_extensions,
|
62
|
+
'InKatakana' => :block_inkatakana,
|
63
|
+
'InKhmer_Symbols' => :block_inkhmer_symbols,
|
64
|
+
'InKhmer' => :block_inkhmer,
|
65
|
+
'InLao' => :block_inlao,
|
66
|
+
'InLatin-1_Supplement' => :block_inlatin_1_supplement,
|
67
|
+
'InLatin_Extended-A' => :block_inlatin_extended_a,
|
68
|
+
'InLatin_Extended-B' => :block_inlatin_extended_b,
|
69
|
+
'InLatin_Extended_Additional' => :block_inlatin_extended_additional,
|
70
|
+
'InLetterlike_Symbols' => :block_inletterlike_symbols,
|
71
|
+
'InLimbu' => :block_inlimbu,
|
72
|
+
'InLow_Surrogates' => :block_inlow_surrogates,
|
73
|
+
'InMalayalam' => :block_inmalayalam,
|
74
|
+
'InMathematical_Operators' => :block_inmathematical_operators,
|
75
|
+
'InMiscellaneous_Mathematical_Symbols-A' => :block_inmiscellaneous_mathematical_symbols_a,
|
76
|
+
'InMiscellaneous_Mathematical_Symbols-B' => :block_inmiscellaneous_mathematical_symbols_b,
|
77
|
+
'InMiscellaneous_Symbols_and_Arrows' => :block_inmiscellaneous_symbols_and_arrows,
|
78
|
+
'InMiscellaneous_Symbols' => :block_inmiscellaneous_symbols,
|
79
|
+
'InMiscellaneous_Technical' => :block_inmiscellaneous_technical,
|
80
|
+
'InMongolian' => :block_inmongolian,
|
81
|
+
'InMyanmar' => :block_inmyanmar,
|
82
|
+
'InNumber_Forms' => :block_innumber_forms,
|
83
|
+
'InOgham' => :block_inogham,
|
84
|
+
'InOptical_Character_Recognition' => :block_inoptical_character_recognition,
|
85
|
+
'InOriya' => :block_inoriya,
|
86
|
+
'InPhonetic_Extensions' => :block_inphonetic_extensions,
|
87
|
+
'InPrivate_Use_Area' => :block_inprivate_use_area,
|
88
|
+
'InRunic' => :block_inrunic,
|
89
|
+
'InSinhala' => :block_insinhala,
|
90
|
+
'InSmall_Form_Variants' => :block_insmall_form_variants,
|
91
|
+
'InSpacing_Modifier_Letters' => :block_inspacing_modifier_letters,
|
92
|
+
'InSpecials' => :block_inspecials,
|
93
|
+
'InSuperscripts_and_Subscripts' => :block_insuperscripts_and_subscripts,
|
94
|
+
'InSupplemental_Arrows-A' => :block_insupplemental_arrows_a,
|
95
|
+
'InSupplemental_Arrows-B' => :block_insupplemental_arrows_b,
|
96
|
+
'InSupplemental_Mathematical_Operators' => :block_insupplemental_mathematical_operators,
|
97
|
+
'InSyriac' => :block_insyriac,
|
98
|
+
'InTagalog' => :block_intagalog,
|
99
|
+
'InTagbanwa' => :block_intagbanwa,
|
100
|
+
'InTai_Le' => :block_intai_le,
|
101
|
+
'InTamil' => :block_intamil,
|
102
|
+
'InTelugu' => :block_intelugu,
|
103
|
+
'InThaana' => :block_inthaana,
|
104
|
+
'InThai' => :block_inthai,
|
105
|
+
'InTibetan' => :block_intibetan,
|
106
|
+
'InUnified_Canadian_Aboriginal_Syllabics' => :block_inunified_canadian_aboriginal_syllabics,
|
107
|
+
'InVariation_Selectors' => :block_invariation_selectors,
|
108
|
+
'InYi_Radicals' => :block_inyi_radicals,
|
109
|
+
'InYi_Syllables' => :block_inyi_syllables,
|
110
|
+
'InYijing_Hexagram_Symbols' => :block_inyijing_hexagram_symbols
|
111
|
+
}
|
112
|
+
|
113
|
+
count = 0
|
114
|
+
tests.each do |property, test|
|
115
|
+
define_method "test_scan_property_#{test}_#{count+=1}" do
|
116
|
+
token = RS.scan("a\\p{#{property}}c")[1]
|
117
|
+
|
118
|
+
assert_equal( :property, token[0] )
|
119
|
+
assert_equal( test, token[1] )
|
120
|
+
end
|
121
|
+
|
122
|
+
define_method "test_scan_nonproperty_#{test}_#{count+=1}" do
|
123
|
+
token = RS.scan("a\\P{#{property}}c")[1]
|
124
|
+
|
125
|
+
assert_equal( :nonproperty, token[0] )
|
126
|
+
assert_equal( test, token[1] )
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ammar Ali
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A library for tokenizing, lexing, and parsing Ruby regular expressions.
|
14
14
|
email:
|
@@ -133,6 +133,7 @@ files:
|
|
133
133
|
- test/scanner/test_scripts.rb
|
134
134
|
- test/scanner/test_sets.rb
|
135
135
|
- test/scanner/test_types.rb
|
136
|
+
- test/scanner/test_unicode_blocks.rb
|
136
137
|
- test/syntax/ruby/test_1.8.rb
|
137
138
|
- test/syntax/ruby/test_1.9.1.rb
|
138
139
|
- test/syntax/ruby/test_1.9.3.rb
|
@@ -220,6 +221,7 @@ test_files:
|
|
220
221
|
- test/scanner/test_scripts.rb
|
221
222
|
- test/scanner/test_sets.rb
|
222
223
|
- test/scanner/test_types.rb
|
224
|
+
- test/scanner/test_unicode_blocks.rb
|
223
225
|
- test/syntax/ruby/test_1.8.rb
|
224
226
|
- test/syntax/ruby/test_1.9.1.rb
|
225
227
|
- test/syntax/ruby/test_1.9.3.rb
|