regexp-examples 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 062a1310c8b7c861a7724fd75745c1e9bff9257f
4
- data.tar.gz: b05ce36dbb3c0afee079091d5c1016a429f1d099
3
+ metadata.gz: f7dacce756110dd70823630de898a8c9f55d12b1
4
+ data.tar.gz: d3ee78e2ed48d91aacc9cb916d8ab71dd25e326d
5
5
  SHA512:
6
- metadata.gz: 5519ec6e710a257c165b35f6b4138bc03d37973c31fcb4a7c6091713ab347438dd8e8bbde722c5ab21f63d6ede0565be2b77de0747d42f9f1e83413312b71d14
7
- data.tar.gz: 58d67c5e25de2dbd238cb53a192c784047b8ab7e8a836d87c47c9cf5316133d3383a8fdec0a7c9fe07eec0518266b3c66a66c8953e830277b41e1bf9ca53e525
6
+ metadata.gz: 2655f9c1b1bbb8452a06d7debdba232ba53354131776bbf23a69fc1dc3b62d950600093b4581d2f4c5304f161db421ed95204e8c40cee2adc75c95670dcf42a1
7
+ data.tar.gz: da2dd9829aa3f5f2415f4a5ca4182133c19b1a481a40172140858ba72f65e05824eebdbff8899c6f0d84a90c93b0539c86c68dd7a23371fe6f577da738746824
data/README.md CHANGED
@@ -44,6 +44,7 @@ For more detail on this, see [configuration options](#configuration-options).
44
44
  * Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/`
45
45
  * Octal characters, e.g. `/\10/`, `/\177/`
46
46
  * POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/`
47
+ * Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
47
48
  * **Arbitrarily complex combinations of all the above!**
48
49
 
49
50
  * Regexp options can also be used:
@@ -60,11 +61,6 @@ For more detail on this, see [configuration options](#configuration-options).
60
61
 
61
62
  * Conditional capture groups, such as `/(group1) (?(1)yes|no)`
62
63
 
63
- Using any of the following will raise a RegexpExamples::UnsupportedSyntax exception (until such time as they are implemented!):
64
-
65
- * Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
66
- * Subexpression calls, e.g. `/(?<name> ... \g<name>* )/` (Note: These could get _really_ ugly to implement, and may even be impossible, so I highly doubt it's worth the effort!)
67
-
68
64
  There are loads more (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out here. Full documentation on all the various other obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE).
69
65
 
70
66
  ## Impossible features ("illegal syntax")
@@ -77,6 +73,7 @@ Using any of the following will raise a RegexpExamples::IllegalSyntax exception:
77
73
  * Lookarounds, e.g. `/foo(?=bar)/`, `/foo(?!bar)/`, `/(?<=foo)bar/`, `/(?<!foo)bar/`
78
74
  * [Anchors](http://ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Anchors) (`\b`, `\B`, `\G`, `^`, `\A`, `$`, `\z`, `\Z`), e.g. `/\bword\b/`, `/line1\n^line2/`
79
75
  * However, a special case has been made to allow `^`, `\A` and `\G` at the start of a pattern; and to allow `$`, `\z` and `\Z` at the end of pattern. In such cases, the characters are effectively just ignored.
76
+ * Subexpression calls, e.g. `/(?<name> ... \g<name>* )/`
80
77
 
81
78
  (Note: Backreferences are not really "regular" either, but I got these to work with a bit of hackery!)
82
79
 
@@ -1,5 +1,7 @@
1
1
  module RegexpExamples
2
2
  class BackReferenceReplacer
3
+ BackrefNotFound = Class.new(StandardError)
4
+
3
5
  def substitute_backreferences(full_examples)
4
6
  full_examples.map do |full_example|
5
7
  begin
@@ -7,7 +9,7 @@ module RegexpExamples
7
9
  full_example.sub!(/__(\w+?)__/, find_backref_for(full_example, $1))
8
10
  end
9
11
  full_example
10
- rescue RegexpExamples::BackrefNotFound
12
+ rescue BackrefNotFound
11
13
  # For instance, one "full example" from /(a|(b)) \2/: "a __2__"
12
14
  # should be rejected because the backref (\2) does not exist
13
15
  nil
@@ -27,7 +29,7 @@ module RegexpExamples
27
29
  if octal_chars =~ /\A[01]?[0-7]{1,2}\z/ && octal_chars.to_i >= 10
28
30
  Integer(octal_chars, 8).chr
29
31
  else
30
- raise(RegexpExamples::BackrefNotFound)
32
+ raise(BackrefNotFound)
31
33
  end
32
34
  end
33
35
 
@@ -35,6 +35,9 @@ module RegexpExamples
35
35
  Lower = Array('a'..'z')
36
36
  Upper = Array('A'..'Z')
37
37
  Digit = Array('0'..'9')
38
+ # Note: Punct should also include the following chars: $ + < = > ^ ` | ~
39
+ # I.e. Punct = %w(! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~)
40
+ # However, due to a ruby bug (!!) these do not work properly at the moment!
38
41
  Punct = %w(! " # % & ' ( ) * , - . / : ; ? @ [ \\ ] _ { })
39
42
  Hex = Array('a'..'f') | Array('A'..'F') | Digit
40
43
  Word = Lower | Upper | Digit | ['_']
@@ -81,5 +84,157 @@ module RegexpExamples
81
84
  'word' => CharSets::Word,
82
85
  'ascii' => CharSets::Any
83
86
  }.freeze
87
+
88
+ def self.ranges_to_unicode(*ranges)
89
+ result = []
90
+ ranges.each do |range|
91
+ if range.is_a? Fixnum # Small hack to improve readability below
92
+ result << hex_to_unicode(range.to_s(16))
93
+ else
94
+ range.each { |num| result << hex_to_unicode(num.to_s(16)) }
95
+ end
96
+ end
97
+ result
98
+ end
99
+
100
+ def self.hex_to_unicode(hex)
101
+ eval("?\\u{#{hex}}")
102
+ end
103
+
104
+ # These values were generated by: scripts/unicode_lister.rb
105
+ # Note: Only the first 128 results are listed, for performance.
106
+ # Also, some groups seem to have no matches (weird!)
107
+ NamedPropertyCharMap = {
108
+ 'Alnum' => ranges_to_unicode(48..57, 65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..256),
109
+ 'Alpha' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
110
+ 'Blank' => ranges_to_unicode(9, 32, 160, 5760, 8192..8202, 8239, 8287, 12288),
111
+ 'Cntrl' => ranges_to_unicode(0..31, 127..159),
112
+ 'Digit' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
113
+ 'Graph' => ranges_to_unicode(33..126, 161..194),
114
+ 'Lower' => ranges_to_unicode(97..122, 170, 181, 186, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387),
115
+ 'Print' => ranges_to_unicode(32..126, 160..192),
116
+ 'Punct' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
117
+ 'Space' => ranges_to_unicode(9..13, 32, 133, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
118
+ 'Upper' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
119
+ 'XDigit' => ranges_to_unicode(48..57, 65..70, 97..102),
120
+ 'Word' => ranges_to_unicode(48..57, 65..90, 95, 97..122, 170, 181, 186, 192..214, 216..246, 248..255),
121
+ 'ASCII' => ranges_to_unicode(0..127),
122
+ 'Any' => ranges_to_unicode(0..127),
123
+ 'Assigned' => ranges_to_unicode(0..127),
124
+ 'L' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
125
+ 'Ll' => ranges_to_unicode(97..122, 181, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387, 389, 392),
126
+ 'Lm' => ranges_to_unicode(688..705, 710..721, 736..740, 748, 750, 884, 890, 1369, 1600, 1765..1766, 2036..2037, 2042, 2074, 2084, 2088, 2417, 3654, 3782, 4348, 6103, 6211, 6823, 7288..7293, 7468..7530, 7544, 7579..7580),
127
+ 'Lo' => ranges_to_unicode(170, 186, 443, 448..451, 660, 1488..1514, 1520..1522, 1568..1599, 1601..1610, 1646..1647, 1649..1694),
128
+ 'Lt' => ranges_to_unicode(453, 456, 459, 498, 8072..8079, 8088..8095, 8104..8111, 8124, 8140, 8188),
129
+ 'Lu' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
130
+ 'M' => ranges_to_unicode(768..879, 1155..1161, 1425..1433),
131
+ 'Mn' => ranges_to_unicode(768..879, 1155..1159, 1425..1435),
132
+ 'Mc' => ranges_to_unicode(2307, 2363, 2366..2368, 2377..2380, 2382..2383, 2434..2435, 2494..2496, 2503..2504, 2507..2508, 2519, 2563, 2622..2624, 2691, 2750..2752, 2761, 2763..2764, 2818..2819, 2878, 2880, 2887..2888, 2891..2892, 2903, 3006..3007, 3009..3010, 3014..3016, 3018..3020, 3031, 3073..3075, 3137..3140, 3202..3203, 3262, 3264..3268, 3271..3272, 3274..3275, 3285..3286, 3330..3331, 3390..3392, 3398..3400, 3402..3404, 3415, 3458..3459, 3535..3537, 3544..3551, 3570..3571, 3902..3903, 3967, 4139..4140, 4145, 4152, 4155..4156, 4182..4183, 4194..4196, 4199..4205, 4227..4228, 4231..4235),
133
+ 'Me' => ranges_to_unicode(1160..1161, 6846, 8413..8416, 8418..8420, 42608..42610),
134
+ 'N' => ranges_to_unicode(48..57, 178..179, 185, 188..190, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2548..2553, 2662..2671, 2790..2799, 2918..2927, 2930..2935, 3046..3058, 3174..3180),
135
+ 'Nd' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
136
+ 'Nl' => ranges_to_unicode(5870..5872, 8544..8578, 8581..8584, 12295, 12321..12329, 12344..12346, 42726..42735),
137
+ 'No' => ranges_to_unicode(178..179, 185, 188..190, 2548..2553, 2930..2935, 3056..3058, 3192..3198, 3440..3445, 3882..3891, 4969..4988, 6128..6137, 6618, 8304, 8308..8313, 8320..8329, 8528..8543, 8585, 9312..9330),
138
+ 'P' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
139
+ 'Pc' => ranges_to_unicode(95, 8255..8256, 8276),
140
+ 'Pd' => ranges_to_unicode(45, 1418, 1470, 5120, 6150, 8208..8213, 11799, 11802, 11834..11835, 11840, 12316, 12336, 12448),
141
+ 'Ps' => ranges_to_unicode(40, 91, 123, 3898, 3900, 5787, 8218, 8222, 8261, 8317, 8333, 8968, 8970, 9001, 10088, 10090, 10092, 10094, 10096, 10098, 10100, 10181, 10214, 10216, 10218, 10220, 10222, 10627, 10629, 10631, 10633, 10635, 10637, 10639, 10641, 10643, 10645, 10647, 10712, 10714, 10748, 11810, 11812, 11814, 11816, 11842, 12296, 12298, 12300, 12302, 12304, 12308, 12310, 12312, 12314, 12317),
142
+ 'Pe' => ranges_to_unicode(41, 93, 125, 3899, 3901, 5788, 8262, 8318, 8334, 8969, 8971, 9002, 10089, 10091, 10093, 10095, 10097, 10099, 10101, 10182, 10215, 10217, 10219, 10221, 10223, 10628, 10630, 10632, 10634, 10636, 10638, 10640, 10642, 10644, 10646, 10648, 10713, 10715, 10749, 11811, 11813, 11815, 11817, 12297, 12299, 12301, 12303, 12305, 12309, 12311, 12313, 12315, 12318..12319),
143
+ 'Pi' => ranges_to_unicode(171, 8216, 8219..8220, 8223, 8249, 11778, 11780, 11785, 11788, 11804, 11808),
144
+ 'Pf' => ranges_to_unicode(187, 8217, 8221, 8250, 11779, 11781, 11786, 11789, 11805, 11809),
145
+ 'Po' => ranges_to_unicode(33..35, 37..39, 42, 44, 46..47, 58..59, 63..64, 92, 161, 167, 182..183, 191, 894, 903, 1370..1375, 1417, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3973, 4048..4052, 4057..4058, 4170..4175, 4347, 4960..4968, 5741),
146
+ 'S' => ranges_to_unicode(36, 43, 60..62, 94, 96, 124, 126, 162..166, 168..169, 172, 174..177, 180, 184, 215, 247, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 1014, 1154, 1421..1423, 1542..1544, 1547, 1550..1551, 1758, 1769, 1789..1790, 2038, 2546..2547, 2554..2555, 2801, 2928, 3059..3066, 3199, 3449, 3647, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037),
147
+ 'Sm' => ranges_to_unicode(43, 60..62, 124, 126, 172, 177, 215, 247, 1014, 1542..1544, 8260, 8274, 8314..8316, 8330..8332, 8472, 8512..8516, 8523, 8592..8596, 8602..8603, 8608, 8611, 8614, 8622, 8654..8655, 8658, 8660, 8692..8775),
148
+ 'Sc' => ranges_to_unicode(36, 162..165, 1423, 1547, 2546..2547, 2555, 2801, 3065, 3647, 6107, 8352..8381, 43064),
149
+ 'Sk' => ranges_to_unicode(94, 96, 168, 175, 180, 184, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 8125, 8127..8129, 8141..8143, 8157..8159, 8173..8175, 8189..8190, 12443..12444, 42752..42774, 42784..42785, 42889..42890, 43867),
150
+ 'So' => ranges_to_unicode(166, 169, 174, 176, 1154, 1421..1422, 1550..1551, 1758, 1769, 1789..1790, 2038, 2554, 2928, 3059..3064, 3066, 3199, 3449, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037, 4039..4044, 4046..4047, 4053..4056, 4254..4255, 5008..5017, 6464, 6622..6655, 7009..7018, 7028..7036, 8448),
151
+ 'Z' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
152
+ 'Zs' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8239, 8287, 12288),
153
+ 'Zl' => ranges_to_unicode(8232),
154
+ 'Zp' => ranges_to_unicode(8233),
155
+ 'C' => ranges_to_unicode(0..31, 127..159, 173, 888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1541, 1564..1565, 1757, 1806..1807, 1867..1868, 1970..1977),
156
+ 'Cc' => ranges_to_unicode(0..31, 127..159),
157
+ 'Cf' => ranges_to_unicode(173, 1536..1541, 1564, 1757, 1807, 6158, 8203..8207, 8234..8238, 8288..8292, 8294..8303),
158
+ 'Cn' => ranges_to_unicode(888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1535, 1565, 1806, 1867..1868, 1970..1983, 2043..2047, 2094..2095, 2111, 2140..2141, 2143..2201),
159
+ 'Co' => ranges_to_unicode(),
160
+ 'Cs' => ranges_to_unicode(),
161
+ 'Arabic' => ranges_to_unicode(1536..1540, 1542..1547, 1549..1562, 1566, 1568..1599, 1601..1610, 1622..1631, 1642..1647, 1649..1692),
162
+ 'Armenian' => ranges_to_unicode(1329..1366, 1369..1375, 1377..1415, 1418, 1421..1423),
163
+ 'Balinese' => ranges_to_unicode(6912..6987, 6992..7036),
164
+ 'Bengali' => ranges_to_unicode(2432..2435, 2437..2444, 2447..2448, 2451..2472, 2474..2480, 2482, 2486..2489, 2492..2500, 2503..2504, 2507..2510, 2519, 2524..2525, 2527..2531, 2534..2555),
165
+ 'Bopomofo' => ranges_to_unicode(746..747, 12549..12589, 12704..12730),
166
+ 'Braille' => ranges_to_unicode(10240..10367),
167
+ 'Buginese' => ranges_to_unicode(6656..6683, 6686..6687),
168
+ 'Buhid' => ranges_to_unicode(5952..5971),
169
+ 'Canadian_Aboriginal' => ranges_to_unicode(5120..5247),
170
+ 'Carian' => ranges_to_unicode(),
171
+ 'Cham' => ranges_to_unicode(43520..43574, 43584..43597, 43600..43609, 43612..43615),
172
+ 'Cherokee' => ranges_to_unicode(5024..5108),
173
+ 'Common' => ranges_to_unicode(0..64, 91..96, 123..169, 171..180),
174
+ 'Coptic' => ranges_to_unicode(994..1007, 11392..11505),
175
+ 'Cuneiform' => ranges_to_unicode(),
176
+ 'Cypriot' => ranges_to_unicode(),
177
+ 'Cyrillic' => ranges_to_unicode(1024..1151),
178
+ 'Deseret' => ranges_to_unicode(),
179
+ 'Devanagari' => ranges_to_unicode(2304..2384, 2387..2403, 2406..2431, 43232..43235),
180
+ 'Ethiopic' => ranges_to_unicode(4608..4680, 4682..4685, 4688..4694, 4696, 4698..4701, 4704..4742),
181
+ 'Georgian' => ranges_to_unicode(4256..4293, 4295, 4301, 4304..4346, 4348..4351, 11520..11557, 11559, 11565),
182
+ 'Glagolitic' => ranges_to_unicode(11264..11310, 11312..11358),
183
+ 'Gothic' => ranges_to_unicode(),
184
+ 'Greek' => ranges_to_unicode(880..883, 885..887, 890..893, 895, 900, 902, 904..906, 908, 910..929, 931..993, 1008..1023, 7462..7466, 7517..7521, 7526),
185
+ 'Gujarati' => ranges_to_unicode(2689..2691, 2693..2701, 2703..2705, 2707..2728, 2730..2736, 2738..2739, 2741..2745, 2748..2757, 2759..2761, 2763..2765, 2768, 2784..2787, 2790..2801),
186
+ 'Gurmukhi' => ranges_to_unicode(2561..2563, 2565..2570, 2575..2576, 2579..2600, 2602..2608, 2610..2611, 2613..2614, 2616..2617, 2620, 2622..2626, 2631..2632, 2635..2637, 2641, 2649..2652, 2654, 2662..2677),
187
+ 'Han' => ranges_to_unicode(11904..11929, 11931..12019, 12032..12044),
188
+ 'Hangul' => ranges_to_unicode(4352..4479),
189
+ 'Hanunoo' => ranges_to_unicode(5920..5940),
190
+ 'Hebrew' => ranges_to_unicode(1425..1479, 1488..1514, 1520..1524),
191
+ 'Hiragana' => ranges_to_unicode(12353..12438, 12445..12447),
192
+ 'Inherited' => ranges_to_unicode(768..879, 1157..1158, 1611..1621, 1648, 2385..2386),
193
+ 'Kannada' => ranges_to_unicode(3201..3203, 3205..3212, 3214..3216, 3218..3240, 3242..3251, 3253..3257, 3260..3268, 3270..3272, 3274..3277, 3285..3286, 3294, 3296..3299, 3302..3311, 3313..3314),
194
+ 'Katakana' => ranges_to_unicode(12449..12538, 12541..12543, 12784..12799, 13008..13026),
195
+ 'Kayah_Li' => ranges_to_unicode(43264..43309, 43311),
196
+ 'Kharoshthi' => ranges_to_unicode(),
197
+ 'Khmer' => ranges_to_unicode(6016..6109, 6112..6121, 6128..6137, 6624..6637),
198
+ 'Lao' => ranges_to_unicode(3713..3714, 3716, 3719..3720, 3722, 3725, 3732..3735, 3737..3743, 3745..3747, 3749, 3751, 3754..3755, 3757..3769, 3771..3773, 3776..3780, 3782, 3784..3789, 3792..3801, 3804..3807),
199
+ 'Latin' => ranges_to_unicode(65..90, 97..122, 170, 186, 192..214, 216..246, 248..267),
200
+ 'Lepcha' => ranges_to_unicode(7168..7223, 7227..7241, 7245..7247),
201
+ 'Limbu' => ranges_to_unicode(6400..6430, 6432..6443, 6448..6459, 6464, 6468..6479),
202
+ 'Linear_B' => ranges_to_unicode(),
203
+ 'Lycian' => ranges_to_unicode(),
204
+ 'Lydian' => ranges_to_unicode(),
205
+ 'Malayalam' => ranges_to_unicode(3329..3331, 3333..3340, 3342..3344, 3346..3386, 3389..3396, 3398..3400, 3402..3406, 3415, 3424..3427, 3430..3445, 3449..3455),
206
+ 'Mongolian' => ranges_to_unicode(6144..6145, 6148, 6150..6158, 6160..6169, 6176..6263, 6272..6289),
207
+ 'Myanmar' => ranges_to_unicode(4096..4223),
208
+ 'New_Tai_Lue' => ranges_to_unicode(6528..6571, 6576..6601, 6608..6618, 6622..6623),
209
+ 'Nko' => ranges_to_unicode(1984..2042),
210
+ 'Ogham' => ranges_to_unicode(5760..5788),
211
+ 'Ol_Chiki' => ranges_to_unicode(7248..7295),
212
+ 'Old_Italic' => ranges_to_unicode(),
213
+ 'Old_Persian' => ranges_to_unicode(),
214
+ 'Oriya' => ranges_to_unicode(2817..2819, 2821..2828, 2831..2832, 2835..2856, 2858..2864, 2866..2867, 2869..2873, 2876..2884, 2887..2888, 2891..2893, 2902..2903, 2908..2909, 2911..2915, 2918..2935),
215
+ 'Osmanya' => ranges_to_unicode(),
216
+ 'Phags_Pa' => ranges_to_unicode(43072..43127),
217
+ 'Phoenician' => ranges_to_unicode(),
218
+ 'Rejang' => ranges_to_unicode(43312..43347, 43359),
219
+ 'Runic' => ranges_to_unicode(5792..5866, 5870..5880),
220
+ 'Saurashtra' => ranges_to_unicode(43136..43204, 43214..43225),
221
+ 'Shavian' => ranges_to_unicode(),
222
+ 'Sinhala' => ranges_to_unicode(3458..3459, 3461..3478, 3482..3505, 3507..3515, 3517, 3520..3526, 3530, 3535..3540, 3542, 3544..3551, 3558..3567, 3570..3572),
223
+ 'Sundanese' => ranges_to_unicode(7040..7103, 7360..7367),
224
+ 'Syloti_Nagri' => ranges_to_unicode(43008..43051),
225
+ 'Syriac' => ranges_to_unicode(1792..1805, 1807..1866, 1869..1871),
226
+ 'Tagalog' => ranges_to_unicode(5888..5900, 5902..5908),
227
+ 'Tagbanwa' => ranges_to_unicode(5984..5996, 5998..6000, 6002..6003),
228
+ 'Tai_Le' => ranges_to_unicode(6480..6509, 6512..6516),
229
+ 'Tamil' => ranges_to_unicode(2946..2947, 2949..2954, 2958..2960, 2962..2965, 2969..2970, 2972, 2974..2975, 2979..2980, 2984..2986, 2990..3001, 3006..3010, 3014..3016, 3018..3021, 3024, 3031, 3046..3066),
230
+ 'Telugu' => ranges_to_unicode(3072..3075, 3077..3084, 3086..3088, 3090..3112, 3114..3129, 3133..3140, 3142..3144, 3146..3149, 3157..3158, 3160..3161, 3168..3171, 3174..3183, 3192..3199),
231
+ 'Thaana' => ranges_to_unicode(1920..1969),
232
+ 'Thai' => ranges_to_unicode(3585..3642, 3648..3675),
233
+ 'Tibetan' => ranges_to_unicode(3840..3911, 3913..3948, 3953..3972),
234
+ 'Tifinagh' => ranges_to_unicode(11568..11623, 11631..11632, 11647),
235
+ 'Ugaritic' => ranges_to_unicode(),
236
+ 'Vai' => ranges_to_unicode(42240..42367),
237
+ 'Yi' => ranges_to_unicode(40960..41087),
238
+ }.freeze
84
239
  end
85
240
 
@@ -1,4 +1,5 @@
1
1
  module RegexpExamples
2
+ IllegalSyntaxError = Class.new(StandardError)
2
3
  class Parser
3
4
  attr_reader :regexp_string
4
5
  def initialize(regexp_string, regexp_options, config_options={})
@@ -85,8 +86,6 @@ module RegexpExamples
85
86
  group = parse_backreference_group($1)
86
87
  when BackslashCharMap.keys.include?(next_char)
87
88
  group = CharGroup.new(
88
- # Note: The `.dup` is important, as it prevents modifying the constant, in
89
- # CharGroup#init_ranges (where the '-' is moved to the front)
90
89
  BackslashCharMap[next_char].dup,
91
90
  @ignorecase
92
91
  )
@@ -100,16 +99,22 @@ module RegexpExamples
100
99
  @current_position += $1.length
101
100
  sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
102
101
  group = parse_single_char_group( parse_unicode_sequence(sequence) )
103
- when rest_of_string =~ /\Ap\{([^}]+)\}/ # Named properties
104
- @current_position += ($1.length + 2)
105
- raise UnsupportedSyntaxError, "Named properties ({\\p#{$1}}) are not yet supported"
102
+ when rest_of_string =~ /\Ap\{(\^?)([^}]+)\}/ # Named properties
103
+ @current_position += ($1.length + $2.length + 2)
104
+ group = CharGroup.new(
105
+ if($1 == "^")
106
+ CharSets::Any.dup - NamedPropertyCharMap[$2]
107
+ else
108
+ NamedPropertyCharMap[$2]
109
+ end,
110
+ @ignorecase
111
+ )
106
112
  when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
107
113
  group = PlaceHolderGroup.new
108
114
  when next_char == 'R' # Linebreak
109
115
  group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
110
116
  when next_char == 'g' # Subexpression call
111
- # TODO: Should this be IllegalSyntaxError ?
112
- raise UnsupportedSyntaxError, "Subexpression calls (\g) are not yet supported"
117
+ raise IllegalSyntaxError, "Subexpression calls (\g) are not yet supported"
113
118
  when next_char =~ /[bB]/ # Anchors
114
119
  raise IllegalSyntaxError, "Anchors ('\\#{next_char}') cannot be supported, as they are not regular"
115
120
  when next_char =~ /[AG]/ # Start of string
@@ -1,3 +1,3 @@
1
1
  module RegexpExamples
2
- VERSION = '0.6.0'
2
+ VERSION = '0.7.0'
3
3
  end
@@ -0,0 +1,180 @@
1
+ # A script to generate lists of all unicode characters
2
+ # that match all named group/character properties regexps.
3
+ # For use in e.g. /\p{Arabic}/.examples
4
+
5
+ # To (re-)generate this list, simply run this file!
6
+ # > ruby scripts/unicode_lister.rb
7
+ OutputFilename = 'unicode_result'
8
+
9
+ # Taken from ruby documentation:
10
+ # http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
11
+ NamedGroups = %w(
12
+ Alnum
13
+ Alpha
14
+ Blank
15
+ Cntrl
16
+ Digit
17
+ Graph
18
+ Lower
19
+ Print
20
+ Punct
21
+ Space
22
+ Upper
23
+ XDigit
24
+ Word
25
+ ASCII
26
+ Any
27
+ Assigned
28
+
29
+ L
30
+ Ll
31
+ Lm
32
+ Lo
33
+ Lt
34
+ Lu
35
+ M
36
+ Mn
37
+ Mc
38
+ Me
39
+ N
40
+ Nd
41
+ Nl
42
+ No
43
+ P
44
+ Pc
45
+ Pd
46
+ Ps
47
+ Pe
48
+ Pi
49
+ Pf
50
+ Po
51
+ S
52
+ Sm
53
+ Sc
54
+ Sk
55
+ So
56
+ Z
57
+ Zs
58
+ Zl
59
+ Zp
60
+ C
61
+ Cc
62
+ Cf
63
+ Cn
64
+ Co
65
+ Cs
66
+
67
+ Arabic
68
+ Armenian
69
+ Balinese
70
+ Bengali
71
+ Bopomofo
72
+ Braille
73
+ Buginese
74
+ Buhid
75
+ Canadian_Aboriginal
76
+ Carian
77
+ Cham
78
+ Cherokee
79
+ Common
80
+ Coptic
81
+ Cuneiform
82
+ Cypriot
83
+ Cyrillic
84
+ Deseret
85
+ Devanagari
86
+ Ethiopic
87
+ Georgian
88
+ Glagolitic
89
+ Gothic
90
+ Greek
91
+ Gujarati
92
+ Gurmukhi
93
+ Han
94
+ Hangul
95
+ Hanunoo
96
+ Hebrew
97
+ Hiragana
98
+ Inherited
99
+ Kannada
100
+ Katakana
101
+ Kayah_Li
102
+ Kharoshthi
103
+ Khmer
104
+ Lao
105
+ Latin
106
+ Lepcha
107
+ Limbu
108
+ Linear_B
109
+ Lycian
110
+ Lydian
111
+ Malayalam
112
+ Mongolian
113
+ Myanmar
114
+ New_Tai_Lue
115
+ Nko
116
+ Ogham
117
+ Ol_Chiki
118
+ Old_Italic
119
+ Old_Persian
120
+ Oriya
121
+ Osmanya
122
+ Phags_Pa
123
+ Phoenician
124
+ Rejang
125
+ Runic
126
+ Saurashtra
127
+ Shavian
128
+ Sinhala
129
+ Sundanese
130
+ Syloti_Nagri
131
+ Syriac
132
+ Tagalog
133
+ Tagbanwa
134
+ Tai_Le
135
+ Tamil
136
+ Telugu
137
+ Thaana
138
+ Thai
139
+ Tibetan
140
+ Tifinagh
141
+ Ugaritic
142
+ Vai
143
+ Yi
144
+ )
145
+
146
+ # Note: For some reason, a character encoding-related exception gets raised
147
+ # when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
148
+ # This means my calculation is MISSING results in the range: 55296..65535
149
+ # However, for the sake of performance, I'm also being "lazy" and only calculating/saving
150
+ # the first 128 matches anyway!
151
+ # If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
152
+
153
+ # Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
154
+ # Example output: "1..4, 6..7, 12, 14" (String)
155
+ def calculate_ranges(matching_codes)
156
+ return "" if matching_codes.empty?
157
+ first = matching_codes.shift
158
+ matching_codes.inject([first..first]) do |r,x|
159
+ if r.last.last.succ != x
160
+ r << (x..x) # Start new range
161
+ else
162
+ r[0..-2] << (r.last.first..x) # Update last range
163
+ end
164
+ end
165
+ .map { |range| range.size == 1 ? range.first : range}
166
+ .join(", ")
167
+ end
168
+
169
+ count = 0
170
+ File.open(OutputFilename, 'w') do |f|
171
+ NamedGroups.each do |name|
172
+ count += 1
173
+ matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
174
+ f.puts "'#{name}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
175
+ puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
176
+ end
177
+ puts "*"*50
178
+ puts "Finished! Result stored in: #{OutputFilename}"
179
+ end
180
+
@@ -1,14 +1,9 @@
1
1
  RSpec.describe Regexp, "#examples" do
2
2
  def self.examples_exist_and_match(*regexps)
3
3
  regexps.each do |regexp|
4
- it do
5
- begin
6
- regexp_examples = regexp.examples(max_group_results: 999)
7
- rescue
8
- # TODO: Find a nicer way to display this?
9
- puts "Error generating examples for /#{regexp.source}/"
10
- raise $!
11
- end
4
+ it "examples for /#{regexp.source}/" do
5
+ regexp_examples = regexp.examples(max_group_results: 999)
6
+
12
7
  expect(regexp_examples).not_to be_empty, "No examples were generated for regexp: /#{regexp.source}/"
13
8
  regexp_examples.each { |example| expect(example).to match(/\A(?:#{regexp.source})\z/) }
14
9
  # Note: /\A...\z/ is used to prevent misleading examples from passing the test.
@@ -21,24 +16,16 @@ RSpec.describe Regexp, "#examples" do
21
16
 
22
17
  def self.examples_raise_illegal_syntax_error(*regexps)
23
18
  regexps.each do |regexp|
24
- it do
19
+ it "examples for /#{regexp.source}/" do
25
20
  expect{regexp.examples}.to raise_error RegexpExamples::IllegalSyntaxError
26
21
  end
27
22
  end
28
23
  end
29
24
 
30
- def self.examples_raise_unsupported_syntax_error(*regexps)
31
- regexps.each do |regexp|
32
- it do
33
- expect{regexp.examples}.to raise_error RegexpExamples::UnsupportedSyntaxError
34
- end
35
- end
36
- end
37
-
38
25
  def self.examples_are_empty(*regexps)
39
26
  regexps.each do |regexp|
40
- it do
41
- expect(regexp.examples).to be_empty, "Unexpected examples for regexp: /#{regexp.source}/"
27
+ it "examples for /#{regexp.source}/" do
28
+ expect(regexp.examples).to be_empty
42
29
  end
43
30
  end
44
31
  end
@@ -167,7 +154,8 @@ RSpec.describe Regexp, "#examples" do
167
154
  /start-of^-line/,
168
155
  /end-of\Z-string/,
169
156
  /end-of\z-string/,
170
- /end-of$-line/
157
+ /end-of$-line/,
158
+ /(?<name> ... \g<name>*)/
171
159
  )
172
160
  end
173
161
 
@@ -182,13 +170,13 @@ RSpec.describe Regexp, "#examples" do
182
170
  )
183
171
  end
184
172
 
185
- context "for unsupported syntax" do
186
- examples_raise_unsupported_syntax_error(
173
+ context "for named properties" do
174
+ examples_exist_and_match(
187
175
  /\p{L}/,
188
176
  /\p{Arabic}/,
189
- /\p{^Ll}/,
190
- /(?<name> ... \g<name>*)/
177
+ /\p{^Ll}/
191
178
  )
179
+
192
180
  end
193
181
 
194
182
  context "for control characters" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: regexp-examples
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tom Lord
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-22 00:00:00.000000000 Z
11
+ date: 2015-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -58,13 +58,13 @@ files:
58
58
  - lib/regexp-examples/chargroup_parser.rb
59
59
  - lib/regexp-examples/constants.rb
60
60
  - lib/regexp-examples/core_extensions/regexp/examples.rb
61
- - lib/regexp-examples/exceptions.rb
62
61
  - lib/regexp-examples/groups.rb
63
62
  - lib/regexp-examples/helpers.rb
64
63
  - lib/regexp-examples/parser.rb
65
64
  - lib/regexp-examples/repeaters.rb
66
65
  - lib/regexp-examples/version.rb
67
66
  - regexp-examples.gemspec
67
+ - scripts/unicode_lister.rb
68
68
  - spec/regexp-examples_spec.rb
69
69
  - spec/spec_helper.rb
70
70
  homepage: http://rubygems.org/gems/regexp-examples
@@ -1,6 +0,0 @@
1
- module RegexpExamples
2
- class Error < StandardError; end
3
- class UnsupportedSyntaxError < Error; end
4
- class IllegalSyntaxError < Error; end
5
- class BackrefNotFound < Error; end
6
- end