regexp-examples 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -5
- data/lib/regexp-examples/backreferences.rb +4 -2
- data/lib/regexp-examples/constants.rb +155 -0
- data/lib/regexp-examples/parser.rb +12 -7
- data/lib/regexp-examples/version.rb +1 -1
- data/scripts/unicode_lister.rb +180 -0
- data/spec/regexp-examples_spec.rb +12 -24
- metadata +3 -3
- data/lib/regexp-examples/exceptions.rb +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7dacce756110dd70823630de898a8c9f55d12b1
|
4
|
+
data.tar.gz: d3ee78e2ed48d91aacc9cb916d8ab71dd25e326d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2655f9c1b1bbb8452a06d7debdba232ba53354131776bbf23a69fc1dc3b62d950600093b4581d2f4c5304f161db421ed95204e8c40cee2adc75c95670dcf42a1
|
7
|
+
data.tar.gz: da2dd9829aa3f5f2415f4a5ca4182133c19b1a481a40172140858ba72f65e05824eebdbff8899c6f0d84a90c93b0539c86c68dd7a23371fe6f577da738746824
|
data/README.md
CHANGED
@@ -44,6 +44,7 @@ For more detail on this, see [configuration options](#configuration-options).
|
|
44
44
|
* Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/`
|
45
45
|
* Octal characters, e.g. `/\10/`, `/\177/`
|
46
46
|
* POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/`
|
47
|
+
* Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
|
47
48
|
* **Arbitrarily complex combinations of all the above!**
|
48
49
|
|
49
50
|
* Regexp options can also be used:
|
@@ -60,11 +61,6 @@ For more detail on this, see [configuration options](#configuration-options).
|
|
60
61
|
|
61
62
|
* Conditional capture groups, such as `/(group1) (?(1)yes|no)`
|
62
63
|
|
63
|
-
Using any of the following will raise a RegexpExamples::UnsupportedSyntax exception (until such time as they are implemented!):
|
64
|
-
|
65
|
-
* Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
|
66
|
-
* Subexpression calls, e.g. `/(?<name> ... \g<name>* )/` (Note: These could get _really_ ugly to implement, and may even be impossible, so I highly doubt it's worth the effort!)
|
67
|
-
|
68
64
|
There are loads more (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out here. Full documentation on all the various other obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE).
|
69
65
|
|
70
66
|
## Impossible features ("illegal syntax")
|
@@ -77,6 +73,7 @@ Using any of the following will raise a RegexpExamples::IllegalSyntax exception:
|
|
77
73
|
* Lookarounds, e.g. `/foo(?=bar)/`, `/foo(?!bar)/`, `/(?<=foo)bar/`, `/(?<!foo)bar/`
|
78
74
|
* [Anchors](http://ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Anchors) (`\b`, `\B`, `\G`, `^`, `\A`, `$`, `\z`, `\Z`), e.g. `/\bword\b/`, `/line1\n^line2/`
|
79
75
|
* However, a special case has been made to allow `^`, `\A` and `\G` at the start of a pattern; and to allow `$`, `\z` and `\Z` at the end of pattern. In such cases, the characters are effectively just ignored.
|
76
|
+
* Subexpression calls, e.g. `/(?<name> ... \g<name>* )/`
|
80
77
|
|
81
78
|
(Note: Backreferences are not really "regular" either, but I got these to work with a bit of hackery!)
|
82
79
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module RegexpExamples
|
2
2
|
class BackReferenceReplacer
|
3
|
+
BackrefNotFound = Class.new(StandardError)
|
4
|
+
|
3
5
|
def substitute_backreferences(full_examples)
|
4
6
|
full_examples.map do |full_example|
|
5
7
|
begin
|
@@ -7,7 +9,7 @@ module RegexpExamples
|
|
7
9
|
full_example.sub!(/__(\w+?)__/, find_backref_for(full_example, $1))
|
8
10
|
end
|
9
11
|
full_example
|
10
|
-
rescue
|
12
|
+
rescue BackrefNotFound
|
11
13
|
# For instance, one "full example" from /(a|(b)) \2/: "a __2__"
|
12
14
|
# should be rejected because the backref (\2) does not exist
|
13
15
|
nil
|
@@ -27,7 +29,7 @@ module RegexpExamples
|
|
27
29
|
if octal_chars =~ /\A[01]?[0-7]{1,2}\z/ && octal_chars.to_i >= 10
|
28
30
|
Integer(octal_chars, 8).chr
|
29
31
|
else
|
30
|
-
raise(
|
32
|
+
raise(BackrefNotFound)
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
@@ -35,6 +35,9 @@ module RegexpExamples
|
|
35
35
|
Lower = Array('a'..'z')
|
36
36
|
Upper = Array('A'..'Z')
|
37
37
|
Digit = Array('0'..'9')
|
38
|
+
# Note: Punct should also include the following chars: $ + < = > ^ ` | ~
|
39
|
+
# I.e. Punct = %w(! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~)
|
40
|
+
# However, due to a ruby bug (!!) these do not work properly at the moment!
|
38
41
|
Punct = %w(! " # % & ' ( ) * , - . / : ; ? @ [ \\ ] _ { })
|
39
42
|
Hex = Array('a'..'f') | Array('A'..'F') | Digit
|
40
43
|
Word = Lower | Upper | Digit | ['_']
|
@@ -81,5 +84,157 @@ module RegexpExamples
|
|
81
84
|
'word' => CharSets::Word,
|
82
85
|
'ascii' => CharSets::Any
|
83
86
|
}.freeze
|
87
|
+
|
88
|
+
def self.ranges_to_unicode(*ranges)
|
89
|
+
result = []
|
90
|
+
ranges.each do |range|
|
91
|
+
if range.is_a? Fixnum # Small hack to improve readability below
|
92
|
+
result << hex_to_unicode(range.to_s(16))
|
93
|
+
else
|
94
|
+
range.each { |num| result << hex_to_unicode(num.to_s(16)) }
|
95
|
+
end
|
96
|
+
end
|
97
|
+
result
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.hex_to_unicode(hex)
|
101
|
+
eval("?\\u{#{hex}}")
|
102
|
+
end
|
103
|
+
|
104
|
+
# These values were generated by: scripts/unicode_lister.rb
|
105
|
+
# Note: Only the first 128 results are listed, for performance.
|
106
|
+
# Also, some groups seem to have no matches (weird!)
|
107
|
+
NamedPropertyCharMap = {
|
108
|
+
'Alnum' => ranges_to_unicode(48..57, 65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..256),
|
109
|
+
'Alpha' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
|
110
|
+
'Blank' => ranges_to_unicode(9, 32, 160, 5760, 8192..8202, 8239, 8287, 12288),
|
111
|
+
'Cntrl' => ranges_to_unicode(0..31, 127..159),
|
112
|
+
'Digit' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
|
113
|
+
'Graph' => ranges_to_unicode(33..126, 161..194),
|
114
|
+
'Lower' => ranges_to_unicode(97..122, 170, 181, 186, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387),
|
115
|
+
'Print' => ranges_to_unicode(32..126, 160..192),
|
116
|
+
'Punct' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
|
117
|
+
'Space' => ranges_to_unicode(9..13, 32, 133, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
|
118
|
+
'Upper' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
|
119
|
+
'XDigit' => ranges_to_unicode(48..57, 65..70, 97..102),
|
120
|
+
'Word' => ranges_to_unicode(48..57, 65..90, 95, 97..122, 170, 181, 186, 192..214, 216..246, 248..255),
|
121
|
+
'ASCII' => ranges_to_unicode(0..127),
|
122
|
+
'Any' => ranges_to_unicode(0..127),
|
123
|
+
'Assigned' => ranges_to_unicode(0..127),
|
124
|
+
'L' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
|
125
|
+
'Ll' => ranges_to_unicode(97..122, 181, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387, 389, 392),
|
126
|
+
'Lm' => ranges_to_unicode(688..705, 710..721, 736..740, 748, 750, 884, 890, 1369, 1600, 1765..1766, 2036..2037, 2042, 2074, 2084, 2088, 2417, 3654, 3782, 4348, 6103, 6211, 6823, 7288..7293, 7468..7530, 7544, 7579..7580),
|
127
|
+
'Lo' => ranges_to_unicode(170, 186, 443, 448..451, 660, 1488..1514, 1520..1522, 1568..1599, 1601..1610, 1646..1647, 1649..1694),
|
128
|
+
'Lt' => ranges_to_unicode(453, 456, 459, 498, 8072..8079, 8088..8095, 8104..8111, 8124, 8140, 8188),
|
129
|
+
'Lu' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
|
130
|
+
'M' => ranges_to_unicode(768..879, 1155..1161, 1425..1433),
|
131
|
+
'Mn' => ranges_to_unicode(768..879, 1155..1159, 1425..1435),
|
132
|
+
'Mc' => ranges_to_unicode(2307, 2363, 2366..2368, 2377..2380, 2382..2383, 2434..2435, 2494..2496, 2503..2504, 2507..2508, 2519, 2563, 2622..2624, 2691, 2750..2752, 2761, 2763..2764, 2818..2819, 2878, 2880, 2887..2888, 2891..2892, 2903, 3006..3007, 3009..3010, 3014..3016, 3018..3020, 3031, 3073..3075, 3137..3140, 3202..3203, 3262, 3264..3268, 3271..3272, 3274..3275, 3285..3286, 3330..3331, 3390..3392, 3398..3400, 3402..3404, 3415, 3458..3459, 3535..3537, 3544..3551, 3570..3571, 3902..3903, 3967, 4139..4140, 4145, 4152, 4155..4156, 4182..4183, 4194..4196, 4199..4205, 4227..4228, 4231..4235),
|
133
|
+
'Me' => ranges_to_unicode(1160..1161, 6846, 8413..8416, 8418..8420, 42608..42610),
|
134
|
+
'N' => ranges_to_unicode(48..57, 178..179, 185, 188..190, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2548..2553, 2662..2671, 2790..2799, 2918..2927, 2930..2935, 3046..3058, 3174..3180),
|
135
|
+
'Nd' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
|
136
|
+
'Nl' => ranges_to_unicode(5870..5872, 8544..8578, 8581..8584, 12295, 12321..12329, 12344..12346, 42726..42735),
|
137
|
+
'No' => ranges_to_unicode(178..179, 185, 188..190, 2548..2553, 2930..2935, 3056..3058, 3192..3198, 3440..3445, 3882..3891, 4969..4988, 6128..6137, 6618, 8304, 8308..8313, 8320..8329, 8528..8543, 8585, 9312..9330),
|
138
|
+
'P' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
|
139
|
+
'Pc' => ranges_to_unicode(95, 8255..8256, 8276),
|
140
|
+
'Pd' => ranges_to_unicode(45, 1418, 1470, 5120, 6150, 8208..8213, 11799, 11802, 11834..11835, 11840, 12316, 12336, 12448),
|
141
|
+
'Ps' => ranges_to_unicode(40, 91, 123, 3898, 3900, 5787, 8218, 8222, 8261, 8317, 8333, 8968, 8970, 9001, 10088, 10090, 10092, 10094, 10096, 10098, 10100, 10181, 10214, 10216, 10218, 10220, 10222, 10627, 10629, 10631, 10633, 10635, 10637, 10639, 10641, 10643, 10645, 10647, 10712, 10714, 10748, 11810, 11812, 11814, 11816, 11842, 12296, 12298, 12300, 12302, 12304, 12308, 12310, 12312, 12314, 12317),
|
142
|
+
'Pe' => ranges_to_unicode(41, 93, 125, 3899, 3901, 5788, 8262, 8318, 8334, 8969, 8971, 9002, 10089, 10091, 10093, 10095, 10097, 10099, 10101, 10182, 10215, 10217, 10219, 10221, 10223, 10628, 10630, 10632, 10634, 10636, 10638, 10640, 10642, 10644, 10646, 10648, 10713, 10715, 10749, 11811, 11813, 11815, 11817, 12297, 12299, 12301, 12303, 12305, 12309, 12311, 12313, 12315, 12318..12319),
|
143
|
+
'Pi' => ranges_to_unicode(171, 8216, 8219..8220, 8223, 8249, 11778, 11780, 11785, 11788, 11804, 11808),
|
144
|
+
'Pf' => ranges_to_unicode(187, 8217, 8221, 8250, 11779, 11781, 11786, 11789, 11805, 11809),
|
145
|
+
'Po' => ranges_to_unicode(33..35, 37..39, 42, 44, 46..47, 58..59, 63..64, 92, 161, 167, 182..183, 191, 894, 903, 1370..1375, 1417, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3973, 4048..4052, 4057..4058, 4170..4175, 4347, 4960..4968, 5741),
|
146
|
+
'S' => ranges_to_unicode(36, 43, 60..62, 94, 96, 124, 126, 162..166, 168..169, 172, 174..177, 180, 184, 215, 247, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 1014, 1154, 1421..1423, 1542..1544, 1547, 1550..1551, 1758, 1769, 1789..1790, 2038, 2546..2547, 2554..2555, 2801, 2928, 3059..3066, 3199, 3449, 3647, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037),
|
147
|
+
'Sm' => ranges_to_unicode(43, 60..62, 124, 126, 172, 177, 215, 247, 1014, 1542..1544, 8260, 8274, 8314..8316, 8330..8332, 8472, 8512..8516, 8523, 8592..8596, 8602..8603, 8608, 8611, 8614, 8622, 8654..8655, 8658, 8660, 8692..8775),
|
148
|
+
'Sc' => ranges_to_unicode(36, 162..165, 1423, 1547, 2546..2547, 2555, 2801, 3065, 3647, 6107, 8352..8381, 43064),
|
149
|
+
'Sk' => ranges_to_unicode(94, 96, 168, 175, 180, 184, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 8125, 8127..8129, 8141..8143, 8157..8159, 8173..8175, 8189..8190, 12443..12444, 42752..42774, 42784..42785, 42889..42890, 43867),
|
150
|
+
'So' => ranges_to_unicode(166, 169, 174, 176, 1154, 1421..1422, 1550..1551, 1758, 1769, 1789..1790, 2038, 2554, 2928, 3059..3064, 3066, 3199, 3449, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037, 4039..4044, 4046..4047, 4053..4056, 4254..4255, 5008..5017, 6464, 6622..6655, 7009..7018, 7028..7036, 8448),
|
151
|
+
'Z' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
|
152
|
+
'Zs' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8239, 8287, 12288),
|
153
|
+
'Zl' => ranges_to_unicode(8232),
|
154
|
+
'Zp' => ranges_to_unicode(8233),
|
155
|
+
'C' => ranges_to_unicode(0..31, 127..159, 173, 888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1541, 1564..1565, 1757, 1806..1807, 1867..1868, 1970..1977),
|
156
|
+
'Cc' => ranges_to_unicode(0..31, 127..159),
|
157
|
+
'Cf' => ranges_to_unicode(173, 1536..1541, 1564, 1757, 1807, 6158, 8203..8207, 8234..8238, 8288..8292, 8294..8303),
|
158
|
+
'Cn' => ranges_to_unicode(888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1535, 1565, 1806, 1867..1868, 1970..1983, 2043..2047, 2094..2095, 2111, 2140..2141, 2143..2201),
|
159
|
+
'Co' => ranges_to_unicode(),
|
160
|
+
'Cs' => ranges_to_unicode(),
|
161
|
+
'Arabic' => ranges_to_unicode(1536..1540, 1542..1547, 1549..1562, 1566, 1568..1599, 1601..1610, 1622..1631, 1642..1647, 1649..1692),
|
162
|
+
'Armenian' => ranges_to_unicode(1329..1366, 1369..1375, 1377..1415, 1418, 1421..1423),
|
163
|
+
'Balinese' => ranges_to_unicode(6912..6987, 6992..7036),
|
164
|
+
'Bengali' => ranges_to_unicode(2432..2435, 2437..2444, 2447..2448, 2451..2472, 2474..2480, 2482, 2486..2489, 2492..2500, 2503..2504, 2507..2510, 2519, 2524..2525, 2527..2531, 2534..2555),
|
165
|
+
'Bopomofo' => ranges_to_unicode(746..747, 12549..12589, 12704..12730),
|
166
|
+
'Braille' => ranges_to_unicode(10240..10367),
|
167
|
+
'Buginese' => ranges_to_unicode(6656..6683, 6686..6687),
|
168
|
+
'Buhid' => ranges_to_unicode(5952..5971),
|
169
|
+
'Canadian_Aboriginal' => ranges_to_unicode(5120..5247),
|
170
|
+
'Carian' => ranges_to_unicode(),
|
171
|
+
'Cham' => ranges_to_unicode(43520..43574, 43584..43597, 43600..43609, 43612..43615),
|
172
|
+
'Cherokee' => ranges_to_unicode(5024..5108),
|
173
|
+
'Common' => ranges_to_unicode(0..64, 91..96, 123..169, 171..180),
|
174
|
+
'Coptic' => ranges_to_unicode(994..1007, 11392..11505),
|
175
|
+
'Cuneiform' => ranges_to_unicode(),
|
176
|
+
'Cypriot' => ranges_to_unicode(),
|
177
|
+
'Cyrillic' => ranges_to_unicode(1024..1151),
|
178
|
+
'Deseret' => ranges_to_unicode(),
|
179
|
+
'Devanagari' => ranges_to_unicode(2304..2384, 2387..2403, 2406..2431, 43232..43235),
|
180
|
+
'Ethiopic' => ranges_to_unicode(4608..4680, 4682..4685, 4688..4694, 4696, 4698..4701, 4704..4742),
|
181
|
+
'Georgian' => ranges_to_unicode(4256..4293, 4295, 4301, 4304..4346, 4348..4351, 11520..11557, 11559, 11565),
|
182
|
+
'Glagolitic' => ranges_to_unicode(11264..11310, 11312..11358),
|
183
|
+
'Gothic' => ranges_to_unicode(),
|
184
|
+
'Greek' => ranges_to_unicode(880..883, 885..887, 890..893, 895, 900, 902, 904..906, 908, 910..929, 931..993, 1008..1023, 7462..7466, 7517..7521, 7526),
|
185
|
+
'Gujarati' => ranges_to_unicode(2689..2691, 2693..2701, 2703..2705, 2707..2728, 2730..2736, 2738..2739, 2741..2745, 2748..2757, 2759..2761, 2763..2765, 2768, 2784..2787, 2790..2801),
|
186
|
+
'Gurmukhi' => ranges_to_unicode(2561..2563, 2565..2570, 2575..2576, 2579..2600, 2602..2608, 2610..2611, 2613..2614, 2616..2617, 2620, 2622..2626, 2631..2632, 2635..2637, 2641, 2649..2652, 2654, 2662..2677),
|
187
|
+
'Han' => ranges_to_unicode(11904..11929, 11931..12019, 12032..12044),
|
188
|
+
'Hangul' => ranges_to_unicode(4352..4479),
|
189
|
+
'Hanunoo' => ranges_to_unicode(5920..5940),
|
190
|
+
'Hebrew' => ranges_to_unicode(1425..1479, 1488..1514, 1520..1524),
|
191
|
+
'Hiragana' => ranges_to_unicode(12353..12438, 12445..12447),
|
192
|
+
'Inherited' => ranges_to_unicode(768..879, 1157..1158, 1611..1621, 1648, 2385..2386),
|
193
|
+
'Kannada' => ranges_to_unicode(3201..3203, 3205..3212, 3214..3216, 3218..3240, 3242..3251, 3253..3257, 3260..3268, 3270..3272, 3274..3277, 3285..3286, 3294, 3296..3299, 3302..3311, 3313..3314),
|
194
|
+
'Katakana' => ranges_to_unicode(12449..12538, 12541..12543, 12784..12799, 13008..13026),
|
195
|
+
'Kayah_Li' => ranges_to_unicode(43264..43309, 43311),
|
196
|
+
'Kharoshthi' => ranges_to_unicode(),
|
197
|
+
'Khmer' => ranges_to_unicode(6016..6109, 6112..6121, 6128..6137, 6624..6637),
|
198
|
+
'Lao' => ranges_to_unicode(3713..3714, 3716, 3719..3720, 3722, 3725, 3732..3735, 3737..3743, 3745..3747, 3749, 3751, 3754..3755, 3757..3769, 3771..3773, 3776..3780, 3782, 3784..3789, 3792..3801, 3804..3807),
|
199
|
+
'Latin' => ranges_to_unicode(65..90, 97..122, 170, 186, 192..214, 216..246, 248..267),
|
200
|
+
'Lepcha' => ranges_to_unicode(7168..7223, 7227..7241, 7245..7247),
|
201
|
+
'Limbu' => ranges_to_unicode(6400..6430, 6432..6443, 6448..6459, 6464, 6468..6479),
|
202
|
+
'Linear_B' => ranges_to_unicode(),
|
203
|
+
'Lycian' => ranges_to_unicode(),
|
204
|
+
'Lydian' => ranges_to_unicode(),
|
205
|
+
'Malayalam' => ranges_to_unicode(3329..3331, 3333..3340, 3342..3344, 3346..3386, 3389..3396, 3398..3400, 3402..3406, 3415, 3424..3427, 3430..3445, 3449..3455),
|
206
|
+
'Mongolian' => ranges_to_unicode(6144..6145, 6148, 6150..6158, 6160..6169, 6176..6263, 6272..6289),
|
207
|
+
'Myanmar' => ranges_to_unicode(4096..4223),
|
208
|
+
'New_Tai_Lue' => ranges_to_unicode(6528..6571, 6576..6601, 6608..6618, 6622..6623),
|
209
|
+
'Nko' => ranges_to_unicode(1984..2042),
|
210
|
+
'Ogham' => ranges_to_unicode(5760..5788),
|
211
|
+
'Ol_Chiki' => ranges_to_unicode(7248..7295),
|
212
|
+
'Old_Italic' => ranges_to_unicode(),
|
213
|
+
'Old_Persian' => ranges_to_unicode(),
|
214
|
+
'Oriya' => ranges_to_unicode(2817..2819, 2821..2828, 2831..2832, 2835..2856, 2858..2864, 2866..2867, 2869..2873, 2876..2884, 2887..2888, 2891..2893, 2902..2903, 2908..2909, 2911..2915, 2918..2935),
|
215
|
+
'Osmanya' => ranges_to_unicode(),
|
216
|
+
'Phags_Pa' => ranges_to_unicode(43072..43127),
|
217
|
+
'Phoenician' => ranges_to_unicode(),
|
218
|
+
'Rejang' => ranges_to_unicode(43312..43347, 43359),
|
219
|
+
'Runic' => ranges_to_unicode(5792..5866, 5870..5880),
|
220
|
+
'Saurashtra' => ranges_to_unicode(43136..43204, 43214..43225),
|
221
|
+
'Shavian' => ranges_to_unicode(),
|
222
|
+
'Sinhala' => ranges_to_unicode(3458..3459, 3461..3478, 3482..3505, 3507..3515, 3517, 3520..3526, 3530, 3535..3540, 3542, 3544..3551, 3558..3567, 3570..3572),
|
223
|
+
'Sundanese' => ranges_to_unicode(7040..7103, 7360..7367),
|
224
|
+
'Syloti_Nagri' => ranges_to_unicode(43008..43051),
|
225
|
+
'Syriac' => ranges_to_unicode(1792..1805, 1807..1866, 1869..1871),
|
226
|
+
'Tagalog' => ranges_to_unicode(5888..5900, 5902..5908),
|
227
|
+
'Tagbanwa' => ranges_to_unicode(5984..5996, 5998..6000, 6002..6003),
|
228
|
+
'Tai_Le' => ranges_to_unicode(6480..6509, 6512..6516),
|
229
|
+
'Tamil' => ranges_to_unicode(2946..2947, 2949..2954, 2958..2960, 2962..2965, 2969..2970, 2972, 2974..2975, 2979..2980, 2984..2986, 2990..3001, 3006..3010, 3014..3016, 3018..3021, 3024, 3031, 3046..3066),
|
230
|
+
'Telugu' => ranges_to_unicode(3072..3075, 3077..3084, 3086..3088, 3090..3112, 3114..3129, 3133..3140, 3142..3144, 3146..3149, 3157..3158, 3160..3161, 3168..3171, 3174..3183, 3192..3199),
|
231
|
+
'Thaana' => ranges_to_unicode(1920..1969),
|
232
|
+
'Thai' => ranges_to_unicode(3585..3642, 3648..3675),
|
233
|
+
'Tibetan' => ranges_to_unicode(3840..3911, 3913..3948, 3953..3972),
|
234
|
+
'Tifinagh' => ranges_to_unicode(11568..11623, 11631..11632, 11647),
|
235
|
+
'Ugaritic' => ranges_to_unicode(),
|
236
|
+
'Vai' => ranges_to_unicode(42240..42367),
|
237
|
+
'Yi' => ranges_to_unicode(40960..41087),
|
238
|
+
}.freeze
|
84
239
|
end
|
85
240
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
module RegexpExamples
|
2
|
+
IllegalSyntaxError = Class.new(StandardError)
|
2
3
|
class Parser
|
3
4
|
attr_reader :regexp_string
|
4
5
|
def initialize(regexp_string, regexp_options, config_options={})
|
@@ -85,8 +86,6 @@ module RegexpExamples
|
|
85
86
|
group = parse_backreference_group($1)
|
86
87
|
when BackslashCharMap.keys.include?(next_char)
|
87
88
|
group = CharGroup.new(
|
88
|
-
# Note: The `.dup` is important, as it prevents modifying the constant, in
|
89
|
-
# CharGroup#init_ranges (where the '-' is moved to the front)
|
90
89
|
BackslashCharMap[next_char].dup,
|
91
90
|
@ignorecase
|
92
91
|
)
|
@@ -100,16 +99,22 @@ module RegexpExamples
|
|
100
99
|
@current_position += $1.length
|
101
100
|
sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
|
102
101
|
group = parse_single_char_group( parse_unicode_sequence(sequence) )
|
103
|
-
when rest_of_string =~ /\Ap\{([^}]+)\}/ # Named properties
|
104
|
-
@current_position += ($1.length + 2)
|
105
|
-
|
102
|
+
when rest_of_string =~ /\Ap\{(\^?)([^}]+)\}/ # Named properties
|
103
|
+
@current_position += ($1.length + $2.length + 2)
|
104
|
+
group = CharGroup.new(
|
105
|
+
if($1 == "^")
|
106
|
+
CharSets::Any.dup - NamedPropertyCharMap[$2]
|
107
|
+
else
|
108
|
+
NamedPropertyCharMap[$2]
|
109
|
+
end,
|
110
|
+
@ignorecase
|
111
|
+
)
|
106
112
|
when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
|
107
113
|
group = PlaceHolderGroup.new
|
108
114
|
when next_char == 'R' # Linebreak
|
109
115
|
group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
|
110
116
|
when next_char == 'g' # Subexpression call
|
111
|
-
|
112
|
-
raise UnsupportedSyntaxError, "Subexpression calls (\g) are not yet supported"
|
117
|
+
raise IllegalSyntaxError, "Subexpression calls (\g) are not yet supported"
|
113
118
|
when next_char =~ /[bB]/ # Anchors
|
114
119
|
raise IllegalSyntaxError, "Anchors ('\\#{next_char}') cannot be supported, as they are not regular"
|
115
120
|
when next_char =~ /[AG]/ # Start of string
|
@@ -0,0 +1,180 @@
|
|
1
|
+
# A script to generate lists of all unicode characters
|
2
|
+
# that match all named group/character properties regexps.
|
3
|
+
# For use in e.g. /\p{Arabic}/.examples
|
4
|
+
|
5
|
+
# To (re-)generate this list, simply run this file!
|
6
|
+
# > ruby scripts/unicode_lister.rb
|
7
|
+
OutputFilename = 'unicode_result'
|
8
|
+
|
9
|
+
# Taken from ruby documentation:
|
10
|
+
# http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
|
11
|
+
NamedGroups = %w(
|
12
|
+
Alnum
|
13
|
+
Alpha
|
14
|
+
Blank
|
15
|
+
Cntrl
|
16
|
+
Digit
|
17
|
+
Graph
|
18
|
+
Lower
|
19
|
+
Print
|
20
|
+
Punct
|
21
|
+
Space
|
22
|
+
Upper
|
23
|
+
XDigit
|
24
|
+
Word
|
25
|
+
ASCII
|
26
|
+
Any
|
27
|
+
Assigned
|
28
|
+
|
29
|
+
L
|
30
|
+
Ll
|
31
|
+
Lm
|
32
|
+
Lo
|
33
|
+
Lt
|
34
|
+
Lu
|
35
|
+
M
|
36
|
+
Mn
|
37
|
+
Mc
|
38
|
+
Me
|
39
|
+
N
|
40
|
+
Nd
|
41
|
+
Nl
|
42
|
+
No
|
43
|
+
P
|
44
|
+
Pc
|
45
|
+
Pd
|
46
|
+
Ps
|
47
|
+
Pe
|
48
|
+
Pi
|
49
|
+
Pf
|
50
|
+
Po
|
51
|
+
S
|
52
|
+
Sm
|
53
|
+
Sc
|
54
|
+
Sk
|
55
|
+
So
|
56
|
+
Z
|
57
|
+
Zs
|
58
|
+
Zl
|
59
|
+
Zp
|
60
|
+
C
|
61
|
+
Cc
|
62
|
+
Cf
|
63
|
+
Cn
|
64
|
+
Co
|
65
|
+
Cs
|
66
|
+
|
67
|
+
Arabic
|
68
|
+
Armenian
|
69
|
+
Balinese
|
70
|
+
Bengali
|
71
|
+
Bopomofo
|
72
|
+
Braille
|
73
|
+
Buginese
|
74
|
+
Buhid
|
75
|
+
Canadian_Aboriginal
|
76
|
+
Carian
|
77
|
+
Cham
|
78
|
+
Cherokee
|
79
|
+
Common
|
80
|
+
Coptic
|
81
|
+
Cuneiform
|
82
|
+
Cypriot
|
83
|
+
Cyrillic
|
84
|
+
Deseret
|
85
|
+
Devanagari
|
86
|
+
Ethiopic
|
87
|
+
Georgian
|
88
|
+
Glagolitic
|
89
|
+
Gothic
|
90
|
+
Greek
|
91
|
+
Gujarati
|
92
|
+
Gurmukhi
|
93
|
+
Han
|
94
|
+
Hangul
|
95
|
+
Hanunoo
|
96
|
+
Hebrew
|
97
|
+
Hiragana
|
98
|
+
Inherited
|
99
|
+
Kannada
|
100
|
+
Katakana
|
101
|
+
Kayah_Li
|
102
|
+
Kharoshthi
|
103
|
+
Khmer
|
104
|
+
Lao
|
105
|
+
Latin
|
106
|
+
Lepcha
|
107
|
+
Limbu
|
108
|
+
Linear_B
|
109
|
+
Lycian
|
110
|
+
Lydian
|
111
|
+
Malayalam
|
112
|
+
Mongolian
|
113
|
+
Myanmar
|
114
|
+
New_Tai_Lue
|
115
|
+
Nko
|
116
|
+
Ogham
|
117
|
+
Ol_Chiki
|
118
|
+
Old_Italic
|
119
|
+
Old_Persian
|
120
|
+
Oriya
|
121
|
+
Osmanya
|
122
|
+
Phags_Pa
|
123
|
+
Phoenician
|
124
|
+
Rejang
|
125
|
+
Runic
|
126
|
+
Saurashtra
|
127
|
+
Shavian
|
128
|
+
Sinhala
|
129
|
+
Sundanese
|
130
|
+
Syloti_Nagri
|
131
|
+
Syriac
|
132
|
+
Tagalog
|
133
|
+
Tagbanwa
|
134
|
+
Tai_Le
|
135
|
+
Tamil
|
136
|
+
Telugu
|
137
|
+
Thaana
|
138
|
+
Thai
|
139
|
+
Tibetan
|
140
|
+
Tifinagh
|
141
|
+
Ugaritic
|
142
|
+
Vai
|
143
|
+
Yi
|
144
|
+
)
|
145
|
+
|
146
|
+
# Note: For some reason, a character encoding-related exception gets raised
|
147
|
+
# when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
|
148
|
+
# This means my calculation is MISSING results in the range: 55296..65535
|
149
|
+
# However, for the sake of performance, I'm also being "lazy" and only calculating/saving
|
150
|
+
# the first 128 matches anyway!
|
151
|
+
# If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
|
152
|
+
|
153
|
+
# Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
|
154
|
+
# Example output: "1..4, 6..7, 12, 14" (String)
|
155
|
+
def calculate_ranges(matching_codes)
|
156
|
+
return "" if matching_codes.empty?
|
157
|
+
first = matching_codes.shift
|
158
|
+
matching_codes.inject([first..first]) do |r,x|
|
159
|
+
if r.last.last.succ != x
|
160
|
+
r << (x..x) # Start new range
|
161
|
+
else
|
162
|
+
r[0..-2] << (r.last.first..x) # Update last range
|
163
|
+
end
|
164
|
+
end
|
165
|
+
.map { |range| range.size == 1 ? range.first : range}
|
166
|
+
.join(", ")
|
167
|
+
end
|
168
|
+
|
169
|
+
count = 0
|
170
|
+
File.open(OutputFilename, 'w') do |f|
|
171
|
+
NamedGroups.each do |name|
|
172
|
+
count += 1
|
173
|
+
matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
|
174
|
+
f.puts "'#{name}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
|
175
|
+
puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
|
176
|
+
end
|
177
|
+
puts "*"*50
|
178
|
+
puts "Finished! Result stored in: #{OutputFilename}"
|
179
|
+
end
|
180
|
+
|
@@ -1,14 +1,9 @@
|
|
1
1
|
RSpec.describe Regexp, "#examples" do
|
2
2
|
def self.examples_exist_and_match(*regexps)
|
3
3
|
regexps.each do |regexp|
|
4
|
-
it do
|
5
|
-
|
6
|
-
|
7
|
-
rescue
|
8
|
-
# TODO: Find a nicer way to display this?
|
9
|
-
puts "Error generating examples for /#{regexp.source}/"
|
10
|
-
raise $!
|
11
|
-
end
|
4
|
+
it "examples for /#{regexp.source}/" do
|
5
|
+
regexp_examples = regexp.examples(max_group_results: 999)
|
6
|
+
|
12
7
|
expect(regexp_examples).not_to be_empty, "No examples were generated for regexp: /#{regexp.source}/"
|
13
8
|
regexp_examples.each { |example| expect(example).to match(/\A(?:#{regexp.source})\z/) }
|
14
9
|
# Note: /\A...\z/ is used to prevent misleading examples from passing the test.
|
@@ -21,24 +16,16 @@ RSpec.describe Regexp, "#examples" do
|
|
21
16
|
|
22
17
|
def self.examples_raise_illegal_syntax_error(*regexps)
|
23
18
|
regexps.each do |regexp|
|
24
|
-
it do
|
19
|
+
it "examples for /#{regexp.source}/" do
|
25
20
|
expect{regexp.examples}.to raise_error RegexpExamples::IllegalSyntaxError
|
26
21
|
end
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
30
|
-
def self.examples_raise_unsupported_syntax_error(*regexps)
|
31
|
-
regexps.each do |regexp|
|
32
|
-
it do
|
33
|
-
expect{regexp.examples}.to raise_error RegexpExamples::UnsupportedSyntaxError
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
25
|
def self.examples_are_empty(*regexps)
|
39
26
|
regexps.each do |regexp|
|
40
|
-
it do
|
41
|
-
expect(regexp.examples).to be_empty
|
27
|
+
it "examples for /#{regexp.source}/" do
|
28
|
+
expect(regexp.examples).to be_empty
|
42
29
|
end
|
43
30
|
end
|
44
31
|
end
|
@@ -167,7 +154,8 @@ RSpec.describe Regexp, "#examples" do
|
|
167
154
|
/start-of^-line/,
|
168
155
|
/end-of\Z-string/,
|
169
156
|
/end-of\z-string/,
|
170
|
-
/end-of$-line
|
157
|
+
/end-of$-line/,
|
158
|
+
/(?<name> ... \g<name>*)/
|
171
159
|
)
|
172
160
|
end
|
173
161
|
|
@@ -182,13 +170,13 @@ RSpec.describe Regexp, "#examples" do
|
|
182
170
|
)
|
183
171
|
end
|
184
172
|
|
185
|
-
context "for
|
186
|
-
|
173
|
+
context "for named properties" do
|
174
|
+
examples_exist_and_match(
|
187
175
|
/\p{L}/,
|
188
176
|
/\p{Arabic}/,
|
189
|
-
/\p{^Ll}
|
190
|
-
/(?<name> ... \g<name>*)/
|
177
|
+
/\p{^Ll}/
|
191
178
|
)
|
179
|
+
|
192
180
|
end
|
193
181
|
|
194
182
|
context "for control characters" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp-examples
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Lord
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -58,13 +58,13 @@ files:
|
|
58
58
|
- lib/regexp-examples/chargroup_parser.rb
|
59
59
|
- lib/regexp-examples/constants.rb
|
60
60
|
- lib/regexp-examples/core_extensions/regexp/examples.rb
|
61
|
-
- lib/regexp-examples/exceptions.rb
|
62
61
|
- lib/regexp-examples/groups.rb
|
63
62
|
- lib/regexp-examples/helpers.rb
|
64
63
|
- lib/regexp-examples/parser.rb
|
65
64
|
- lib/regexp-examples/repeaters.rb
|
66
65
|
- lib/regexp-examples/version.rb
|
67
66
|
- regexp-examples.gemspec
|
67
|
+
- scripts/unicode_lister.rb
|
68
68
|
- spec/regexp-examples_spec.rb
|
69
69
|
- spec/spec_helper.rb
|
70
70
|
homepage: http://rubygems.org/gems/regexp-examples
|