regexp-examples 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -5
- data/lib/regexp-examples/backreferences.rb +4 -2
- data/lib/regexp-examples/constants.rb +155 -0
- data/lib/regexp-examples/parser.rb +12 -7
- data/lib/regexp-examples/version.rb +1 -1
- data/scripts/unicode_lister.rb +180 -0
- data/spec/regexp-examples_spec.rb +12 -24
- metadata +3 -3
- data/lib/regexp-examples/exceptions.rb +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7dacce756110dd70823630de898a8c9f55d12b1
|
4
|
+
data.tar.gz: d3ee78e2ed48d91aacc9cb916d8ab71dd25e326d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2655f9c1b1bbb8452a06d7debdba232ba53354131776bbf23a69fc1dc3b62d950600093b4581d2f4c5304f161db421ed95204e8c40cee2adc75c95670dcf42a1
|
7
|
+
data.tar.gz: da2dd9829aa3f5f2415f4a5ca4182133c19b1a481a40172140858ba72f65e05824eebdbff8899c6f0d84a90c93b0539c86c68dd7a23371fe6f577da738746824
|
data/README.md
CHANGED
@@ -44,6 +44,7 @@ For more detail on this, see [configuration options](#configuration-options).
|
|
44
44
|
* Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/`
|
45
45
|
* Octal characters, e.g. `/\10/`, `/\177/`
|
46
46
|
* POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/`
|
47
|
+
* Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
|
47
48
|
* **Arbitrarily complex combinations of all the above!**
|
48
49
|
|
49
50
|
* Regexp options can also be used:
|
@@ -60,11 +61,6 @@ For more detail on this, see [configuration options](#configuration-options).
|
|
60
61
|
|
61
62
|
* Conditional capture groups, such as `/(group1) (?(1)yes|no)`
|
62
63
|
|
63
|
-
Using any of the following will raise a RegexpExamples::UnsupportedSyntax exception (until such time as they are implemented!):
|
64
|
-
|
65
|
-
* Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
|
66
|
-
* Subexpression calls, e.g. `/(?<name> ... \g<name>* )/` (Note: These could get _really_ ugly to implement, and may even be impossible, so I highly doubt it's worth the effort!)
|
67
|
-
|
68
64
|
There are loads more (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out here. Full documentation on all the various other obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE).
|
69
65
|
|
70
66
|
## Impossible features ("illegal syntax")
|
@@ -77,6 +73,7 @@ Using any of the following will raise a RegexpExamples::IllegalSyntax exception:
|
|
77
73
|
* Lookarounds, e.g. `/foo(?=bar)/`, `/foo(?!bar)/`, `/(?<=foo)bar/`, `/(?<!foo)bar/`
|
78
74
|
* [Anchors](http://ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Anchors) (`\b`, `\B`, `\G`, `^`, `\A`, `$`, `\z`, `\Z`), e.g. `/\bword\b/`, `/line1\n^line2/`
|
79
75
|
* However, a special case has been made to allow `^`, `\A` and `\G` at the start of a pattern; and to allow `$`, `\z` and `\Z` at the end of pattern. In such cases, the characters are effectively just ignored.
|
76
|
+
* Subexpression calls, e.g. `/(?<name> ... \g<name>* )/`
|
80
77
|
|
81
78
|
(Note: Backreferences are not really "regular" either, but I got these to work with a bit of hackery!)
|
82
79
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module RegexpExamples
|
2
2
|
class BackReferenceReplacer
|
3
|
+
BackrefNotFound = Class.new(StandardError)
|
4
|
+
|
3
5
|
def substitute_backreferences(full_examples)
|
4
6
|
full_examples.map do |full_example|
|
5
7
|
begin
|
@@ -7,7 +9,7 @@ module RegexpExamples
|
|
7
9
|
full_example.sub!(/__(\w+?)__/, find_backref_for(full_example, $1))
|
8
10
|
end
|
9
11
|
full_example
|
10
|
-
rescue
|
12
|
+
rescue BackrefNotFound
|
11
13
|
# For instance, one "full example" from /(a|(b)) \2/: "a __2__"
|
12
14
|
# should be rejected because the backref (\2) does not exist
|
13
15
|
nil
|
@@ -27,7 +29,7 @@ module RegexpExamples
|
|
27
29
|
if octal_chars =~ /\A[01]?[0-7]{1,2}\z/ && octal_chars.to_i >= 10
|
28
30
|
Integer(octal_chars, 8).chr
|
29
31
|
else
|
30
|
-
raise(
|
32
|
+
raise(BackrefNotFound)
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
@@ -35,6 +35,9 @@ module RegexpExamples
|
|
35
35
|
Lower = Array('a'..'z')
|
36
36
|
Upper = Array('A'..'Z')
|
37
37
|
Digit = Array('0'..'9')
|
38
|
+
# Note: Punct should also include the following chars: $ + < = > ^ ` | ~
|
39
|
+
# I.e. Punct = %w(! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~)
|
40
|
+
# However, due to a ruby bug (!!) these do not work properly at the moment!
|
38
41
|
Punct = %w(! " # % & ' ( ) * , - . / : ; ? @ [ \\ ] _ { })
|
39
42
|
Hex = Array('a'..'f') | Array('A'..'F') | Digit
|
40
43
|
Word = Lower | Upper | Digit | ['_']
|
@@ -81,5 +84,157 @@ module RegexpExamples
|
|
81
84
|
'word' => CharSets::Word,
|
82
85
|
'ascii' => CharSets::Any
|
83
86
|
}.freeze
|
87
|
+
|
88
|
+
def self.ranges_to_unicode(*ranges)
|
89
|
+
result = []
|
90
|
+
ranges.each do |range|
|
91
|
+
if range.is_a? Fixnum # Small hack to improve readability below
|
92
|
+
result << hex_to_unicode(range.to_s(16))
|
93
|
+
else
|
94
|
+
range.each { |num| result << hex_to_unicode(num.to_s(16)) }
|
95
|
+
end
|
96
|
+
end
|
97
|
+
result
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.hex_to_unicode(hex)
|
101
|
+
eval("?\\u{#{hex}}")
|
102
|
+
end
|
103
|
+
|
104
|
+
# These values were generated by: scripts/unicode_lister.rb
|
105
|
+
# Note: Only the first 128 results are listed, for performance.
|
106
|
+
# Also, some groups seem to have no matches (weird!)
|
107
|
+
NamedPropertyCharMap = {
|
108
|
+
'Alnum' => ranges_to_unicode(48..57, 65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..256),
|
109
|
+
'Alpha' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
|
110
|
+
'Blank' => ranges_to_unicode(9, 32, 160, 5760, 8192..8202, 8239, 8287, 12288),
|
111
|
+
'Cntrl' => ranges_to_unicode(0..31, 127..159),
|
112
|
+
'Digit' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
|
113
|
+
'Graph' => ranges_to_unicode(33..126, 161..194),
|
114
|
+
'Lower' => ranges_to_unicode(97..122, 170, 181, 186, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387),
|
115
|
+
'Print' => ranges_to_unicode(32..126, 160..192),
|
116
|
+
'Punct' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
|
117
|
+
'Space' => ranges_to_unicode(9..13, 32, 133, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
|
118
|
+
'Upper' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
|
119
|
+
'XDigit' => ranges_to_unicode(48..57, 65..70, 97..102),
|
120
|
+
'Word' => ranges_to_unicode(48..57, 65..90, 95, 97..122, 170, 181, 186, 192..214, 216..246, 248..255),
|
121
|
+
'ASCII' => ranges_to_unicode(0..127),
|
122
|
+
'Any' => ranges_to_unicode(0..127),
|
123
|
+
'Assigned' => ranges_to_unicode(0..127),
|
124
|
+
'L' => ranges_to_unicode(65..90, 97..122, 170, 181, 186, 192..214, 216..246, 248..266),
|
125
|
+
'Ll' => ranges_to_unicode(97..122, 181, 223..246, 248..255, 257, 259, 261, 263, 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307, 309, 311..312, 314, 316, 318, 320, 322, 324, 326, 328..329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382..384, 387, 389, 392),
|
126
|
+
'Lm' => ranges_to_unicode(688..705, 710..721, 736..740, 748, 750, 884, 890, 1369, 1600, 1765..1766, 2036..2037, 2042, 2074, 2084, 2088, 2417, 3654, 3782, 4348, 6103, 6211, 6823, 7288..7293, 7468..7530, 7544, 7579..7580),
|
127
|
+
'Lo' => ranges_to_unicode(170, 186, 443, 448..451, 660, 1488..1514, 1520..1522, 1568..1599, 1601..1610, 1646..1647, 1649..1694),
|
128
|
+
'Lt' => ranges_to_unicode(453, 456, 459, 498, 8072..8079, 8088..8095, 8104..8111, 8124, 8140, 8188),
|
129
|
+
'Lu' => ranges_to_unicode(65..90, 192..214, 216..222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376..377, 379, 381, 385..386, 388, 390..391, 393..395, 398),
|
130
|
+
'M' => ranges_to_unicode(768..879, 1155..1161, 1425..1433),
|
131
|
+
'Mn' => ranges_to_unicode(768..879, 1155..1159, 1425..1435),
|
132
|
+
'Mc' => ranges_to_unicode(2307, 2363, 2366..2368, 2377..2380, 2382..2383, 2434..2435, 2494..2496, 2503..2504, 2507..2508, 2519, 2563, 2622..2624, 2691, 2750..2752, 2761, 2763..2764, 2818..2819, 2878, 2880, 2887..2888, 2891..2892, 2903, 3006..3007, 3009..3010, 3014..3016, 3018..3020, 3031, 3073..3075, 3137..3140, 3202..3203, 3262, 3264..3268, 3271..3272, 3274..3275, 3285..3286, 3330..3331, 3390..3392, 3398..3400, 3402..3404, 3415, 3458..3459, 3535..3537, 3544..3551, 3570..3571, 3902..3903, 3967, 4139..4140, 4145, 4152, 4155..4156, 4182..4183, 4194..4196, 4199..4205, 4227..4228, 4231..4235),
|
133
|
+
'Me' => ranges_to_unicode(1160..1161, 6846, 8413..8416, 8418..8420, 42608..42610),
|
134
|
+
'N' => ranges_to_unicode(48..57, 178..179, 185, 188..190, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2548..2553, 2662..2671, 2790..2799, 2918..2927, 2930..2935, 3046..3058, 3174..3180),
|
135
|
+
'Nd' => ranges_to_unicode(48..57, 1632..1641, 1776..1785, 1984..1993, 2406..2415, 2534..2543, 2662..2671, 2790..2799, 2918..2927, 3046..3055, 3174..3183, 3302..3311, 3430..3437),
|
136
|
+
'Nl' => ranges_to_unicode(5870..5872, 8544..8578, 8581..8584, 12295, 12321..12329, 12344..12346, 42726..42735),
|
137
|
+
'No' => ranges_to_unicode(178..179, 185, 188..190, 2548..2553, 2930..2935, 3056..3058, 3192..3198, 3440..3445, 3882..3891, 4969..4988, 6128..6137, 6618, 8304, 8308..8313, 8320..8329, 8528..8543, 8585, 9312..9330),
|
138
|
+
'P' => ranges_to_unicode(33..35, 37..42, 44..47, 58..59, 63..64, 91..93, 95, 123, 125, 161, 167, 171, 182..183, 187, 191, 894, 903, 1370..1375, 1417..1418, 1470, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3898..3901, 3973, 4048..4052, 4057..4058, 4170),
|
139
|
+
'Pc' => ranges_to_unicode(95, 8255..8256, 8276),
|
140
|
+
'Pd' => ranges_to_unicode(45, 1418, 1470, 5120, 6150, 8208..8213, 11799, 11802, 11834..11835, 11840, 12316, 12336, 12448),
|
141
|
+
'Ps' => ranges_to_unicode(40, 91, 123, 3898, 3900, 5787, 8218, 8222, 8261, 8317, 8333, 8968, 8970, 9001, 10088, 10090, 10092, 10094, 10096, 10098, 10100, 10181, 10214, 10216, 10218, 10220, 10222, 10627, 10629, 10631, 10633, 10635, 10637, 10639, 10641, 10643, 10645, 10647, 10712, 10714, 10748, 11810, 11812, 11814, 11816, 11842, 12296, 12298, 12300, 12302, 12304, 12308, 12310, 12312, 12314, 12317),
|
142
|
+
'Pe' => ranges_to_unicode(41, 93, 125, 3899, 3901, 5788, 8262, 8318, 8334, 8969, 8971, 9002, 10089, 10091, 10093, 10095, 10097, 10099, 10101, 10182, 10215, 10217, 10219, 10221, 10223, 10628, 10630, 10632, 10634, 10636, 10638, 10640, 10642, 10644, 10646, 10648, 10713, 10715, 10749, 11811, 11813, 11815, 11817, 12297, 12299, 12301, 12303, 12305, 12309, 12311, 12313, 12315, 12318..12319),
|
143
|
+
'Pi' => ranges_to_unicode(171, 8216, 8219..8220, 8223, 8249, 11778, 11780, 11785, 11788, 11804, 11808),
|
144
|
+
'Pf' => ranges_to_unicode(187, 8217, 8221, 8250, 11779, 11781, 11786, 11789, 11805, 11809),
|
145
|
+
'Po' => ranges_to_unicode(33..35, 37..39, 42, 44, 46..47, 58..59, 63..64, 92, 161, 167, 182..183, 191, 894, 903, 1370..1375, 1417, 1472, 1475, 1478, 1523..1524, 1545..1546, 1548..1549, 1563, 1566..1567, 1642..1645, 1748, 1792..1805, 2039..2041, 2096..2110, 2142, 2404..2405, 2416, 2800, 3572, 3663, 3674..3675, 3844..3858, 3860, 3973, 4048..4052, 4057..4058, 4170..4175, 4347, 4960..4968, 5741),
|
146
|
+
'S' => ranges_to_unicode(36, 43, 60..62, 94, 96, 124, 126, 162..166, 168..169, 172, 174..177, 180, 184, 215, 247, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 1014, 1154, 1421..1423, 1542..1544, 1547, 1550..1551, 1758, 1769, 1789..1790, 2038, 2546..2547, 2554..2555, 2801, 2928, 3059..3066, 3199, 3449, 3647, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037),
|
147
|
+
'Sm' => ranges_to_unicode(43, 60..62, 124, 126, 172, 177, 215, 247, 1014, 1542..1544, 8260, 8274, 8314..8316, 8330..8332, 8472, 8512..8516, 8523, 8592..8596, 8602..8603, 8608, 8611, 8614, 8622, 8654..8655, 8658, 8660, 8692..8775),
|
148
|
+
'Sc' => ranges_to_unicode(36, 162..165, 1423, 1547, 2546..2547, 2555, 2801, 3065, 3647, 6107, 8352..8381, 43064),
|
149
|
+
'Sk' => ranges_to_unicode(94, 96, 168, 175, 180, 184, 706..709, 722..735, 741..747, 749, 751..767, 885, 900..901, 8125, 8127..8129, 8141..8143, 8157..8159, 8173..8175, 8189..8190, 12443..12444, 42752..42774, 42784..42785, 42889..42890, 43867),
|
150
|
+
'So' => ranges_to_unicode(166, 169, 174, 176, 1154, 1421..1422, 1550..1551, 1758, 1769, 1789..1790, 2038, 2554, 2928, 3059..3064, 3066, 3199, 3449, 3841..3843, 3859, 3861..3863, 3866..3871, 3892, 3894, 3896, 4030..4037, 4039..4044, 4046..4047, 4053..4056, 4254..4255, 5008..5017, 6464, 6622..6655, 7009..7018, 7028..7036, 8448),
|
151
|
+
'Z' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8232..8233, 8239, 8287, 12288),
|
152
|
+
'Zs' => ranges_to_unicode(32, 160, 5760, 8192..8202, 8239, 8287, 12288),
|
153
|
+
'Zl' => ranges_to_unicode(8232),
|
154
|
+
'Zp' => ranges_to_unicode(8233),
|
155
|
+
'C' => ranges_to_unicode(0..31, 127..159, 173, 888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1541, 1564..1565, 1757, 1806..1807, 1867..1868, 1970..1977),
|
156
|
+
'Cc' => ranges_to_unicode(0..31, 127..159),
|
157
|
+
'Cf' => ranges_to_unicode(173, 1536..1541, 1564, 1757, 1807, 6158, 8203..8207, 8234..8238, 8288..8292, 8294..8303),
|
158
|
+
'Cn' => ranges_to_unicode(888..889, 896..899, 907, 909, 930, 1328, 1367..1368, 1376, 1416, 1419..1420, 1424, 1480..1487, 1515..1519, 1525..1535, 1565, 1806, 1867..1868, 1970..1983, 2043..2047, 2094..2095, 2111, 2140..2141, 2143..2201),
|
159
|
+
'Co' => ranges_to_unicode(),
|
160
|
+
'Cs' => ranges_to_unicode(),
|
161
|
+
'Arabic' => ranges_to_unicode(1536..1540, 1542..1547, 1549..1562, 1566, 1568..1599, 1601..1610, 1622..1631, 1642..1647, 1649..1692),
|
162
|
+
'Armenian' => ranges_to_unicode(1329..1366, 1369..1375, 1377..1415, 1418, 1421..1423),
|
163
|
+
'Balinese' => ranges_to_unicode(6912..6987, 6992..7036),
|
164
|
+
'Bengali' => ranges_to_unicode(2432..2435, 2437..2444, 2447..2448, 2451..2472, 2474..2480, 2482, 2486..2489, 2492..2500, 2503..2504, 2507..2510, 2519, 2524..2525, 2527..2531, 2534..2555),
|
165
|
+
'Bopomofo' => ranges_to_unicode(746..747, 12549..12589, 12704..12730),
|
166
|
+
'Braille' => ranges_to_unicode(10240..10367),
|
167
|
+
'Buginese' => ranges_to_unicode(6656..6683, 6686..6687),
|
168
|
+
'Buhid' => ranges_to_unicode(5952..5971),
|
169
|
+
'Canadian_Aboriginal' => ranges_to_unicode(5120..5247),
|
170
|
+
'Carian' => ranges_to_unicode(),
|
171
|
+
'Cham' => ranges_to_unicode(43520..43574, 43584..43597, 43600..43609, 43612..43615),
|
172
|
+
'Cherokee' => ranges_to_unicode(5024..5108),
|
173
|
+
'Common' => ranges_to_unicode(0..64, 91..96, 123..169, 171..180),
|
174
|
+
'Coptic' => ranges_to_unicode(994..1007, 11392..11505),
|
175
|
+
'Cuneiform' => ranges_to_unicode(),
|
176
|
+
'Cypriot' => ranges_to_unicode(),
|
177
|
+
'Cyrillic' => ranges_to_unicode(1024..1151),
|
178
|
+
'Deseret' => ranges_to_unicode(),
|
179
|
+
'Devanagari' => ranges_to_unicode(2304..2384, 2387..2403, 2406..2431, 43232..43235),
|
180
|
+
'Ethiopic' => ranges_to_unicode(4608..4680, 4682..4685, 4688..4694, 4696, 4698..4701, 4704..4742),
|
181
|
+
'Georgian' => ranges_to_unicode(4256..4293, 4295, 4301, 4304..4346, 4348..4351, 11520..11557, 11559, 11565),
|
182
|
+
'Glagolitic' => ranges_to_unicode(11264..11310, 11312..11358),
|
183
|
+
'Gothic' => ranges_to_unicode(),
|
184
|
+
'Greek' => ranges_to_unicode(880..883, 885..887, 890..893, 895, 900, 902, 904..906, 908, 910..929, 931..993, 1008..1023, 7462..7466, 7517..7521, 7526),
|
185
|
+
'Gujarati' => ranges_to_unicode(2689..2691, 2693..2701, 2703..2705, 2707..2728, 2730..2736, 2738..2739, 2741..2745, 2748..2757, 2759..2761, 2763..2765, 2768, 2784..2787, 2790..2801),
|
186
|
+
'Gurmukhi' => ranges_to_unicode(2561..2563, 2565..2570, 2575..2576, 2579..2600, 2602..2608, 2610..2611, 2613..2614, 2616..2617, 2620, 2622..2626, 2631..2632, 2635..2637, 2641, 2649..2652, 2654, 2662..2677),
|
187
|
+
'Han' => ranges_to_unicode(11904..11929, 11931..12019, 12032..12044),
|
188
|
+
'Hangul' => ranges_to_unicode(4352..4479),
|
189
|
+
'Hanunoo' => ranges_to_unicode(5920..5940),
|
190
|
+
'Hebrew' => ranges_to_unicode(1425..1479, 1488..1514, 1520..1524),
|
191
|
+
'Hiragana' => ranges_to_unicode(12353..12438, 12445..12447),
|
192
|
+
'Inherited' => ranges_to_unicode(768..879, 1157..1158, 1611..1621, 1648, 2385..2386),
|
193
|
+
'Kannada' => ranges_to_unicode(3201..3203, 3205..3212, 3214..3216, 3218..3240, 3242..3251, 3253..3257, 3260..3268, 3270..3272, 3274..3277, 3285..3286, 3294, 3296..3299, 3302..3311, 3313..3314),
|
194
|
+
'Katakana' => ranges_to_unicode(12449..12538, 12541..12543, 12784..12799, 13008..13026),
|
195
|
+
'Kayah_Li' => ranges_to_unicode(43264..43309, 43311),
|
196
|
+
'Kharoshthi' => ranges_to_unicode(),
|
197
|
+
'Khmer' => ranges_to_unicode(6016..6109, 6112..6121, 6128..6137, 6624..6637),
|
198
|
+
'Lao' => ranges_to_unicode(3713..3714, 3716, 3719..3720, 3722, 3725, 3732..3735, 3737..3743, 3745..3747, 3749, 3751, 3754..3755, 3757..3769, 3771..3773, 3776..3780, 3782, 3784..3789, 3792..3801, 3804..3807),
|
199
|
+
'Latin' => ranges_to_unicode(65..90, 97..122, 170, 186, 192..214, 216..246, 248..267),
|
200
|
+
'Lepcha' => ranges_to_unicode(7168..7223, 7227..7241, 7245..7247),
|
201
|
+
'Limbu' => ranges_to_unicode(6400..6430, 6432..6443, 6448..6459, 6464, 6468..6479),
|
202
|
+
'Linear_B' => ranges_to_unicode(),
|
203
|
+
'Lycian' => ranges_to_unicode(),
|
204
|
+
'Lydian' => ranges_to_unicode(),
|
205
|
+
'Malayalam' => ranges_to_unicode(3329..3331, 3333..3340, 3342..3344, 3346..3386, 3389..3396, 3398..3400, 3402..3406, 3415, 3424..3427, 3430..3445, 3449..3455),
|
206
|
+
'Mongolian' => ranges_to_unicode(6144..6145, 6148, 6150..6158, 6160..6169, 6176..6263, 6272..6289),
|
207
|
+
'Myanmar' => ranges_to_unicode(4096..4223),
|
208
|
+
'New_Tai_Lue' => ranges_to_unicode(6528..6571, 6576..6601, 6608..6618, 6622..6623),
|
209
|
+
'Nko' => ranges_to_unicode(1984..2042),
|
210
|
+
'Ogham' => ranges_to_unicode(5760..5788),
|
211
|
+
'Ol_Chiki' => ranges_to_unicode(7248..7295),
|
212
|
+
'Old_Italic' => ranges_to_unicode(),
|
213
|
+
'Old_Persian' => ranges_to_unicode(),
|
214
|
+
'Oriya' => ranges_to_unicode(2817..2819, 2821..2828, 2831..2832, 2835..2856, 2858..2864, 2866..2867, 2869..2873, 2876..2884, 2887..2888, 2891..2893, 2902..2903, 2908..2909, 2911..2915, 2918..2935),
|
215
|
+
'Osmanya' => ranges_to_unicode(),
|
216
|
+
'Phags_Pa' => ranges_to_unicode(43072..43127),
|
217
|
+
'Phoenician' => ranges_to_unicode(),
|
218
|
+
'Rejang' => ranges_to_unicode(43312..43347, 43359),
|
219
|
+
'Runic' => ranges_to_unicode(5792..5866, 5870..5880),
|
220
|
+
'Saurashtra' => ranges_to_unicode(43136..43204, 43214..43225),
|
221
|
+
'Shavian' => ranges_to_unicode(),
|
222
|
+
'Sinhala' => ranges_to_unicode(3458..3459, 3461..3478, 3482..3505, 3507..3515, 3517, 3520..3526, 3530, 3535..3540, 3542, 3544..3551, 3558..3567, 3570..3572),
|
223
|
+
'Sundanese' => ranges_to_unicode(7040..7103, 7360..7367),
|
224
|
+
'Syloti_Nagri' => ranges_to_unicode(43008..43051),
|
225
|
+
'Syriac' => ranges_to_unicode(1792..1805, 1807..1866, 1869..1871),
|
226
|
+
'Tagalog' => ranges_to_unicode(5888..5900, 5902..5908),
|
227
|
+
'Tagbanwa' => ranges_to_unicode(5984..5996, 5998..6000, 6002..6003),
|
228
|
+
'Tai_Le' => ranges_to_unicode(6480..6509, 6512..6516),
|
229
|
+
'Tamil' => ranges_to_unicode(2946..2947, 2949..2954, 2958..2960, 2962..2965, 2969..2970, 2972, 2974..2975, 2979..2980, 2984..2986, 2990..3001, 3006..3010, 3014..3016, 3018..3021, 3024, 3031, 3046..3066),
|
230
|
+
'Telugu' => ranges_to_unicode(3072..3075, 3077..3084, 3086..3088, 3090..3112, 3114..3129, 3133..3140, 3142..3144, 3146..3149, 3157..3158, 3160..3161, 3168..3171, 3174..3183, 3192..3199),
|
231
|
+
'Thaana' => ranges_to_unicode(1920..1969),
|
232
|
+
'Thai' => ranges_to_unicode(3585..3642, 3648..3675),
|
233
|
+
'Tibetan' => ranges_to_unicode(3840..3911, 3913..3948, 3953..3972),
|
234
|
+
'Tifinagh' => ranges_to_unicode(11568..11623, 11631..11632, 11647),
|
235
|
+
'Ugaritic' => ranges_to_unicode(),
|
236
|
+
'Vai' => ranges_to_unicode(42240..42367),
|
237
|
+
'Yi' => ranges_to_unicode(40960..41087),
|
238
|
+
}.freeze
|
84
239
|
end
|
85
240
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
module RegexpExamples
|
2
|
+
IllegalSyntaxError = Class.new(StandardError)
|
2
3
|
class Parser
|
3
4
|
attr_reader :regexp_string
|
4
5
|
def initialize(regexp_string, regexp_options, config_options={})
|
@@ -85,8 +86,6 @@ module RegexpExamples
|
|
85
86
|
group = parse_backreference_group($1)
|
86
87
|
when BackslashCharMap.keys.include?(next_char)
|
87
88
|
group = CharGroup.new(
|
88
|
-
# Note: The `.dup` is important, as it prevents modifying the constant, in
|
89
|
-
# CharGroup#init_ranges (where the '-' is moved to the front)
|
90
89
|
BackslashCharMap[next_char].dup,
|
91
90
|
@ignorecase
|
92
91
|
)
|
@@ -100,16 +99,22 @@ module RegexpExamples
|
|
100
99
|
@current_position += $1.length
|
101
100
|
sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
|
102
101
|
group = parse_single_char_group( parse_unicode_sequence(sequence) )
|
103
|
-
when rest_of_string =~ /\Ap\{([^}]+)\}/ # Named properties
|
104
|
-
@current_position += ($1.length + 2)
|
105
|
-
|
102
|
+
when rest_of_string =~ /\Ap\{(\^?)([^}]+)\}/ # Named properties
|
103
|
+
@current_position += ($1.length + $2.length + 2)
|
104
|
+
group = CharGroup.new(
|
105
|
+
if($1 == "^")
|
106
|
+
CharSets::Any.dup - NamedPropertyCharMap[$2]
|
107
|
+
else
|
108
|
+
NamedPropertyCharMap[$2]
|
109
|
+
end,
|
110
|
+
@ignorecase
|
111
|
+
)
|
106
112
|
when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
|
107
113
|
group = PlaceHolderGroup.new
|
108
114
|
when next_char == 'R' # Linebreak
|
109
115
|
group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
|
110
116
|
when next_char == 'g' # Subexpression call
|
111
|
-
|
112
|
-
raise UnsupportedSyntaxError, "Subexpression calls (\g) are not yet supported"
|
117
|
+
raise IllegalSyntaxError, "Subexpression calls (\g) are not yet supported"
|
113
118
|
when next_char =~ /[bB]/ # Anchors
|
114
119
|
raise IllegalSyntaxError, "Anchors ('\\#{next_char}') cannot be supported, as they are not regular"
|
115
120
|
when next_char =~ /[AG]/ # Start of string
|
@@ -0,0 +1,180 @@
|
|
1
|
+
# A script to generate lists of all unicode characters
|
2
|
+
# that match all named group/character properties regexps.
|
3
|
+
# For use in e.g. /\p{Arabic}/.examples
|
4
|
+
|
5
|
+
# To (re-)generate this list, simply run this file!
|
6
|
+
# > ruby scripts/unicode_lister.rb
|
7
|
+
OutputFilename = 'unicode_result'
|
8
|
+
|
9
|
+
# Taken from ruby documentation:
|
10
|
+
# http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
|
11
|
+
NamedGroups = %w(
|
12
|
+
Alnum
|
13
|
+
Alpha
|
14
|
+
Blank
|
15
|
+
Cntrl
|
16
|
+
Digit
|
17
|
+
Graph
|
18
|
+
Lower
|
19
|
+
Print
|
20
|
+
Punct
|
21
|
+
Space
|
22
|
+
Upper
|
23
|
+
XDigit
|
24
|
+
Word
|
25
|
+
ASCII
|
26
|
+
Any
|
27
|
+
Assigned
|
28
|
+
|
29
|
+
L
|
30
|
+
Ll
|
31
|
+
Lm
|
32
|
+
Lo
|
33
|
+
Lt
|
34
|
+
Lu
|
35
|
+
M
|
36
|
+
Mn
|
37
|
+
Mc
|
38
|
+
Me
|
39
|
+
N
|
40
|
+
Nd
|
41
|
+
Nl
|
42
|
+
No
|
43
|
+
P
|
44
|
+
Pc
|
45
|
+
Pd
|
46
|
+
Ps
|
47
|
+
Pe
|
48
|
+
Pi
|
49
|
+
Pf
|
50
|
+
Po
|
51
|
+
S
|
52
|
+
Sm
|
53
|
+
Sc
|
54
|
+
Sk
|
55
|
+
So
|
56
|
+
Z
|
57
|
+
Zs
|
58
|
+
Zl
|
59
|
+
Zp
|
60
|
+
C
|
61
|
+
Cc
|
62
|
+
Cf
|
63
|
+
Cn
|
64
|
+
Co
|
65
|
+
Cs
|
66
|
+
|
67
|
+
Arabic
|
68
|
+
Armenian
|
69
|
+
Balinese
|
70
|
+
Bengali
|
71
|
+
Bopomofo
|
72
|
+
Braille
|
73
|
+
Buginese
|
74
|
+
Buhid
|
75
|
+
Canadian_Aboriginal
|
76
|
+
Carian
|
77
|
+
Cham
|
78
|
+
Cherokee
|
79
|
+
Common
|
80
|
+
Coptic
|
81
|
+
Cuneiform
|
82
|
+
Cypriot
|
83
|
+
Cyrillic
|
84
|
+
Deseret
|
85
|
+
Devanagari
|
86
|
+
Ethiopic
|
87
|
+
Georgian
|
88
|
+
Glagolitic
|
89
|
+
Gothic
|
90
|
+
Greek
|
91
|
+
Gujarati
|
92
|
+
Gurmukhi
|
93
|
+
Han
|
94
|
+
Hangul
|
95
|
+
Hanunoo
|
96
|
+
Hebrew
|
97
|
+
Hiragana
|
98
|
+
Inherited
|
99
|
+
Kannada
|
100
|
+
Katakana
|
101
|
+
Kayah_Li
|
102
|
+
Kharoshthi
|
103
|
+
Khmer
|
104
|
+
Lao
|
105
|
+
Latin
|
106
|
+
Lepcha
|
107
|
+
Limbu
|
108
|
+
Linear_B
|
109
|
+
Lycian
|
110
|
+
Lydian
|
111
|
+
Malayalam
|
112
|
+
Mongolian
|
113
|
+
Myanmar
|
114
|
+
New_Tai_Lue
|
115
|
+
Nko
|
116
|
+
Ogham
|
117
|
+
Ol_Chiki
|
118
|
+
Old_Italic
|
119
|
+
Old_Persian
|
120
|
+
Oriya
|
121
|
+
Osmanya
|
122
|
+
Phags_Pa
|
123
|
+
Phoenician
|
124
|
+
Rejang
|
125
|
+
Runic
|
126
|
+
Saurashtra
|
127
|
+
Shavian
|
128
|
+
Sinhala
|
129
|
+
Sundanese
|
130
|
+
Syloti_Nagri
|
131
|
+
Syriac
|
132
|
+
Tagalog
|
133
|
+
Tagbanwa
|
134
|
+
Tai_Le
|
135
|
+
Tamil
|
136
|
+
Telugu
|
137
|
+
Thaana
|
138
|
+
Thai
|
139
|
+
Tibetan
|
140
|
+
Tifinagh
|
141
|
+
Ugaritic
|
142
|
+
Vai
|
143
|
+
Yi
|
144
|
+
)
|
145
|
+
|
146
|
+
# Note: For some reason, a character encoding-related exception gets raised
|
147
|
+
# when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
|
148
|
+
# This means my calculation is MISSING results in the range: 55296..65535
|
149
|
+
# However, for the sake of performance, I'm also being "lazy" and only calculating/saving
|
150
|
+
# the first 128 matches anyway!
|
151
|
+
# If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
|
152
|
+
|
153
|
+
# Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
|
154
|
+
# Example output: "1..4, 6..7, 12, 14" (String)
|
155
|
+
def calculate_ranges(matching_codes)
|
156
|
+
return "" if matching_codes.empty?
|
157
|
+
first = matching_codes.shift
|
158
|
+
matching_codes.inject([first..first]) do |r,x|
|
159
|
+
if r.last.last.succ != x
|
160
|
+
r << (x..x) # Start new range
|
161
|
+
else
|
162
|
+
r[0..-2] << (r.last.first..x) # Update last range
|
163
|
+
end
|
164
|
+
end
|
165
|
+
.map { |range| range.size == 1 ? range.first : range}
|
166
|
+
.join(", ")
|
167
|
+
end
|
168
|
+
|
169
|
+
count = 0
|
170
|
+
File.open(OutputFilename, 'w') do |f|
|
171
|
+
NamedGroups.each do |name|
|
172
|
+
count += 1
|
173
|
+
matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
|
174
|
+
f.puts "'#{name}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
|
175
|
+
puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
|
176
|
+
end
|
177
|
+
puts "*"*50
|
178
|
+
puts "Finished! Result stored in: #{OutputFilename}"
|
179
|
+
end
|
180
|
+
|
@@ -1,14 +1,9 @@
|
|
1
1
|
RSpec.describe Regexp, "#examples" do
|
2
2
|
def self.examples_exist_and_match(*regexps)
|
3
3
|
regexps.each do |regexp|
|
4
|
-
it do
|
5
|
-
|
6
|
-
|
7
|
-
rescue
|
8
|
-
# TODO: Find a nicer way to display this?
|
9
|
-
puts "Error generating examples for /#{regexp.source}/"
|
10
|
-
raise $!
|
11
|
-
end
|
4
|
+
it "examples for /#{regexp.source}/" do
|
5
|
+
regexp_examples = regexp.examples(max_group_results: 999)
|
6
|
+
|
12
7
|
expect(regexp_examples).not_to be_empty, "No examples were generated for regexp: /#{regexp.source}/"
|
13
8
|
regexp_examples.each { |example| expect(example).to match(/\A(?:#{regexp.source})\z/) }
|
14
9
|
# Note: /\A...\z/ is used to prevent misleading examples from passing the test.
|
@@ -21,24 +16,16 @@ RSpec.describe Regexp, "#examples" do
|
|
21
16
|
|
22
17
|
def self.examples_raise_illegal_syntax_error(*regexps)
|
23
18
|
regexps.each do |regexp|
|
24
|
-
it do
|
19
|
+
it "examples for /#{regexp.source}/" do
|
25
20
|
expect{regexp.examples}.to raise_error RegexpExamples::IllegalSyntaxError
|
26
21
|
end
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
30
|
-
def self.examples_raise_unsupported_syntax_error(*regexps)
|
31
|
-
regexps.each do |regexp|
|
32
|
-
it do
|
33
|
-
expect{regexp.examples}.to raise_error RegexpExamples::UnsupportedSyntaxError
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
25
|
def self.examples_are_empty(*regexps)
|
39
26
|
regexps.each do |regexp|
|
40
|
-
it do
|
41
|
-
expect(regexp.examples).to be_empty
|
27
|
+
it "examples for /#{regexp.source}/" do
|
28
|
+
expect(regexp.examples).to be_empty
|
42
29
|
end
|
43
30
|
end
|
44
31
|
end
|
@@ -167,7 +154,8 @@ RSpec.describe Regexp, "#examples" do
|
|
167
154
|
/start-of^-line/,
|
168
155
|
/end-of\Z-string/,
|
169
156
|
/end-of\z-string/,
|
170
|
-
/end-of$-line
|
157
|
+
/end-of$-line/,
|
158
|
+
/(?<name> ... \g<name>*)/
|
171
159
|
)
|
172
160
|
end
|
173
161
|
|
@@ -182,13 +170,13 @@ RSpec.describe Regexp, "#examples" do
|
|
182
170
|
)
|
183
171
|
end
|
184
172
|
|
185
|
-
context "for
|
186
|
-
|
173
|
+
context "for named properties" do
|
174
|
+
examples_exist_and_match(
|
187
175
|
/\p{L}/,
|
188
176
|
/\p{Arabic}/,
|
189
|
-
/\p{^Ll}
|
190
|
-
/(?<name> ... \g<name>*)/
|
177
|
+
/\p{^Ll}/
|
191
178
|
)
|
179
|
+
|
192
180
|
end
|
193
181
|
|
194
182
|
context "for control characters" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp-examples
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tom Lord
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -58,13 +58,13 @@ files:
|
|
58
58
|
- lib/regexp-examples/chargroup_parser.rb
|
59
59
|
- lib/regexp-examples/constants.rb
|
60
60
|
- lib/regexp-examples/core_extensions/regexp/examples.rb
|
61
|
-
- lib/regexp-examples/exceptions.rb
|
62
61
|
- lib/regexp-examples/groups.rb
|
63
62
|
- lib/regexp-examples/helpers.rb
|
64
63
|
- lib/regexp-examples/parser.rb
|
65
64
|
- lib/regexp-examples/repeaters.rb
|
66
65
|
- lib/regexp-examples/version.rb
|
67
66
|
- regexp-examples.gemspec
|
67
|
+
- scripts/unicode_lister.rb
|
68
68
|
- spec/regexp-examples_spec.rb
|
69
69
|
- spec/spec_helper.rb
|
70
70
|
homepage: http://rubygems.org/gems/regexp-examples
|