regexp_parser 2.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +94 -6
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +40 -30
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +75 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +41 -23
- data/lib/regexp_parser/expression/sequence.rb +9 -24
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +85 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -8
- data/lib/regexp_parser/expression.rb +10 -132
- data/lib/regexp_parser/lexer.rb +8 -6
- data/lib/regexp_parser/parser.rb +21 -72
- data/lib/regexp_parser/scanner/properties/long.csv +622 -0
- data/lib/regexp_parser/scanner/properties/short.csv +246 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +48 -35
- data/lib/regexp_parser/scanner.rb +735 -801
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +37 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
|
|
2
|
+
adlm,adlam
|
|
3
|
+
aghb,caucasian_albanian
|
|
4
|
+
ahex,ascii_hex_digit
|
|
5
|
+
arab,arabic
|
|
6
|
+
armi,imperial_aramaic
|
|
7
|
+
armn,armenian
|
|
8
|
+
avst,avestan
|
|
9
|
+
bali,balinese
|
|
10
|
+
bamu,bamum
|
|
11
|
+
bass,bassa_vah
|
|
12
|
+
batk,batak
|
|
13
|
+
beng,bengali
|
|
14
|
+
bhks,bhaiksuki
|
|
15
|
+
bidic,bidi_control
|
|
16
|
+
bopo,bopomofo
|
|
17
|
+
brah,brahmi
|
|
18
|
+
brai,braille
|
|
19
|
+
bugi,buginese
|
|
20
|
+
buhd,buhid
|
|
21
|
+
c,other
|
|
22
|
+
cakm,chakma
|
|
23
|
+
cans,canadian_aboriginal
|
|
24
|
+
cari,carian
|
|
25
|
+
cc,control
|
|
26
|
+
cf,format
|
|
27
|
+
cher,cherokee
|
|
28
|
+
chrs,chorasmian
|
|
29
|
+
ci,case_ignorable
|
|
30
|
+
cn,unassigned
|
|
31
|
+
co,private_use
|
|
32
|
+
combiningmark,mark
|
|
33
|
+
copt,coptic
|
|
34
|
+
cpmn,cypro_minoan
|
|
35
|
+
cprt,cypriot
|
|
36
|
+
cs,surrogate
|
|
37
|
+
cwcf,changes_when_casefolded
|
|
38
|
+
cwcm,changes_when_casemapped
|
|
39
|
+
cwl,changes_when_lowercased
|
|
40
|
+
cwt,changes_when_titlecased
|
|
41
|
+
cwu,changes_when_uppercased
|
|
42
|
+
cyrl,cyrillic
|
|
43
|
+
dep,deprecated
|
|
44
|
+
deva,devanagari
|
|
45
|
+
di,default_ignorable_code_point
|
|
46
|
+
dia,diacritic
|
|
47
|
+
diak,dives_akuru
|
|
48
|
+
dogr,dogra
|
|
49
|
+
dsrt,deseret
|
|
50
|
+
dupl,duployan
|
|
51
|
+
ebase,emoji_modifier_base
|
|
52
|
+
ecomp,emoji_component
|
|
53
|
+
egyp,egyptian_hieroglyphs
|
|
54
|
+
elba,elbasan
|
|
55
|
+
elym,elymaic
|
|
56
|
+
emod,emoji_modifier
|
|
57
|
+
epres,emoji_presentation
|
|
58
|
+
ethi,ethiopic
|
|
59
|
+
ext,extender
|
|
60
|
+
geor,georgian
|
|
61
|
+
glag,glagolitic
|
|
62
|
+
gong,gunjala_gondi
|
|
63
|
+
gonm,masaram_gondi
|
|
64
|
+
goth,gothic
|
|
65
|
+
gran,grantha
|
|
66
|
+
grbase,grapheme_base
|
|
67
|
+
grek,greek
|
|
68
|
+
grext,grapheme_extend
|
|
69
|
+
grlink,grapheme_link
|
|
70
|
+
gujr,gujarati
|
|
71
|
+
guru,gurmukhi
|
|
72
|
+
hang,hangul
|
|
73
|
+
hani,han
|
|
74
|
+
hano,hanunoo
|
|
75
|
+
hatr,hatran
|
|
76
|
+
hebr,hebrew
|
|
77
|
+
hex,hex_digit
|
|
78
|
+
hira,hiragana
|
|
79
|
+
hluw,anatolian_hieroglyphs
|
|
80
|
+
hmng,pahawh_hmong
|
|
81
|
+
hmnp,nyiakeng_puachue_hmong
|
|
82
|
+
hung,old_hungarian
|
|
83
|
+
idc,id_continue
|
|
84
|
+
ideo,ideographic
|
|
85
|
+
ids,id_start
|
|
86
|
+
idsb,ids_binary_operator
|
|
87
|
+
idst,ids_trinary_operator
|
|
88
|
+
ital,old_italic
|
|
89
|
+
java,javanese
|
|
90
|
+
joinc,join_control
|
|
91
|
+
kali,kayah_li
|
|
92
|
+
kana,katakana
|
|
93
|
+
khar,kharoshthi
|
|
94
|
+
khmr,khmer
|
|
95
|
+
khoj,khojki
|
|
96
|
+
kits,khitan_small_script
|
|
97
|
+
knda,kannada
|
|
98
|
+
kthi,kaithi
|
|
99
|
+
l,letter
|
|
100
|
+
lana,tai_tham
|
|
101
|
+
laoo,lao
|
|
102
|
+
latn,latin
|
|
103
|
+
lc,cased_letter
|
|
104
|
+
lepc,lepcha
|
|
105
|
+
limb,limbu
|
|
106
|
+
lina,linear_a
|
|
107
|
+
linb,linear_b
|
|
108
|
+
ll,lowercase_letter
|
|
109
|
+
lm,modifier_letter
|
|
110
|
+
lo,other_letter
|
|
111
|
+
loe,logical_order_exception
|
|
112
|
+
lt,titlecase_letter
|
|
113
|
+
lu,uppercase_letter
|
|
114
|
+
lyci,lycian
|
|
115
|
+
lydi,lydian
|
|
116
|
+
m,mark
|
|
117
|
+
mahj,mahajani
|
|
118
|
+
maka,makasar
|
|
119
|
+
mand,mandaic
|
|
120
|
+
mani,manichaean
|
|
121
|
+
marc,marchen
|
|
122
|
+
mc,spacing_mark
|
|
123
|
+
me,enclosing_mark
|
|
124
|
+
medf,medefaidrin
|
|
125
|
+
mend,mende_kikakui
|
|
126
|
+
merc,meroitic_cursive
|
|
127
|
+
mero,meroitic_hieroglyphs
|
|
128
|
+
mlym,malayalam
|
|
129
|
+
mn,nonspacing_mark
|
|
130
|
+
mong,mongolian
|
|
131
|
+
mroo,mro
|
|
132
|
+
mtei,meetei_mayek
|
|
133
|
+
mult,multani
|
|
134
|
+
mymr,myanmar
|
|
135
|
+
n,number
|
|
136
|
+
nand,nandinagari
|
|
137
|
+
narb,old_north_arabian
|
|
138
|
+
nbat,nabataean
|
|
139
|
+
nchar,noncharacter_code_point
|
|
140
|
+
nd,decimal_number
|
|
141
|
+
nkoo,nko
|
|
142
|
+
nl,letter_number
|
|
143
|
+
no,other_number
|
|
144
|
+
nshu,nushu
|
|
145
|
+
oalpha,other_alphabetic
|
|
146
|
+
odi,other_default_ignorable_code_point
|
|
147
|
+
ogam,ogham
|
|
148
|
+
ogrext,other_grapheme_extend
|
|
149
|
+
oidc,other_id_continue
|
|
150
|
+
oids,other_id_start
|
|
151
|
+
olck,ol_chiki
|
|
152
|
+
olower,other_lowercase
|
|
153
|
+
omath,other_math
|
|
154
|
+
orkh,old_turkic
|
|
155
|
+
orya,oriya
|
|
156
|
+
osge,osage
|
|
157
|
+
osma,osmanya
|
|
158
|
+
ougr,old_uyghur
|
|
159
|
+
oupper,other_uppercase
|
|
160
|
+
p,punctuation
|
|
161
|
+
palm,palmyrene
|
|
162
|
+
patsyn,pattern_syntax
|
|
163
|
+
patws,pattern_white_space
|
|
164
|
+
pauc,pau_cin_hau
|
|
165
|
+
pc,connector_punctuation
|
|
166
|
+
pcm,prepended_concatenation_mark
|
|
167
|
+
pd,dash_punctuation
|
|
168
|
+
pe,close_punctuation
|
|
169
|
+
perm,old_permic
|
|
170
|
+
pf,final_punctuation
|
|
171
|
+
phag,phags_pa
|
|
172
|
+
phli,inscriptional_pahlavi
|
|
173
|
+
phlp,psalter_pahlavi
|
|
174
|
+
phnx,phoenician
|
|
175
|
+
pi,initial_punctuation
|
|
176
|
+
plrd,miao
|
|
177
|
+
po,other_punctuation
|
|
178
|
+
prti,inscriptional_parthian
|
|
179
|
+
ps,open_punctuation
|
|
180
|
+
qaac,coptic
|
|
181
|
+
qaai,inherited
|
|
182
|
+
qmark,quotation_mark
|
|
183
|
+
ri,regional_indicator
|
|
184
|
+
rjng,rejang
|
|
185
|
+
rohg,hanifi_rohingya
|
|
186
|
+
runr,runic
|
|
187
|
+
s,symbol
|
|
188
|
+
samr,samaritan
|
|
189
|
+
sarb,old_south_arabian
|
|
190
|
+
saur,saurashtra
|
|
191
|
+
sc,currency_symbol
|
|
192
|
+
sd,soft_dotted
|
|
193
|
+
sgnw,signwriting
|
|
194
|
+
shaw,shavian
|
|
195
|
+
shrd,sharada
|
|
196
|
+
sidd,siddham
|
|
197
|
+
sind,khudawadi
|
|
198
|
+
sinh,sinhala
|
|
199
|
+
sk,modifier_symbol
|
|
200
|
+
sm,math_symbol
|
|
201
|
+
so,other_symbol
|
|
202
|
+
sogd,sogdian
|
|
203
|
+
sogo,old_sogdian
|
|
204
|
+
sora,sora_sompeng
|
|
205
|
+
soyo,soyombo
|
|
206
|
+
sterm,sentence_terminal
|
|
207
|
+
sund,sundanese
|
|
208
|
+
sylo,syloti_nagri
|
|
209
|
+
syrc,syriac
|
|
210
|
+
tagb,tagbanwa
|
|
211
|
+
takr,takri
|
|
212
|
+
tale,tai_le
|
|
213
|
+
talu,new_tai_lue
|
|
214
|
+
taml,tamil
|
|
215
|
+
tang,tangut
|
|
216
|
+
tavt,tai_viet
|
|
217
|
+
telu,telugu
|
|
218
|
+
term,terminal_punctuation
|
|
219
|
+
tfng,tifinagh
|
|
220
|
+
tglg,tagalog
|
|
221
|
+
thaa,thaana
|
|
222
|
+
tibt,tibetan
|
|
223
|
+
tirh,tirhuta
|
|
224
|
+
tnsa,tangsa
|
|
225
|
+
ugar,ugaritic
|
|
226
|
+
uideo,unified_ideograph
|
|
227
|
+
vaii,vai
|
|
228
|
+
vith,vithkuqi
|
|
229
|
+
vs,variation_selector
|
|
230
|
+
wara,warang_citi
|
|
231
|
+
wcho,wancho
|
|
232
|
+
wspace,white_space
|
|
233
|
+
xidc,xid_continue
|
|
234
|
+
xids,xid_start
|
|
235
|
+
xpeo,old_persian
|
|
236
|
+
xsux,cuneiform
|
|
237
|
+
yezi,yezidi
|
|
238
|
+
yiii,yi
|
|
239
|
+
z,separator
|
|
240
|
+
zanb,zanabazar_square
|
|
241
|
+
zinh,inherited
|
|
242
|
+
zl,line_separator
|
|
243
|
+
zp,paragraph_separator
|
|
244
|
+
zs,space_separator
|
|
245
|
+
zyyy,common
|
|
246
|
+
zzzz,unknown
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
|
21
21
|
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
|
23
|
-
|
|
23
|
+
validation_error(:property, name) unless token
|
|
24
24
|
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
|
26
26
|
|
|
@@ -28,13 +28,7 @@
|
|
|
28
28
|
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
|
33
|
-
'lower' | 'print' | 'punct' |
|
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
|
35
|
-
'word' | 'ascii';
|
|
36
|
-
|
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
|
38
32
|
|
|
39
33
|
|
|
40
34
|
# these are not supported in ruby at the moment
|
|
@@ -74,8 +68,7 @@
|
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
|
78
|
-
quantifier_mode?;
|
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
|
79
72
|
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
|
81
74
|
quantifier_possessive | quantifier_interval;
|
|
@@ -223,24 +216,28 @@
|
|
|
223
216
|
fcall character_set;
|
|
224
217
|
};
|
|
225
218
|
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
227
220
|
text = copy(data, ts, te)
|
|
228
221
|
|
|
229
222
|
type = :posixclass
|
|
230
223
|
class_name = text[2..-3]
|
|
231
|
-
if class_name[0]
|
|
224
|
+
if class_name[0] == '^'
|
|
232
225
|
class_name = class_name[1..-1]
|
|
233
226
|
type = :nonposixclass
|
|
234
227
|
end
|
|
235
228
|
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
|
230
|
+
validation_error(:posix_class, text)
|
|
231
|
+
end
|
|
232
|
+
|
|
236
233
|
emit(type, class_name.to_sym, text)
|
|
237
234
|
};
|
|
238
235
|
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
|
242
239
|
# };
|
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
|
245
242
|
# };
|
|
246
243
|
|
|
@@ -323,7 +320,7 @@
|
|
|
323
320
|
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
|
325
322
|
text = copy(data, ts-1, te)
|
|
326
|
-
if text[2]
|
|
323
|
+
if text[2] == '{'
|
|
327
324
|
emit(:escape, :codepoint_list, text)
|
|
328
325
|
else
|
|
329
326
|
emit(:escape, :codepoint, text)
|
|
@@ -419,12 +416,12 @@
|
|
|
419
416
|
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
|
421
418
|
case text = copy(data, ts, te)
|
|
422
|
-
when '
|
|
423
|
-
when '
|
|
424
|
-
when '
|
|
425
|
-
when '
|
|
426
|
-
when '
|
|
427
|
-
when '
|
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
|
428
425
|
end
|
|
429
426
|
};
|
|
430
427
|
|
|
@@ -477,7 +474,7 @@
|
|
|
477
474
|
group_open . group_options >group_opened {
|
|
478
475
|
text = copy(data, ts, te)
|
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
|
480
|
-
|
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
|
481
478
|
end
|
|
482
479
|
emit_options(text)
|
|
483
480
|
};
|
|
@@ -605,7 +602,7 @@
|
|
|
605
602
|
end
|
|
606
603
|
};
|
|
607
604
|
|
|
608
|
-
quantifier_interval
|
|
605
|
+
quantifier_interval {
|
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
|
610
607
|
};
|
|
611
608
|
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
|
686
683
|
end
|
|
687
684
|
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
|
689
687
|
class InvalidGroupOption < ValidationError
|
|
690
688
|
def initialize(option, text)
|
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
|
706
704
|
end
|
|
707
705
|
end
|
|
708
706
|
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
|
708
|
+
class UnknownPosixClassError < ValidationError
|
|
709
|
+
def initialize(text)
|
|
710
|
+
super "Unknown POSIX class #{text}"
|
|
711
|
+
end
|
|
712
|
+
end
|
|
713
|
+
|
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
|
711
716
|
# given, it gets called for each emitted token.
|
|
@@ -759,14 +764,21 @@ class Regexp::Scanner
|
|
|
759
764
|
end
|
|
760
765
|
|
|
761
766
|
# lazy-load property maps when first needed
|
|
762
|
-
require 'yaml'
|
|
763
|
-
|
|
764
767
|
def self.short_prop_map
|
|
765
|
-
@short_prop_map ||=
|
|
768
|
+
@short_prop_map ||= parse_prop_map('short')
|
|
766
769
|
end
|
|
767
770
|
|
|
768
771
|
def self.long_prop_map
|
|
769
|
-
@long_prop_map ||=
|
|
772
|
+
@long_prop_map ||= parse_prop_map('long')
|
|
773
|
+
end
|
|
774
|
+
|
|
775
|
+
def self.parse_prop_map(name)
|
|
776
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
|
777
|
+
end
|
|
778
|
+
|
|
779
|
+
def self.posix_classes
|
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
|
781
|
+
lower print punct space upper word xdigit]
|
|
770
782
|
end
|
|
771
783
|
|
|
772
784
|
# Emits an array with the details of the scanned pattern
|
|
@@ -871,15 +883,16 @@ class Regexp::Scanner
|
|
|
871
883
|
|
|
872
884
|
# Centralizes and unifies the handling of validation related
|
|
873
885
|
# errors.
|
|
874
|
-
def validation_error(type, what, reason)
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
886
|
+
def validation_error(type, what, reason = nil)
|
|
887
|
+
error =
|
|
888
|
+
case type
|
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
|
895
|
+
end
|
|
883
896
|
|
|
884
897
|
raise error # unless @@config.validation_ignore
|
|
885
898
|
end
|