regexp_parser 2.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +94 -6
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +40 -30
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +75 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +41 -23
- data/lib/regexp_parser/expression/sequence.rb +9 -24
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +85 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -8
- data/lib/regexp_parser/expression.rb +10 -132
- data/lib/regexp_parser/lexer.rb +8 -6
- data/lib/regexp_parser/parser.rb +21 -72
- data/lib/regexp_parser/scanner/properties/long.csv +622 -0
- data/lib/regexp_parser/scanner/properties/short.csv +246 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +48 -35
- data/lib/regexp_parser/scanner.rb +735 -801
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +37 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
@@ -0,0 +1,246 @@
|
|
1
|
+
# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
|
2
|
+
adlm,adlam
|
3
|
+
aghb,caucasian_albanian
|
4
|
+
ahex,ascii_hex_digit
|
5
|
+
arab,arabic
|
6
|
+
armi,imperial_aramaic
|
7
|
+
armn,armenian
|
8
|
+
avst,avestan
|
9
|
+
bali,balinese
|
10
|
+
bamu,bamum
|
11
|
+
bass,bassa_vah
|
12
|
+
batk,batak
|
13
|
+
beng,bengali
|
14
|
+
bhks,bhaiksuki
|
15
|
+
bidic,bidi_control
|
16
|
+
bopo,bopomofo
|
17
|
+
brah,brahmi
|
18
|
+
brai,braille
|
19
|
+
bugi,buginese
|
20
|
+
buhd,buhid
|
21
|
+
c,other
|
22
|
+
cakm,chakma
|
23
|
+
cans,canadian_aboriginal
|
24
|
+
cari,carian
|
25
|
+
cc,control
|
26
|
+
cf,format
|
27
|
+
cher,cherokee
|
28
|
+
chrs,chorasmian
|
29
|
+
ci,case_ignorable
|
30
|
+
cn,unassigned
|
31
|
+
co,private_use
|
32
|
+
combiningmark,mark
|
33
|
+
copt,coptic
|
34
|
+
cpmn,cypro_minoan
|
35
|
+
cprt,cypriot
|
36
|
+
cs,surrogate
|
37
|
+
cwcf,changes_when_casefolded
|
38
|
+
cwcm,changes_when_casemapped
|
39
|
+
cwl,changes_when_lowercased
|
40
|
+
cwt,changes_when_titlecased
|
41
|
+
cwu,changes_when_uppercased
|
42
|
+
cyrl,cyrillic
|
43
|
+
dep,deprecated
|
44
|
+
deva,devanagari
|
45
|
+
di,default_ignorable_code_point
|
46
|
+
dia,diacritic
|
47
|
+
diak,dives_akuru
|
48
|
+
dogr,dogra
|
49
|
+
dsrt,deseret
|
50
|
+
dupl,duployan
|
51
|
+
ebase,emoji_modifier_base
|
52
|
+
ecomp,emoji_component
|
53
|
+
egyp,egyptian_hieroglyphs
|
54
|
+
elba,elbasan
|
55
|
+
elym,elymaic
|
56
|
+
emod,emoji_modifier
|
57
|
+
epres,emoji_presentation
|
58
|
+
ethi,ethiopic
|
59
|
+
ext,extender
|
60
|
+
geor,georgian
|
61
|
+
glag,glagolitic
|
62
|
+
gong,gunjala_gondi
|
63
|
+
gonm,masaram_gondi
|
64
|
+
goth,gothic
|
65
|
+
gran,grantha
|
66
|
+
grbase,grapheme_base
|
67
|
+
grek,greek
|
68
|
+
grext,grapheme_extend
|
69
|
+
grlink,grapheme_link
|
70
|
+
gujr,gujarati
|
71
|
+
guru,gurmukhi
|
72
|
+
hang,hangul
|
73
|
+
hani,han
|
74
|
+
hano,hanunoo
|
75
|
+
hatr,hatran
|
76
|
+
hebr,hebrew
|
77
|
+
hex,hex_digit
|
78
|
+
hira,hiragana
|
79
|
+
hluw,anatolian_hieroglyphs
|
80
|
+
hmng,pahawh_hmong
|
81
|
+
hmnp,nyiakeng_puachue_hmong
|
82
|
+
hung,old_hungarian
|
83
|
+
idc,id_continue
|
84
|
+
ideo,ideographic
|
85
|
+
ids,id_start
|
86
|
+
idsb,ids_binary_operator
|
87
|
+
idst,ids_trinary_operator
|
88
|
+
ital,old_italic
|
89
|
+
java,javanese
|
90
|
+
joinc,join_control
|
91
|
+
kali,kayah_li
|
92
|
+
kana,katakana
|
93
|
+
khar,kharoshthi
|
94
|
+
khmr,khmer
|
95
|
+
khoj,khojki
|
96
|
+
kits,khitan_small_script
|
97
|
+
knda,kannada
|
98
|
+
kthi,kaithi
|
99
|
+
l,letter
|
100
|
+
lana,tai_tham
|
101
|
+
laoo,lao
|
102
|
+
latn,latin
|
103
|
+
lc,cased_letter
|
104
|
+
lepc,lepcha
|
105
|
+
limb,limbu
|
106
|
+
lina,linear_a
|
107
|
+
linb,linear_b
|
108
|
+
ll,lowercase_letter
|
109
|
+
lm,modifier_letter
|
110
|
+
lo,other_letter
|
111
|
+
loe,logical_order_exception
|
112
|
+
lt,titlecase_letter
|
113
|
+
lu,uppercase_letter
|
114
|
+
lyci,lycian
|
115
|
+
lydi,lydian
|
116
|
+
m,mark
|
117
|
+
mahj,mahajani
|
118
|
+
maka,makasar
|
119
|
+
mand,mandaic
|
120
|
+
mani,manichaean
|
121
|
+
marc,marchen
|
122
|
+
mc,spacing_mark
|
123
|
+
me,enclosing_mark
|
124
|
+
medf,medefaidrin
|
125
|
+
mend,mende_kikakui
|
126
|
+
merc,meroitic_cursive
|
127
|
+
mero,meroitic_hieroglyphs
|
128
|
+
mlym,malayalam
|
129
|
+
mn,nonspacing_mark
|
130
|
+
mong,mongolian
|
131
|
+
mroo,mro
|
132
|
+
mtei,meetei_mayek
|
133
|
+
mult,multani
|
134
|
+
mymr,myanmar
|
135
|
+
n,number
|
136
|
+
nand,nandinagari
|
137
|
+
narb,old_north_arabian
|
138
|
+
nbat,nabataean
|
139
|
+
nchar,noncharacter_code_point
|
140
|
+
nd,decimal_number
|
141
|
+
nkoo,nko
|
142
|
+
nl,letter_number
|
143
|
+
no,other_number
|
144
|
+
nshu,nushu
|
145
|
+
oalpha,other_alphabetic
|
146
|
+
odi,other_default_ignorable_code_point
|
147
|
+
ogam,ogham
|
148
|
+
ogrext,other_grapheme_extend
|
149
|
+
oidc,other_id_continue
|
150
|
+
oids,other_id_start
|
151
|
+
olck,ol_chiki
|
152
|
+
olower,other_lowercase
|
153
|
+
omath,other_math
|
154
|
+
orkh,old_turkic
|
155
|
+
orya,oriya
|
156
|
+
osge,osage
|
157
|
+
osma,osmanya
|
158
|
+
ougr,old_uyghur
|
159
|
+
oupper,other_uppercase
|
160
|
+
p,punctuation
|
161
|
+
palm,palmyrene
|
162
|
+
patsyn,pattern_syntax
|
163
|
+
patws,pattern_white_space
|
164
|
+
pauc,pau_cin_hau
|
165
|
+
pc,connector_punctuation
|
166
|
+
pcm,prepended_concatenation_mark
|
167
|
+
pd,dash_punctuation
|
168
|
+
pe,close_punctuation
|
169
|
+
perm,old_permic
|
170
|
+
pf,final_punctuation
|
171
|
+
phag,phags_pa
|
172
|
+
phli,inscriptional_pahlavi
|
173
|
+
phlp,psalter_pahlavi
|
174
|
+
phnx,phoenician
|
175
|
+
pi,initial_punctuation
|
176
|
+
plrd,miao
|
177
|
+
po,other_punctuation
|
178
|
+
prti,inscriptional_parthian
|
179
|
+
ps,open_punctuation
|
180
|
+
qaac,coptic
|
181
|
+
qaai,inherited
|
182
|
+
qmark,quotation_mark
|
183
|
+
ri,regional_indicator
|
184
|
+
rjng,rejang
|
185
|
+
rohg,hanifi_rohingya
|
186
|
+
runr,runic
|
187
|
+
s,symbol
|
188
|
+
samr,samaritan
|
189
|
+
sarb,old_south_arabian
|
190
|
+
saur,saurashtra
|
191
|
+
sc,currency_symbol
|
192
|
+
sd,soft_dotted
|
193
|
+
sgnw,signwriting
|
194
|
+
shaw,shavian
|
195
|
+
shrd,sharada
|
196
|
+
sidd,siddham
|
197
|
+
sind,khudawadi
|
198
|
+
sinh,sinhala
|
199
|
+
sk,modifier_symbol
|
200
|
+
sm,math_symbol
|
201
|
+
so,other_symbol
|
202
|
+
sogd,sogdian
|
203
|
+
sogo,old_sogdian
|
204
|
+
sora,sora_sompeng
|
205
|
+
soyo,soyombo
|
206
|
+
sterm,sentence_terminal
|
207
|
+
sund,sundanese
|
208
|
+
sylo,syloti_nagri
|
209
|
+
syrc,syriac
|
210
|
+
tagb,tagbanwa
|
211
|
+
takr,takri
|
212
|
+
tale,tai_le
|
213
|
+
talu,new_tai_lue
|
214
|
+
taml,tamil
|
215
|
+
tang,tangut
|
216
|
+
tavt,tai_viet
|
217
|
+
telu,telugu
|
218
|
+
term,terminal_punctuation
|
219
|
+
tfng,tifinagh
|
220
|
+
tglg,tagalog
|
221
|
+
thaa,thaana
|
222
|
+
tibt,tibetan
|
223
|
+
tirh,tirhuta
|
224
|
+
tnsa,tangsa
|
225
|
+
ugar,ugaritic
|
226
|
+
uideo,unified_ideograph
|
227
|
+
vaii,vai
|
228
|
+
vith,vithkuqi
|
229
|
+
vs,variation_selector
|
230
|
+
wara,warang_citi
|
231
|
+
wcho,wancho
|
232
|
+
wspace,white_space
|
233
|
+
xidc,xid_continue
|
234
|
+
xids,xid_start
|
235
|
+
xpeo,old_persian
|
236
|
+
xsux,cuneiform
|
237
|
+
yezi,yezidi
|
238
|
+
yiii,yi
|
239
|
+
z,separator
|
240
|
+
zanb,zanabazar_square
|
241
|
+
zinh,inherited
|
242
|
+
zl,line_separator
|
243
|
+
zp,paragraph_separator
|
244
|
+
zs,space_separator
|
245
|
+
zyyy,common
|
246
|
+
zzzz,unknown
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
validation_error(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -28,13 +28,7 @@
|
|
28
28
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
30
30
|
|
31
|
-
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
33
|
-
'lower' | 'print' | 'punct' |
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
35
|
-
'word' | 'ascii';
|
36
|
-
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
|
40
34
|
# these are not supported in ruby at the moment
|
@@ -74,8 +68,7 @@
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
78
|
-
quantifier_mode?;
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
79
72
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
81
74
|
quantifier_possessive | quantifier_interval;
|
@@ -223,24 +216,28 @@
|
|
223
216
|
fcall character_set;
|
224
217
|
};
|
225
218
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
227
220
|
text = copy(data, ts, te)
|
228
221
|
|
229
222
|
type = :posixclass
|
230
223
|
class_name = text[2..-3]
|
231
|
-
if class_name[0]
|
224
|
+
if class_name[0] == '^'
|
232
225
|
class_name = class_name[1..-1]
|
233
226
|
type = :nonposixclass
|
234
227
|
end
|
235
228
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
230
|
+
validation_error(:posix_class, text)
|
231
|
+
end
|
232
|
+
|
236
233
|
emit(type, class_name.to_sym, text)
|
237
234
|
};
|
238
235
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
242
239
|
# };
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
245
242
|
# };
|
246
243
|
|
@@ -323,7 +320,7 @@
|
|
323
320
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
325
322
|
text = copy(data, ts-1, te)
|
326
|
-
if text[2]
|
323
|
+
if text[2] == '{'
|
327
324
|
emit(:escape, :codepoint_list, text)
|
328
325
|
else
|
329
326
|
emit(:escape, :codepoint, text)
|
@@ -419,12 +416,12 @@
|
|
419
416
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
421
418
|
case text = copy(data, ts, te)
|
422
|
-
when '
|
423
|
-
when '
|
424
|
-
when '
|
425
|
-
when '
|
426
|
-
when '
|
427
|
-
when '
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
428
425
|
end
|
429
426
|
};
|
430
427
|
|
@@ -477,7 +474,7 @@
|
|
477
474
|
group_open . group_options >group_opened {
|
478
475
|
text = copy(data, ts, te)
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
480
|
-
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
481
478
|
end
|
482
479
|
emit_options(text)
|
483
480
|
};
|
@@ -605,7 +602,7 @@
|
|
605
602
|
end
|
606
603
|
};
|
607
604
|
|
608
|
-
quantifier_interval
|
605
|
+
quantifier_interval {
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
610
607
|
};
|
611
608
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
686
683
|
end
|
687
684
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
689
687
|
class InvalidGroupOption < ValidationError
|
690
688
|
def initialize(option, text)
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
706
704
|
end
|
707
705
|
end
|
708
706
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
708
|
+
class UnknownPosixClassError < ValidationError
|
709
|
+
def initialize(text)
|
710
|
+
super "Unknown POSIX class #{text}"
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
711
716
|
# given, it gets called for each emitted token.
|
@@ -759,14 +764,21 @@ class Regexp::Scanner
|
|
759
764
|
end
|
760
765
|
|
761
766
|
# lazy-load property maps when first needed
|
762
|
-
require 'yaml'
|
763
|
-
|
764
767
|
def self.short_prop_map
|
765
|
-
@short_prop_map ||=
|
768
|
+
@short_prop_map ||= parse_prop_map('short')
|
766
769
|
end
|
767
770
|
|
768
771
|
def self.long_prop_map
|
769
|
-
@long_prop_map ||=
|
772
|
+
@long_prop_map ||= parse_prop_map('long')
|
773
|
+
end
|
774
|
+
|
775
|
+
def self.parse_prop_map(name)
|
776
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
777
|
+
end
|
778
|
+
|
779
|
+
def self.posix_classes
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
781
|
+
lower print punct space upper word xdigit]
|
770
782
|
end
|
771
783
|
|
772
784
|
# Emits an array with the details of the scanned pattern
|
@@ -871,15 +883,16 @@ class Regexp::Scanner
|
|
871
883
|
|
872
884
|
# Centralizes and unifies the handling of validation related
|
873
885
|
# errors.
|
874
|
-
def validation_error(type, what, reason)
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
886
|
+
def validation_error(type, what, reason = nil)
|
887
|
+
error =
|
888
|
+
case type
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
+
end
|
883
896
|
|
884
897
|
raise error # unless @@config.validation_ignore
|
885
898
|
end
|