regexp_parser 1.7.1 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +157 -1
- data/Gemfile +6 -1
- data/LICENSE +1 -1
- data/README.md +38 -32
- data/Rakefile +18 -27
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +1 -3
- data/lib/regexp_parser/expression/classes/root.rb +4 -17
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +11 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -20
- data/lib/regexp_parser/expression/subexpression.rb +1 -2
- data/lib/regexp_parser/expression.rb +7 -139
- data/lib/regexp_parser/lexer.rb +13 -11
- data/lib/regexp_parser/parser.rb +325 -344
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/properties/long.csv +604 -0
- data/lib/regexp_parser/scanner/properties/short.csv +242 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +235 -255
- data/lib/regexp_parser/scanner.rb +1324 -1387
- data/lib/regexp_parser/syntax/any.rb +4 -6
- data/lib/regexp_parser/syntax/base.rb +13 -15
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +34 -165
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -52
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,13 +20,13 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
22
27
|
brackets = set_open | set_close;
|
23
28
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
29
|
+
comment = ('#' . [^\n]* . '\n'?);
|
25
30
|
|
26
31
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
32
|
'cntrl' | 'digit' | 'graph' |
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -53,6 +58,8 @@
|
|
53
58
|
|
54
59
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
60
|
|
61
|
+
sequence_char = [CMcux];
|
62
|
+
|
56
63
|
zero_or_one = '?' | '??' | '?+';
|
57
64
|
zero_or_more = '*' | '*?' | '*+';
|
58
65
|
one_or_more = '+' | '+?' | '++';
|
@@ -90,21 +97,26 @@
|
|
90
97
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
98
|
|
92
99
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
100
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
101
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
102
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
103
|
group_level = [+\-] . [0-9]+;
|
97
104
|
|
98
|
-
group_name = ('<' .
|
105
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
106
|
+
("'" . group_name_id_sq? . "'");
|
99
107
|
group_lookup = group_name | group_number;
|
100
108
|
|
101
109
|
group_named = ('?' . group_name );
|
102
110
|
|
103
|
-
|
104
|
-
|
111
|
+
group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
|
112
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
113
|
+
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
114
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
115
|
|
106
|
-
|
107
|
-
|
116
|
+
group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
|
117
|
+
("'" . group_number . group_level? "'"));
|
118
|
+
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
119
|
+
("'" . ((group_number . group_level?) | '0') "'"));
|
108
120
|
|
109
121
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
110
122
|
|
@@ -115,33 +127,31 @@
|
|
115
127
|
|
116
128
|
# characters that 'break' a literal
|
117
129
|
meta_char = dot | backslash | alternation |
|
118
|
-
curlies |
|
130
|
+
curlies | parentheses | brackets |
|
119
131
|
line_anchor | quantifier_greedy;
|
120
132
|
|
121
133
|
literal_delimiters = ']' | '}';
|
122
134
|
|
123
|
-
ascii_print = ((0x20..0x7e) - meta_char);
|
135
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
136
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
137
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
138
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
|
-
|
139
|
+
keep_mark | sequence_char;
|
140
|
+
|
141
|
+
# escapes that also work within a character set
|
142
|
+
set_escape = backslash | brackets | escaped_ascii | property_char |
|
143
|
+
sequence_char | single_codepoint_char_type;
|
132
144
|
|
133
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
134
|
-
multi_codepoint_char_type | [0-9cCM];
|
135
145
|
|
136
146
|
# EOF error, used where it can be detected
|
137
147
|
action premature_end_error {
|
138
|
-
text =
|
148
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
149
|
raise PrematureEndError.new( text )
|
140
150
|
}
|
141
151
|
|
142
152
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
153
|
action invalid_sequence_error {
|
144
|
-
text =
|
154
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
155
|
validation_error(:sequence, 'sequence', text)
|
146
156
|
}
|
147
157
|
|
@@ -156,7 +166,7 @@
|
|
156
166
|
# --------------------------------------------------------------------------
|
157
167
|
character_set := |*
|
158
168
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
169
|
+
emit(:set, :close, copy(data, ts, te))
|
160
170
|
if in_set?
|
161
171
|
fret;
|
162
172
|
else
|
@@ -165,8 +175,8 @@
|
|
165
175
|
};
|
166
176
|
|
167
177
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
178
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
179
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
180
|
if in_set?
|
171
181
|
fret;
|
172
182
|
else
|
@@ -175,33 +185,33 @@
|
|
175
185
|
};
|
176
186
|
|
177
187
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
188
|
+
emit(:literal, :literal, '-')
|
189
|
+
emit(:set, :intersection, '&&')
|
180
190
|
};
|
181
191
|
|
182
192
|
'^' {
|
183
|
-
text =
|
193
|
+
text = copy(data, ts, te)
|
184
194
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
195
|
+
emit(:set, :negate, text)
|
186
196
|
else
|
187
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
188
198
|
end
|
189
199
|
};
|
190
200
|
|
191
201
|
'-' {
|
192
|
-
text =
|
202
|
+
text = copy(data, ts, te)
|
193
203
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
204
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
205
|
+
emit(:literal, :literal, text)
|
196
206
|
else
|
197
|
-
emit(:set, :range, text
|
207
|
+
emit(:set, :range, text)
|
198
208
|
end
|
199
209
|
};
|
200
210
|
|
201
211
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
212
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
213
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
214
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
215
|
};
|
206
216
|
|
207
217
|
backslash {
|
@@ -209,12 +219,12 @@
|
|
209
219
|
};
|
210
220
|
|
211
221
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
222
|
+
emit(:set, :open, copy(data, ts, te))
|
213
223
|
fcall character_set;
|
214
224
|
};
|
215
225
|
|
216
226
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
227
|
+
text = copy(data, ts, te)
|
218
228
|
|
219
229
|
type = :posixclass
|
220
230
|
class_name = text[2..-3]
|
@@ -223,45 +233,40 @@
|
|
223
233
|
type = :nonposixclass
|
224
234
|
end
|
225
235
|
|
226
|
-
emit(type, class_name.to_sym, text
|
236
|
+
emit(type, class_name.to_sym, text)
|
227
237
|
};
|
228
238
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
239
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
+
# emit(:set, :collation, copy(data, ts, te))
|
242
|
+
# };
|
243
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
245
|
+
# };
|
236
246
|
|
237
247
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
248
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
249
|
};
|
240
250
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
251
|
+
any | ascii_nonprint | utf8_multibyte {
|
252
|
+
text = copy(data, ts, te)
|
253
|
+
emit(:literal, :literal, text)
|
249
254
|
};
|
250
255
|
*|;
|
251
256
|
|
252
257
|
# set escapes scanner
|
253
258
|
# --------------------------------------------------------------------------
|
254
259
|
set_escape_sequence := |*
|
255
|
-
|
256
|
-
emit(:escape, :literal, *text(data, ts, te, 1))
|
257
|
-
fret;
|
258
|
-
};
|
259
|
-
|
260
|
-
any > (escaped_set_alpha, 1) {
|
260
|
+
set_escape > (escaped_set_alpha, 2) {
|
261
261
|
fhold;
|
262
262
|
fnext character_set;
|
263
263
|
fcall escape_sequence;
|
264
264
|
};
|
265
|
+
|
266
|
+
any > (escaped_set_alpha, 1) {
|
267
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
268
|
+
fret;
|
269
|
+
};
|
265
270
|
*|;
|
266
271
|
|
267
272
|
|
@@ -269,33 +274,33 @@
|
|
269
274
|
# --------------------------------------------------------------------------
|
270
275
|
escape_sequence := |*
|
271
276
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
277
|
+
text = copy(data, ts-1, te)
|
278
|
+
emit(:backref, :number, text)
|
274
279
|
fret;
|
275
280
|
};
|
276
281
|
|
277
282
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
283
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
284
|
fret;
|
280
285
|
};
|
281
286
|
|
282
287
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
288
|
+
case text = copy(data, ts-1, te)
|
289
|
+
when '\.'; emit(:escape, :dot, text)
|
290
|
+
when '\|'; emit(:escape, :alternation, text)
|
291
|
+
when '\^'; emit(:escape, :bol, text)
|
292
|
+
when '\$'; emit(:escape, :eol, text)
|
293
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
294
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
295
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
296
|
+
when '\('; emit(:escape, :group_open, text)
|
297
|
+
when '\)'; emit(:escape, :group_close, text)
|
298
|
+
when '\{'; emit(:escape, :interval_open, text)
|
299
|
+
when '\}'; emit(:escape, :interval_close, text)
|
300
|
+
when '\['; emit(:escape, :set_open, text)
|
301
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
302
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
303
|
+
emit(:escape, :backslash, text)
|
299
304
|
end
|
300
305
|
fret;
|
301
306
|
};
|
@@ -303,31 +308,31 @@
|
|
303
308
|
escaped_ascii > (escaped_alpha, 7) {
|
304
309
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
310
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
311
|
+
case text = copy(data, ts-1, te)
|
312
|
+
when '\a'; emit(:escape, :bell, text)
|
313
|
+
when '\b'; emit(:escape, :backspace, text)
|
314
|
+
when '\e'; emit(:escape, :escape, text)
|
315
|
+
when '\f'; emit(:escape, :form_feed, text)
|
316
|
+
when '\n'; emit(:escape, :newline, text)
|
317
|
+
when '\r'; emit(:escape, :carriage, text)
|
318
|
+
when '\t'; emit(:escape, :tab, text)
|
319
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
320
|
end
|
316
321
|
fret;
|
317
322
|
};
|
318
323
|
|
319
324
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
325
|
+
text = copy(data, ts-1, te)
|
321
326
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
327
|
+
emit(:escape, :codepoint_list, text)
|
323
328
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
329
|
+
emit(:escape, :codepoint, text)
|
325
330
|
end
|
326
331
|
fret;
|
327
332
|
};
|
328
333
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
334
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
335
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
336
|
fret;
|
332
337
|
};
|
333
338
|
|
@@ -357,8 +362,8 @@
|
|
357
362
|
fcall unicode_property;
|
358
363
|
};
|
359
364
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
365
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
366
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
367
|
fret;
|
363
368
|
};
|
364
369
|
*|;
|
@@ -368,9 +373,9 @@
|
|
368
373
|
# --------------------------------------------------------------------------
|
369
374
|
conditional_expression := |*
|
370
375
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
376
|
+
text = copy(data, ts, te-1)
|
377
|
+
emit(:conditional, :condition, text)
|
378
|
+
emit(:conditional, :condition_close, ')')
|
374
379
|
};
|
375
380
|
|
376
381
|
any {
|
@@ -387,39 +392,39 @@
|
|
387
392
|
# Meta characters
|
388
393
|
# ------------------------------------------------------------------------
|
389
394
|
dot {
|
390
|
-
emit(:meta, :dot,
|
395
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
396
|
};
|
392
397
|
|
393
398
|
alternation {
|
394
399
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
400
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
401
|
else
|
397
|
-
emit(:meta, :alternation,
|
402
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
403
|
end
|
399
404
|
};
|
400
405
|
|
401
406
|
# Anchors
|
402
407
|
# ------------------------------------------------------------------------
|
403
408
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
409
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
410
|
};
|
406
411
|
|
407
412
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
413
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
414
|
};
|
410
415
|
|
411
416
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
417
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
418
|
};
|
414
419
|
|
415
420
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
421
|
+
case text = copy(data, ts, te)
|
422
|
+
when '\\A'; emit(:anchor, :bos, text)
|
423
|
+
when '\\z'; emit(:anchor, :eos, text)
|
424
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
425
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
426
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
427
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
428
|
end
|
424
429
|
};
|
425
430
|
|
@@ -430,7 +435,7 @@
|
|
430
435
|
# Character sets
|
431
436
|
# ------------------------------------------------------------------------
|
432
437
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
438
|
+
emit(:set, :open, copy(data, ts, te))
|
434
439
|
fcall character_set;
|
435
440
|
};
|
436
441
|
|
@@ -439,12 +444,12 @@
|
|
439
444
|
# (?(condition)Y|N) conditional expression
|
440
445
|
# ------------------------------------------------------------------------
|
441
446
|
conditional {
|
442
|
-
text =
|
447
|
+
text = copy(data, ts, te)
|
443
448
|
|
444
449
|
conditional_stack << group_depth
|
445
450
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
451
|
+
emit(:conditional, :open, text[0..-2])
|
452
|
+
emit(:conditional, :condition_open, '(')
|
448
453
|
fcall conditional_expression;
|
449
454
|
};
|
450
455
|
|
@@ -455,7 +460,7 @@
|
|
455
460
|
# correct closing count.
|
456
461
|
# ------------------------------------------------------------------------
|
457
462
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
463
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
464
|
};
|
460
465
|
|
461
466
|
# Expression options:
|
@@ -470,11 +475,11 @@
|
|
470
475
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
476
|
# ------------------------------------------------------------------------
|
472
477
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
478
|
+
text = copy(data, ts, te)
|
474
479
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
480
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
481
|
end
|
477
|
-
emit_options(text
|
482
|
+
emit_options(text)
|
478
483
|
};
|
479
484
|
|
480
485
|
# Assertions
|
@@ -484,11 +489,11 @@
|
|
484
489
|
# (?<!subexp) negative look-behind
|
485
490
|
# ------------------------------------------------------------------------
|
486
491
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
492
|
+
case text = copy(data, ts, te)
|
493
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
494
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
495
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
496
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
497
|
end
|
493
498
|
};
|
494
499
|
|
@@ -501,32 +506,32 @@
|
|
501
506
|
# (subexp) captured group
|
502
507
|
# ------------------------------------------------------------------------
|
503
508
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
509
|
+
case text = copy(data, ts, te)
|
510
|
+
when '(?:'; emit(:group, :passive, text)
|
511
|
+
when '(?>'; emit(:group, :atomic, text)
|
512
|
+
when '(?~'; emit(:group, :absence, text)
|
508
513
|
|
509
514
|
when /^\(\?(?:<>|'')/
|
510
515
|
validation_error(:group, 'named group', 'name is empty')
|
511
516
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
517
|
+
when /^\(\?<[^>]+>/
|
518
|
+
emit(:group, :named_ab, text)
|
514
519
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
520
|
+
when /^\(\?'[^']+'/
|
521
|
+
emit(:group, :named_sq, text)
|
517
522
|
|
518
523
|
end
|
519
524
|
};
|
520
525
|
|
521
526
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
527
|
+
text = copy(data, ts, te)
|
528
|
+
emit(:group, :capture, text)
|
524
529
|
};
|
525
530
|
|
526
531
|
group_close @group_closed {
|
527
532
|
if conditional_stack.last == group_depth + 1
|
528
533
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
534
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
535
|
else
|
531
536
|
if spacing_stack.length > 1 &&
|
532
537
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,72 +539,42 @@
|
|
534
539
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
540
|
end
|
536
541
|
|
537
|
-
emit(:group, :close,
|
542
|
+
emit(:group, :close, copy(data, ts, te))
|
538
543
|
end
|
539
544
|
};
|
540
545
|
|
541
546
|
|
542
547
|
# Group backreference, named and numbered
|
543
548
|
# ------------------------------------------------------------------------
|
544
|
-
backslash . (
|
545
|
-
case text =
|
546
|
-
when /^\\(
|
547
|
-
validation_error(:backref, '
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
when /^\\(
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
561
|
-
end
|
562
|
-
|
563
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
564
|
-
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
566
|
-
else
|
567
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
568
|
-
end
|
569
|
-
|
570
|
-
when /^\\([gk])'\d+'/ # single quotes
|
571
|
-
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
573
|
-
else
|
574
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
575
|
-
end
|
576
|
-
|
577
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
|
-
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
580
|
-
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
582
|
-
end
|
583
|
-
|
584
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
|
-
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
587
|
-
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
589
|
-
end
|
590
|
-
|
591
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
593
|
-
|
594
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
596
|
-
|
597
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
599
|
-
|
600
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
549
|
+
backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
|
550
|
+
case text = copy(data, ts, te)
|
551
|
+
when /^\\k(<>|'')/
|
552
|
+
validation_error(:backref, 'backreference', 'ref ID is empty')
|
553
|
+
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
554
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
555
|
+
when /^\\k(.)\d+\D$/
|
556
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
557
|
+
when /^\\k(.)-\d+\D$/
|
558
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
559
|
+
when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
|
560
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
561
|
+
when /^\\k(.)-?\d+[+\-]\d+\D$/
|
562
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
563
|
+
end
|
564
|
+
};
|
602
565
|
|
566
|
+
# Group call, named and numbered
|
567
|
+
# ------------------------------------------------------------------------
|
568
|
+
backslash . (group_name_call | group_number_call) > (backslashed, 4) {
|
569
|
+
case text = copy(data, ts, te)
|
570
|
+
when /^\\g(<>|'')/
|
571
|
+
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
572
|
+
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
573
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
574
|
+
when /^\\g(.)\d+\D$/
|
575
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
576
|
+
when /^\\g(.)[+-]\d+/
|
577
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
603
578
|
end
|
604
579
|
};
|
605
580
|
|
@@ -607,31 +582,31 @@
|
|
607
582
|
# Quantifiers
|
608
583
|
# ------------------------------------------------------------------------
|
609
584
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
587
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
588
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
589
|
end
|
615
590
|
};
|
616
591
|
|
617
592
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
595
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
596
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
597
|
end
|
623
598
|
};
|
624
599
|
|
625
600
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
601
|
+
case text = copy(data, ts, te)
|
602
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
603
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
604
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
605
|
end
|
631
606
|
};
|
632
607
|
|
633
608
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
609
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
610
|
};
|
636
611
|
|
637
612
|
# Catch unmatched curly braces as literals
|
@@ -647,15 +622,17 @@
|
|
647
622
|
|
648
623
|
comment {
|
649
624
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
625
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
626
|
else
|
652
|
-
|
627
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
628
|
+
append_literal(data, ts, ts + 1)
|
629
|
+
fexec ts + 1;
|
653
630
|
end
|
654
631
|
};
|
655
632
|
|
656
633
|
space+ {
|
657
634
|
if free_spacing
|
658
|
-
emit(:free_space, :whitespace,
|
635
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
659
636
|
else
|
660
637
|
append_literal(data, ts, te)
|
661
638
|
end
|
@@ -664,11 +641,7 @@
|
|
664
641
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
665
642
|
# except meta characters.
|
666
643
|
# ------------------------------------------------------------------------
|
667
|
-
(ascii_print -- space)+
|
668
|
-
ascii_nonprint+ |
|
669
|
-
utf8_2_byte+ |
|
670
|
-
utf8_3_byte+ |
|
671
|
-
utf8_4_byte+ {
|
644
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
672
645
|
append_literal(data, ts, te)
|
673
646
|
};
|
674
647
|
|
@@ -678,12 +651,14 @@
|
|
678
651
|
# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
|
679
652
|
# This file was generated from lib/regexp_parser/scanner/scanner.rl
|
680
653
|
|
654
|
+
require 'regexp_parser/error'
|
655
|
+
|
681
656
|
class Regexp::Scanner
|
682
657
|
# General scanner error (catch all)
|
683
|
-
class ScannerError <
|
658
|
+
class ScannerError < Regexp::Parser::Error; end
|
684
659
|
|
685
660
|
# Base for all scanner validation errors
|
686
|
-
class ValidationError <
|
661
|
+
class ValidationError < Regexp::Parser::Error
|
687
662
|
def initialize(reason)
|
688
663
|
super reason
|
689
664
|
end
|
@@ -737,21 +712,16 @@ class Regexp::Scanner
|
|
737
712
|
#
|
738
713
|
# This method may raise errors if a syntax error is encountered.
|
739
714
|
# --------------------------------------------------------------------------
|
740
|
-
def self.scan(input_object, &block)
|
741
|
-
new.scan(input_object, &block)
|
715
|
+
def self.scan(input_object, options: nil, &block)
|
716
|
+
new.scan(input_object, options: options, &block)
|
742
717
|
end
|
743
718
|
|
744
|
-
def scan(input_object, &block)
|
719
|
+
def scan(input_object, options: nil, &block)
|
745
720
|
self.literal = nil
|
746
721
|
stack = []
|
747
722
|
|
748
|
-
|
749
|
-
|
750
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
751
|
-
else
|
752
|
-
input = input_object
|
753
|
-
self.free_spacing = false
|
754
|
-
end
|
723
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
724
|
+
self.free_spacing = free_spacing?(input_object, options)
|
755
725
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
756
726
|
|
757
727
|
data = input.unpack("c*") if input.is_a?(String)
|
@@ -763,6 +733,7 @@ class Regexp::Scanner
|
|
763
733
|
self.set_depth = 0
|
764
734
|
self.group_depth = 0
|
765
735
|
self.conditional_stack = []
|
736
|
+
self.char_pos = 0
|
766
737
|
|
767
738
|
%% write data;
|
768
739
|
%% write init;
|
@@ -772,7 +743,7 @@ class Regexp::Scanner
|
|
772
743
|
testEof = testEof
|
773
744
|
|
774
745
|
if cs == re_scanner_error
|
775
|
-
text =
|
746
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
776
747
|
raise ScannerError.new("Scan error at '#{text}'")
|
777
748
|
end
|
778
749
|
|
@@ -788,34 +759,54 @@ class Regexp::Scanner
|
|
788
759
|
end
|
789
760
|
|
790
761
|
# lazy-load property maps when first needed
|
791
|
-
require 'yaml'
|
792
|
-
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
793
|
-
|
794
762
|
def self.short_prop_map
|
795
|
-
@short_prop_map ||=
|
763
|
+
@short_prop_map ||= parse_prop_map('short')
|
796
764
|
end
|
797
765
|
|
798
766
|
def self.long_prop_map
|
799
|
-
@long_prop_map ||=
|
767
|
+
@long_prop_map ||= parse_prop_map('long')
|
768
|
+
end
|
769
|
+
|
770
|
+
def self.parse_prop_map(name)
|
771
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
800
772
|
end
|
801
773
|
|
802
774
|
# Emits an array with the details of the scanned pattern
|
803
|
-
def emit(type, token, text
|
775
|
+
def emit(type, token, text)
|
804
776
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
805
777
|
|
806
778
|
emit_literal if literal
|
807
779
|
|
780
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
781
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
782
|
+
ts_char_pos = char_pos
|
783
|
+
te_char_pos = char_pos + text.length
|
784
|
+
|
808
785
|
if block
|
809
|
-
block.call type, token, text,
|
786
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
810
787
|
end
|
811
788
|
|
812
|
-
tokens << [type, token, text,
|
789
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
790
|
+
|
791
|
+
self.char_pos = te_char_pos
|
813
792
|
end
|
814
793
|
|
815
794
|
private
|
816
795
|
|
817
796
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
818
|
-
:group_depth, :set_depth, :conditional_stack
|
797
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
798
|
+
|
799
|
+
def free_spacing?(input_object, options)
|
800
|
+
if options && !input_object.is_a?(String)
|
801
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
802
|
+
end
|
803
|
+
|
804
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
805
|
+
|
806
|
+
return false unless options
|
807
|
+
|
808
|
+
options & Regexp::EXTENDED != 0
|
809
|
+
end
|
819
810
|
|
820
811
|
def in_group?
|
821
812
|
group_depth > 0
|
@@ -826,36 +817,25 @@ class Regexp::Scanner
|
|
826
817
|
end
|
827
818
|
|
828
819
|
# Copy from ts to te from data as text
|
829
|
-
def copy(data,
|
830
|
-
data[
|
831
|
-
end
|
832
|
-
|
833
|
-
# Copy from ts to te from data as text, returning an array with the text
|
834
|
-
# and the offsets used to copy it.
|
835
|
-
def text(data, ts, te, soff = 0)
|
836
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
820
|
+
def copy(data, ts, te)
|
821
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
837
822
|
end
|
838
823
|
|
839
824
|
# Appends one or more characters to the literal buffer, to be emitted later
|
840
|
-
# by a call to emit_literal.
|
825
|
+
# by a call to emit_literal.
|
841
826
|
def append_literal(data, ts, te)
|
842
827
|
self.literal = literal || []
|
843
|
-
literal <<
|
828
|
+
literal << copy(data, ts, te)
|
844
829
|
end
|
845
830
|
|
846
|
-
# Emits the literal run collected by calls to the append_literal method
|
847
|
-
# using the total start (ts) and end (te) offsets of the run.
|
831
|
+
# Emits the literal run collected by calls to the append_literal method.
|
848
832
|
def emit_literal
|
849
|
-
|
850
|
-
text = literal.map {|t| t[0]}.join
|
851
|
-
|
852
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
853
|
-
|
833
|
+
text = literal.join
|
854
834
|
self.literal = nil
|
855
|
-
emit(:literal, :literal, text
|
835
|
+
emit(:literal, :literal, text)
|
856
836
|
end
|
857
837
|
|
858
|
-
def emit_options(text
|
838
|
+
def emit_options(text)
|
859
839
|
token = nil
|
860
840
|
|
861
841
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -881,14 +861,14 @@ class Regexp::Scanner
|
|
881
861
|
token = :options_switch
|
882
862
|
end
|
883
863
|
|
884
|
-
emit(:group, token, text
|
864
|
+
emit(:group, token, text)
|
885
865
|
end
|
886
866
|
|
887
867
|
def emit_meta_control_sequence(data, ts, te, token)
|
888
868
|
if data.last < 0x00 || data.last > 0x7F
|
889
869
|
validation_error(:sequence, 'escape', token.to_s)
|
890
870
|
end
|
891
|
-
emit(:escape, token,
|
871
|
+
emit(:escape, token, copy(data, ts-1, te))
|
892
872
|
end
|
893
873
|
|
894
874
|
# Centralizes and unifies the handling of validation related
|