regexp_parser 1.7.0 → 2.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +303 -368
- data/lib/regexp_parser/scanner.rb +1423 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +19 -23
- metadata +52 -171
- data/CHANGELOG.md +0 -349
- data/README.md +0 -470
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,26 +20,15 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
22
27
|
brackets = set_open | set_close;
|
23
28
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
25
|
-
|
26
|
-
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
|
-
'cntrl' | 'digit' | 'graph' |
|
28
|
-
'lower' | 'print' | 'punct' |
|
29
|
-
'space' | 'upper' | 'xdigit' |
|
30
|
-
'word' | 'ascii';
|
31
|
-
|
32
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
|
-
|
29
|
+
comment = ('#' . [^\n]* . '\n'?);
|
34
30
|
|
35
|
-
|
36
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
|
-
character_equivalent = '[=' . alpha . '=]';
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
line_anchor = beginning_of_line | end_of_line;
|
40
34
|
anchor_char = [AbBzZG];
|
@@ -53,21 +47,20 @@
|
|
53
47
|
|
54
48
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
49
|
|
50
|
+
sequence_char = [CMcux];
|
51
|
+
|
56
52
|
zero_or_one = '?' | '??' | '?+';
|
57
53
|
zero_or_more = '*' | '*?' | '*+';
|
58
54
|
one_or_more = '+' | '+?' | '++';
|
59
55
|
|
60
56
|
quantifier_greedy = '?' | '*' | '+';
|
61
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
62
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
63
|
-
quantifier_mode = '?' | '+';
|
64
|
-
|
65
|
-
quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
|
66
|
-
range_close . quantifier_mode?;
|
67
|
-
|
68
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
|
-
quantifier_possessive | quantifier_interval;
|
70
57
|
|
58
|
+
quantity_exact = (digit+);
|
59
|
+
quantity_minimum = (digit+) . ',';
|
60
|
+
quantity_maximum = ',' . (digit+);
|
61
|
+
quantity_range = (digit+) . ',' . (digit+);
|
62
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
63
|
+
quantity_maximum | quantity_range ) . range_close;
|
71
64
|
|
72
65
|
conditional = '(?(';
|
73
66
|
|
@@ -85,22 +78,22 @@
|
|
85
78
|
# try to treat every other group head as options group, like Ruby
|
86
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
87
80
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
group_number = '-'? . [1-9] . ([0-9]+)?;
|
81
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
|
+
group_number = '-'? . [0-9]+;
|
92
84
|
group_level = [+\-] . [0-9]+;
|
93
85
|
|
94
|
-
group_name = ('<' .
|
86
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
87
|
+
("'" . group_name_id_sq? . "'");
|
95
88
|
group_lookup = group_name | group_number;
|
96
89
|
|
97
90
|
group_named = ('?' . group_name );
|
98
91
|
|
99
|
-
|
100
|
-
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
101
94
|
|
102
|
-
|
103
|
-
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
104
97
|
|
105
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
106
99
|
|
@@ -111,32 +104,33 @@
|
|
111
104
|
|
112
105
|
# characters that 'break' a literal
|
113
106
|
meta_char = dot | backslash | alternation |
|
114
|
-
curlies |
|
107
|
+
curlies | parentheses | brackets |
|
115
108
|
line_anchor | quantifier_greedy;
|
116
109
|
|
117
|
-
|
118
|
-
ascii_nonprint = (0x01..0x1f | 0x7f);
|
110
|
+
literal_delimiters = ']' | '}';
|
119
111
|
|
120
|
-
|
121
|
-
|
122
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
112
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
113
|
+
ascii_nonprint = (0x01..0x1f | 0x7f);
|
123
114
|
|
124
115
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
125
|
-
|
116
|
+
keep_mark | sequence_char;
|
117
|
+
|
118
|
+
# escapes that also work within a character set
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
121
|
+
sequence_char | single_codepoint_char_type;
|
126
122
|
|
127
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
128
|
-
multi_codepoint_char_type | [0-9cCM];
|
129
123
|
|
130
124
|
# EOF error, used where it can be detected
|
131
125
|
action premature_end_error {
|
132
|
-
text =
|
133
|
-
raise PrematureEndError.new(
|
126
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
127
|
+
raise PrematureEndError.new(text)
|
134
128
|
}
|
135
129
|
|
136
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
137
131
|
action invalid_sequence_error {
|
138
|
-
text =
|
139
|
-
|
132
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
140
134
|
}
|
141
135
|
|
142
136
|
# group (nesting) and set open/close actions
|
@@ -150,7 +144,7 @@
|
|
150
144
|
# --------------------------------------------------------------------------
|
151
145
|
character_set := |*
|
152
146
|
set_close > (set_meta, 2) @set_closed {
|
153
|
-
emit(:set, :close,
|
147
|
+
emit(:set, :close, copy(data, ts, te))
|
154
148
|
if in_set?
|
155
149
|
fret;
|
156
150
|
else
|
@@ -159,8 +153,8 @@
|
|
159
153
|
};
|
160
154
|
|
161
155
|
'-]' @set_closed { # special case, emits two tokens
|
162
|
-
emit(:literal, :literal,
|
163
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
164
158
|
if in_set?
|
165
159
|
fret;
|
166
160
|
else
|
@@ -169,33 +163,32 @@
|
|
169
163
|
};
|
170
164
|
|
171
165
|
'-&&' { # special case, emits two tokens
|
172
|
-
emit(:literal, :literal, '-'
|
173
|
-
emit(:set, :intersection, '&&'
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :intersection, '&&')
|
174
168
|
};
|
175
169
|
|
176
170
|
'^' {
|
177
|
-
|
178
|
-
|
179
|
-
emit(:set, :negate, text, ts, te)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
180
173
|
else
|
181
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
182
175
|
end
|
183
176
|
};
|
184
177
|
|
185
178
|
'-' {
|
186
|
-
|
187
|
-
#
|
188
|
-
if
|
189
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
190
183
|
else
|
191
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
192
185
|
end
|
193
186
|
};
|
194
187
|
|
195
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
196
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
197
190
|
'&&' {
|
198
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
199
192
|
};
|
200
193
|
|
201
194
|
backslash {
|
@@ -203,59 +196,60 @@
|
|
203
196
|
};
|
204
197
|
|
205
198
|
set_open >(open_bracket, 1) >set_opened {
|
206
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
207
200
|
fcall character_set;
|
208
201
|
};
|
209
202
|
|
210
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
211
|
-
text =
|
203
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
204
|
+
text = copy(data, ts, te)
|
212
205
|
|
213
206
|
type = :posixclass
|
214
207
|
class_name = text[2..-3]
|
215
|
-
if class_name[0]
|
208
|
+
if class_name[0] == '^'
|
216
209
|
class_name = class_name[1..-1]
|
217
210
|
type = :nonposixclass
|
218
211
|
end
|
219
212
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
224
|
-
emit(:set, :collation, *text(data, ts, te))
|
225
|
-
};
|
213
|
+
unless self.class.posix_classes.include?(class_name)
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
215
|
+
end
|
226
216
|
|
227
|
-
|
228
|
-
emit(:set, :equivalent, *text(data, ts, te))
|
217
|
+
emit(type, class_name.to_sym, text)
|
229
218
|
};
|
230
219
|
|
231
220
|
meta_char > (set_meta, 1) {
|
232
|
-
emit(:literal, :literal,
|
221
|
+
emit(:literal, :literal, copy(data, ts, te))
|
233
222
|
};
|
234
223
|
|
235
|
-
any
|
236
|
-
|
237
|
-
|
238
|
-
utf8_3_byte |
|
239
|
-
utf8_4_byte {
|
240
|
-
char, *rest = *text(data, ts, te)
|
241
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
242
|
-
emit(:literal, :literal, char, *rest)
|
224
|
+
any | ascii_nonprint | utf8_multibyte {
|
225
|
+
text = copy(data, ts, te)
|
226
|
+
emit(:literal, :literal, text)
|
243
227
|
};
|
244
228
|
*|;
|
245
229
|
|
246
230
|
# set escapes scanner
|
247
231
|
# --------------------------------------------------------------------------
|
248
232
|
set_escape_sequence := |*
|
249
|
-
|
250
|
-
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
251
236
|
fret;
|
252
237
|
};
|
253
238
|
|
254
|
-
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
240
|
+
set_escape > (escaped_set_alpha, 2) {
|
255
241
|
fhold;
|
256
242
|
fnext character_set;
|
257
243
|
fcall escape_sequence;
|
258
244
|
};
|
245
|
+
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
249
|
+
any > (escaped_set_alpha, 1) {
|
250
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
251
|
+
fret;
|
252
|
+
};
|
259
253
|
*|;
|
260
254
|
|
261
255
|
|
@@ -263,33 +257,40 @@
|
|
263
257
|
# --------------------------------------------------------------------------
|
264
258
|
escape_sequence := |*
|
265
259
|
[1-9] {
|
266
|
-
text =
|
267
|
-
emit(:backref, :number, text
|
260
|
+
text = copy(data, ts-1, te)
|
261
|
+
emit(:backref, :number, text)
|
268
262
|
fret;
|
269
263
|
};
|
270
264
|
|
271
265
|
octal_sequence {
|
272
|
-
emit(:escape, :octal,
|
266
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
267
|
+
fret;
|
268
|
+
};
|
269
|
+
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
273
|
+
emit(:literal, :literal, text[2])
|
273
274
|
fret;
|
274
275
|
};
|
275
276
|
|
276
277
|
meta_char {
|
277
|
-
case text =
|
278
|
-
when '\.'; emit(:escape, :dot, text
|
279
|
-
when '\|'; emit(:escape, :alternation, text
|
280
|
-
when '\^'; emit(:escape, :bol, text
|
281
|
-
when '\$'; emit(:escape, :eol, text
|
282
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
283
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
284
|
-
when '\+'; emit(:escape, :one_or_more, text
|
285
|
-
when '\('; emit(:escape, :group_open, text
|
286
|
-
when '\)'; emit(:escape, :group_close, text
|
287
|
-
when '\{'; emit(:escape, :interval_open, text
|
288
|
-
when '\}'; emit(:escape, :interval_close, text
|
289
|
-
when '\['; emit(:escape, :set_open, text
|
290
|
-
when '\]'; emit(:escape, :set_close, text
|
278
|
+
case text = copy(data, ts-1, te)
|
279
|
+
when '\.'; emit(:escape, :dot, text)
|
280
|
+
when '\|'; emit(:escape, :alternation, text)
|
281
|
+
when '\^'; emit(:escape, :bol, text)
|
282
|
+
when '\$'; emit(:escape, :eol, text)
|
283
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
284
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
285
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
286
|
+
when '\('; emit(:escape, :group_open, text)
|
287
|
+
when '\)'; emit(:escape, :group_close, text)
|
288
|
+
when '\{'; emit(:escape, :interval_open, text)
|
289
|
+
when '\}'; emit(:escape, :interval_close, text)
|
290
|
+
when '\['; emit(:escape, :set_open, text)
|
291
|
+
when '\]'; emit(:escape, :set_close, text)
|
291
292
|
when "\\\\";
|
292
|
-
emit(:escape, :backslash, text
|
293
|
+
emit(:escape, :backslash, text)
|
293
294
|
end
|
294
295
|
fret;
|
295
296
|
};
|
@@ -297,31 +298,31 @@
|
|
297
298
|
escaped_ascii > (escaped_alpha, 7) {
|
298
299
|
# \b is emitted as backspace only when inside a character set, otherwise
|
299
300
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
300
|
-
case text =
|
301
|
-
when '\a'; emit(:escape, :bell, text
|
302
|
-
when '\b'; emit(:escape, :backspace, text
|
303
|
-
when '\e'; emit(:escape, :escape, text
|
304
|
-
when '\f'; emit(:escape, :form_feed, text
|
305
|
-
when '\n'; emit(:escape, :newline, text
|
306
|
-
when '\r'; emit(:escape, :carriage, text
|
307
|
-
when '\t'; emit(:escape, :tab, text
|
308
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
301
|
+
case text = copy(data, ts-1, te)
|
302
|
+
when '\a'; emit(:escape, :bell, text)
|
303
|
+
when '\b'; emit(:escape, :backspace, text)
|
304
|
+
when '\e'; emit(:escape, :escape, text)
|
305
|
+
when '\f'; emit(:escape, :form_feed, text)
|
306
|
+
when '\n'; emit(:escape, :newline, text)
|
307
|
+
when '\r'; emit(:escape, :carriage, text)
|
308
|
+
when '\t'; emit(:escape, :tab, text)
|
309
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
309
310
|
end
|
310
311
|
fret;
|
311
312
|
};
|
312
313
|
|
313
314
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
314
|
-
text =
|
315
|
-
if text[2]
|
316
|
-
emit(:escape, :codepoint_list, text
|
315
|
+
text = copy(data, ts-1, te)
|
316
|
+
if text[2] == '{'
|
317
|
+
emit(:escape, :codepoint_list, text)
|
317
318
|
else
|
318
|
-
emit(:escape, :codepoint, text
|
319
|
+
emit(:escape, :codepoint, text)
|
319
320
|
end
|
320
321
|
fret;
|
321
322
|
};
|
322
323
|
|
323
|
-
hex_sequence > (escaped_alpha, 5)
|
324
|
-
emit(:escape, :hex,
|
324
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
325
326
|
fret;
|
326
327
|
};
|
327
328
|
|
@@ -351,8 +352,8 @@
|
|
351
352
|
fcall unicode_property;
|
352
353
|
};
|
353
354
|
|
354
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
355
|
-
emit(:escape, :literal,
|
355
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
356
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
356
357
|
fret;
|
357
358
|
};
|
358
359
|
*|;
|
@@ -362,9 +363,10 @@
|
|
362
363
|
# --------------------------------------------------------------------------
|
363
364
|
conditional_expression := |*
|
364
365
|
group_lookup . ')' {
|
365
|
-
text =
|
366
|
-
|
367
|
-
emit(:conditional, :
|
366
|
+
text = copy(data, ts, te-1)
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
368
|
+
emit(:conditional, :condition, text)
|
369
|
+
emit(:conditional, :condition_close, ')')
|
368
370
|
};
|
369
371
|
|
370
372
|
any {
|
@@ -381,46 +383,50 @@
|
|
381
383
|
# Meta characters
|
382
384
|
# ------------------------------------------------------------------------
|
383
385
|
dot {
|
384
|
-
emit(:meta, :dot,
|
386
|
+
emit(:meta, :dot, copy(data, ts, te))
|
385
387
|
};
|
386
388
|
|
387
389
|
alternation {
|
388
390
|
if conditional_stack.last == group_depth
|
389
|
-
emit(:conditional, :separator,
|
391
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
390
392
|
else
|
391
|
-
emit(:meta, :alternation,
|
393
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
392
394
|
end
|
393
395
|
};
|
394
396
|
|
395
397
|
# Anchors
|
396
398
|
# ------------------------------------------------------------------------
|
397
399
|
beginning_of_line {
|
398
|
-
emit(:anchor, :bol,
|
400
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
399
401
|
};
|
400
402
|
|
401
403
|
end_of_line {
|
402
|
-
emit(:anchor, :eol,
|
404
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
403
405
|
};
|
404
406
|
|
405
407
|
backslash . keep_mark > (backslashed, 4) {
|
406
|
-
emit(:keep, :mark,
|
408
|
+
emit(:keep, :mark, copy(data, ts, te))
|
407
409
|
};
|
408
410
|
|
409
411
|
backslash . anchor_char > (backslashed, 3) {
|
410
|
-
case text =
|
411
|
-
when '
|
412
|
-
when '
|
413
|
-
when '
|
414
|
-
when '
|
415
|
-
when '
|
416
|
-
when '
|
412
|
+
case text = copy(data, ts, te)
|
413
|
+
when '\A'; emit(:anchor, :bos, text)
|
414
|
+
when '\z'; emit(:anchor, :eos, text)
|
415
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
416
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
417
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
418
|
+
when '\G'; emit(:anchor, :match_start, text)
|
417
419
|
end
|
418
420
|
};
|
419
421
|
|
422
|
+
literal_delimiters {
|
423
|
+
append_literal(data, ts, te)
|
424
|
+
};
|
425
|
+
|
420
426
|
# Character sets
|
421
427
|
# ------------------------------------------------------------------------
|
422
428
|
set_open >set_opened {
|
423
|
-
emit(:set, :open,
|
429
|
+
emit(:set, :open, copy(data, ts, te))
|
424
430
|
fcall character_set;
|
425
431
|
};
|
426
432
|
|
@@ -429,23 +435,22 @@
|
|
429
435
|
# (?(condition)Y|N) conditional expression
|
430
436
|
# ------------------------------------------------------------------------
|
431
437
|
conditional {
|
432
|
-
text =
|
438
|
+
text = copy(data, ts, te)
|
433
439
|
|
434
440
|
conditional_stack << group_depth
|
435
441
|
|
436
|
-
emit(:conditional, :open, text[0..-2]
|
437
|
-
emit(:conditional, :condition_open, '('
|
442
|
+
emit(:conditional, :open, text[0..-2])
|
443
|
+
emit(:conditional, :condition_open, '(')
|
438
444
|
fcall conditional_expression;
|
439
445
|
};
|
440
446
|
|
441
447
|
|
442
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
443
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
444
|
-
# special case for close
|
445
|
-
# correct closing count.
|
450
|
+
# special case for close to get the correct closing count.
|
446
451
|
# ------------------------------------------------------------------------
|
447
|
-
group_open . group_comment
|
448
|
-
emit(:group, :comment,
|
452
|
+
(group_open . group_comment) @group_closed {
|
453
|
+
emit(:group, :comment, copy(data, ts, te))
|
449
454
|
};
|
450
455
|
|
451
456
|
# Expression options:
|
@@ -459,12 +464,12 @@
|
|
459
464
|
#
|
460
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
461
466
|
# ------------------------------------------------------------------------
|
462
|
-
group_open . group_options >group_opened {
|
463
|
-
text =
|
467
|
+
(group_open . group_options) >group_opened {
|
468
|
+
text = copy(data, ts, te)
|
464
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
465
|
-
raise
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
466
471
|
end
|
467
|
-
emit_options(text
|
472
|
+
emit_options(text)
|
468
473
|
};
|
469
474
|
|
470
475
|
# Assertions
|
@@ -473,12 +478,12 @@
|
|
473
478
|
# (?<=subexp) look-behind
|
474
479
|
# (?<!subexp) negative look-behind
|
475
480
|
# ------------------------------------------------------------------------
|
476
|
-
group_open . assertion_type >group_opened {
|
477
|
-
case text =
|
478
|
-
when '(?='; emit(:assertion, :lookahead, text
|
479
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
480
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
481
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
481
|
+
(group_open . assertion_type) >group_opened {
|
482
|
+
case text = copy(data, ts, te)
|
483
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
484
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
485
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
486
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
482
487
|
end
|
483
488
|
};
|
484
489
|
|
@@ -490,106 +495,78 @@
|
|
490
495
|
# (?'name'subexp) named group (single quoted version)
|
491
496
|
# (subexp) captured group
|
492
497
|
# ------------------------------------------------------------------------
|
493
|
-
group_open . group_type >group_opened {
|
494
|
-
case text =
|
495
|
-
when '(?:'; emit(:group, :passive, text
|
496
|
-
when '(?>'; emit(:group, :atomic, text
|
497
|
-
when '(?~'; emit(:group, :absence, text
|
498
|
+
(group_open . group_type) >group_opened {
|
499
|
+
case text = copy(data, ts, te)
|
500
|
+
when '(?:'; emit(:group, :passive, text)
|
501
|
+
when '(?>'; emit(:group, :atomic, text)
|
502
|
+
when '(?~'; emit(:group, :absence, text)
|
498
503
|
|
499
504
|
when /^\(\?(?:<>|'')/
|
500
|
-
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
501
506
|
|
502
|
-
when /^\(
|
503
|
-
emit(:group, :named_ab, text
|
507
|
+
when /^\(\?<[^>]+>/
|
508
|
+
emit(:group, :named_ab, text)
|
504
509
|
|
505
|
-
when /^\(\?'
|
506
|
-
emit(:group, :named_sq, text
|
510
|
+
when /^\(\?'[^']+'/
|
511
|
+
emit(:group, :named_sq, text)
|
507
512
|
|
508
513
|
end
|
509
514
|
};
|
510
515
|
|
511
516
|
group_open @group_opened {
|
512
|
-
text =
|
513
|
-
emit(:group, :capture, text
|
517
|
+
text = copy(data, ts, te)
|
518
|
+
emit(:group, :capture, text)
|
514
519
|
};
|
515
520
|
|
516
521
|
group_close @group_closed {
|
517
522
|
if conditional_stack.last == group_depth + 1
|
518
523
|
conditional_stack.pop
|
519
|
-
emit(:conditional, :close,
|
520
|
-
|
524
|
+
emit(:conditional, :close, ')')
|
525
|
+
elsif group_depth >= 0
|
521
526
|
if spacing_stack.length > 1 &&
|
522
527
|
spacing_stack.last[:depth] == group_depth + 1
|
523
528
|
spacing_stack.pop
|
524
529
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
525
530
|
end
|
526
531
|
|
527
|
-
emit(:group, :close,
|
532
|
+
emit(:group, :close, ')')
|
533
|
+
else
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
528
535
|
end
|
529
536
|
};
|
530
537
|
|
531
538
|
|
532
539
|
# Group backreference, named and numbered
|
533
540
|
# ------------------------------------------------------------------------
|
534
|
-
backslash . (
|
535
|
-
case text =
|
536
|
-
when /^\\([
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
551
|
-
end
|
552
|
-
|
553
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
554
|
-
if $1 == 'k'
|
555
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
556
|
-
else
|
557
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
558
|
-
end
|
559
|
-
|
560
|
-
when /^\\([gk])'\d+'/ # single quotes
|
561
|
-
if $1 == 'k'
|
562
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
563
|
-
else
|
564
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
565
|
-
end
|
566
|
-
|
567
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
568
|
-
if $1 == 'k'
|
569
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
570
|
-
else
|
571
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
572
|
-
end
|
573
|
-
|
574
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
575
|
-
if $1 == 'k'
|
576
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
577
|
-
else
|
578
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
579
|
-
end
|
580
|
-
|
581
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
582
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
583
|
-
|
584
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
585
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
586
|
-
|
587
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
588
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
589
|
-
|
590
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
591
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
542
|
+
case text = copy(data, ts, te)
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
544
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
546
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
548
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
550
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
552
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
553
|
+
else
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
555
|
+
end
|
556
|
+
};
|
592
557
|
|
558
|
+
# Group call, named and numbered
|
559
|
+
# ------------------------------------------------------------------------
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
561
|
+
case text = copy(data, ts, te)
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
563
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
565
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
567
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
568
|
+
else
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
593
570
|
end
|
594
571
|
};
|
595
572
|
|
@@ -597,31 +574,36 @@
|
|
597
574
|
# Quantifiers
|
598
575
|
# ------------------------------------------------------------------------
|
599
576
|
zero_or_one {
|
600
|
-
case text =
|
601
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
602
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
603
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
577
|
+
case text = copy(data, ts, te)
|
578
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
579
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
580
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
604
581
|
end
|
605
582
|
};
|
606
583
|
|
607
584
|
zero_or_more {
|
608
|
-
case text =
|
609
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
610
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
611
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
587
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
588
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
612
589
|
end
|
613
590
|
};
|
614
591
|
|
615
592
|
one_or_more {
|
616
|
-
case text =
|
617
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
618
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
619
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
595
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
596
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
620
597
|
end
|
621
598
|
};
|
622
599
|
|
623
|
-
quantifier_interval
|
624
|
-
emit(:quantifier, :interval,
|
600
|
+
quantifier_interval {
|
601
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
602
|
+
};
|
603
|
+
|
604
|
+
# Catch unmatched curly braces as literals
|
605
|
+
range_open {
|
606
|
+
append_literal(data, ts, te)
|
625
607
|
};
|
626
608
|
|
627
609
|
# Escaped sequences
|
@@ -632,15 +614,17 @@
|
|
632
614
|
|
633
615
|
comment {
|
634
616
|
if free_spacing
|
635
|
-
emit(:free_space, :comment,
|
617
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
636
618
|
else
|
637
|
-
|
619
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
620
|
+
append_literal(data, ts, ts + 1)
|
621
|
+
fexec ts + 1;
|
638
622
|
end
|
639
623
|
};
|
640
624
|
|
641
625
|
space+ {
|
642
626
|
if free_spacing
|
643
|
-
emit(:free_space, :whitespace,
|
627
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
644
628
|
else
|
645
629
|
append_literal(data, ts, te)
|
646
630
|
end
|
@@ -649,105 +633,47 @@
|
|
649
633
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
650
634
|
# except meta characters.
|
651
635
|
# ------------------------------------------------------------------------
|
652
|
-
(ascii_print -- space)+
|
653
|
-
ascii_nonprint+ |
|
654
|
-
utf8_2_byte+ |
|
655
|
-
utf8_3_byte+ |
|
656
|
-
utf8_4_byte+ {
|
636
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
657
637
|
append_literal(data, ts, te)
|
658
638
|
};
|
659
639
|
|
660
640
|
*|;
|
661
641
|
}%%
|
662
642
|
|
663
|
-
|
664
|
-
|
643
|
+
require 'regexp_parser/scanner/errors/scanner_error'
|
644
|
+
require 'regexp_parser/scanner/errors/premature_end_error'
|
645
|
+
require 'regexp_parser/scanner/errors/validation_error'
|
665
646
|
|
666
647
|
class Regexp::Scanner
|
667
|
-
# General scanner error (catch all)
|
668
|
-
class ScannerError < StandardError; end
|
669
|
-
|
670
|
-
# Base for all scanner validation errors
|
671
|
-
class ValidationError < StandardError
|
672
|
-
def initialize(reason)
|
673
|
-
super reason
|
674
|
-
end
|
675
|
-
end
|
676
|
-
|
677
|
-
# Unexpected end of pattern
|
678
|
-
class PrematureEndError < ScannerError
|
679
|
-
def initialize(where = '')
|
680
|
-
super "Premature end of pattern at #{where}"
|
681
|
-
end
|
682
|
-
end
|
683
|
-
|
684
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
685
|
-
class InvalidSequenceError < ValidationError
|
686
|
-
def initialize(what = 'sequence', where = '')
|
687
|
-
super "Invalid #{what} at #{where}"
|
688
|
-
end
|
689
|
-
end
|
690
|
-
|
691
|
-
# Invalid group. Used for named groups.
|
692
|
-
class InvalidGroupError < ValidationError
|
693
|
-
def initialize(what, reason)
|
694
|
-
super "Invalid #{what}, #{reason}."
|
695
|
-
end
|
696
|
-
end
|
697
|
-
|
698
|
-
# Invalid groupOption. Used for inline options.
|
699
|
-
class InvalidGroupOption < ValidationError
|
700
|
-
def initialize(option, text)
|
701
|
-
super "Invalid group option #{option} in #{text}"
|
702
|
-
end
|
703
|
-
end
|
704
|
-
|
705
|
-
# Invalid back reference. Used for name a number refs/calls.
|
706
|
-
class InvalidBackrefError < ValidationError
|
707
|
-
def initialize(what, reason)
|
708
|
-
super "Invalid back reference #{what}, #{reason}"
|
709
|
-
end
|
710
|
-
end
|
711
|
-
|
712
|
-
# The property name was not recognized by the scanner.
|
713
|
-
class UnknownUnicodePropertyError < ValidationError
|
714
|
-
def initialize(name)
|
715
|
-
super "Unknown unicode character property name #{name}"
|
716
|
-
end
|
717
|
-
end
|
718
|
-
|
719
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
720
649
|
# emitted token into an array that gets returned at the end. If a block is
|
721
650
|
# given, it gets called for each emitted token.
|
722
651
|
#
|
723
652
|
# This method may raise errors if a syntax error is encountered.
|
724
653
|
# --------------------------------------------------------------------------
|
725
|
-
def self.scan(input_object, &block)
|
726
|
-
new.scan(input_object, &block)
|
654
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
655
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
727
656
|
end
|
728
657
|
|
729
|
-
def scan(input_object, &block)
|
730
|
-
self.
|
658
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
659
|
+
self.collect_tokens = collect_tokens
|
660
|
+
self.literal_run = nil
|
731
661
|
stack = []
|
732
662
|
|
733
|
-
|
734
|
-
|
735
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
736
|
-
else
|
737
|
-
input = input_object
|
738
|
-
self.free_spacing = false
|
739
|
-
end
|
663
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
|
+
self.free_spacing = free_spacing?(input_object, options)
|
740
665
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
741
666
|
|
742
|
-
data = input.unpack("c*")
|
667
|
+
data = input.unpack("c*")
|
743
668
|
eof = data.length
|
744
669
|
|
745
670
|
self.tokens = []
|
746
|
-
self.block =
|
671
|
+
self.block = block
|
747
672
|
|
748
673
|
self.set_depth = 0
|
749
674
|
self.group_depth = 0
|
750
675
|
self.conditional_stack = []
|
676
|
+
self.char_pos = 0
|
751
677
|
|
752
678
|
%% write data;
|
753
679
|
%% write init;
|
@@ -757,7 +683,7 @@ class Regexp::Scanner
|
|
757
683
|
testEof = testEof
|
758
684
|
|
759
685
|
if cs == re_scanner_error
|
760
|
-
text =
|
686
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
761
687
|
raise ScannerError.new("Scan error at '#{text}'")
|
762
688
|
end
|
763
689
|
|
@@ -767,40 +693,76 @@ class Regexp::Scanner
|
|
767
693
|
"[#{set_depth}]") if in_set?
|
768
694
|
|
769
695
|
# when the entire expression is a literal run
|
770
|
-
emit_literal if
|
696
|
+
emit_literal if literal_run
|
771
697
|
|
772
698
|
tokens
|
773
699
|
end
|
774
700
|
|
775
701
|
# lazy-load property maps when first needed
|
776
|
-
require 'yaml'
|
777
|
-
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
778
|
-
|
779
702
|
def self.short_prop_map
|
780
|
-
@short_prop_map ||=
|
703
|
+
@short_prop_map ||= parse_prop_map('short')
|
781
704
|
end
|
782
705
|
|
783
706
|
def self.long_prop_map
|
784
|
-
@long_prop_map ||=
|
707
|
+
@long_prop_map ||= parse_prop_map('long')
|
708
|
+
end
|
709
|
+
|
710
|
+
def self.parse_prop_map(name)
|
711
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
|
+
end
|
713
|
+
|
714
|
+
def self.posix_classes
|
715
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
716
|
+
lower print punct space upper word xdigit]
|
785
717
|
end
|
786
718
|
|
787
719
|
# Emits an array with the details of the scanned pattern
|
788
|
-
def emit(type, token, text
|
720
|
+
def emit(type, token, text)
|
789
721
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
790
722
|
|
791
|
-
emit_literal if
|
723
|
+
emit_literal if literal_run
|
724
|
+
|
725
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
726
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
727
|
+
ts_char_pos = char_pos
|
728
|
+
te_char_pos = char_pos + text.length
|
729
|
+
|
730
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
731
|
+
|
732
|
+
self.prev_token = tok
|
733
|
+
|
734
|
+
self.char_pos = te_char_pos
|
792
735
|
|
793
736
|
if block
|
794
|
-
block.call type, token, text,
|
737
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
738
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
739
|
+
tokens << tok if collect_tokens
|
740
|
+
elsif collect_tokens
|
741
|
+
tokens << tok
|
795
742
|
end
|
796
|
-
|
797
|
-
tokens << [type, token, text, ts, te]
|
798
743
|
end
|
799
744
|
|
745
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
746
|
+
|
800
747
|
private
|
801
748
|
|
802
|
-
attr_accessor :
|
803
|
-
:
|
749
|
+
attr_accessor :block,
|
750
|
+
:collect_tokens, :tokens, :prev_token,
|
751
|
+
:free_spacing, :spacing_stack,
|
752
|
+
:group_depth, :set_depth, :conditional_stack,
|
753
|
+
:char_pos
|
754
|
+
|
755
|
+
def free_spacing?(input_object, options)
|
756
|
+
if options && !input_object.is_a?(String)
|
757
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
758
|
+
end
|
759
|
+
|
760
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
761
|
+
|
762
|
+
return false unless options
|
763
|
+
|
764
|
+
options & Regexp::EXTENDED != 0
|
765
|
+
end
|
804
766
|
|
805
767
|
def in_group?
|
806
768
|
group_depth > 0
|
@@ -811,36 +773,24 @@ class Regexp::Scanner
|
|
811
773
|
end
|
812
774
|
|
813
775
|
# Copy from ts to te from data as text
|
814
|
-
def copy(data,
|
815
|
-
data[
|
816
|
-
end
|
817
|
-
|
818
|
-
# Copy from ts to te from data as text, returning an array with the text
|
819
|
-
# and the offsets used to copy it.
|
820
|
-
def text(data, ts, te, soff = 0)
|
821
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
776
|
+
def copy(data, ts, te)
|
777
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
822
778
|
end
|
823
779
|
|
824
780
|
# Appends one or more characters to the literal buffer, to be emitted later
|
825
|
-
# by a call to emit_literal.
|
781
|
+
# by a call to emit_literal.
|
826
782
|
def append_literal(data, ts, te)
|
827
|
-
self.
|
828
|
-
literal << text(data, ts, te)
|
783
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
829
784
|
end
|
830
785
|
|
831
|
-
# Emits the literal run collected by calls to the append_literal method
|
832
|
-
# using the total start (ts) and end (te) offsets of the run.
|
786
|
+
# Emits the literal run collected by calls to the append_literal method.
|
833
787
|
def emit_literal
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
838
|
-
|
839
|
-
self.literal = nil
|
840
|
-
emit(:literal, :literal, text, ts, te)
|
788
|
+
text = literal_run.join
|
789
|
+
self.literal_run = nil
|
790
|
+
emit(:literal, :literal, text)
|
841
791
|
end
|
842
792
|
|
843
|
-
def emit_options(text
|
793
|
+
def emit_options(text)
|
844
794
|
token = nil
|
845
795
|
|
846
796
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -866,28 +816,13 @@ class Regexp::Scanner
|
|
866
816
|
token = :options_switch
|
867
817
|
end
|
868
818
|
|
869
|
-
emit(:group, token, text
|
819
|
+
emit(:group, token, text)
|
870
820
|
end
|
871
821
|
|
872
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
873
823
|
if data.last < 0x00 || data.last > 0x7F
|
874
|
-
|
875
|
-
end
|
876
|
-
emit(:escape, token, *text(data, ts, te, 1))
|
877
|
-
end
|
878
|
-
|
879
|
-
# Centralizes and unifies the handling of validation related
|
880
|
-
# errors.
|
881
|
-
def validation_error(type, what, reason)
|
882
|
-
case type
|
883
|
-
when :group
|
884
|
-
error = InvalidGroupError.new(what, reason)
|
885
|
-
when :backref
|
886
|
-
error = InvalidBackrefError.new(what, reason)
|
887
|
-
when :sequence
|
888
|
-
error = InvalidSequenceError.new(what, reason)
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
889
825
|
end
|
890
|
-
|
891
|
-
raise error # unless @@config.validation_ignore
|
826
|
+
emit(:escape, token, copy(data, ts-1, te))
|
892
827
|
end
|
893
828
|
end # module Regexp::Scanner
|