yarp 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -8
- data/CONTRIBUTING.md +2 -2
- data/Makefile +5 -5
- data/README.md +11 -12
- data/config.yml +6 -2
- data/docs/build_system.md +21 -21
- data/docs/building.md +4 -4
- data/docs/configuration.md +25 -21
- data/docs/design.md +2 -2
- data/docs/encoding.md +17 -17
- data/docs/fuzzing.md +4 -4
- data/docs/heredocs.md +3 -3
- data/docs/mapping.md +94 -94
- data/docs/ripper.md +4 -4
- data/docs/ruby_api.md +11 -11
- data/docs/serialization.md +17 -16
- data/docs/testing.md +6 -6
- data/ext/prism/api_node.c +4725 -0
- data/ext/{yarp → prism}/api_pack.c +82 -82
- data/ext/{yarp → prism}/extconf.rb +13 -13
- data/ext/{yarp → prism}/extension.c +175 -168
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +1932 -0
- data/include/prism/defines.h +45 -0
- data/include/prism/diagnostic.h +231 -0
- data/include/{yarp/enc/yp_encoding.h → prism/enc/pm_encoding.h} +40 -40
- data/include/prism/node.h +41 -0
- data/include/prism/pack.h +141 -0
- data/include/{yarp → prism}/parser.h +143 -142
- data/include/prism/regexp.h +19 -0
- data/include/prism/unescape.h +48 -0
- data/include/prism/util/pm_buffer.h +51 -0
- data/include/{yarp/util/yp_char.h → prism/util/pm_char.h} +20 -20
- data/include/{yarp/util/yp_constant_pool.h → prism/util/pm_constant_pool.h} +26 -22
- data/include/{yarp/util/yp_list.h → prism/util/pm_list.h} +21 -21
- data/include/prism/util/pm_memchr.h +14 -0
- data/include/{yarp/util/yp_newline_list.h → prism/util/pm_newline_list.h} +11 -11
- data/include/prism/util/pm_state_stack.h +24 -0
- data/include/{yarp/util/yp_string.h → prism/util/pm_string.h} +20 -20
- data/include/prism/util/pm_string_list.h +25 -0
- data/include/{yarp/util/yp_strpbrk.h → prism/util/pm_strpbrk.h} +7 -7
- data/include/prism/version.h +4 -0
- data/include/prism.h +82 -0
- data/lib/prism/compiler.rb +465 -0
- data/lib/prism/debug.rb +157 -0
- data/lib/{yarp/desugar_visitor.rb → prism/desugar_compiler.rb} +4 -2
- data/lib/prism/dispatcher.rb +2051 -0
- data/lib/prism/dsl.rb +750 -0
- data/lib/{yarp → prism}/ffi.rb +66 -67
- data/lib/{yarp → prism}/lex_compat.rb +40 -43
- data/lib/{yarp/mutation_visitor.rb → prism/mutation_compiler.rb} +3 -3
- data/lib/{yarp → prism}/node.rb +2012 -2593
- data/lib/prism/node_ext.rb +55 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/{yarp → prism}/pack.rb +1 -1
- data/lib/{yarp → prism}/parse_result/comments.rb +1 -1
- data/lib/{yarp → prism}/parse_result/newlines.rb +1 -1
- data/lib/prism/parse_result.rb +266 -0
- data/lib/{yarp → prism}/pattern.rb +14 -14
- data/lib/{yarp → prism}/ripper_compat.rb +5 -5
- data/lib/{yarp → prism}/serialize.rb +12 -7
- data/lib/prism/visitor.rb +470 -0
- data/lib/prism.rb +64 -0
- data/lib/yarp.rb +2 -614
- data/src/diagnostic.c +213 -208
- data/src/enc/pm_big5.c +52 -0
- data/src/enc/pm_euc_jp.c +58 -0
- data/src/enc/{yp_gbk.c → pm_gbk.c} +16 -16
- data/src/enc/pm_shift_jis.c +56 -0
- data/src/enc/{yp_tables.c → pm_tables.c} +69 -69
- data/src/enc/{yp_unicode.c → pm_unicode.c} +40 -40
- data/src/enc/pm_windows_31j.c +56 -0
- data/src/node.c +1293 -1233
- data/src/pack.c +247 -247
- data/src/prettyprint.c +1479 -1479
- data/src/{yarp.c → prism.c} +5205 -5083
- data/src/regexp.c +132 -132
- data/src/serialize.c +1121 -1121
- data/src/token_type.c +169 -167
- data/src/unescape.c +106 -87
- data/src/util/pm_buffer.c +103 -0
- data/src/util/{yp_char.c → pm_char.c} +72 -72
- data/src/util/{yp_constant_pool.c → pm_constant_pool.c} +85 -64
- data/src/util/{yp_list.c → pm_list.c} +10 -10
- data/src/util/{yp_memchr.c → pm_memchr.c} +6 -4
- data/src/util/{yp_newline_list.c → pm_newline_list.c} +21 -21
- data/src/util/{yp_state_stack.c → pm_state_stack.c} +4 -4
- data/src/util/{yp_string.c → pm_string.c} +38 -38
- data/src/util/pm_string_list.c +29 -0
- data/src/util/{yp_strncasecmp.c → pm_strncasecmp.c} +1 -1
- data/src/util/{yp_strpbrk.c → pm_strpbrk.c} +8 -8
- data/yarp.gemspec +68 -59
- metadata +70 -61
- data/ext/yarp/api_node.c +0 -4728
- data/ext/yarp/extension.h +0 -18
- data/include/yarp/ast.h +0 -1929
- data/include/yarp/defines.h +0 -45
- data/include/yarp/diagnostic.h +0 -226
- data/include/yarp/node.h +0 -42
- data/include/yarp/pack.h +0 -141
- data/include/yarp/regexp.h +0 -19
- data/include/yarp/unescape.h +0 -44
- data/include/yarp/util/yp_buffer.h +0 -51
- data/include/yarp/util/yp_memchr.h +0 -14
- data/include/yarp/util/yp_state_stack.h +0 -24
- data/include/yarp/util/yp_string_list.h +0 -25
- data/include/yarp/version.h +0 -4
- data/include/yarp.h +0 -82
- data/src/enc/yp_big5.c +0 -52
- data/src/enc/yp_euc_jp.c +0 -58
- data/src/enc/yp_shift_jis.c +0 -56
- data/src/enc/yp_windows_31j.c +0 -56
- data/src/util/yp_buffer.c +0 -101
- data/src/util/yp_string_list.c +0 -29
data/src/regexp.c
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
#include "
|
1
|
+
#include "prism/regexp.h"
|
2
2
|
|
3
3
|
// This is the parser that is going to handle parsing regular expressions.
|
4
4
|
typedef struct {
|
5
5
|
const uint8_t *start;
|
6
6
|
const uint8_t *cursor;
|
7
7
|
const uint8_t *end;
|
8
|
-
|
8
|
+
pm_string_list_t *named_captures;
|
9
9
|
bool encoding_changed;
|
10
|
-
|
11
|
-
}
|
10
|
+
pm_encoding_t *encoding;
|
11
|
+
} pm_regexp_parser_t;
|
12
12
|
|
13
13
|
// This initializes a new parser with the given source.
|
14
14
|
static void
|
15
|
-
|
16
|
-
*parser = (
|
15
|
+
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
16
|
+
*parser = (pm_regexp_parser_t) {
|
17
17
|
.start = start,
|
18
18
|
.cursor = start,
|
19
19
|
.end = end,
|
@@ -25,23 +25,23 @@ yp_regexp_parser_init(yp_regexp_parser_t *parser, const uint8_t *start, const ui
|
|
25
25
|
|
26
26
|
// This appends a new string to the list of named captures.
|
27
27
|
static void
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
29
|
+
pm_string_t string;
|
30
|
+
pm_string_shared_init(&string, start, end);
|
31
|
+
pm_string_list_append(parser->named_captures, &string);
|
32
|
+
pm_string_free(&string);
|
33
33
|
}
|
34
34
|
|
35
35
|
// Returns true if the next character is the end of the source.
|
36
36
|
static inline bool
|
37
|
-
|
37
|
+
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
38
38
|
return parser->cursor >= parser->end;
|
39
39
|
}
|
40
40
|
|
41
41
|
// Optionally accept a char and consume it if it exists.
|
42
42
|
static inline bool
|
43
|
-
|
44
|
-
if (!
|
43
|
+
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
44
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
45
45
|
parser->cursor++;
|
46
46
|
return true;
|
47
47
|
}
|
@@ -50,8 +50,8 @@ yp_regexp_char_accept(yp_regexp_parser_t *parser, uint8_t value) {
|
|
50
50
|
|
51
51
|
// Expect a character to be present and consume it.
|
52
52
|
static inline bool
|
53
|
-
|
54
|
-
if (!
|
53
|
+
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
54
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
55
55
|
parser->cursor++;
|
56
56
|
return true;
|
57
57
|
}
|
@@ -60,12 +60,12 @@ yp_regexp_char_expect(yp_regexp_parser_t *parser, uint8_t value) {
|
|
60
60
|
|
61
61
|
// This advances the current token to the next instance of the given character.
|
62
62
|
static bool
|
63
|
-
|
64
|
-
if (
|
63
|
+
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
64
|
+
if (pm_regexp_char_is_eof(parser)) {
|
65
65
|
return false;
|
66
66
|
}
|
67
67
|
|
68
|
-
const uint8_t *end = (const uint8_t *)
|
68
|
+
const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
69
69
|
if (end == NULL) {
|
70
70
|
return false;
|
71
71
|
}
|
@@ -106,41 +106,41 @@ yp_regexp_char_find(yp_regexp_parser_t *parser, uint8_t value) {
|
|
106
106
|
// Note that by the time we've hit this function, the lbrace has already been
|
107
107
|
// consumed so we're in the start state.
|
108
108
|
static bool
|
109
|
-
|
109
|
+
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
110
110
|
const uint8_t *savepoint = parser->cursor;
|
111
111
|
|
112
112
|
enum {
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
} state =
|
113
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
|
114
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
|
115
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
|
116
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
|
117
|
+
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
118
118
|
|
119
119
|
while (1) {
|
120
120
|
switch (state) {
|
121
|
-
case
|
121
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
122
122
|
switch (*parser->cursor) {
|
123
123
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
124
124
|
parser->cursor++;
|
125
|
-
state =
|
125
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
|
126
126
|
break;
|
127
127
|
case ',':
|
128
128
|
parser->cursor++;
|
129
|
-
state =
|
129
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
|
130
130
|
break;
|
131
131
|
default:
|
132
132
|
parser->cursor = savepoint;
|
133
133
|
return true;
|
134
134
|
}
|
135
135
|
break;
|
136
|
-
case
|
136
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
|
137
137
|
switch (*parser->cursor) {
|
138
138
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
139
139
|
parser->cursor++;
|
140
140
|
break;
|
141
141
|
case ',':
|
142
142
|
parser->cursor++;
|
143
|
-
state =
|
143
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
144
144
|
break;
|
145
145
|
case '}':
|
146
146
|
parser->cursor++;
|
@@ -150,18 +150,18 @@ yp_regexp_parse_range_quantifier(yp_regexp_parser_t *parser) {
|
|
150
150
|
return true;
|
151
151
|
}
|
152
152
|
break;
|
153
|
-
case
|
153
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
|
154
154
|
switch (*parser->cursor) {
|
155
155
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
156
156
|
parser->cursor++;
|
157
|
-
state =
|
157
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
158
158
|
break;
|
159
159
|
default:
|
160
160
|
parser->cursor = savepoint;
|
161
161
|
return true;
|
162
162
|
}
|
163
163
|
break;
|
164
|
-
case
|
164
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
|
165
165
|
switch (*parser->cursor) {
|
166
166
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
167
167
|
parser->cursor++;
|
@@ -187,7 +187,7 @@ yp_regexp_parse_range_quantifier(yp_regexp_parser_t *parser) {
|
|
187
187
|
// | <empty>
|
188
188
|
// ;
|
189
189
|
static bool
|
190
|
-
|
190
|
+
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
191
191
|
switch (*parser->cursor) {
|
192
192
|
case '*':
|
193
193
|
case '+':
|
@@ -196,7 +196,7 @@ yp_regexp_parse_quantifier(yp_regexp_parser_t *parser) {
|
|
196
196
|
return true;
|
197
197
|
case '{':
|
198
198
|
parser->cursor++;
|
199
|
-
return
|
199
|
+
return pm_regexp_parse_range_quantifier(parser);
|
200
200
|
default:
|
201
201
|
// In this case there is no quantifier.
|
202
202
|
return true;
|
@@ -206,37 +206,37 @@ yp_regexp_parse_quantifier(yp_regexp_parser_t *parser) {
|
|
206
206
|
// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
207
207
|
// ;
|
208
208
|
static bool
|
209
|
-
|
210
|
-
if (!
|
209
|
+
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
210
|
+
if (!pm_regexp_char_expect(parser, ':')) {
|
211
211
|
return false;
|
212
212
|
}
|
213
213
|
|
214
|
-
|
214
|
+
pm_regexp_char_accept(parser, '^');
|
215
215
|
|
216
216
|
return (
|
217
|
-
|
218
|
-
|
219
|
-
|
217
|
+
pm_regexp_char_find(parser, ':') &&
|
218
|
+
pm_regexp_char_expect(parser, ']') &&
|
219
|
+
pm_regexp_char_expect(parser, ']')
|
220
220
|
);
|
221
221
|
}
|
222
222
|
|
223
223
|
// Forward declaration because character sets can be nested.
|
224
224
|
static bool
|
225
|
-
|
225
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
226
226
|
|
227
227
|
// match-char-set : '[' '^'? (match-range | match-char)* ']'
|
228
228
|
// ;
|
229
229
|
static bool
|
230
|
-
|
231
|
-
|
230
|
+
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
231
|
+
pm_regexp_char_accept(parser, '^');
|
232
232
|
|
233
|
-
while (!
|
233
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
234
234
|
switch (*parser->cursor++) {
|
235
235
|
case '[':
|
236
|
-
|
236
|
+
pm_regexp_parse_lbracket(parser);
|
237
237
|
break;
|
238
238
|
case '\\':
|
239
|
-
if (!
|
239
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
240
240
|
parser->cursor++;
|
241
241
|
}
|
242
242
|
break;
|
@@ -246,78 +246,78 @@ yp_regexp_parse_character_set(yp_regexp_parser_t *parser) {
|
|
246
246
|
}
|
247
247
|
}
|
248
248
|
|
249
|
-
return
|
249
|
+
return pm_regexp_char_expect(parser, ']');
|
250
250
|
}
|
251
251
|
|
252
252
|
// A left bracket can either mean a POSIX class or a character set.
|
253
253
|
static bool
|
254
|
-
|
254
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
255
255
|
const uint8_t *reset = parser->cursor;
|
256
256
|
|
257
257
|
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
258
258
|
parser->cursor++;
|
259
|
-
if (
|
259
|
+
if (pm_regexp_parse_posix_class(parser)) return true;
|
260
260
|
|
261
261
|
parser->cursor = reset;
|
262
262
|
}
|
263
263
|
|
264
|
-
return
|
264
|
+
return pm_regexp_parse_character_set(parser);
|
265
265
|
}
|
266
266
|
|
267
267
|
// Forward declaration here since parsing groups needs to go back up the grammar
|
268
268
|
// to parse expressions within them.
|
269
269
|
static bool
|
270
|
-
|
270
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
271
271
|
|
272
272
|
// These are the states of the options that are configurable on the regular
|
273
273
|
// expression (or from within a group).
|
274
274
|
typedef enum {
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
}
|
275
|
+
PM_REGEXP_OPTION_STATE_INVALID,
|
276
|
+
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
277
|
+
PM_REGEXP_OPTION_STATE_ADDABLE,
|
278
|
+
PM_REGEXP_OPTION_STATE_ADDED,
|
279
|
+
PM_REGEXP_OPTION_STATE_REMOVED
|
280
|
+
} pm_regexp_option_state_t;
|
281
281
|
|
282
282
|
// These are the options that are configurable on the regular expression (or
|
283
283
|
// from within a group).
|
284
|
-
#define
|
285
|
-
#define
|
286
|
-
#define
|
284
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
285
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
286
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
287
287
|
|
288
288
|
// This is the set of options that are configurable on the regular expression.
|
289
289
|
typedef struct {
|
290
|
-
uint8_t values[
|
291
|
-
}
|
290
|
+
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
291
|
+
} pm_regexp_options_t;
|
292
292
|
|
293
293
|
// Initialize a new set of options to their default values.
|
294
294
|
static void
|
295
|
-
|
296
|
-
memset(options,
|
297
|
-
options->values['i' -
|
298
|
-
options->values['m' -
|
299
|
-
options->values['x' -
|
300
|
-
options->values['d' -
|
301
|
-
options->values['a' -
|
302
|
-
options->values['u' -
|
295
|
+
pm_regexp_options_init(pm_regexp_options_t *options) {
|
296
|
+
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
297
|
+
options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
298
|
+
options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
299
|
+
options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
300
|
+
options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
301
|
+
options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
302
|
+
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
303
303
|
}
|
304
304
|
|
305
305
|
// Attempt to add the given option to the set of options. Returns true if it was
|
306
306
|
// added, false if it was already present.
|
307
307
|
static bool
|
308
|
-
|
309
|
-
if (key >=
|
310
|
-
key = (uint8_t) (key -
|
308
|
+
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
309
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
310
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
311
311
|
|
312
312
|
switch (options->values[key]) {
|
313
|
-
case
|
314
|
-
case
|
313
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
314
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
315
315
|
return false;
|
316
|
-
case
|
317
|
-
case
|
318
|
-
options->values[key] =
|
316
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
317
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
318
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
|
319
319
|
return true;
|
320
|
-
case
|
320
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
321
321
|
return true;
|
322
322
|
}
|
323
323
|
}
|
@@ -328,18 +328,18 @@ yp_regexp_options_add(yp_regexp_options_t *options, uint8_t key) {
|
|
328
328
|
// Attempt to remove the given option from the set of options. Returns true if
|
329
329
|
// it was removed, false if it was already absent.
|
330
330
|
static bool
|
331
|
-
|
332
|
-
if (key >=
|
333
|
-
key = (uint8_t) (key -
|
331
|
+
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
332
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
333
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
334
334
|
|
335
335
|
switch (options->values[key]) {
|
336
|
-
case
|
337
|
-
case
|
336
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
337
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
338
338
|
return false;
|
339
|
-
case
|
340
|
-
case
|
341
|
-
case
|
342
|
-
options->values[key] =
|
339
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
340
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
341
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
342
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
|
343
343
|
return true;
|
344
344
|
}
|
345
345
|
}
|
@@ -368,14 +368,14 @@ yp_regexp_options_remove(yp_regexp_options_t *options, uint8_t key) {
|
|
368
368
|
// * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
369
369
|
//
|
370
370
|
static bool
|
371
|
-
|
371
|
+
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
372
372
|
// First, parse any options for the group.
|
373
|
-
if (
|
374
|
-
if (
|
373
|
+
if (pm_regexp_char_accept(parser, '?')) {
|
374
|
+
if (pm_regexp_char_is_eof(parser)) {
|
375
375
|
return false;
|
376
376
|
}
|
377
|
-
|
378
|
-
|
377
|
+
pm_regexp_options_t options;
|
378
|
+
pm_regexp_options_init(&options);
|
379
379
|
|
380
380
|
switch (*parser->cursor) {
|
381
381
|
case '#': { // inline comments
|
@@ -403,10 +403,10 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
403
403
|
// Here we can take the fast path and use memchr to find the
|
404
404
|
// next ) because we are safe checking backward for \ since
|
405
405
|
// it cannot be a trailing character.
|
406
|
-
bool found =
|
406
|
+
bool found = pm_regexp_char_find(parser, ')');
|
407
407
|
|
408
408
|
while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
|
409
|
-
found =
|
409
|
+
found = pm_regexp_char_find(parser, ')');
|
410
410
|
}
|
411
411
|
|
412
412
|
return found;
|
@@ -421,7 +421,7 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
421
421
|
break;
|
422
422
|
case '<':
|
423
423
|
parser->cursor++;
|
424
|
-
if (
|
424
|
+
if (pm_regexp_char_is_eof(parser)) {
|
425
425
|
return false;
|
426
426
|
}
|
427
427
|
|
@@ -432,37 +432,37 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
432
432
|
break;
|
433
433
|
default: { // named capture group
|
434
434
|
const uint8_t *start = parser->cursor;
|
435
|
-
if (!
|
435
|
+
if (!pm_regexp_char_find(parser, '>')) {
|
436
436
|
return false;
|
437
437
|
}
|
438
|
-
|
438
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
439
439
|
break;
|
440
440
|
}
|
441
441
|
}
|
442
442
|
break;
|
443
443
|
case '\'': { // named capture group
|
444
444
|
const uint8_t *start = ++parser->cursor;
|
445
|
-
if (!
|
445
|
+
if (!pm_regexp_char_find(parser, '\'')) {
|
446
446
|
return false;
|
447
447
|
}
|
448
448
|
|
449
|
-
|
449
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
450
450
|
break;
|
451
451
|
}
|
452
452
|
case '(': // conditional expression
|
453
|
-
if (!
|
453
|
+
if (!pm_regexp_char_find(parser, ')')) {
|
454
454
|
return false;
|
455
455
|
}
|
456
456
|
break;
|
457
457
|
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
|
458
|
-
while (!
|
459
|
-
if (!
|
458
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
|
459
|
+
if (!pm_regexp_options_add(&options, *parser->cursor)) {
|
460
460
|
return false;
|
461
461
|
}
|
462
462
|
parser->cursor++;
|
463
463
|
}
|
464
464
|
|
465
|
-
if (
|
465
|
+
if (pm_regexp_char_is_eof(parser)) {
|
466
466
|
return false;
|
467
467
|
}
|
468
468
|
|
@@ -473,14 +473,14 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
473
473
|
/* fallthrough */
|
474
474
|
case '-':
|
475
475
|
parser->cursor++;
|
476
|
-
while (!
|
477
|
-
if (!
|
476
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
477
|
+
if (!pm_regexp_options_remove(&options, *parser->cursor)) {
|
478
478
|
return false;
|
479
479
|
}
|
480
480
|
parser->cursor++;
|
481
481
|
}
|
482
482
|
|
483
|
-
if (
|
483
|
+
if (pm_regexp_char_is_eof(parser)) {
|
484
484
|
return false;
|
485
485
|
}
|
486
486
|
break;
|
@@ -490,15 +490,15 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
490
490
|
}
|
491
491
|
|
492
492
|
// Now, parse the expressions within this group.
|
493
|
-
while (!
|
494
|
-
if (!
|
493
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
494
|
+
if (!pm_regexp_parse_expression(parser)) {
|
495
495
|
return false;
|
496
496
|
}
|
497
|
-
|
497
|
+
pm_regexp_char_accept(parser, '|');
|
498
498
|
}
|
499
499
|
|
500
500
|
// Finally, make sure we have a closing parenthesis.
|
501
|
-
return
|
501
|
+
return pm_regexp_char_expect(parser, ')');
|
502
502
|
}
|
503
503
|
|
504
504
|
// item : anchor
|
@@ -512,35 +512,35 @@ yp_regexp_parse_group(yp_regexp_parser_t *parser) {
|
|
512
512
|
// | quantified
|
513
513
|
// ;
|
514
514
|
static bool
|
515
|
-
|
515
|
+
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
516
516
|
switch (*parser->cursor++) {
|
517
517
|
case '^':
|
518
518
|
case '$':
|
519
519
|
return true;
|
520
520
|
case '\\':
|
521
|
-
if (!
|
521
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
522
522
|
parser->cursor++;
|
523
523
|
}
|
524
|
-
return
|
524
|
+
return pm_regexp_parse_quantifier(parser);
|
525
525
|
case '(':
|
526
|
-
return
|
526
|
+
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
|
527
527
|
case '[':
|
528
|
-
return
|
528
|
+
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
|
529
529
|
default:
|
530
|
-
return
|
530
|
+
return pm_regexp_parse_quantifier(parser);
|
531
531
|
}
|
532
532
|
}
|
533
533
|
|
534
534
|
// expression : item+
|
535
535
|
// ;
|
536
536
|
static bool
|
537
|
-
|
538
|
-
if (!
|
537
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
538
|
+
if (!pm_regexp_parse_item(parser)) {
|
539
539
|
return false;
|
540
540
|
}
|
541
541
|
|
542
|
-
while (!
|
543
|
-
if (!
|
542
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
543
|
+
if (!pm_regexp_parse_item(parser)) {
|
544
544
|
return false;
|
545
545
|
}
|
546
546
|
}
|
@@ -553,28 +553,28 @@ yp_regexp_parse_expression(yp_regexp_parser_t *parser) {
|
|
553
553
|
// | expression '|' pattern
|
554
554
|
// ;
|
555
555
|
static bool
|
556
|
-
|
556
|
+
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
557
557
|
return (
|
558
558
|
(
|
559
559
|
// Exit early if the pattern is empty.
|
560
|
-
|
560
|
+
pm_regexp_char_is_eof(parser) ||
|
561
561
|
// Parse the first expression in the pattern.
|
562
|
-
|
562
|
+
pm_regexp_parse_expression(parser)
|
563
563
|
) &&
|
564
564
|
(
|
565
565
|
// Return now if we've parsed the entire pattern.
|
566
|
-
|
566
|
+
pm_regexp_char_is_eof(parser) ||
|
567
567
|
// Otherwise, we should have a pipe character.
|
568
|
-
(
|
568
|
+
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
569
569
|
)
|
570
570
|
);
|
571
571
|
}
|
572
572
|
|
573
573
|
// Parse a regular expression and extract the names of all of the named capture
|
574
574
|
// groups.
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
return
|
575
|
+
PRISM_EXPORTED_FUNCTION bool
|
576
|
+
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
577
|
+
pm_regexp_parser_t parser;
|
578
|
+
pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
|
579
|
+
return pm_regexp_parse_pattern(&parser);
|
580
580
|
}
|