prism 0.15.1 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/Makefile +12 -0
- data/README.md +3 -1
- data/config.yml +66 -50
- data/docs/configuration.md +2 -0
- data/docs/fuzzing.md +1 -1
- data/docs/javascript.md +90 -0
- data/docs/releasing.md +27 -0
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +28 -29
- data/ext/prism/api_node.c +856 -826
- data/ext/prism/api_pack.c +20 -9
- data/ext/prism/extension.c +494 -119
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +3157 -747
- data/include/prism/defines.h +40 -8
- data/include/prism/diagnostic.h +36 -3
- data/include/prism/enc/pm_encoding.h +119 -28
- data/include/prism/node.h +38 -30
- data/include/prism/options.h +204 -0
- data/include/prism/pack.h +44 -33
- data/include/prism/parser.h +445 -199
- data/include/prism/prettyprint.h +26 -0
- data/include/prism/regexp.h +16 -2
- data/include/prism/util/pm_buffer.h +102 -18
- data/include/prism/util/pm_char.h +162 -48
- data/include/prism/util/pm_constant_pool.h +128 -34
- data/include/prism/util/pm_list.h +68 -38
- data/include/prism/util/pm_memchr.h +18 -3
- data/include/prism/util/pm_newline_list.h +71 -28
- data/include/prism/util/pm_state_stack.h +25 -7
- data/include/prism/util/pm_string.h +115 -27
- data/include/prism/util/pm_string_list.h +25 -6
- data/include/prism/util/pm_strncasecmp.h +32 -0
- data/include/prism/util/pm_strpbrk.h +31 -17
- data/include/prism/version.h +28 -3
- data/include/prism.h +229 -36
- data/lib/prism/compiler.rb +5 -5
- data/lib/prism/debug.rb +43 -13
- data/lib/prism/desugar_compiler.rb +1 -1
- data/lib/prism/dispatcher.rb +27 -26
- data/lib/prism/dsl.rb +16 -16
- data/lib/prism/ffi.rb +138 -61
- data/lib/prism/lex_compat.rb +26 -16
- data/lib/prism/mutation_compiler.rb +11 -11
- data/lib/prism/node.rb +426 -227
- data/lib/prism/node_ext.rb +23 -16
- data/lib/prism/node_inspector.rb +1 -1
- data/lib/prism/pack.rb +79 -40
- data/lib/prism/parse_result/comments.rb +7 -2
- data/lib/prism/parse_result/newlines.rb +4 -0
- data/lib/prism/parse_result.rb +157 -21
- data/lib/prism/pattern.rb +14 -3
- data/lib/prism/ripper_compat.rb +28 -10
- data/lib/prism/serialize.rb +935 -307
- data/lib/prism/visitor.rb +9 -5
- data/lib/prism.rb +20 -2
- data/prism.gemspec +11 -2
- data/rbi/prism.rbi +7305 -0
- data/rbi/prism_static.rbi +196 -0
- data/sig/prism.rbs +4468 -0
- data/sig/prism_static.rbs +123 -0
- data/src/diagnostic.c +56 -53
- data/src/enc/pm_big5.c +1 -0
- data/src/enc/pm_euc_jp.c +1 -0
- data/src/enc/pm_gbk.c +1 -0
- data/src/enc/pm_shift_jis.c +1 -0
- data/src/enc/pm_tables.c +316 -80
- data/src/enc/pm_unicode.c +54 -9
- data/src/enc/pm_windows_31j.c +1 -0
- data/src/node.c +357 -345
- data/src/options.c +170 -0
- data/src/prettyprint.c +7697 -1643
- data/src/prism.c +1964 -1125
- data/src/regexp.c +153 -95
- data/src/serialize.c +432 -397
- data/src/token_type.c +3 -1
- data/src/util/pm_buffer.c +88 -23
- data/src/util/pm_char.c +103 -57
- data/src/util/pm_constant_pool.c +52 -22
- data/src/util/pm_list.c +12 -4
- data/src/util/pm_memchr.c +5 -3
- data/src/util/pm_newline_list.c +25 -63
- data/src/util/pm_state_stack.c +9 -3
- data/src/util/pm_string.c +95 -85
- data/src/util/pm_string_list.c +14 -15
- data/src/util/pm_strncasecmp.c +10 -3
- data/src/util/pm_strpbrk.c +25 -19
- metadata +12 -3
- data/docs/prism.png +0 -0
data/src/regexp.c
CHANGED
@@ -1,16 +1,31 @@
|
|
1
1
|
#include "prism/regexp.h"
|
2
2
|
|
3
|
-
|
3
|
+
/**
|
4
|
+
* This is the parser that is going to handle parsing regular expressions.
|
5
|
+
*/
|
4
6
|
typedef struct {
|
7
|
+
/** A pointer to the start of the source that we are parsing. */
|
5
8
|
const uint8_t *start;
|
9
|
+
|
10
|
+
/** A pointer to the current position in the source. */
|
6
11
|
const uint8_t *cursor;
|
12
|
+
|
13
|
+
/** A pointer to the end of the source that we are parsing. */
|
7
14
|
const uint8_t *end;
|
15
|
+
|
16
|
+
/** A list of named captures that we've found. */
|
8
17
|
pm_string_list_t *named_captures;
|
18
|
+
|
19
|
+
/** Whether the encoding has changed from the default. */
|
9
20
|
bool encoding_changed;
|
21
|
+
|
22
|
+
/** The encoding of the source. */
|
10
23
|
pm_encoding_t *encoding;
|
11
24
|
} pm_regexp_parser_t;
|
12
25
|
|
13
|
-
|
26
|
+
/**
|
27
|
+
* This initializes a new parser with the given source.
|
28
|
+
*/
|
14
29
|
static void
|
15
30
|
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
16
31
|
*parser = (pm_regexp_parser_t) {
|
@@ -23,7 +38,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
|
|
23
38
|
};
|
24
39
|
}
|
25
40
|
|
26
|
-
|
41
|
+
/**
|
42
|
+
* This appends a new string to the list of named captures.
|
43
|
+
*/
|
27
44
|
static void
|
28
45
|
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
29
46
|
pm_string_t string;
|
@@ -32,13 +49,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
|
|
32
49
|
pm_string_free(&string);
|
33
50
|
}
|
34
51
|
|
35
|
-
|
52
|
+
/**
|
53
|
+
* Returns true if the next character is the end of the source.
|
54
|
+
*/
|
36
55
|
static inline bool
|
37
56
|
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
38
57
|
return parser->cursor >= parser->end;
|
39
58
|
}
|
40
59
|
|
41
|
-
|
60
|
+
/**
|
61
|
+
* Optionally accept a char and consume it if it exists.
|
62
|
+
*/
|
42
63
|
static inline bool
|
43
64
|
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
44
65
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
@@ -48,7 +69,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
|
48
69
|
return false;
|
49
70
|
}
|
50
71
|
|
51
|
-
|
72
|
+
/**
|
73
|
+
* Expect a character to be present and consume it.
|
74
|
+
*/
|
52
75
|
static inline bool
|
53
76
|
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
54
77
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
@@ -58,7 +81,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
|
58
81
|
return false;
|
59
82
|
}
|
60
83
|
|
61
|
-
|
84
|
+
/**
|
85
|
+
* This advances the current token to the next instance of the given character.
|
86
|
+
*/
|
62
87
|
static bool
|
63
88
|
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
64
89
|
if (pm_regexp_char_is_eof(parser)) {
|
@@ -74,37 +99,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
|
74
99
|
return true;
|
75
100
|
}
|
76
101
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
102
|
+
/**
|
103
|
+
* Range quantifiers are a special class of quantifiers that look like
|
104
|
+
*
|
105
|
+
* * {digit}
|
106
|
+
* * {digit,}
|
107
|
+
* * {digit,digit}
|
108
|
+
* * {,digit}
|
109
|
+
*
|
110
|
+
* Unfortunately, if there are any spaces in between, then this just becomes a
|
111
|
+
* regular character match expression and we have to backtrack. So when this
|
112
|
+
* function first starts running, we'll create a "save" point and then attempt
|
113
|
+
* to parse the quantifier. If it fails, we'll restore the save point and
|
114
|
+
* return.
|
115
|
+
*
|
116
|
+
* The properly track everything, we're going to build a little state machine.
|
117
|
+
* It looks something like the following:
|
118
|
+
*
|
119
|
+
* ┌───────┐ ┌─────────┐ ────────────┐
|
120
|
+
* ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
|
121
|
+
* └───────┘ └─────────┘ <─── digit ─┘
|
122
|
+
* │ │ │
|
123
|
+
* ┌───────┐ │ │ rbrace
|
124
|
+
* │ comma │ <───── comma ┌──── comma ───────┘ │
|
125
|
+
* └───────┘ V V
|
126
|
+
* │ ┌─────────┐ ┌─────────┐
|
127
|
+
* └── digit ──> │ maximum │ ── rbrace ──> │| final |│
|
128
|
+
* └─────────┘ └─────────┘
|
129
|
+
* │ ^
|
130
|
+
* └─ digit ─┘
|
131
|
+
*
|
132
|
+
* Note that by the time we've hit this function, the lbrace has already been
|
133
|
+
* consumed so we're in the start state.
|
134
|
+
*/
|
108
135
|
static bool
|
109
136
|
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
110
137
|
const uint8_t *savepoint = parser->cursor;
|
@@ -180,14 +207,18 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
180
207
|
return true;
|
181
208
|
}
|
182
209
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
210
|
+
/**
|
211
|
+
* quantifier : star-quantifier
|
212
|
+
* | plus-quantifier
|
213
|
+
* | optional-quantifier
|
214
|
+
* | range-quantifier
|
215
|
+
* | <empty>
|
216
|
+
* ;
|
217
|
+
*/
|
189
218
|
static bool
|
190
219
|
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
220
|
+
if (pm_regexp_char_is_eof(parser)) return true;
|
221
|
+
|
191
222
|
switch (*parser->cursor) {
|
192
223
|
case '*':
|
193
224
|
case '+':
|
@@ -203,8 +234,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
|
203
234
|
}
|
204
235
|
}
|
205
236
|
|
206
|
-
|
207
|
-
|
237
|
+
/**
|
238
|
+
* match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
239
|
+
* ;
|
240
|
+
*/
|
208
241
|
static bool
|
209
242
|
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
210
243
|
if (!pm_regexp_char_expect(parser, ':')) {
|
@@ -224,8 +257,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
|
224
257
|
static bool
|
225
258
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
226
259
|
|
227
|
-
|
228
|
-
|
260
|
+
/**
|
261
|
+
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
262
|
+
* ;
|
263
|
+
*/
|
229
264
|
static bool
|
230
265
|
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
231
266
|
pm_regexp_char_accept(parser, '^');
|
@@ -249,7 +284,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
|
249
284
|
return pm_regexp_char_expect(parser, ']');
|
250
285
|
}
|
251
286
|
|
252
|
-
|
287
|
+
/**
|
288
|
+
* A left bracket can either mean a POSIX class or a character set.
|
289
|
+
*/
|
253
290
|
static bool
|
254
291
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
255
292
|
const uint8_t *reset = parser->cursor;
|
@@ -269,8 +306,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
|
269
306
|
static bool
|
270
307
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
271
308
|
|
272
|
-
|
273
|
-
|
309
|
+
/**
|
310
|
+
* These are the states of the options that are configurable on the regular
|
311
|
+
* expression (or from within a group).
|
312
|
+
*/
|
274
313
|
typedef enum {
|
275
314
|
PM_REGEXP_OPTION_STATE_INVALID,
|
276
315
|
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
@@ -281,16 +320,22 @@ typedef enum {
|
|
281
320
|
|
282
321
|
// These are the options that are configurable on the regular expression (or
|
283
322
|
// from within a group).
|
323
|
+
|
284
324
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
285
325
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
286
326
|
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
287
327
|
|
288
|
-
|
328
|
+
/**
|
329
|
+
* This is the set of options that are configurable on the regular expression.
|
330
|
+
*/
|
289
331
|
typedef struct {
|
332
|
+
/** The current state of each option. */
|
290
333
|
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
291
334
|
} pm_regexp_options_t;
|
292
335
|
|
293
|
-
|
336
|
+
/**
|
337
|
+
* Initialize a new set of options to their default values.
|
338
|
+
*/
|
294
339
|
static void
|
295
340
|
pm_regexp_options_init(pm_regexp_options_t *options) {
|
296
341
|
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
@@ -302,8 +347,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
|
|
302
347
|
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
303
348
|
}
|
304
349
|
|
305
|
-
|
306
|
-
|
350
|
+
/**
|
351
|
+
* Attempt to add the given option to the set of options. Returns true if it was
|
352
|
+
* added, false if it was already present.
|
353
|
+
*/
|
307
354
|
static bool
|
308
355
|
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
309
356
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
@@ -325,8 +372,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
|
325
372
|
return false;
|
326
373
|
}
|
327
374
|
|
328
|
-
|
329
|
-
|
375
|
+
/**
|
376
|
+
* Attempt to remove the given option from the set of options. Returns true if
|
377
|
+
* it was removed, false if it was already absent.
|
378
|
+
*/
|
330
379
|
static bool
|
331
380
|
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
332
381
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
@@ -347,26 +396,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
347
396
|
return false;
|
348
397
|
}
|
349
398
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
399
|
+
/**
|
400
|
+
* Groups can have quite a few different patterns for syntax. They basically
|
401
|
+
* just wrap a set of expressions, but they can potentially have options after a
|
402
|
+
* question mark. If there _isn't_ a question mark, then it's just a set of
|
403
|
+
* expressions. If there _is_, then here are the options:
|
404
|
+
*
|
405
|
+
* * (?#...) - inline comments
|
406
|
+
* * (?:subexp) - non-capturing group
|
407
|
+
* * (?=subexp) - positive lookahead
|
408
|
+
* * (?!subexp) - negative lookahead
|
409
|
+
* * (?>subexp) - atomic group
|
410
|
+
* * (?~subexp) - absence operator
|
411
|
+
* * (?<=subexp) - positive lookbehind
|
412
|
+
* * (?<!subexp) - negative lookbehind
|
413
|
+
* * (?<name>subexp) - named capturing group
|
414
|
+
* * (?'name'subexp) - named capturing group
|
415
|
+
* * (?(cond)yes-subexp) - conditional expression
|
416
|
+
* * (?(cond)yes-subexp|no-subexp) - conditional expression
|
417
|
+
* * (?imxdau-imx) - turn on and off configuration
|
418
|
+
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
419
|
+
*/
|
370
420
|
static bool
|
371
421
|
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
372
422
|
// First, parse any options for the group.
|
@@ -501,16 +551,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
501
551
|
return pm_regexp_char_expect(parser, ')');
|
502
552
|
}
|
503
553
|
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
554
|
+
/**
|
555
|
+
* item : anchor
|
556
|
+
* | match-posix-class
|
557
|
+
* | match-char-set
|
558
|
+
* | match-char-class
|
559
|
+
* | match-char-prop
|
560
|
+
* | match-char
|
561
|
+
* | match-any
|
562
|
+
* | group
|
563
|
+
* | quantified
|
564
|
+
* ;
|
565
|
+
*/
|
514
566
|
static bool
|
515
567
|
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
516
568
|
switch (*parser->cursor++) {
|
@@ -531,8 +583,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
|
531
583
|
}
|
532
584
|
}
|
533
585
|
|
534
|
-
|
535
|
-
|
586
|
+
/**
|
587
|
+
* expression : item+
|
588
|
+
* ;
|
589
|
+
*/
|
536
590
|
static bool
|
537
591
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
538
592
|
if (!pm_regexp_parse_item(parser)) {
|
@@ -548,10 +602,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
|
548
602
|
return true;
|
549
603
|
}
|
550
604
|
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
605
|
+
/**
|
606
|
+
* pattern : EOF
|
607
|
+
* | expression EOF
|
608
|
+
* | expression '|' pattern
|
609
|
+
* ;
|
610
|
+
*/
|
555
611
|
static bool
|
556
612
|
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
557
613
|
return (
|
@@ -570,8 +626,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
|
570
626
|
);
|
571
627
|
}
|
572
628
|
|
573
|
-
|
574
|
-
|
629
|
+
/**
|
630
|
+
* Parse a regular expression and extract the names of all of the named capture
|
631
|
+
* groups.
|
632
|
+
*/
|
575
633
|
PRISM_EXPORTED_FUNCTION bool
|
576
634
|
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
577
635
|
pm_regexp_parser_t parser;
|