prism 0.15.1 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/Makefile +12 -0
- data/README.md +3 -1
- data/config.yml +66 -50
- data/docs/configuration.md +2 -0
- data/docs/fuzzing.md +1 -1
- data/docs/javascript.md +90 -0
- data/docs/releasing.md +27 -0
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +28 -29
- data/ext/prism/api_node.c +856 -826
- data/ext/prism/api_pack.c +20 -9
- data/ext/prism/extension.c +494 -119
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +3157 -747
- data/include/prism/defines.h +40 -8
- data/include/prism/diagnostic.h +36 -3
- data/include/prism/enc/pm_encoding.h +119 -28
- data/include/prism/node.h +38 -30
- data/include/prism/options.h +204 -0
- data/include/prism/pack.h +44 -33
- data/include/prism/parser.h +445 -199
- data/include/prism/prettyprint.h +26 -0
- data/include/prism/regexp.h +16 -2
- data/include/prism/util/pm_buffer.h +102 -18
- data/include/prism/util/pm_char.h +162 -48
- data/include/prism/util/pm_constant_pool.h +128 -34
- data/include/prism/util/pm_list.h +68 -38
- data/include/prism/util/pm_memchr.h +18 -3
- data/include/prism/util/pm_newline_list.h +71 -28
- data/include/prism/util/pm_state_stack.h +25 -7
- data/include/prism/util/pm_string.h +115 -27
- data/include/prism/util/pm_string_list.h +25 -6
- data/include/prism/util/pm_strncasecmp.h +32 -0
- data/include/prism/util/pm_strpbrk.h +31 -17
- data/include/prism/version.h +28 -3
- data/include/prism.h +229 -36
- data/lib/prism/compiler.rb +5 -5
- data/lib/prism/debug.rb +43 -13
- data/lib/prism/desugar_compiler.rb +1 -1
- data/lib/prism/dispatcher.rb +27 -26
- data/lib/prism/dsl.rb +16 -16
- data/lib/prism/ffi.rb +138 -61
- data/lib/prism/lex_compat.rb +26 -16
- data/lib/prism/mutation_compiler.rb +11 -11
- data/lib/prism/node.rb +426 -227
- data/lib/prism/node_ext.rb +23 -16
- data/lib/prism/node_inspector.rb +1 -1
- data/lib/prism/pack.rb +79 -40
- data/lib/prism/parse_result/comments.rb +7 -2
- data/lib/prism/parse_result/newlines.rb +4 -0
- data/lib/prism/parse_result.rb +157 -21
- data/lib/prism/pattern.rb +14 -3
- data/lib/prism/ripper_compat.rb +28 -10
- data/lib/prism/serialize.rb +935 -307
- data/lib/prism/visitor.rb +9 -5
- data/lib/prism.rb +20 -2
- data/prism.gemspec +11 -2
- data/rbi/prism.rbi +7305 -0
- data/rbi/prism_static.rbi +196 -0
- data/sig/prism.rbs +4468 -0
- data/sig/prism_static.rbs +123 -0
- data/src/diagnostic.c +56 -53
- data/src/enc/pm_big5.c +1 -0
- data/src/enc/pm_euc_jp.c +1 -0
- data/src/enc/pm_gbk.c +1 -0
- data/src/enc/pm_shift_jis.c +1 -0
- data/src/enc/pm_tables.c +316 -80
- data/src/enc/pm_unicode.c +54 -9
- data/src/enc/pm_windows_31j.c +1 -0
- data/src/node.c +357 -345
- data/src/options.c +170 -0
- data/src/prettyprint.c +7697 -1643
- data/src/prism.c +1964 -1125
- data/src/regexp.c +153 -95
- data/src/serialize.c +432 -397
- data/src/token_type.c +3 -1
- data/src/util/pm_buffer.c +88 -23
- data/src/util/pm_char.c +103 -57
- data/src/util/pm_constant_pool.c +52 -22
- data/src/util/pm_list.c +12 -4
- data/src/util/pm_memchr.c +5 -3
- data/src/util/pm_newline_list.c +25 -63
- data/src/util/pm_state_stack.c +9 -3
- data/src/util/pm_string.c +95 -85
- data/src/util/pm_string_list.c +14 -15
- data/src/util/pm_strncasecmp.c +10 -3
- data/src/util/pm_strpbrk.c +25 -19
- metadata +12 -3
- data/docs/prism.png +0 -0
data/src/regexp.c
CHANGED
@@ -1,16 +1,31 @@
|
|
1
1
|
#include "prism/regexp.h"
|
2
2
|
|
3
|
-
|
3
|
+
/**
|
4
|
+
* This is the parser that is going to handle parsing regular expressions.
|
5
|
+
*/
|
4
6
|
typedef struct {
|
7
|
+
/** A pointer to the start of the source that we are parsing. */
|
5
8
|
const uint8_t *start;
|
9
|
+
|
10
|
+
/** A pointer to the current position in the source. */
|
6
11
|
const uint8_t *cursor;
|
12
|
+
|
13
|
+
/** A pointer to the end of the source that we are parsing. */
|
7
14
|
const uint8_t *end;
|
15
|
+
|
16
|
+
/** A list of named captures that we've found. */
|
8
17
|
pm_string_list_t *named_captures;
|
18
|
+
|
19
|
+
/** Whether the encoding has changed from the default. */
|
9
20
|
bool encoding_changed;
|
21
|
+
|
22
|
+
/** The encoding of the source. */
|
10
23
|
pm_encoding_t *encoding;
|
11
24
|
} pm_regexp_parser_t;
|
12
25
|
|
13
|
-
|
26
|
+
/**
|
27
|
+
* This initializes a new parser with the given source.
|
28
|
+
*/
|
14
29
|
static void
|
15
30
|
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
16
31
|
*parser = (pm_regexp_parser_t) {
|
@@ -23,7 +38,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
|
|
23
38
|
};
|
24
39
|
}
|
25
40
|
|
26
|
-
|
41
|
+
/**
|
42
|
+
* This appends a new string to the list of named captures.
|
43
|
+
*/
|
27
44
|
static void
|
28
45
|
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
29
46
|
pm_string_t string;
|
@@ -32,13 +49,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
|
|
32
49
|
pm_string_free(&string);
|
33
50
|
}
|
34
51
|
|
35
|
-
|
52
|
+
/**
|
53
|
+
* Returns true if the next character is the end of the source.
|
54
|
+
*/
|
36
55
|
static inline bool
|
37
56
|
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
38
57
|
return parser->cursor >= parser->end;
|
39
58
|
}
|
40
59
|
|
41
|
-
|
60
|
+
/**
|
61
|
+
* Optionally accept a char and consume it if it exists.
|
62
|
+
*/
|
42
63
|
static inline bool
|
43
64
|
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
44
65
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
@@ -48,7 +69,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
|
48
69
|
return false;
|
49
70
|
}
|
50
71
|
|
51
|
-
|
72
|
+
/**
|
73
|
+
* Expect a character to be present and consume it.
|
74
|
+
*/
|
52
75
|
static inline bool
|
53
76
|
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
54
77
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
@@ -58,7 +81,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
|
58
81
|
return false;
|
59
82
|
}
|
60
83
|
|
61
|
-
|
84
|
+
/**
|
85
|
+
* This advances the current token to the next instance of the given character.
|
86
|
+
*/
|
62
87
|
static bool
|
63
88
|
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
64
89
|
if (pm_regexp_char_is_eof(parser)) {
|
@@ -74,37 +99,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
|
74
99
|
return true;
|
75
100
|
}
|
76
101
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
102
|
+
/**
|
103
|
+
* Range quantifiers are a special class of quantifiers that look like
|
104
|
+
*
|
105
|
+
* * {digit}
|
106
|
+
* * {digit,}
|
107
|
+
* * {digit,digit}
|
108
|
+
* * {,digit}
|
109
|
+
*
|
110
|
+
* Unfortunately, if there are any spaces in between, then this just becomes a
|
111
|
+
* regular character match expression and we have to backtrack. So when this
|
112
|
+
* function first starts running, we'll create a "save" point and then attempt
|
113
|
+
* to parse the quantifier. If it fails, we'll restore the save point and
|
114
|
+
* return.
|
115
|
+
*
|
116
|
+
* The properly track everything, we're going to build a little state machine.
|
117
|
+
* It looks something like the following:
|
118
|
+
*
|
119
|
+
* ┌───────┐ ┌─────────┐ ────────────┐
|
120
|
+
* ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
|
121
|
+
* └───────┘ └─────────┘ <─── digit ─┘
|
122
|
+
* │ │ │
|
123
|
+
* ┌───────┐ │ │ rbrace
|
124
|
+
* │ comma │ <───── comma ┌──── comma ───────┘ │
|
125
|
+
* └───────┘ V V
|
126
|
+
* │ ┌─────────┐ ┌─────────┐
|
127
|
+
* └── digit ──> │ maximum │ ── rbrace ──> │| final |│
|
128
|
+
* └─────────┘ └─────────┘
|
129
|
+
* │ ^
|
130
|
+
* └─ digit ─┘
|
131
|
+
*
|
132
|
+
* Note that by the time we've hit this function, the lbrace has already been
|
133
|
+
* consumed so we're in the start state.
|
134
|
+
*/
|
108
135
|
static bool
|
109
136
|
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
110
137
|
const uint8_t *savepoint = parser->cursor;
|
@@ -180,14 +207,18 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
180
207
|
return true;
|
181
208
|
}
|
182
209
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
210
|
+
/**
|
211
|
+
* quantifier : star-quantifier
|
212
|
+
* | plus-quantifier
|
213
|
+
* | optional-quantifier
|
214
|
+
* | range-quantifier
|
215
|
+
* | <empty>
|
216
|
+
* ;
|
217
|
+
*/
|
189
218
|
static bool
|
190
219
|
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
220
|
+
if (pm_regexp_char_is_eof(parser)) return true;
|
221
|
+
|
191
222
|
switch (*parser->cursor) {
|
192
223
|
case '*':
|
193
224
|
case '+':
|
@@ -203,8 +234,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
|
203
234
|
}
|
204
235
|
}
|
205
236
|
|
206
|
-
|
207
|
-
|
237
|
+
/**
|
238
|
+
* match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
239
|
+
* ;
|
240
|
+
*/
|
208
241
|
static bool
|
209
242
|
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
210
243
|
if (!pm_regexp_char_expect(parser, ':')) {
|
@@ -224,8 +257,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
|
224
257
|
static bool
|
225
258
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
226
259
|
|
227
|
-
|
228
|
-
|
260
|
+
/**
|
261
|
+
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
262
|
+
* ;
|
263
|
+
*/
|
229
264
|
static bool
|
230
265
|
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
231
266
|
pm_regexp_char_accept(parser, '^');
|
@@ -249,7 +284,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
|
249
284
|
return pm_regexp_char_expect(parser, ']');
|
250
285
|
}
|
251
286
|
|
252
|
-
|
287
|
+
/**
|
288
|
+
* A left bracket can either mean a POSIX class or a character set.
|
289
|
+
*/
|
253
290
|
static bool
|
254
291
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
255
292
|
const uint8_t *reset = parser->cursor;
|
@@ -269,8 +306,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
|
269
306
|
static bool
|
270
307
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
271
308
|
|
272
|
-
|
273
|
-
|
309
|
+
/**
|
310
|
+
* These are the states of the options that are configurable on the regular
|
311
|
+
* expression (or from within a group).
|
312
|
+
*/
|
274
313
|
typedef enum {
|
275
314
|
PM_REGEXP_OPTION_STATE_INVALID,
|
276
315
|
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
@@ -281,16 +320,22 @@ typedef enum {
|
|
281
320
|
|
282
321
|
// These are the options that are configurable on the regular expression (or
|
283
322
|
// from within a group).
|
323
|
+
|
284
324
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
285
325
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
286
326
|
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
287
327
|
|
288
|
-
|
328
|
+
/**
|
329
|
+
* This is the set of options that are configurable on the regular expression.
|
330
|
+
*/
|
289
331
|
typedef struct {
|
332
|
+
/** The current state of each option. */
|
290
333
|
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
291
334
|
} pm_regexp_options_t;
|
292
335
|
|
293
|
-
|
336
|
+
/**
|
337
|
+
* Initialize a new set of options to their default values.
|
338
|
+
*/
|
294
339
|
static void
|
295
340
|
pm_regexp_options_init(pm_regexp_options_t *options) {
|
296
341
|
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
@@ -302,8 +347,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
|
|
302
347
|
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
303
348
|
}
|
304
349
|
|
305
|
-
|
306
|
-
|
350
|
+
/**
|
351
|
+
* Attempt to add the given option to the set of options. Returns true if it was
|
352
|
+
* added, false if it was already present.
|
353
|
+
*/
|
307
354
|
static bool
|
308
355
|
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
309
356
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
@@ -325,8 +372,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
|
325
372
|
return false;
|
326
373
|
}
|
327
374
|
|
328
|
-
|
329
|
-
|
375
|
+
/**
|
376
|
+
* Attempt to remove the given option from the set of options. Returns true if
|
377
|
+
* it was removed, false if it was already absent.
|
378
|
+
*/
|
330
379
|
static bool
|
331
380
|
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
332
381
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
@@ -347,26 +396,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
347
396
|
return false;
|
348
397
|
}
|
349
398
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
399
|
+
/**
|
400
|
+
* Groups can have quite a few different patterns for syntax. They basically
|
401
|
+
* just wrap a set of expressions, but they can potentially have options after a
|
402
|
+
* question mark. If there _isn't_ a question mark, then it's just a set of
|
403
|
+
* expressions. If there _is_, then here are the options:
|
404
|
+
*
|
405
|
+
* * (?#...) - inline comments
|
406
|
+
* * (?:subexp) - non-capturing group
|
407
|
+
* * (?=subexp) - positive lookahead
|
408
|
+
* * (?!subexp) - negative lookahead
|
409
|
+
* * (?>subexp) - atomic group
|
410
|
+
* * (?~subexp) - absence operator
|
411
|
+
* * (?<=subexp) - positive lookbehind
|
412
|
+
* * (?<!subexp) - negative lookbehind
|
413
|
+
* * (?<name>subexp) - named capturing group
|
414
|
+
* * (?'name'subexp) - named capturing group
|
415
|
+
* * (?(cond)yes-subexp) - conditional expression
|
416
|
+
* * (?(cond)yes-subexp|no-subexp) - conditional expression
|
417
|
+
* * (?imxdau-imx) - turn on and off configuration
|
418
|
+
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
419
|
+
*/
|
370
420
|
static bool
|
371
421
|
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
372
422
|
// First, parse any options for the group.
|
@@ -501,16 +551,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
501
551
|
return pm_regexp_char_expect(parser, ')');
|
502
552
|
}
|
503
553
|
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
554
|
+
/**
|
555
|
+
* item : anchor
|
556
|
+
* | match-posix-class
|
557
|
+
* | match-char-set
|
558
|
+
* | match-char-class
|
559
|
+
* | match-char-prop
|
560
|
+
* | match-char
|
561
|
+
* | match-any
|
562
|
+
* | group
|
563
|
+
* | quantified
|
564
|
+
* ;
|
565
|
+
*/
|
514
566
|
static bool
|
515
567
|
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
516
568
|
switch (*parser->cursor++) {
|
@@ -531,8 +583,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
|
531
583
|
}
|
532
584
|
}
|
533
585
|
|
534
|
-
|
535
|
-
|
586
|
+
/**
|
587
|
+
* expression : item+
|
588
|
+
* ;
|
589
|
+
*/
|
536
590
|
static bool
|
537
591
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
538
592
|
if (!pm_regexp_parse_item(parser)) {
|
@@ -548,10 +602,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
|
548
602
|
return true;
|
549
603
|
}
|
550
604
|
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
605
|
+
/**
|
606
|
+
* pattern : EOF
|
607
|
+
* | expression EOF
|
608
|
+
* | expression '|' pattern
|
609
|
+
* ;
|
610
|
+
*/
|
555
611
|
static bool
|
556
612
|
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
557
613
|
return (
|
@@ -570,8 +626,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
|
570
626
|
);
|
571
627
|
}
|
572
628
|
|
573
|
-
|
574
|
-
|
629
|
+
/**
|
630
|
+
* Parse a regular expression and extract the names of all of the named capture
|
631
|
+
* groups.
|
632
|
+
*/
|
575
633
|
PRISM_EXPORTED_FUNCTION bool
|
576
634
|
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
577
635
|
pm_regexp_parser_t parser;
|