c_lexer 2.5.1.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << ' -Wall -Werror -Wno-declaration-after-statement '
4
+ $CFLAGS << ' --std=c99 -march=native -mtune=native -O2 '
5
+ create_makefile 'lexer'
data/ext/lexer/lexer.h ADDED
@@ -0,0 +1,308 @@
1
+ #ifndef LEXER_H
2
+ #define LEXER_H
3
+
4
+ typedef struct lexer_state lexer_state;
5
+ #include "literal.h"
6
+ define_stack_type(lit_stack, literal, {0});
7
+
8
+ struct lexer_state {
9
+ uint cs; /* DFA state */
10
+ long p; /* stream position */
11
+ long pe; /* end-of-stream position */
12
+
13
+ int *cs_stack;
14
+ int cs_stack_top;
15
+ int cs_stack_size;
16
+
17
+ VALUE token_queue;
18
+ VALUE static_env;
19
+
20
+ VALUE source_buffer;
21
+ VALUE source; /* source code string */
22
+ VALUE source_pts; /* source as a codepoint array */
23
+ VALUE encoding;
24
+
25
+ VALUE tokens;
26
+ VALUE comments;
27
+
28
+ stack_state cond;
29
+ stack_state cmdarg;
30
+ ss_stack cond_stack; /* for saving 'cond' */
31
+ ss_stack cmdarg_stack; /* for saving 'cmdarg' */
32
+
33
+ VALUE lambda_stack;
34
+ int paren_nest;
35
+ lit_stack literal_stack;
36
+
37
+ int version;
38
+ int in_kwarg;
39
+ int force_utf32;
40
+
41
+ VALUE diagnostics;
42
+
43
+ long newline_s; /* position of last newline encountered */
44
+ long eq_begin_s;
45
+ long herebody_s;
46
+ long escape_s;
47
+
48
+ int dedent_level;
49
+
50
+ VALUE escape;
51
+
52
+ uint cs_before_block_comment;
53
+ };
54
+
55
+ static void lexer_mark(void*);
56
+ static void lexer_dealloc(void*);
57
+ static VALUE lexer_reset(int, VALUE*, VALUE);
58
+
59
+ static void emit_token(lexer_state*, VALUE, VALUE, long, long);
60
+ static void emit_comment(lexer_state*, long, long);
61
+ static void emit_do(lexer_state*, int, long, long);
62
+
63
+ static VALUE tok(lexer_state*, long, long);
64
+ static VALUE range(lexer_state*, long, long);
65
+ static void diagnostic(lexer_state*, VALUE, VALUE, VALUE, VALUE, VALUE);
66
+ static int get_codepoint(lexer_state*, long);
67
+ static int arg_or_cmdarg(int);
68
+ static int is_nthref(VALUE);
69
+ static int is_backref(VALUE);
70
+ static int is_capitalized(VALUE);
71
+ static int is_regexp_metachar(VALUE);
72
+ static int eof_codepoint(int);
73
+ static VALUE find_unknown_options(VALUE);
74
+ static int bad_cvar_name(VALUE);
75
+ static int bad_ivar_name(VALUE);
76
+ static int find_8_or_9(VALUE str);
77
+ static void emit_int(lexer_state*, VALUE, long, long);
78
+ static void emit_rational(lexer_state*, VALUE, long, long);
79
+ static void emit_complex(lexer_state*, VALUE, long, long);
80
+ static void emit_complex_rational(lexer_state*, VALUE, long, long);
81
+ static void emit_float(lexer_state*, VALUE, long, long);
82
+ static void emit_complex_float(lexer_state*, VALUE, long, long);
83
+ static void emit_int_followed_by_if(lexer_state*, VALUE, long, long);
84
+ static void emit_int_followed_by_rescue(lexer_state*, VALUE, long, long);
85
+ static void emit_float_followed_by_if(lexer_state*, VALUE, long, long);
86
+ static void emit_float_followed_by_rescue(lexer_state*, VALUE, long, long);
87
+ static int push_literal(lexer_state*, VALUE, VALUE, long, long, int, int, int);
88
+ static int pop_literal(lexer_state*);
89
+ static VALUE array_last(VALUE);
90
+ static VALUE unescape_char(char);
91
+ static VALUE escape_char(VALUE);
92
+ static inline int str_start_with_p(VALUE, const char*);
93
+ static inline int str_end_with_p(VALUE, const char*);
94
+ static inline void force_encoding(VALUE, VALUE);
95
+
96
+ #define emit(type) emit_token(state, type, tok(state, ts, te), ts, te)
97
+
98
+ #define def_lexer_attr_reader(name) \
99
+ static VALUE lexer_get_ ## name(VALUE self) { \
100
+ lexer_state *state; \
101
+ Data_Get_Struct(self, lexer_state, state); \
102
+ return state->name; }
103
+
104
+ #define def_lexer_attr_writer(name) \
105
+ static VALUE lexer_set_ ## name(VALUE self, VALUE val) { \
106
+ lexer_state *state; \
107
+ Data_Get_Struct(self, lexer_state, state); \
108
+ return state->name = val; } \
109
+
110
+ #define def_lexer_attribute(name) \
111
+ def_lexer_attr_reader(name) \
112
+ def_lexer_attr_writer(name)
113
+
114
+ #define Qzero INT2NUM(0)
115
+
116
+ #define init_symbol(name) \
117
+ name = ID2SYM(rb_intern(#name)); \
118
+ rb_gc_register_address(&name)
119
+
120
+ VALUE k__ENCODING__;
121
+ VALUE k__FILE__;
122
+ VALUE k__LINE__;
123
+ VALUE kALIAS;
124
+ VALUE kAND;
125
+ VALUE kBEGIN;
126
+ VALUE klBEGIN;
127
+ VALUE kBREAK;
128
+ VALUE kCASE;
129
+ VALUE kCLASS;
130
+ VALUE kDEF;
131
+ VALUE kDEFINED;
132
+ VALUE kDO;
133
+ VALUE kDO_BLOCK;
134
+ VALUE kDO_COND;
135
+ VALUE kDO_LAMBDA;
136
+ VALUE kELSE;
137
+ VALUE kELSIF;
138
+ VALUE kEND;
139
+ VALUE klEND;
140
+ VALUE kENSURE;
141
+ VALUE kFALSE;
142
+ VALUE kFOR;
143
+ VALUE kIF;
144
+ VALUE kIF_MOD;
145
+ VALUE kIN;
146
+ VALUE kMODULE;
147
+ VALUE kNEXT;
148
+ VALUE kNIL;
149
+ VALUE kNOT;
150
+ VALUE kOR;
151
+ VALUE kREDO;
152
+ VALUE kRESCUE;
153
+ VALUE kRESCUE_MOD;
154
+ VALUE kRETRY;
155
+ VALUE kRETURN;
156
+ VALUE kSELF;
157
+ VALUE kSUPER;
158
+ VALUE kTHEN;
159
+ VALUE kTRUE;
160
+ VALUE kUNDEF;
161
+ VALUE kUNLESS;
162
+ VALUE kUNLESS_MOD;
163
+ VALUE kUNTIL;
164
+ VALUE kUNTIL_MOD;
165
+ VALUE kWHEN;
166
+ VALUE kWHILE;
167
+ VALUE kWHILE_MOD;
168
+ VALUE kYIELD;
169
+
170
+ VALUE tAMPER;
171
+ VALUE tAMPER2;
172
+ VALUE tANDDOT;
173
+ VALUE tANDOP;
174
+ VALUE tAREF;
175
+ VALUE tASET;
176
+ VALUE tASSOC;
177
+ VALUE tBACK_REF;
178
+ VALUE tBACK_REF2;
179
+ VALUE tBANG;
180
+ VALUE tCARET;
181
+ VALUE tCHARACTER;
182
+ VALUE tCMP;
183
+ VALUE tCOLON;
184
+ VALUE tCOLON2;
185
+ VALUE tCOLON3;
186
+ VALUE tCOMMA;
187
+ VALUE tCOMMENT;
188
+ VALUE tCONSTANT;
189
+ VALUE tCVAR;
190
+ VALUE tDIVIDE;
191
+ VALUE tDOT;
192
+ VALUE tDOT2;
193
+ VALUE tDOT3;
194
+ VALUE tDSTAR;
195
+ VALUE tEH;
196
+ VALUE tEQ;
197
+ VALUE tEQL;
198
+ VALUE tEQQ;
199
+ VALUE tFID;
200
+ VALUE tFLOAT;
201
+ VALUE tGEQ;
202
+ VALUE tGT;
203
+ VALUE tGVAR;
204
+ VALUE tIDENTIFIER;
205
+ VALUE tIMAGINARY;
206
+ VALUE tINTEGER;
207
+ VALUE tIVAR;
208
+ VALUE tLABEL;
209
+ VALUE tLABEL_END;
210
+ VALUE tLAMBDA;
211
+ VALUE tLAMBEG;
212
+ VALUE tLBRACE;
213
+ VALUE tLBRACE_ARG;
214
+ VALUE tLBRACK;
215
+ VALUE tLBRACK2;
216
+ VALUE tLCURLY;
217
+ VALUE tLEQ;
218
+ VALUE tLPAREN;
219
+ VALUE tLPAREN_ARG;
220
+ VALUE tLPAREN2;
221
+ VALUE tLSHFT;
222
+ VALUE tLT;
223
+ VALUE tMATCH;
224
+ VALUE tMINUS;
225
+ VALUE tNEQ;
226
+ VALUE tNL;
227
+ VALUE tNMATCH;
228
+ VALUE tNTH_REF;
229
+ VALUE tOP_ASGN;
230
+ VALUE tOROP;
231
+ VALUE tPERCENT;
232
+ VALUE tPIPE;
233
+ VALUE tPLUS;
234
+ VALUE tPOW;
235
+ VALUE tQWORDS_BEG;
236
+ VALUE tQSYMBOLS_BEG;
237
+ VALUE tRATIONAL;
238
+ VALUE tRBRACK;
239
+ VALUE tRCURLY;
240
+ VALUE tREGEXP_BEG;
241
+ VALUE tREGEXP_OPT;
242
+ VALUE tRPAREN;
243
+ VALUE tRSHFT;
244
+ VALUE tSEMI;
245
+ VALUE tSPACE;
246
+ VALUE tSTAR;
247
+ VALUE tSTAR2;
248
+ VALUE tSTRING;
249
+ VALUE tSTRING_BEG;
250
+ VALUE tSTRING_CONTENT;
251
+ VALUE tSTRING_DBEG;
252
+ VALUE tSTRING_DEND;
253
+ VALUE tSTRING_DVAR;
254
+ VALUE tSTRING_END;
255
+ VALUE tSYMBEG;
256
+ VALUE tSYMBOL;
257
+ VALUE tSYMBOLS_BEG;
258
+ VALUE tTILDE;
259
+ VALUE tUMINUS;
260
+ VALUE tUNARY_NUM;
261
+ VALUE tUPLUS;
262
+ VALUE tWORDS_BEG;
263
+ VALUE tXSTRING_BEG;
264
+
265
+ VALUE comment_klass;
266
+ VALUE diagnostic_klass;
267
+ VALUE range_klass;
268
+
269
+ VALUE severity_error;
270
+ VALUE fatal;
271
+ VALUE warning;
272
+
273
+ VALUE ambiguous_literal;
274
+ VALUE ambiguous_prefix;
275
+ VALUE bare_backslash;
276
+ VALUE character;
277
+ VALUE cvar_name;
278
+ VALUE embedded_document;
279
+ VALUE empty_numeric;
280
+ VALUE escape_eof;
281
+ VALUE incomplete_escape;
282
+ VALUE invalid_escape;
283
+ VALUE invalid_escape_use;
284
+ VALUE invalid_hex_escape;
285
+ VALUE invalid_octal;
286
+ VALUE invalid_unicode_escape;
287
+ VALUE ivar_name;
288
+ VALUE heredoc_id_ends_with_nl;
289
+ VALUE heredoc_id_has_newline;
290
+ VALUE no_dot_digit_literal;
291
+ VALUE prefix;
292
+ VALUE regexp_options;
293
+ VALUE string_eof;
294
+ VALUE trailing_in_number;
295
+ VALUE unexpected;
296
+ VALUE unexpected_percent_str;
297
+ VALUE unicode_point_too_large;
298
+ VALUE unterminated_unicode;
299
+
300
+ VALUE empty_array;
301
+ VALUE blank_string;
302
+ VALUE newline;
303
+ VALUE escaped_newline;
304
+ VALUE utf8_encoding;
305
+ VALUE cr_then_anything_to_eol;
306
+ VALUE crs_to_eol;
307
+
308
+ #endif
@@ -0,0 +1,2522 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <ruby/re.h>
4
+
5
+ #include <stdint.h>
6
+ #include <stdio.h>
7
+
8
+ #include "stack.h"
9
+ #include "stack_state.h"
10
+ #include "lexer.h"
11
+
12
+ #define INIT_LEXER_STATE(l, s) lexer_state *s; Data_Get_Struct(l, lexer_state, s);
13
+ #define STATIC_ENV_DECLARED(name) \
14
+ state->static_env != Qnil && RTEST(rb_funcall(state->static_env, rb_intern("declared?"), 1, name))
15
+
16
+ #include "cmdarg.h"
17
+ #include "cond.h"
18
+
19
+ #include "literal/methods.h"
20
+ #include "emit_tables.h"
21
+
22
+ %%machine lex;
23
+ %%write data;
24
+
25
+ static VALUE lexer_alloc(VALUE klass)
26
+ {
27
+ lexer_state *state = xmalloc(sizeof(lexer_state));
28
+
29
+ state->cs = state->p = state->pe = 0;
30
+ state->paren_nest = 0;
31
+
32
+ state->cs_stack = xmalloc(4 * sizeof(int));
33
+ state->cs_stack_top = 0;
34
+ state->cs_stack_size = 4;
35
+
36
+ state->source_buffer = Qnil;
37
+ state->source = Qnil;
38
+ state->source_pts = Qnil;
39
+ state->token_queue = Qnil;
40
+ state->static_env = Qnil;
41
+ state->lambda_stack = Qnil;
42
+ state->diagnostics = Qnil;
43
+ state->tokens = Qnil;
44
+ state->comments = Qnil;
45
+ state->encoding = Qnil;
46
+ state->escape = Qnil;
47
+
48
+ ss_stack_init(&state->cond_stack);
49
+ ss_stack_init(&state->cmdarg_stack);
50
+ lit_stack_init(&state->literal_stack);
51
+
52
+ return Data_Wrap_Struct(klass, lexer_mark, lexer_dealloc, state);
53
+ }
54
+
55
+ static void lexer_mark(void *ptr)
56
+ {
57
+ lexer_state *state = ptr;
58
+ rb_gc_mark(state->source_buffer);
59
+ rb_gc_mark(state->source);
60
+ rb_gc_mark(state->source_pts);
61
+ rb_gc_mark(state->token_queue);
62
+ rb_gc_mark(state->static_env);
63
+ rb_gc_mark(state->lambda_stack);
64
+ rb_gc_mark(state->diagnostics);
65
+ rb_gc_mark(state->tokens);
66
+ rb_gc_mark(state->comments);
67
+ rb_gc_mark(state->encoding);
68
+ rb_gc_mark(state->escape);
69
+
70
+ for (literal *lit = state->literal_stack.bottom; lit < state->literal_stack.top; lit++) {
71
+ rb_gc_mark(lit->buffer);
72
+ rb_gc_mark(lit->start_tok);
73
+ rb_gc_mark(lit->start_delim);
74
+ rb_gc_mark(lit->end_delim);
75
+ rb_gc_mark(lit->delimiter);
76
+ }
77
+ }
78
+
79
+ static void lexer_dealloc(void *ptr)
80
+ {
81
+ lexer_state *state = ptr;
82
+ ss_stack_dealloc(&state->cond_stack);
83
+ ss_stack_dealloc(&state->cmdarg_stack);
84
+ lit_stack_dealloc(&state->literal_stack);
85
+ xfree(ptr);
86
+ }
87
+
88
+ static VALUE lexer_initialize(VALUE self, VALUE version)
89
+ {
90
+ INIT_LEXER_STATE(self, state);
91
+
92
+ state->version = NUM2INT(version);
93
+
94
+ return lexer_reset(0, NULL, self);
95
+ }
96
+
97
+ static VALUE lexer_reset(int argc, VALUE *argv, VALUE self)
98
+ {
99
+ INIT_LEXER_STATE(self, state);
100
+
101
+ VALUE reset_state;
102
+ rb_scan_args(argc, argv, "01", &reset_state);
103
+ if (NIL_P(reset_state))
104
+ reset_state = Qtrue;
105
+
106
+ if (RTEST(reset_state)) {
107
+ state->cs = lex_en_line_begin;
108
+
109
+ state->cond = 0;
110
+ state->cmdarg = 0;
111
+ ss_stack_clear(&state->cond_stack);
112
+ ss_stack_clear(&state->cmdarg_stack);
113
+ }
114
+
115
+ state->force_utf32 = 0;
116
+
117
+ state->source = Qnil;
118
+ state->source_pts = Qnil;
119
+ state->encoding = Qnil;
120
+
121
+ state->p = 0;
122
+ // @ts is a local variable
123
+ // @te is a local variable
124
+ // @act is a local variable
125
+
126
+ // @stack is handled on prepush
127
+ // @top is handled on prepush
128
+
129
+ // Lexer state
130
+ state->token_queue = rb_ary_new();
131
+ lit_stack_clear(&state->literal_stack);
132
+
133
+ state->eq_begin_s = 0;
134
+ // @sharp_s is a local variable
135
+
136
+ state->newline_s = 0;
137
+
138
+ // @num_base is a local variable
139
+ // @num_digits_s is a local variable
140
+ // @num_suffix_s is a local variable
141
+ // @num_xfrm is a local variable
142
+
143
+ state->escape_s = 0;
144
+ state->escape = Qnil;
145
+
146
+ state->herebody_s = 0;
147
+
148
+ state->paren_nest = 0;
149
+ state->lambda_stack = rb_ary_new();
150
+
151
+ state->dedent_level = -1;
152
+
153
+ // @command_state is a local variable
154
+
155
+ state->in_kwarg = 0;
156
+
157
+ state->cs_before_block_comment = lex_en_line_begin;
158
+
159
+ return self;
160
+ }
161
+
162
+ static VALUE lexer_set_source_buffer(VALUE self, VALUE buffer)
163
+ {
164
+ INIT_LEXER_STATE(self, state);
165
+
166
+ state->source_buffer = buffer;
167
+
168
+ if (RTEST(buffer)) {
169
+ state->source = rb_funcall(buffer, rb_intern("source"), 0);
170
+ state->encoding = rb_obj_encoding(state->source);
171
+
172
+ if (state->encoding == utf8_encoding) {
173
+ state->source_pts = rb_funcall(state->source, rb_intern("unpack"), 1, rb_str_new2("U*"));
174
+ } else {
175
+ state->source_pts = rb_funcall(state->source, rb_intern("unpack"), 1, rb_str_new2("C*"));
176
+ }
177
+
178
+ state->pe = RARRAY_LEN(state->source_pts) + 2; /* pretend there is a null at the end */
179
+
180
+ VALUE source_pt = rb_ary_entry(state->source_pts, 0);
181
+ if (source_pt != Qnil && NUM2INT(source_pt) == 0xfeff) {
182
+ state->p = 1;
183
+ }
184
+ } else {
185
+ state->source = Qnil;
186
+ state->source_pts = Qnil;
187
+ state->encoding = Qnil;
188
+ state->pe = 0;
189
+ }
190
+
191
+ return self;
192
+ }
193
+
194
+ static VALUE lexer_get_state(VALUE self)
195
+ {
196
+ INIT_LEXER_STATE(self, state);
197
+
198
+ switch (state->cs) {
199
+ case lex_en_line_begin: return ID2SYM(rb_intern("line_begin"));
200
+ case lex_en_expr_dot: return ID2SYM(rb_intern("expr_dot"));
201
+ case lex_en_expr_fname: return ID2SYM(rb_intern("expr_fname"));
202
+ case lex_en_expr_value: return ID2SYM(rb_intern("expr_value"));
203
+ case lex_en_expr_beg: return ID2SYM(rb_intern("expr_beg"));
204
+ case lex_en_expr_mid: return ID2SYM(rb_intern("expr_mid"));
205
+ case lex_en_expr_arg: return ID2SYM(rb_intern("expr_arg"));
206
+ case lex_en_expr_cmdarg: return ID2SYM(rb_intern("expr_cmdarg"));
207
+ case lex_en_expr_end: return ID2SYM(rb_intern("expr_end"));
208
+ case lex_en_expr_endarg: return ID2SYM(rb_intern("expr_endarg"));
209
+ case lex_en_expr_endfn: return ID2SYM(rb_intern("expr_endfn"));
210
+ case lex_en_expr_labelarg: return ID2SYM(rb_intern("expr_labelarg"));
211
+
212
+ case lex_en_interp_string: return ID2SYM(rb_intern("interp_string"));
213
+ case lex_en_interp_words: return ID2SYM(rb_intern("interp_words"));
214
+ case lex_en_plain_string: return ID2SYM(rb_intern("plain_string"));
215
+ case lex_en_plain_words: return ID2SYM(rb_intern("plain_words"));
216
+ default:
217
+ rb_raise(rb_eRuntimeError, "Lexer state variable is borked");
218
+ }
219
+ }
220
+
221
+ static VALUE lexer_set_state(VALUE self, VALUE state_sym)
222
+ {
223
+ INIT_LEXER_STATE(self, state);
224
+ const char *state_name = rb_id2name(SYM2ID(state_sym));
225
+
226
+ if (strcmp(state_name, "line_begin") == 0)
227
+ state->cs = lex_en_line_begin;
228
+ else if (strcmp(state_name, "expr_dot") == 0)
229
+ state->cs = lex_en_expr_dot;
230
+ else if (strcmp(state_name, "expr_fname") == 0)
231
+ state->cs = lex_en_expr_fname;
232
+ else if (strcmp(state_name, "expr_value") == 0)
233
+ state->cs = lex_en_expr_value;
234
+ else if (strcmp(state_name, "expr_beg") == 0)
235
+ state->cs = lex_en_expr_beg;
236
+ else if (strcmp(state_name, "expr_mid") == 0)
237
+ state->cs = lex_en_expr_mid;
238
+ else if (strcmp(state_name, "expr_arg") == 0)
239
+ state->cs = lex_en_expr_arg;
240
+ else if (strcmp(state_name, "expr_cmdarg") == 0)
241
+ state->cs = lex_en_expr_cmdarg;
242
+ else if (strcmp(state_name, "expr_end") == 0)
243
+ state->cs = lex_en_expr_end;
244
+ else if (strcmp(state_name, "expr_endarg") == 0)
245
+ state->cs = lex_en_expr_endarg;
246
+ else if (strcmp(state_name, "expr_endfn") == 0)
247
+ state->cs = lex_en_expr_endfn;
248
+ else if (strcmp(state_name, "expr_labelarg") == 0)
249
+ state->cs = lex_en_expr_labelarg;
250
+
251
+ else if (strcmp(state_name, "interp_string") == 0)
252
+ state->cs = lex_en_interp_string;
253
+ else if (strcmp(state_name, "interp_words") == 0)
254
+ state->cs = lex_en_interp_words;
255
+ else if (strcmp(state_name, "plain_string") == 0)
256
+ state->cs = lex_en_plain_string;
257
+ else if (strcmp(state_name, "plain_words") == 0)
258
+ state->cs = lex_en_plain_words;
259
+ else
260
+ rb_raise(rb_eArgError, "Invalid state: %s", state_name);
261
+
262
+ return state_sym;
263
+ }
264
+
265
+ static VALUE lexer_push_cmdarg(VALUE self)
266
+ {
267
+ INIT_LEXER_STATE(self, state);
268
+ ss_stack_push(&state->cmdarg_stack, state->cmdarg);
269
+ state->cmdarg = 0;
270
+ return Qnil;
271
+ }
272
+
273
+ static VALUE lexer_pop_cmdarg(VALUE self)
274
+ {
275
+ INIT_LEXER_STATE(self, state);
276
+ state->cmdarg = ss_stack_pop(&state->cmdarg_stack);
277
+ return Qnil;
278
+ }
279
+
280
+ static VALUE lexer_push_cond(VALUE self)
281
+ {
282
+ INIT_LEXER_STATE(self, state);
283
+ ss_stack_push(&state->cond_stack, state->cond);
284
+ state->cond = 0;
285
+ return Qnil;
286
+ }
287
+
288
+ static VALUE lexer_pop_cond(VALUE self)
289
+ {
290
+ INIT_LEXER_STATE(self, state);
291
+ state->cond = ss_stack_pop(&state->cond_stack);
292
+ return Qnil;
293
+ }
294
+
295
+ static VALUE lexer_get_in_kwarg(VALUE self)
296
+ {
297
+ INIT_LEXER_STATE(self, state);
298
+ return state->in_kwarg ? Qtrue : Qfalse;
299
+ }
300
+
301
+ static VALUE lexer_set_in_kwarg(VALUE self, VALUE val)
302
+ {
303
+ INIT_LEXER_STATE(self, state);
304
+ state->in_kwarg = RTEST(val) ? 1 : 0;
305
+ return val;
306
+ }
307
+
308
+ static VALUE lexer_get_dedent_level(VALUE self)
309
+ {
310
+ INIT_LEXER_STATE(self, state);
311
+ int result = state->dedent_level;
312
+ state->dedent_level = -1;
313
+ if (result == -1)
314
+ return Qnil;
315
+ else
316
+ return INT2NUM(result);
317
+ }
318
+
319
+ static VALUE lexer_set_force_utf32(VALUE self, VALUE arg)
320
+ {
321
+ return arg;
322
+ }
323
+
324
+ static VALUE lexer_advance(VALUE self)
325
+ {
326
+ int cs, act = 0, top, command_state;
327
+ int num_base = 0;
328
+ long p, pe, eof, ts = 0, te = 0, tm = 0, sharp_s = 0, heredoc_e = 0;
329
+ long num_digits_s = 0, num_suffix_s = 0;
330
+ void (*num_xfrm)(lexer_state*, VALUE, long, long); /* numeric suffix-induced transformation */
331
+ lexer_state *state;
332
+ int *stack;
333
+ VALUE ident_tok = Qnil;
334
+ long ident_ts = 0, ident_te = 0;
335
+ long numeric_s = 0;
336
+ Data_Get_Struct(self, lexer_state, state);
337
+
338
+ if (RARRAY_LEN(state->token_queue) > 0)
339
+ return rb_ary_shift(state->token_queue);
340
+
341
+ cs = state->cs;
342
+ p = state->p;
343
+ pe = eof = state->pe;
344
+ stack = state->cs_stack;
345
+ top = state->cs_stack_top;
346
+
347
+ command_state = (cs == lex_en_expr_value || cs == lex_en_line_begin);
348
+
349
+ %%{
350
+ write exec;
351
+ }%%
352
+
353
+ state->p = p;
354
+ state->cs = cs;
355
+ state->cs_stack_top = top;
356
+
357
+ if (RARRAY_LEN(state->token_queue) > 0) {
358
+ return rb_ary_shift(state->token_queue);
359
+ } else if (cs == lex_error) {
360
+ VALUE info = rb_ary_new3(2, rb_str_new2("$error"), range(state, p - 1, p));
361
+ VALUE token = rb_ary_new3(2, Qfalse, info);
362
+ return token;
363
+ } else {
364
+ VALUE info = rb_ary_new3(2, rb_str_new2("$eof"), range(state, eof - 2, eof - 2));
365
+ VALUE token = rb_ary_new3(2, Qfalse, info);
366
+ return token;
367
+ }
368
+ }
369
+
370
+ static inline void force_encoding(VALUE str, VALUE enc)
371
+ {
372
+ rb_enc_associate(str, rb_to_encoding(enc));
373
+ }
374
+
375
+ static void emit_token(lexer_state *state, VALUE type, VALUE value, long start, long end)
376
+ {
377
+ VALUE info = rb_ary_new3(2, value, range(state, start, end));
378
+ VALUE token = rb_ary_new3(2, type, info);
379
+
380
+ rb_ary_push(state->token_queue, token);
381
+
382
+ if (state->tokens != Qnil)
383
+ rb_ary_push(state->tokens, token);
384
+ }
385
+
386
+ static void emit_comment(lexer_state *state, long start, long end)
387
+ {
388
+ VALUE rng = Qnil;
389
+
390
+ if (state->tokens != Qnil) {
391
+ rng = range(state, start, end);
392
+
393
+ VALUE info = rb_ary_new3(2, tok(state, start, end), rng);
394
+ VALUE token = rb_ary_new3(2, tCOMMENT, info);
395
+ rb_ary_push(state->tokens, token);
396
+ }
397
+
398
+ if (state->comments != Qnil) {
399
+ if (rng == Qnil)
400
+ rng = range(state, start, end);
401
+ VALUE comment = rb_class_new_instance(1, &rng, comment_klass);
402
+ rb_ary_push(state->comments, comment);
403
+ }
404
+ }
405
+
406
+ static void emit_do(lexer_state *state, int do_block, long ts, long te)
407
+ {
408
+ if (stack_state_active(&state->cond))
409
+ emit(kDO_COND);
410
+ else if (stack_state_active(&state->cmdarg) || do_block)
411
+ emit(kDO_BLOCK);
412
+ else
413
+ emit(kDO);
414
+ }
415
+
416
+ static VALUE tok(lexer_state *state, long start, long end)
417
+ {
418
+ return rb_str_substr(state->source, start, end - start);
419
+ }
420
+
421
+ static VALUE range(lexer_state *state, long start, long end)
422
+ {
423
+ VALUE args[3];
424
+ args[0] = state->source_buffer;
425
+ args[1] = INT2NUM(start);
426
+ args[2] = INT2NUM(end);
427
+ return rb_class_new_instance(3, args, range_klass);
428
+ }
429
+
430
+ static void diagnostic(lexer_state *state, VALUE type, VALUE reason,
431
+ VALUE arguments, VALUE loc, VALUE hilights)
432
+ {
433
+ VALUE args[5];
434
+ args[0] = type;
435
+ args[1] = reason;
436
+ args[2] = arguments;
437
+ args[3] = loc;
438
+ args[4] = hilights;
439
+ VALUE diagnostic = rb_class_new_instance(5, args, diagnostic_klass);
440
+ rb_funcall(state->diagnostics, rb_intern("process"), 1, diagnostic);
441
+ }
442
+
443
+ static int get_codepoint(lexer_state *state, long p)
444
+ {
445
+ if (p >= RARRAY_LEN(state->source_pts))
446
+ return 0;
447
+ else
448
+ return NUM2INT(rb_ary_entry(state->source_pts, p));
449
+ }
450
+
451
+ static int arg_or_cmdarg(int command_state)
452
+ {
453
+ if (command_state) {
454
+ return lex_en_expr_cmdarg;
455
+ } else {
456
+ return lex_en_expr_arg;
457
+ }
458
+ }
459
+
460
+ static int is_nthref(VALUE str)
461
+ {
462
+ char c;
463
+ char *p = RSTRING_PTR(str);
464
+
465
+ if (*p++ != '$') return 0;
466
+
467
+ c = *p++;
468
+ if (c < '1' || c > '9') return 0;
469
+
470
+ while ((c = *p++)) {
471
+ if (c < '0' || c > '9') return 0;
472
+ }
473
+
474
+ return 1;
475
+ }
476
+
477
+ static int is_backref(VALUE str)
478
+ {
479
+ char c;
480
+ char *p = RSTRING_PTR(str);
481
+
482
+ if (*p++ != '$') return 0;
483
+
484
+ c = *p++;
485
+ if (c != '&' && c != '`' && c != '\'' && c != '+') return 0;
486
+
487
+ return *p == '\0'; /* are we at end of string? */
488
+ }
489
+
490
+ static int is_capitalized(VALUE str)
491
+ {
492
+ char *p = RSTRING_PTR(str);
493
+ return *p >= 'A' && *p <= 'Z';
494
+ }
495
+
496
+ static int is_regexp_metachar(VALUE str)
497
+ {
498
+ char c = *RSTRING_PTR(str);
499
+ return c == '\\' || c == '$' || c == '(' || c == ')' || c == '*' ||
500
+ c == '+' || c == '.' || c == '<' || c == '>' || c == '?' ||
501
+ c == '[' || c == ']' || c == '^' || c == '{' || c == '|' ||
502
+ c == '}';
503
+ }
504
+
505
+ static int eof_codepoint(int codepoint)
506
+ {
507
+ return codepoint == 0x04 || codepoint == 0x1a || codepoint == 0x00;
508
+ }
509
+
510
+ static VALUE find_unknown_options(VALUE str)
511
+ {
512
+ char c, *p = RSTRING_PTR(str);
513
+ VALUE result = Qnil;
514
+
515
+ while ((c = *p++)) {
516
+ if (c != 'i' && c != 'm' && c != 'x' && c != 'o' && c != 'u' && c != 'e' &&
517
+ c != 's' && c != 'n') {
518
+ if (result == Qnil) {
519
+ result = rb_str_new(&c, 1);
520
+ } else {
521
+ rb_str_concat(result, rb_str_new(&c, 1));
522
+ }
523
+ }
524
+ }
525
+
526
+ return result;
527
+ }
528
+
529
+ static int bad_cvar_name(VALUE str)
530
+ {
531
+ char *p = RSTRING_PTR(str);
532
+
533
+ if (*p++ != '@') return 0;
534
+ if (*p++ != '@') return 0;
535
+ return *p >= '0' && *p <= '9';
536
+ }
537
+
538
+ static int bad_ivar_name(VALUE str)
539
+ {
540
+ char *p = RSTRING_PTR(str);
541
+
542
+ if (*p++ != '@') return 0;
543
+ return *p >= '0' && *p <= '9';
544
+ }
545
+
546
+ static int find_8_or_9(VALUE str)
547
+ {
548
+ int idx = 0;
549
+ char *p = RSTRING_PTR(str);
550
+
551
+ while (*p) {
552
+ if (*p == '8' || *p == '9')
553
+ return idx;
554
+ idx++;
555
+ p++;
556
+ }
557
+
558
+ return -1;
559
+ }
560
+
561
+ static void emit_int(lexer_state *state, VALUE val, long start, long end)
562
+ {
563
+ emit_token(state, tINTEGER, val, start, end);
564
+ }
565
+
566
+ static void emit_rational(lexer_state *state, VALUE val, long start, long end)
567
+ {
568
+ emit_token(state, tRATIONAL, rb_Rational1(val), start, end);
569
+ }
570
+
571
+ static void emit_complex(lexer_state *state, VALUE val, long start, long end)
572
+ {
573
+ emit_token(state, tIMAGINARY, rb_Complex(Qzero, val), start, end);
574
+ }
575
+
576
+ static void emit_complex_rational(lexer_state *state, VALUE val, long start, long end)
577
+ {
578
+ emit_token(state, tIMAGINARY, rb_Complex(Qzero, rb_Rational1(val)), start, end);
579
+ }
580
+
581
+ static void emit_float(lexer_state *state, VALUE val, long start, long end)
582
+ {
583
+ emit_token(state, tFLOAT, rb_Float(val), start, end);
584
+ }
585
+
586
+ static void emit_complex_float(lexer_state *state, VALUE val, long start, long end)
587
+ {
588
+ emit_token(state, tIMAGINARY, rb_Complex(Qzero, rb_Float(val)), start, end);
589
+ }
590
+
591
+ static void emit_int_followed_by_if(lexer_state *state, VALUE val, long start, long end)
592
+ {
593
+ emit_token(state, tINTEGER, val, start, end);
594
+ }
595
+
596
+ static void emit_int_followed_by_rescue(lexer_state *state, VALUE val, long start, long end)
597
+ {
598
+ emit_token(state, tINTEGER, val, start, end);
599
+ }
600
+
601
+ static void emit_float_followed_by_if(lexer_state *state, VALUE val, long start, long end)
602
+ {
603
+ emit_token(state, tFLOAT, rb_Float(val), start, end);
604
+ }
605
+ static void emit_float_followed_by_rescue(lexer_state *state, VALUE val, long start, long end)
606
+ {
607
+ emit_token(state, tFLOAT, rb_Float(val), start, end);
608
+ }
609
+
610
+ static int next_state_for_literal(literal *lit) {
611
+ if (literal_words_p(lit) && literal_backslash_delimited_p(lit)) {
612
+ if (lit->interpolate) {
613
+ return lex_en_interp_backslash_delimited_words;
614
+ } else {
615
+ return lex_en_plain_backslash_delimited_words;
616
+ }
617
+ } else if (literal_words_p(lit) && !literal_backslash_delimited_p(lit)) {
618
+ if (lit->interpolate) {
619
+ return lex_en_interp_words;
620
+ } else {
621
+ return lex_en_plain_words;
622
+ }
623
+ } else if (!literal_words_p(lit) && literal_backslash_delimited_p(lit)) {
624
+ if (lit->interpolate) {
625
+ return lex_en_interp_backslash_delimited;
626
+ } else {
627
+ return lex_en_plain_backslash_delimited;
628
+ }
629
+ } else {
630
+ if (lit->interpolate) {
631
+ return lex_en_interp_string;
632
+ } else {
633
+ return lex_en_plain_string;
634
+ }
635
+ }
636
+ }
637
+
638
+ static int push_literal(lexer_state *state, VALUE str_type, VALUE delimiter,
639
+ long str_s, long heredoc_e, int indent, int dedent_body,
640
+ int label_allowed)
641
+ {
642
+ literal lit;
643
+ literal_init(&lit, state, str_type, delimiter, str_s, heredoc_e, indent,
644
+ dedent_body, label_allowed);
645
+ lit_stack_push(&state->literal_stack, lit);
646
+
647
+ return next_state_for_literal(&lit);
648
+ }
649
+
650
+ static int pop_literal(lexer_state *state)
651
+ {
652
+ literal old_literal = lit_stack_pop(&state->literal_stack);
653
+
654
+ state->dedent_level = old_literal.dedent_level;
655
+
656
+ if (old_literal.start_tok == tREGEXP_BEG) {
657
+ return lex_en_regexp_modifiers;
658
+ } else {
659
+ return lex_en_expr_end;
660
+ }
661
+ }
662
+
663
+ static VALUE array_last(VALUE array)
664
+ {
665
+ long len = RARRAY_LEN(array);
666
+ if (len == 0)
667
+ return Qnil;
668
+ else
669
+ return rb_ary_entry(array, len - 1);
670
+ }
671
+
672
+ static VALUE unescape_char(char c)
673
+ {
674
+ switch (c) {
675
+ case 'a': return rb_str_new("\a", 1);
676
+ case 'b': return rb_str_new("\b", 1);
677
+ case 'e': return rb_str_new("\e", 1);
678
+ case 'f': return rb_str_new("\f", 1);
679
+ case 'n': return rb_str_new("\n", 1);
680
+ case 'r': return rb_str_new("\r", 1);
681
+ case 's': return rb_str_new(" ", 1);
682
+ case 't': return rb_str_new("\t", 1);
683
+ case 'v': return rb_str_new("\v", 1);
684
+ default: return Qnil;
685
+ }
686
+ }
687
+
688
+ static VALUE escape_char(VALUE str)
689
+ {
690
+ char c = *RSTRING_PTR(str);
691
+
692
+ switch (c) {
693
+ case '\f': return rb_str_new("\\f", 2);
694
+ case '\n': return rb_str_new("\\n", 2);
695
+ case '\r': return rb_str_new("\\r", 2);
696
+ case ' ': return rb_str_new("\\s", 2);
697
+ case '\t': return rb_str_new("\\t", 2);
698
+ case '\v': return rb_str_new("\\v", 2);
699
+ default: return Qnil;
700
+ }
701
+ }
702
+
703
+ // assumes that pattern contains only one char
704
+ static inline int str_start_with_p(VALUE str, const char *pattern)
705
+ {
706
+ if (RSTRING_LEN(str) == 0) {
707
+ return 0;
708
+ }
709
+ return (memcmp(RSTRING_PTR(str), pattern, 1) == 0);
710
+ }
711
+
712
+ // assumes that pattern contains only one char
713
+ static inline int str_end_with_p(VALUE str, const char *pattern)
714
+ {
715
+ if (RSTRING_LEN(str) == 0) {
716
+ return 0;
717
+ }
718
+ return (memcmp(RSTRING_PTR(str) + RSTRING_LEN(str) - 1, pattern, 1) == 0);
719
+ }
720
+
721
+ def_lexer_attribute(diagnostics);
722
+ def_lexer_attribute(static_env);
723
+ def_lexer_attribute(tokens);
724
+ def_lexer_attribute(comments);
725
+ def_lexer_attribute(encoding);
726
+
727
+ def_lexer_attr_reader(source_buffer);
728
+
729
+ void Init_lexer()
730
+ {
731
+ init_symbol(k__ENCODING__);
732
+ init_symbol(k__FILE__);
733
+ init_symbol(k__LINE__);
734
+ init_symbol(kALIAS);
735
+ init_symbol(kAND);
736
+ init_symbol(kBEGIN);
737
+ init_symbol(klBEGIN);
738
+ init_symbol(kBREAK);
739
+ init_symbol(kCASE);
740
+ init_symbol(kCLASS);
741
+ init_symbol(kDEF);
742
+ init_symbol(kDEFINED);
743
+ init_symbol(kDO);
744
+ init_symbol(kDO_BLOCK);
745
+ init_symbol(kDO_COND);
746
+ init_symbol(kDO_LAMBDA);
747
+ init_symbol(kELSE);
748
+ init_symbol(kELSIF);
749
+ init_symbol(kEND);
750
+ init_symbol(klEND);
751
+ init_symbol(kENSURE);
752
+ init_symbol(kFALSE);
753
+ init_symbol(kFOR);
754
+ init_symbol(kIF);
755
+ init_symbol(kIF_MOD);
756
+ init_symbol(kIN);
757
+ init_symbol(kMODULE);
758
+ init_symbol(kNEXT);
759
+ init_symbol(kNIL);
760
+ init_symbol(kNOT);
761
+ init_symbol(kOR);
762
+ init_symbol(kREDO);
763
+ init_symbol(kRESCUE);
764
+ init_symbol(kRESCUE_MOD);
765
+ init_symbol(kRETRY);
766
+ init_symbol(kRETURN);
767
+ init_symbol(kSELF);
768
+ init_symbol(kSUPER);
769
+ init_symbol(kTHEN);
770
+ init_symbol(kTRUE);
771
+ init_symbol(kUNDEF);
772
+ init_symbol(kUNLESS);
773
+ init_symbol(kUNLESS_MOD);
774
+ init_symbol(kUNTIL);
775
+ init_symbol(kUNTIL_MOD);
776
+ init_symbol(kWHEN);
777
+ init_symbol(kWHILE);
778
+ init_symbol(kWHILE_MOD);
779
+ init_symbol(kYIELD);
780
+
781
+ init_symbol(tAMPER);
782
+ init_symbol(tAMPER2);
783
+ init_symbol(tANDDOT);
784
+ init_symbol(tANDOP);
785
+ init_symbol(tAREF);
786
+ init_symbol(tASET);
787
+ init_symbol(tASSOC);
788
+ init_symbol(tBACK_REF);
789
+ init_symbol(tBACK_REF2);
790
+ init_symbol(tBANG);
791
+ init_symbol(tCARET);
792
+ init_symbol(tCHARACTER);
793
+ init_symbol(tCMP);
794
+ init_symbol(tCOLON);
795
+ init_symbol(tCOLON2);
796
+ init_symbol(tCOLON3);
797
+ init_symbol(tCOMMA);
798
+ init_symbol(tCOMMENT);
799
+ init_symbol(tCONSTANT);
800
+ init_symbol(tCVAR);
801
+ init_symbol(tDIVIDE);
802
+ init_symbol(tDOT);
803
+ init_symbol(tDOT2);
804
+ init_symbol(tDOT3);
805
+ init_symbol(tDSTAR);
806
+ init_symbol(tEH);
807
+ init_symbol(tEQ);
808
+ init_symbol(tEQL);
809
+ init_symbol(tEQQ);
810
+ init_symbol(tFID);
811
+ init_symbol(tFLOAT);
812
+ init_symbol(tGEQ);
813
+ init_symbol(tGT);
814
+ init_symbol(tGVAR);
815
+ init_symbol(tIDENTIFIER);
816
+ init_symbol(tIMAGINARY);
817
+ init_symbol(tINTEGER);
818
+ init_symbol(tIVAR);
819
+ init_symbol(tLABEL);
820
+ init_symbol(tLABEL_END);
821
+ init_symbol(tLAMBDA);
822
+ init_symbol(tLAMBEG);
823
+ init_symbol(tLBRACE);
824
+ init_symbol(tLBRACE_ARG);
825
+ init_symbol(tLBRACK);
826
+ init_symbol(tLBRACK2);
827
+ init_symbol(tLCURLY);
828
+ init_symbol(tLEQ);
829
+ init_symbol(tLPAREN);
830
+ init_symbol(tLPAREN_ARG);
831
+ init_symbol(tLPAREN2);
832
+ init_symbol(tLSHFT);
833
+ init_symbol(tLT);
834
+ init_symbol(tMATCH);
835
+ init_symbol(tMINUS);
836
+ init_symbol(tNEQ);
837
+ init_symbol(tNL);
838
+ init_symbol(tNMATCH);
839
+ init_symbol(tNTH_REF);
840
+ init_symbol(tOP_ASGN);
841
+ init_symbol(tOROP);
842
+ init_symbol(tPERCENT);
843
+ init_symbol(tPIPE);
844
+ init_symbol(tPLUS);
845
+ init_symbol(tPOW);
846
+ init_symbol(tQWORDS_BEG);
847
+ init_symbol(tQSYMBOLS_BEG);
848
+ init_symbol(tRATIONAL);
849
+ init_symbol(tRBRACK);
850
+ init_symbol(tRCURLY);
851
+ init_symbol(tREGEXP_BEG);
852
+ init_symbol(tREGEXP_OPT);
853
+ init_symbol(tRPAREN);
854
+ init_symbol(tRSHFT);
855
+ init_symbol(tSEMI);
856
+ init_symbol(tSPACE);
857
+ init_symbol(tSTAR);
858
+ init_symbol(tSTAR2);
859
+ init_symbol(tSTRING);
860
+ init_symbol(tSTRING_BEG);
861
+ init_symbol(tSTRING_CONTENT);
862
+ init_symbol(tSTRING_DBEG);
863
+ init_symbol(tSTRING_DEND);
864
+ init_symbol(tSTRING_DVAR);
865
+ init_symbol(tSTRING_END);
866
+ init_symbol(tSYMBEG);
867
+ init_symbol(tSYMBOL);
868
+ init_symbol(tSYMBOLS_BEG);
869
+ init_symbol(tTILDE);
870
+ init_symbol(tUMINUS);
871
+ init_symbol(tUNARY_NUM);
872
+ init_symbol(tUPLUS);
873
+ init_symbol(tWORDS_BEG);
874
+ init_symbol(tXSTRING_BEG);
875
+
876
+ severity_error = ID2SYM(rb_intern("error"));
877
+ rb_gc_register_address(&severity_error);
878
+ init_symbol(fatal);
879
+ init_symbol(warning);
880
+
881
+ init_symbol(ambiguous_literal);
882
+ init_symbol(ambiguous_prefix);
883
+ init_symbol(bare_backslash);
884
+ init_symbol(character);
885
+ init_symbol(cvar_name);
886
+ init_symbol(embedded_document);
887
+ init_symbol(empty_numeric);
888
+ init_symbol(escape_eof);
889
+ init_symbol(incomplete_escape);
890
+ init_symbol(invalid_escape);
891
+ init_symbol(invalid_escape_use);
892
+ init_symbol(invalid_hex_escape);
893
+ init_symbol(invalid_octal);
894
+ init_symbol(invalid_unicode_escape);
895
+ init_symbol(ivar_name);
896
+ init_symbol(heredoc_id_ends_with_nl);
897
+ init_symbol(heredoc_id_has_newline);
898
+ init_symbol(no_dot_digit_literal);
899
+ init_symbol(prefix);
900
+ init_symbol(regexp_options);
901
+ init_symbol(string_eof);
902
+ init_symbol(trailing_in_number);
903
+ init_symbol(unexpected);
904
+ init_symbol(unexpected_percent_str);
905
+ init_symbol(unicode_point_too_large);
906
+ init_symbol(unterminated_unicode);
907
+
908
+ VALUE m_Parser = rb_define_module("Parser");
909
+ VALUE c_Lexer = rb_define_class_under(m_Parser, "CLexer", rb_cObject);
910
+
911
+ rb_define_alloc_func(c_Lexer, lexer_alloc);
912
+
913
+ rb_define_method(c_Lexer, "initialize", lexer_initialize, 1);
914
+ rb_define_method(c_Lexer, "advance", lexer_advance, 0);
915
+ rb_define_method(c_Lexer, "reset", lexer_reset, -1);
916
+
917
+ rb_define_method(c_Lexer, "push_cmdarg", lexer_push_cmdarg, 0);
918
+ rb_define_method(c_Lexer, "pop_cmdarg", lexer_pop_cmdarg, 0);
919
+ rb_define_method(c_Lexer, "push_cond", lexer_push_cond, 0);
920
+ rb_define_method(c_Lexer, "pop_cond", lexer_pop_cond, 0);
921
+
922
+ rb_define_method(c_Lexer, "push_cmdarg_state", lexer_push_cmdarg_state, 1);
923
+ rb_define_method(c_Lexer, "pop_cmdarg_state", lexer_pop_cmdarg_state, 0);
924
+ rb_define_method(c_Lexer, "lexpop_cmdarg_state", lexer_lexpop_cmdarg_state, 0);
925
+ rb_define_method(c_Lexer, "clear_cmdarg_state", lexer_clear_cmdarg_state, 0);
926
+ rb_define_method(c_Lexer, "cmdarg_state_empty?", lexer_cmdarg_state_empty_p, 0);
927
+ rb_define_method(c_Lexer, "cmdarg_state_value", lexer_cmdarg_state_value, 0);
928
+ rb_define_method(c_Lexer, "set_cmdarg_state", lexer_set_cmdarg_state, 1);
929
+
930
+ rb_define_method(c_Lexer, "push_cond_state", lexer_push_cond_state, 1);
931
+ rb_define_method(c_Lexer, "pop_cond_state", lexer_pop_cond_state, 0);
932
+ rb_define_method(c_Lexer, "lexpop_cond_state", lexer_lexpop_cond_state, 0);
933
+ rb_define_method(c_Lexer, "clear_cond_state", lexer_clear_cond_state, 0);
934
+ rb_define_method(c_Lexer, "cond_state_empty?", lexer_cond_state_empty_p, 0);
935
+ rb_define_method(c_Lexer, "cond_state_value", lexer_cond_state_value, 0);
936
+ rb_define_method(c_Lexer, "set_cond_state", lexer_set_cond_state, 1);
937
+
938
+ rb_define_method(c_Lexer, "state", lexer_get_state, 0);
939
+ rb_define_method(c_Lexer, "state=", lexer_set_state, 1);
940
+ rb_define_method(c_Lexer, "in_kwarg", lexer_get_in_kwarg, 0);
941
+ rb_define_method(c_Lexer, "in_kwarg=", lexer_set_in_kwarg, 1);
942
+ rb_define_method(c_Lexer, "diagnostics", lexer_get_diagnostics, 0);
943
+ rb_define_method(c_Lexer, "diagnostics=", lexer_set_diagnostics, 1);
944
+ rb_define_method(c_Lexer, "static_env", lexer_get_static_env, 0);
945
+ rb_define_method(c_Lexer, "static_env=", lexer_set_static_env, 1);
946
+ rb_define_method(c_Lexer, "tokens", lexer_get_tokens, 0);
947
+ rb_define_method(c_Lexer, "tokens=", lexer_set_tokens, 1);
948
+ rb_define_method(c_Lexer, "comments", lexer_get_comments, 0);
949
+ rb_define_method(c_Lexer, "comments=", lexer_set_comments, 1);
950
+ rb_define_method(c_Lexer, "encoding", lexer_get_encoding, 0);
951
+ rb_define_method(c_Lexer, "encoding=", lexer_set_encoding, 1);
952
+ rb_define_method(c_Lexer, "dedent_level", lexer_get_dedent_level, 0);
953
+ rb_define_method(c_Lexer, "source_buffer", lexer_get_source_buffer, 0);
954
+ rb_define_method(c_Lexer, "source_buffer=", lexer_set_source_buffer, 1);
955
+ rb_define_method(c_Lexer, "force_utf32=", lexer_set_force_utf32, 1);
956
+
957
+ VALUE m_Source = rb_const_get(m_Parser, rb_intern("Source"));
958
+ comment_klass = rb_const_get(m_Source, rb_intern("Comment"));
959
+ diagnostic_klass = rb_const_get(m_Parser, rb_intern("Diagnostic"));
960
+ range_klass = rb_const_get(m_Source, rb_intern("Range"));
961
+
962
+ empty_array = rb_obj_freeze(rb_ary_new2(0));
963
+ rb_gc_register_address(&empty_array);
964
+ blank_string = rb_obj_freeze(rb_str_new2(""));
965
+ rb_gc_register_address(&blank_string);
966
+ newline = rb_obj_freeze(rb_str_new2("\n"));
967
+ rb_gc_register_address(&newline);
968
+ escaped_newline = rb_obj_freeze(rb_str_new2("\\\n"));
969
+ rb_gc_register_address(&escaped_newline);
970
+
971
+ if (rb_const_defined(rb_cObject, rb_intern("Encoding"))) {
972
+ VALUE encoding = rb_const_get(rb_cObject, rb_intern("Encoding"));
973
+ utf8_encoding = rb_const_get(encoding, rb_intern("UTF_8"));
974
+ rb_gc_register_address(&utf8_encoding);
975
+ } else {
976
+ utf8_encoding = Qnil;
977
+ }
978
+
979
+ VALUE regex_str = rb_str_new2("\\r.*$");
980
+ cr_then_anything_to_eol = rb_class_new_instance(1, &regex_str, rb_cRegexp);
981
+ rb_gc_register_address(&cr_then_anything_to_eol);
982
+ regex_str = rb_str_new2("\\r+$");
983
+ crs_to_eol = rb_class_new_instance(1, &regex_str, rb_cRegexp);
984
+ rb_gc_register_address(&crs_to_eol);
985
+ }
986
+
987
+ %%{
988
+ alphtype int;
989
+ getkey (get_codepoint(state, p));
990
+
991
+ prepush {
992
+ /* grow the state stack as needed */
993
+ if (state->cs_stack_top == state->cs_stack_size) {
994
+ int *new_stack = xmalloc(state->cs_stack_size * 2 * sizeof(int));
995
+ memcpy(new_stack, state->cs_stack, state->cs_stack_size * sizeof(int));
996
+ xfree(state->cs_stack);
997
+ stack = state->cs_stack = new_stack;
998
+ state->cs_stack_size = state->cs_stack_size * 2;
999
+ }
1000
+ }
1001
+
1002
+ action do_nl { state->newline_s = p; }
1003
+
1004
+ c_nl = '\n' $ do_nl;
1005
+ c_space = [ \t\r\f\v];
1006
+ c_space_nl = c_space | c_nl;
1007
+
1008
+ c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
1009
+ c_eol = c_nl | c_eof;
1010
+ c_any = any - c_eof;
1011
+
1012
+ c_nl_zlen = c_nl | zlen;
1013
+ c_line = any - c_nl_zlen;
1014
+
1015
+ c_unicode = c_any - 0x00..0x7f;
1016
+ c_upper = [A-Z];
1017
+ c_lower = [a-z_] | c_unicode;
1018
+ c_alpha = c_lower | c_upper;
1019
+ c_alnum = c_alpha | [0-9];
1020
+
1021
+ action do_eof { fhold; fbreak; }
1022
+
1023
+ operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
1024
+ operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
1025
+ '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
1026
+ operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
1027
+ '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
1028
+
1029
+ punctuation_begin = '-' | '+' | '::' | '(' | '[' |
1030
+ '*' | '**' | '&' ;
1031
+ punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
1032
+ '::' | '?' | ':' | '.' | '..' | '...' ;
1033
+
1034
+ keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
1035
+ keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
1036
+ keyword_with_fname = 'def' | 'undef' | 'alias' ;
1037
+ keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
1038
+ 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
1039
+ 'and' | 'or' ;
1040
+ keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
1041
+ keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
1042
+ 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
1043
+ '__LINE__' | '__ENCODING__';
1044
+
1045
+ keyword = keyword_with_value | keyword_with_mid |
1046
+ keyword_with_end | keyword_with_arg |
1047
+ keyword_with_fname | keyword_modifier ;
1048
+
1049
+ constant = c_upper c_alnum*;
1050
+ bareword = c_alpha c_alnum*;
1051
+
1052
+ call_or_var = c_lower c_alnum*;
1053
+ class_var = '@@' bareword;
1054
+ instance_var = '@' bareword;
1055
+ global_var = '$'
1056
+ ( bareword | digit+
1057
+ | [`'+~*$&?!@/\\;,.=:<>"] # '
1058
+ | '-' c_alnum
1059
+ )
1060
+ ;
1061
+
1062
+ class_var_v = '@@' c_alnum+;
1063
+ instance_var_v = '@' c_alnum+;
1064
+
1065
+ label = bareword [?!]? ':';
1066
+
1067
+ int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
1068
+ int_dec = ( digit+ '_' )* digit* '_'? ;
1069
+ int_bin = ( [01]+ '_' )* [01]* '_'? ;
1070
+
1071
+ flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
1072
+ flo_frac = '.' ( digit+ '_' )* digit+;
1073
+ flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
1074
+
1075
+ int_suffix =
1076
+ '' % { num_xfrm = emit_int; numeric_s = ts; }
1077
+ | 'r' % { num_xfrm = emit_rational; numeric_s = ts; }
1078
+ | 'i' % { num_xfrm = emit_complex; numeric_s = ts; }
1079
+ | 'ri' % { num_xfrm = emit_complex_rational; numeric_s = ts; }
1080
+ | 'if' % { num_xfrm = emit_int_followed_by_if; numeric_s = ts; p -= 2; }
1081
+ | 'rescue' % { num_xfrm = emit_int_followed_by_rescue; numeric_s = ts; p -= 6; } ;
1082
+
1083
+ flo_pow_suffix =
1084
+ '' % { num_xfrm = emit_float; numeric_s = ts; }
1085
+ | 'i' % { num_xfrm = emit_complex_float; numeric_s = ts; }
1086
+ | 'if' % { num_xfrm = emit_float_followed_by_if; numeric_s = ts; p -= 2; };
1087
+
1088
+ flo_suffix =
1089
+ flo_pow_suffix
1090
+ | 'r' % { num_xfrm = emit_rational; numeric_s = ts; }
1091
+ | 'ri' % { num_xfrm = emit_complex_rational; numeric_s = ts; }
1092
+ | 'rescue' % { num_xfrm = emit_float_followed_by_rescue; numeric_s = ts; p -= 6; };
1093
+
1094
+ escaped_nl = "\\" c_nl;
1095
+
1096
+ action unicode_points {
1097
+ state->escape = rb_str_new2("");
1098
+
1099
+ VALUE codepoints = tok(state, state->escape_s + 2, p - 1);
1100
+ long codepoint_s = state->escape_s + 2;
1101
+
1102
+ VALUE regexp;
1103
+
1104
+ if (state->version < 24) {
1105
+ if (str_start_with_p(codepoints, " ") || str_start_with_p(codepoints, "\t")) {
1106
+ diagnostic(state, severity_error, invalid_unicode_escape, Qnil,
1107
+ range(state, state->escape_s + 2, state->escape_s + 3), empty_array);
1108
+ }
1109
+
1110
+ regexp = rb_reg_regcomp(rb_str_new2("[ \\t]{2}"));
1111
+ VALUE space_p = rb_funcall(codepoints, rb_intern("index"), 1, regexp);
1112
+
1113
+ if (RTEST(space_p)) {
1114
+ diagnostic(state, severity_error, invalid_unicode_escape, Qnil,
1115
+ range(state, codepoint_s + NUM2INT(space_p) + 1, codepoint_s + NUM2INT(space_p) + 1), empty_array);
1116
+ }
1117
+
1118
+ if (str_end_with_p(codepoints, " ") || str_end_with_p(codepoints, "\t")) {
1119
+ diagnostic(state, severity_error, invalid_unicode_escape, Qnil,
1120
+ range(state, p - 1, p), empty_array);
1121
+ }
1122
+ }
1123
+
1124
+ regexp = rb_reg_regcomp(rb_str_new2("([0-9a-fA-F]+)|([ \t]+)"));
1125
+
1126
+ VALUE matches = rb_funcall(codepoints, rb_intern("scan"), 1, regexp);
1127
+ long len = RARRAY_LEN(matches);
1128
+
1129
+ for (long i = 0; i < len; i++) {
1130
+ VALUE match = RARRAY_AREF(matches, i);
1131
+ VALUE codepoint_str = RARRAY_AREF(match, 0);
1132
+ VALUE spaces = RARRAY_AREF(match, 1);
1133
+
1134
+ if (RTEST(spaces)) {
1135
+ codepoint_s += RSTRING_LEN(spaces);
1136
+ } else {
1137
+ VALUE codepoint = rb_str_to_inum(codepoint_str, 16, 0);
1138
+ if (NUM2INT(codepoint) >= 0x110000) {
1139
+ diagnostic(state, severity_error, unicode_point_too_large, Qnil,
1140
+ range(state, codepoint_s, codepoint_s + RSTRING_LEN(codepoint_str)), empty_array);
1141
+ break;
1142
+ }
1143
+
1144
+ codepoint = rb_funcall(codepoint, rb_intern("chr"), 1, utf8_encoding);
1145
+ state->escape = rb_str_plus(state->escape, codepoint);
1146
+ codepoint_s += RSTRING_LEN(codepoint_str);
1147
+ }
1148
+ }
1149
+ }
1150
+
1151
+ action unescape_char {
1152
+ char c = NUM2INT(rb_ary_entry(state->source_pts, p - 1));
1153
+ state->escape = unescape_char(c);
1154
+
1155
+ if (state->escape == Qnil) {
1156
+ VALUE codepoint = rb_funcall(state->source_buffer, rb_intern("slice"), 1, INT2NUM(p - 1));
1157
+ state->escape = codepoint;
1158
+ force_encoding(codepoint, state->encoding);
1159
+ }
1160
+ }
1161
+
1162
+ action invalid_complex_escape {
1163
+ diagnostic(state, fatal, invalid_escape, Qnil, range(state, ts, te),
1164
+ empty_array);
1165
+ }
1166
+
1167
+ action slash_c_char {
1168
+ char c = *RSTRING_PTR(state->escape) & 0x9f;
1169
+ state->escape = rb_str_new(&c, 1);
1170
+ force_encoding(state->escape, state->encoding);
1171
+ }
1172
+
1173
+ action slash_m_char {
1174
+ char c = *RSTRING_PTR(state->escape) | 0x80;
1175
+ state->escape = rb_str_new(&c, 1);
1176
+ force_encoding(state->escape, state->encoding);
1177
+ }
1178
+
1179
+ maybe_escaped_char = (
1180
+ '\\' c_any %unescape_char
1181
+ | ( c_any - [\\] ) % { state->escape = rb_str_substr(state->source, p - 1, 1); }
1182
+ );
1183
+
1184
+ maybe_escaped_ctrl_char = (
1185
+ '\\' c_any %unescape_char %slash_c_char
1186
+ | '?' % { state->escape = rb_str_new2("\x7f"); }
1187
+ | ( c_any - [\\?] ) % { state->escape = rb_str_substr(state->source, p - 1, 1); } %slash_c_char
1188
+ );
1189
+
1190
+ escape = (
1191
+ [0-7]{1,3} % {
1192
+ VALUE token = tok(state, state->escape_s, p);
1193
+ char c = NUM2INT(rb_str_to_inum(token, 8, 0));
1194
+ c = c % 0x100;
1195
+ state->escape = rb_str_new(&c, 1);
1196
+ force_encoding(state->escape, state->encoding);
1197
+ }
1198
+
1199
+ | 'x' xdigit{1,2} % {
1200
+ VALUE token = tok(state, state->escape_s + 1, p);
1201
+ char c = NUM2INT(rb_str_to_inum(token, 16, 0));
1202
+ state->escape = rb_str_new(&c, 1);
1203
+ force_encoding(state->escape, state->encoding);
1204
+ }
1205
+
1206
+ | 'x' ( c_any - xdigit )
1207
+ % {
1208
+ diagnostic(state, fatal, invalid_hex_escape, Qnil,
1209
+ range(state, state->escape_s - 1, p + 2), empty_array);
1210
+ }
1211
+
1212
+ | 'u' xdigit{4} % {
1213
+ VALUE token = tok(state, state->escape_s + 1, p);
1214
+ int i = NUM2INT(rb_str_to_inum(token, 16, 0));
1215
+ state->escape = rb_enc_uint_chr(i, rb_to_encoding(utf8_encoding));
1216
+ }
1217
+
1218
+ | 'u' xdigit{0,3} % {
1219
+ diagnostic(state, fatal, invalid_unicode_escape, Qnil,
1220
+ range(state, state->escape_s - 1, p), empty_array);
1221
+ }
1222
+
1223
+ | 'u{' ( c_any - xdigit - [ \t}] )* '}' % {
1224
+ diagnostic(state, fatal, invalid_unicode_escape, Qnil,
1225
+ range(state, state->escape_s - 1, p), empty_array);
1226
+ }
1227
+
1228
+ | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
1229
+ (
1230
+ ( xdigit{1,6} [ \t]* '}'
1231
+ %unicode_points
1232
+ )
1233
+ |
1234
+ ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
1235
+ | ( c_any - [ \t}] )* c_eof
1236
+ | xdigit{7,}
1237
+ ) % {
1238
+ diagnostic(state, fatal, invalid_unicode_escape, Qnil,
1239
+ range(state, p - 1, p), empty_array);
1240
+ }
1241
+ )
1242
+
1243
+ | ( 'C-' | 'c' ) escaped_nl?
1244
+ maybe_escaped_ctrl_char
1245
+
1246
+ | 'M-' escaped_nl?
1247
+ maybe_escaped_char
1248
+ %slash_m_char
1249
+
1250
+ | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
1251
+ | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
1252
+ maybe_escaped_ctrl_char
1253
+ %slash_m_char
1254
+
1255
+ | 'C' c_any %invalid_complex_escape
1256
+ | 'M' c_any %invalid_complex_escape
1257
+ | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
1258
+
1259
+ | ( c_any - [0-7xuCMc] ) %unescape_char
1260
+
1261
+ | c_eof % {
1262
+ diagnostic(state, fatal, escape_eof, Qnil, range(state, p - 1, p),
1263
+ empty_array);
1264
+ }
1265
+ );
1266
+
1267
+ e_bs = '\\' % {
1268
+ state->escape_s = p;
1269
+ state->escape = Qnil;
1270
+ };
1271
+
1272
+ e_heredoc_nl = c_nl % {
1273
+ if (state->herebody_s) {
1274
+ p = state->herebody_s;
1275
+ state->herebody_s = 0;
1276
+ }
1277
+ };
1278
+
1279
+ action extend_string {
1280
+ VALUE string = tok(state, ts, te);
1281
+ VALUE lookahead = Qnil;
1282
+
1283
+ if (state->version >= 22 && !stack_state_active(&state->cond)) {
1284
+ lookahead = tok(state, te, te + 2);
1285
+ }
1286
+
1287
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1288
+
1289
+ if (!current_literal->heredoc_e &&
1290
+ literal_nest_and_try_closing(current_literal, string, ts, te, lookahead)) {
1291
+ VALUE token = array_last(state->token_queue);
1292
+ if (rb_ary_entry(token, 0) == tLABEL_END) {
1293
+ p += 1;
1294
+ pop_literal(state);
1295
+ fnext expr_labelarg;
1296
+ } else {
1297
+ fnext *pop_literal(state);
1298
+ }
1299
+
1300
+ fbreak;
1301
+ } else {
1302
+ literal_extend_string(current_literal, string, ts, te);
1303
+ }
1304
+ }
1305
+
1306
+ action extend_string_escaped {
1307
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1308
+ VALUE escaped_char = rb_str_substr(state->source, state->escape_s, 1);
1309
+
1310
+ if (literal_munge_escape_p(current_literal, escaped_char)) {
1311
+ if (literal_regexp_p(current_literal) && is_regexp_metachar(escaped_char)) {
1312
+ literal_extend_string(current_literal, tok(state, ts, te), ts, te);
1313
+ } else {
1314
+ literal_extend_string(current_literal, escaped_char, ts, te);
1315
+ }
1316
+ } else {
1317
+ if (literal_regexp_p(current_literal)) {
1318
+ VALUE token = tok(state, ts, te);
1319
+ rb_funcall(token, rb_intern("gsub!"), 2, escaped_newline, blank_string);
1320
+ literal_extend_string(current_literal, token, ts, te);
1321
+ } else if (literal_heredoc_p(current_literal) && newline_char_p(escaped_char)) {
1322
+ if (literal_squiggly_heredoc_p(current_literal)) {
1323
+ literal_extend_string(current_literal, tok(state, ts, te), ts, te);
1324
+ } else {
1325
+ VALUE token = tok(state, ts, te);
1326
+ rb_funcall(token, rb_intern("gsub!"), 2, escaped_newline, blank_string);
1327
+ literal_extend_string(current_literal, token, ts, te);
1328
+ }
1329
+ } else if (state->escape == Qnil) {
1330
+ literal_extend_string(current_literal, tok(state, ts, te), ts, te);
1331
+ } else {
1332
+ literal_extend_string(current_literal, state->escape, ts, te);
1333
+ }
1334
+ }
1335
+ }
1336
+
1337
+ action extend_string_eol {
1338
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1339
+ long str_s = current_literal->str_s;
1340
+
1341
+ if (te == pe) {
1342
+ diagnostic(state, fatal, string_eof, Qnil,
1343
+ range(state, str_s, str_s + 1), empty_array);
1344
+ }
1345
+
1346
+ if (literal_heredoc_p(current_literal)) {
1347
+ VALUE line = tok(state, state->herebody_s, ts);
1348
+ rb_funcall(line, rb_intern("gsub!"), 2, crs_to_eol, blank_string);
1349
+
1350
+ if (state->version >= 18 && state->version <= 20) {
1351
+ rb_funcall(line, rb_intern("gsub!"), 2, cr_then_anything_to_eol, blank_string);
1352
+ }
1353
+
1354
+ if (literal_nest_and_try_closing(current_literal, line, state->herebody_s, ts, Qnil)) {
1355
+ state->herebody_s = te;
1356
+ p = current_literal->heredoc_e - 1;
1357
+ fnext *pop_literal(state); fbreak;
1358
+ } else {
1359
+ literal_infer_indent_level(current_literal, line);
1360
+ state->herebody_s = te;
1361
+ }
1362
+ } else {
1363
+ if (literal_nest_and_try_closing(current_literal, tok(state, ts, te), ts, te, Qnil)) {
1364
+ fnext *pop_literal(state); fbreak;
1365
+ }
1366
+
1367
+ if (state->herebody_s) {
1368
+ p = state->herebody_s - 1;
1369
+ state->herebody_s = 0;
1370
+ }
1371
+ }
1372
+
1373
+ if (literal_words_p(current_literal) && !eof_codepoint(get_codepoint(state, p))) {
1374
+ literal_extend_space(current_literal, ts, te);
1375
+ } else {
1376
+ literal_extend_string(current_literal, tok(state, ts, te), ts, te);
1377
+ literal_flush_string(current_literal);
1378
+ }
1379
+ }
1380
+
1381
+ action extend_string_space {
1382
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1383
+ literal_extend_space(current_literal, ts, te);
1384
+ }
1385
+
1386
+ interp_var = '#' ( global_var | class_var_v | instance_var_v );
1387
+
1388
+ action extend_interp_var {
1389
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1390
+ literal_flush_string(current_literal);
1391
+ literal_extend_content(current_literal);
1392
+
1393
+ emit_token(state, tSTRING_DVAR, Qnil, ts, ts + 1);
1394
+
1395
+ p = ts;
1396
+ fcall expr_variable;
1397
+ }
1398
+
1399
+ interp_code = '#{';
1400
+
1401
+ e_lbrace = '{' % {
1402
+ stack_state_push(&state->cond, 0);
1403
+ stack_state_push(&state->cmdarg, 0);
1404
+
1405
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1406
+ if (current_literal != NULL) {
1407
+ literal_start_interp_brace(current_literal);
1408
+ }
1409
+ };
1410
+
1411
+ e_rbrace = '}' % {
1412
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1413
+ if (current_literal != NULL) {
1414
+ if (literal_end_interp_brace_and_try_closing(current_literal)) {
1415
+ if (state->version == 18 || state->version == 19) {
1416
+ emit_token(state, tRCURLY, rb_str_new2("}"), p - 1, p);
1417
+ if (state->version < 24) {
1418
+ stack_state_lexpop(&state->cond);
1419
+ stack_state_lexpop(&state->cmdarg);
1420
+ } else {
1421
+ stack_state_pop(&state->cond);
1422
+ stack_state_pop(&state->cmdarg);
1423
+ }
1424
+ } else {
1425
+ emit_token(state, tSTRING_DEND, rb_str_new2("}"), p - 1, p);
1426
+ }
1427
+
1428
+ if (current_literal->herebody_s) {
1429
+ state->herebody_s = current_literal->herebody_s;
1430
+ }
1431
+
1432
+ fhold;
1433
+ fnext *next_state_for_literal(current_literal);
1434
+ fbreak;
1435
+ }
1436
+ }
1437
+ };
1438
+
1439
+ action extend_interp_code {
1440
+ literal *current_literal = lit_stack_top(&state->literal_stack);
1441
+ literal_flush_string(current_literal);
1442
+ literal_extend_content(current_literal);
1443
+
1444
+ emit_token(state, tSTRING_DBEG, rb_str_new2("#{"), ts, te);
1445
+
1446
+ if (current_literal->heredoc_e) {
1447
+ current_literal->herebody_s = state->herebody_s;
1448
+ state->herebody_s = 0;
1449
+ }
1450
+
1451
+ literal_start_interp_brace(current_literal);
1452
+ fnext expr_value;
1453
+ fbreak;
1454
+ }
1455
+
1456
+ interp_words := |*
1457
+ interp_code => extend_interp_code;
1458
+ interp_var => extend_interp_var;
1459
+ e_bs escape => extend_string_escaped;
1460
+ c_space+ => extend_string_space;
1461
+ c_eol => extend_string_eol;
1462
+ c_any => extend_string;
1463
+ *|;
1464
+
1465
+ interp_string := |*
1466
+ interp_code => extend_interp_code;
1467
+ interp_var => extend_interp_var;
1468
+ e_bs escape => extend_string_escaped;
1469
+ c_eol => extend_string_eol;
1470
+ c_any => extend_string;
1471
+ *|;
1472
+
1473
+ plain_words := |*
1474
+ e_bs c_any => extend_string_escaped;
1475
+ c_space+ => extend_string_space;
1476
+ c_eol => extend_string_eol;
1477
+ c_any => extend_string;
1478
+ *|;
1479
+
1480
+ plain_string := |*
1481
+ '\\' c_nl => extend_string_eol;
1482
+ e_bs c_any => extend_string_escaped;
1483
+ c_eol => extend_string_eol;
1484
+ c_any => extend_string;
1485
+ *|;
1486
+
1487
+ interp_backslash_delimited := |*
1488
+ interp_code => extend_interp_code;
1489
+ interp_var => extend_interp_var;
1490
+ c_eol => extend_string_eol;
1491
+ c_any => extend_string;
1492
+ *|;
1493
+
1494
+ plain_backslash_delimited := |*
1495
+ c_eol => extend_string_eol;
1496
+ c_any => extend_string;
1497
+ *|;
1498
+
1499
+ interp_backslash_delimited_words := |*
1500
+ interp_code => extend_interp_code;
1501
+ interp_var => extend_interp_var;
1502
+ c_space+ => extend_string_space;
1503
+ c_eol => extend_string_eol;
1504
+ c_any => extend_string;
1505
+ *|;
1506
+
1507
+ plain_backslash_delimited_words := |*
1508
+ c_space+ => extend_string_space;
1509
+ c_eol => extend_string_eol;
1510
+ c_any => extend_string;
1511
+ *|;
1512
+
1513
+ regexp_modifiers := |*
1514
+ [A-Za-z]+
1515
+ => {
1516
+ VALUE unknown_options = find_unknown_options(tok(state, ts, te));
1517
+
1518
+ if (unknown_options != Qnil) {
1519
+ VALUE hash = rb_hash_new();
1520
+ rb_hash_aset(hash, ID2SYM(rb_intern("options")), unknown_options);
1521
+ diagnostic(state, severity_error, regexp_options, hash,
1522
+ range(state, ts, te), empty_array);
1523
+ }
1524
+
1525
+ emit(tREGEXP_OPT);
1526
+ fnext expr_end;
1527
+ fbreak;
1528
+ };
1529
+
1530
+ any
1531
+ => {
1532
+ emit_token(state, tREGEXP_OPT, tok(state, ts, te - 1), ts, te - 1);
1533
+ fhold;
1534
+ fgoto expr_end;
1535
+ };
1536
+ *|;
1537
+
1538
+ w_space =
1539
+ c_space+
1540
+ | '\\' e_heredoc_nl
1541
+ ;
1542
+
1543
+ w_comment =
1544
+ '#' %{ sharp_s = p - 1; }
1545
+ c_line* %{ emit_comment(state, sharp_s, p == pe ? p - 2 : p); }
1546
+ ;
1547
+
1548
+ w_space_comment =
1549
+ w_space
1550
+ | w_comment
1551
+ ;
1552
+
1553
+ w_newline =
1554
+ e_heredoc_nl;
1555
+
1556
+ w_any =
1557
+ w_space
1558
+ | w_comment
1559
+ | w_newline
1560
+ ;
1561
+
1562
+ ambiguous_fid_suffix =
1563
+ [?!] %{ tm = p; } |
1564
+ [?!]'=' %{ tm = p - 2; }
1565
+ ;
1566
+
1567
+ ambiguous_ident_suffix =
1568
+ ambiguous_fid_suffix |
1569
+ '=' %{ tm = p; } |
1570
+ '==' %{ tm = p - 2; } |
1571
+ '=~' %{ tm = p - 2; } |
1572
+ '=>' %{ tm = p - 2; } |
1573
+ '===' %{ tm = p - 3; }
1574
+ ;
1575
+
1576
+ ambiguous_symbol_suffix =
1577
+ ambiguous_ident_suffix |
1578
+ '==>' %{ tm = p - 2; }
1579
+ ;
1580
+
1581
+ ambiguous_const_suffix =
1582
+ '::' %{ tm = p - 2; }
1583
+ ;
1584
+
1585
+ e_lbrack = '[' % {
1586
+ stack_state_push(&state->cond, 0);
1587
+ stack_state_push(&state->cmdarg, 0);
1588
+ };
1589
+
1590
+ e_lparen = '(' % {
1591
+ stack_state_push(&state->cond, 0);
1592
+ stack_state_push(&state->cmdarg, 0);
1593
+ state->paren_nest += 1;
1594
+ };
1595
+
1596
+ e_rparen = ')' % {
1597
+ state->paren_nest -= 1;
1598
+ };
1599
+
1600
+ action local_ident {
1601
+ VALUE str = tok(state, ts, te);
1602
+ emit(tIDENTIFIER);
1603
+
1604
+ if (STATIC_ENV_DECLARED(str)) {
1605
+ fnext expr_endfn; fbreak;
1606
+ } else {
1607
+ fnext *arg_or_cmdarg(command_state); fbreak;
1608
+ }
1609
+ }
1610
+
1611
+ expr_variable := |*
1612
+ global_var => {
1613
+ VALUE str = tok(state, ts, te);
1614
+
1615
+ if (is_nthref(str)) {
1616
+ VALUE integer = rb_str_to_inum(tok(state, ts + 1, te), 10, 0);
1617
+ emit_token(state, tNTH_REF, integer, ts, te);
1618
+ } else if (is_backref(str)) {
1619
+ emit(tBACK_REF);
1620
+ } else {
1621
+ emit(tGVAR);
1622
+ }
1623
+
1624
+ fnext *stack[--top]; fbreak;
1625
+ };
1626
+
1627
+ class_var_v => {
1628
+ VALUE str = tok(state, ts, te);
1629
+
1630
+ if (bad_cvar_name(str)) {
1631
+ VALUE hash = rb_hash_new();
1632
+ rb_hash_aset(hash, ID2SYM(rb_intern("name")), str);
1633
+ diagnostic(state, severity_error, cvar_name, hash, range(state, ts, te), empty_array);
1634
+ }
1635
+
1636
+ emit(tCVAR);
1637
+ fnext *stack[--top]; fbreak;
1638
+ };
1639
+
1640
+ instance_var_v => {
1641
+ VALUE str = tok(state, ts, te);
1642
+
1643
+ if (bad_ivar_name(str)) {
1644
+ VALUE hash = rb_hash_new();
1645
+ rb_hash_aset(hash, ID2SYM(rb_intern("name")), str);
1646
+ diagnostic(state, severity_error, ivar_name, hash, range(state, ts, te), empty_array);
1647
+ }
1648
+
1649
+ emit(tIVAR);
1650
+ fnext *stack[--top]; fbreak;
1651
+ };
1652
+ *|;
1653
+
1654
+ expr_fname := |*
1655
+ keyword
1656
+ => { emit_table_KEYWORDS_BEGIN(state, tok(state, ts, te), ts, te);
1657
+ fnext expr_endfn; fbreak; };
1658
+
1659
+ constant => { emit(tCONSTANT); fnext expr_endfn; fbreak; };
1660
+
1661
+ bareword [?=!]? => { emit(tIDENTIFIER); fnext expr_endfn; fbreak; };
1662
+
1663
+ global_var => { p = ts - 1; fnext expr_end; fcall expr_variable; };
1664
+
1665
+ operator_fname |
1666
+ operator_arithmetic |
1667
+ operator_rest
1668
+ => { emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
1669
+ fnext expr_endfn; fbreak; };
1670
+
1671
+ '::' => { fhold; fhold; fgoto expr_end; };
1672
+
1673
+ ':' => { fhold; fgoto expr_beg; };
1674
+
1675
+ '%s' c_any
1676
+ => {
1677
+ if (state->version == 23) {
1678
+ VALUE type = rb_str_substr(state->source, ts, te - ts - 1);
1679
+ VALUE delimiter = rb_str_substr(state->source, te - 1, 1);
1680
+ if (delimiter == Qnil)
1681
+ delimiter = blank_string;
1682
+
1683
+ fgoto *push_literal(state, type, delimiter, ts, 0, 0, 0, 0);
1684
+ } else {
1685
+ p = ts - 1;
1686
+ fgoto expr_end;
1687
+ }
1688
+ };
1689
+
1690
+ w_any;
1691
+
1692
+ c_any => { fhold; fgoto expr_end; };
1693
+
1694
+ c_eof => do_eof;
1695
+ *|;
1696
+
1697
+ expr_endfn := |*
1698
+ label ( any - ':' ) => {
1699
+ emit_token(state, tLABEL, tok(state, ts, te - 2), ts, te - 1);
1700
+ fhold; fnext expr_labelarg; fbreak;
1701
+ };
1702
+
1703
+ w_space_comment;
1704
+
1705
+ c_any => { fhold; fgoto expr_end; };
1706
+
1707
+ c_eof => do_eof;
1708
+ *|;
1709
+
1710
+ expr_dot := |*
1711
+ constant => { emit(tCONSTANT); fnext *arg_or_cmdarg(command_state); fbreak; };
1712
+
1713
+ call_or_var => { emit(tIDENTIFIER); fnext *arg_or_cmdarg(command_state); fbreak; };
1714
+
1715
+ bareword ambiguous_fid_suffix
1716
+ => { emit_token(state, tFID, tok(state, ts, tm), ts, tm);
1717
+ fnext *arg_or_cmdarg(command_state); p = tm - 1; fbreak; };
1718
+
1719
+ operator_fname |
1720
+ operator_arithmetic |
1721
+ operator_rest
1722
+ => { emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
1723
+ fnext expr_arg; fbreak; };
1724
+
1725
+ w_any;
1726
+
1727
+ c_any
1728
+ => { fhold; fgoto expr_end; };
1729
+
1730
+ c_eof => do_eof;
1731
+ *|;
1732
+
1733
+ expr_arg := |*
1734
+ w_space+ e_lparen => {
1735
+ if (state->version == 18) {
1736
+ emit_token(state, tLPAREN2, rb_str_new2("("), te - 1, te);
1737
+ fnext expr_value; fbreak;
1738
+ } else {
1739
+ emit_token(state, tLPAREN_ARG, rb_str_new2("("), te - 1, te);
1740
+ fnext expr_beg; fbreak;
1741
+ }
1742
+ };
1743
+
1744
+ e_lparen => { emit(tLPAREN2); fnext expr_beg; fbreak; };
1745
+
1746
+ w_space+ e_lbrack => {
1747
+ emit_token(state, tLBRACK, rb_str_new2("["), te - 1, te);
1748
+ fnext expr_beg; fbreak;
1749
+ };
1750
+
1751
+ w_space* e_lbrace => {
1752
+ VALUE val = array_last(state->lambda_stack);
1753
+ if (val != Qnil && NUM2INT(val) == state->paren_nest) {
1754
+ rb_ary_pop(state->lambda_stack);
1755
+ emit_token(state, tLAMBEG, rb_str_new2("{"), te - 1, te);
1756
+ } else {
1757
+ emit_token(state, tLCURLY, rb_str_new2("{"), te - 1, te);
1758
+ }
1759
+ fnext expr_value; fbreak;
1760
+ };
1761
+
1762
+ '?' c_space_nl => { p = ts - 1; fgoto expr_end; };
1763
+
1764
+ w_space* '?' => { fhold; fgoto expr_beg; };
1765
+
1766
+ w_space+ %{ tm = p; }
1767
+ ( [%/] ( c_any - c_space_nl - '=' ) | '<<' ) => {
1768
+ if (NUM2INT(rb_ary_entry(state->source_pts, tm)) == '/') {
1769
+ diagnostic(state, warning, ambiguous_literal, Qnil,
1770
+ range(state, tm, tm + 1), empty_array);
1771
+ }
1772
+
1773
+ p = tm - 1;
1774
+ fgoto expr_beg;
1775
+ };
1776
+
1777
+ w_space+ %{ tm = p; } ( '+' | '-' | '*' | '&' | '**' ) => {
1778
+ VALUE hash = rb_hash_new();
1779
+ VALUE str = tok(state, tm, te);
1780
+ rb_hash_aset(hash, prefix, str);
1781
+ diagnostic(state, warning, ambiguous_prefix, hash, range(state, tm, te),
1782
+ empty_array);
1783
+
1784
+ p = tm - 1;
1785
+ fgoto expr_beg;
1786
+ };
1787
+
1788
+ w_space+ '::' => { fhold; fhold; fgoto expr_beg; };
1789
+
1790
+ w_space* ':' => { fhold; fgoto expr_beg; };
1791
+
1792
+ w_space+ label => { p = ts - 1; fgoto expr_beg; };
1793
+
1794
+ w_space+ %{ tm = p; } '?' c_space_nl => { p = tm - 1; fgoto expr_end; };
1795
+
1796
+ w_space* operator_arithmetic
1797
+ ( '=' | c_space_nl )? |
1798
+ w_space* keyword_modifier |
1799
+ w_space* '&.' |
1800
+ w_space* punctuation_end
1801
+ => {
1802
+ p = ts - 1;
1803
+ fgoto expr_end;
1804
+ };
1805
+
1806
+ w_space;
1807
+
1808
+ w_comment => { fgoto expr_end; };
1809
+
1810
+ w_newline => { fhold; fgoto expr_end; };
1811
+
1812
+ c_any => { fhold; fgoto expr_beg; };
1813
+
1814
+ c_eof => do_eof;
1815
+ *|;
1816
+
1817
+ expr_cmdarg := |*
1818
+ w_space+ e_lparen
1819
+ => {
1820
+ emit_token(state, tLPAREN_ARG, rb_str_new2("("), te - 1, te);
1821
+ if (state->version == 18) {
1822
+ fnext expr_value; fbreak;
1823
+ } else {
1824
+ fnext expr_beg; fbreak;
1825
+ }
1826
+ };
1827
+
1828
+ w_space* 'do'
1829
+ => {
1830
+ if (stack_state_active(&state->cond)) {
1831
+ emit_token(state, kDO_COND, rb_str_new2("do"), te - 2, te);
1832
+ } else {
1833
+ emit_token(state, kDO, rb_str_new2("do"), te - 2, te);
1834
+ }
1835
+ fnext expr_value; fbreak;
1836
+ };
1837
+
1838
+ c_any |
1839
+ w_space* bareword |
1840
+ w_space* label
1841
+ => { p = ts - 1;
1842
+ fgoto expr_arg; };
1843
+
1844
+ c_eof => do_eof;
1845
+ *|;
1846
+
1847
+ expr_endarg := |*
1848
+ e_lbrace => {
1849
+ VALUE val = array_last(state->lambda_stack);
1850
+ if (val != Qnil && NUM2INT(val) == state->paren_nest) {
1851
+ rb_ary_pop(state->lambda_stack);
1852
+ emit_token(state, tLAMBEG, rb_str_new2("{"), te - 1, te);
1853
+ } else {
1854
+ emit_token(state, tLBRACE_ARG, rb_str_new2("{"), te - 1, te);
1855
+ }
1856
+ fnext expr_value; fbreak;
1857
+ };
1858
+
1859
+ 'do' => { emit_do(state, 1, ts, te); fnext expr_value; fbreak; };
1860
+
1861
+ w_space_comment;
1862
+
1863
+ c_any
1864
+ => { fhold; fgoto expr_end; };
1865
+
1866
+ c_eof => do_eof;
1867
+ *|;
1868
+
1869
+ expr_mid := |*
1870
+ keyword_modifier
1871
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
1872
+ fnext expr_beg; fbreak; };
1873
+
1874
+ bareword => { p = ts - 1; fgoto expr_beg; };
1875
+
1876
+ w_space_comment;
1877
+
1878
+ w_newline => { fhold; fgoto expr_end; };
1879
+
1880
+ c_any => { fhold; fgoto expr_beg; };
1881
+
1882
+ c_eof => do_eof;
1883
+ *|;
1884
+
1885
+ expr_beg := |*
1886
+ [+\-] w_any* [0-9] => {
1887
+ emit_token(state, tUNARY_NUM, tok(state, ts, ts + 1), ts, ts + 1);
1888
+ fhold; fnext expr_end; fbreak;
1889
+ };
1890
+
1891
+ '*' => { emit(tSTAR); fbreak; };
1892
+
1893
+ '/' c_any => {
1894
+ VALUE delimiter = rb_str_substr(state->source, ts, 1);
1895
+ fhold; fgoto *push_literal(state, delimiter, delimiter, ts, 0, 0, 0, 0);
1896
+ };
1897
+
1898
+ '%' ( any - [A-Za-z] ) => {
1899
+ VALUE type = rb_str_substr(state->source, ts, 1);
1900
+ VALUE delimiter = rb_str_substr(state->source, te - 1, 1);
1901
+ if (delimiter == Qnil)
1902
+ delimiter = blank_string;
1903
+
1904
+ fgoto *push_literal(state, type, delimiter, ts, 0, 0, 0, 0);
1905
+ };
1906
+
1907
+ '%' [A-Za-z]+ c_any => {
1908
+ VALUE type = rb_str_substr(state->source, ts, te - ts - 1);
1909
+ VALUE delimiter = rb_str_substr(state->source, te - 1, 1);
1910
+ if (delimiter == Qnil)
1911
+ delimiter = blank_string;
1912
+
1913
+ fgoto *push_literal(state, type, delimiter, ts, 0, 0, 0, 0);
1914
+ };
1915
+
1916
+ '%' c_eof => {
1917
+ diagnostic(state, fatal, string_eof, Qnil,
1918
+ range(state, ts, ts + 1), empty_array);
1919
+ };
1920
+
1921
+ '<<' [~\-]?
1922
+ ( '"' ( any - '"' )* '"'
1923
+ | "'" ( any - "'" )* "'"
1924
+ | "`" ( any - "`" )* "`"
1925
+ | bareword ) % { heredoc_e = p; }
1926
+ c_line* c_nl % { if (!state->herebody_s) state->herebody_s = p; } => {
1927
+
1928
+ VALUE heredoc = tok(state, ts, heredoc_e);
1929
+ VALUE type;
1930
+ char *cp = RSTRING_PTR(heredoc);
1931
+ int indent = 0, dedent_body = 0;
1932
+ long rng_s = ts, rng_e = heredoc_e;
1933
+
1934
+ if (cp[2] == '-') {
1935
+ indent = 1;
1936
+ cp += 3;
1937
+ rng_s += 3;
1938
+ } else if (cp[2] == '~') {
1939
+ dedent_body = indent = 1;
1940
+ cp += 3;
1941
+ rng_s += 3;
1942
+ } else {
1943
+ cp += 2;
1944
+ rng_s += 2;
1945
+ }
1946
+
1947
+ if (*cp == '"' || *cp == '\'' || *cp == '`') {
1948
+ char type_str[3];
1949
+ type_str[0] = '<';
1950
+ type_str[1] = '<';
1951
+ type_str[2] = *cp;
1952
+
1953
+ cp += 1;
1954
+ rng_s += 1;
1955
+ rng_e -= 1;
1956
+
1957
+ type = rb_str_new(type_str, 3);
1958
+ } else {
1959
+ type = rb_str_new2("<<\"");
1960
+ }
1961
+
1962
+ VALUE delimiter = tok(state, rng_s, rng_e);
1963
+
1964
+ if (state->version >= 24) {
1965
+ if (NUM2INT(rb_funcall(delimiter, rb_intern("count"), 1, newline)) > 0) {
1966
+ if (str_end_with_p(delimiter, "\n")) {
1967
+ diagnostic(state, warning, heredoc_id_ends_with_nl, Qnil,
1968
+ range(state, ts, ts + 1), empty_array);
1969
+
1970
+ delimiter = rb_funcall(delimiter, rb_intern("rstrip"), 0);
1971
+ } else {
1972
+ diagnostic(state, fatal, heredoc_id_has_newline, Qnil,
1973
+ range(state, ts, ts + 1), empty_array);
1974
+ }
1975
+ }
1976
+ }
1977
+
1978
+ if (dedent_body && state->version >= 18 && state->version <= 22) {
1979
+ emit_token(state, tLSHFT, rb_str_new2("<<"), ts, ts + 2);
1980
+ p = ts + 1;
1981
+ fnext expr_beg; fbreak;
1982
+ } else {
1983
+ fnext *push_literal(state, type, delimiter, ts, heredoc_e, indent,
1984
+ dedent_body, 0);
1985
+ p = state->herebody_s - 1;
1986
+ }
1987
+ };
1988
+
1989
+ ':' ('&&' | '||') => {
1990
+ fhold; fhold;
1991
+ emit_token(state, tSYMBEG, tok(state, ts, ts + 1), ts, ts + 1);
1992
+ fgoto expr_fname;
1993
+ };
1994
+
1995
+ ':' ['"] => { /* ' */
1996
+ VALUE type = tok(state, ts, te);
1997
+ VALUE delimiter = tok(state, te - 1, te);
1998
+ fgoto *push_literal(state, type, delimiter, ts, 0, 0, 0, 0);
1999
+ };
2000
+
2001
+ ':' [!~] '@'
2002
+ => {
2003
+ emit_token(state, tSYMBOL, tok(state, ts + 1, ts + 2), ts, te);
2004
+ fnext expr_end; fbreak;
2005
+ };
2006
+
2007
+ ':' bareword ambiguous_symbol_suffix => {
2008
+ emit_token(state, tSYMBOL, tok(state, ts + 1, tm), ts, tm);
2009
+ p = tm - 1;
2010
+ fnext expr_end; fbreak;
2011
+ };
2012
+
2013
+ ':' ( bareword | global_var | class_var | instance_var |
2014
+ operator_fname | operator_arithmetic | operator_rest ) => {
2015
+ emit_token(state, tSYMBOL, tok(state, ts + 1, te), ts, te);
2016
+ fnext expr_end; fbreak;
2017
+ };
2018
+
2019
+ '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
2020
+ | (c_any - c_space_nl - e_bs) % { state->escape = Qnil; }
2021
+ ) => {
2022
+ VALUE value = state->escape;
2023
+ if (value == Qnil)
2024
+ value = tok(state, ts + 1, te);
2025
+
2026
+ if (state->version == 18)
2027
+ emit_token(state, tINTEGER, rb_funcall(value, rb_intern("getbyte"), 1, INT2NUM(0)), ts, te);
2028
+ else
2029
+ emit_token(state, tCHARACTER, value, ts, te);
2030
+
2031
+ fnext expr_end; fbreak;
2032
+ };
2033
+
2034
+ '?' c_space_nl => {
2035
+ VALUE escape = escape_char(rb_str_subseq(state->source, ts + 1, 1));
2036
+ VALUE hash = rb_hash_new();
2037
+ rb_hash_aset(hash, ID2SYM(rb_intern("escape")), escape);
2038
+ diagnostic(state, warning, invalid_escape_use, hash,
2039
+ range(state, ts, te), empty_array);
2040
+
2041
+ p = ts - 1;
2042
+ fgoto expr_end;
2043
+ };
2044
+
2045
+ '?' c_eof => {
2046
+ diagnostic(state, fatal, incomplete_escape, Qnil,
2047
+ range(state, ts, ts + 1), empty_array);
2048
+ };
2049
+
2050
+ '?' [A-Za-z_] bareword => { p = ts - 1; fgoto expr_end; };
2051
+
2052
+ e_lbrace => {
2053
+ VALUE val = array_last(state->lambda_stack);
2054
+ if (val != Qnil && NUM2INT(val) == state->paren_nest) {
2055
+ rb_ary_pop(state->lambda_stack);
2056
+ emit(tLAMBEG);
2057
+ } else {
2058
+ emit(tLBRACE);
2059
+ }
2060
+ fbreak;
2061
+ };
2062
+
2063
+ e_lbrack => {
2064
+ emit_token(state, tLBRACK, tok(state, ts, te), ts, te);
2065
+ fbreak;
2066
+ };
2067
+
2068
+ e_lparen => {
2069
+ emit_token(state, tLPAREN, tok(state, ts, te), ts, te);
2070
+ fbreak;
2071
+ };
2072
+
2073
+ punctuation_begin
2074
+ => { emit_table_PUNCTUATION_BEGIN(state, tok(state, ts, te), ts, te);
2075
+ fbreak; };
2076
+
2077
+ 'rescue' %{ tm = p; } '=>'? => {
2078
+ emit_token(state, kRESCUE, tok(state, ts, tm), ts, tm);
2079
+ p = tm - 1;
2080
+ fnext expr_mid; fbreak;
2081
+ };
2082
+
2083
+ keyword_modifier
2084
+ => { emit_table_KEYWORDS_BEGIN(state, tok(state, ts, te), ts, te);
2085
+ fnext expr_value; fbreak; };
2086
+
2087
+ label ( any - ':' )
2088
+ => {
2089
+ fhold;
2090
+
2091
+ if (state->version == 18) {
2092
+ VALUE ident = tok(state, ts, te - 2);
2093
+
2094
+ emit_token(state, is_capitalized(ident) ? tCONSTANT : tIDENTIFIER,
2095
+ ident, ts, te - 2);
2096
+ fhold;
2097
+
2098
+ if (STATIC_ENV_DECLARED(ident)) {
2099
+ fnext expr_end;
2100
+ } else {
2101
+ fnext *arg_or_cmdarg(command_state);
2102
+ }
2103
+ } else {
2104
+ emit_token(state, tLABEL, tok(state, ts, te - 2), ts, te - 1);
2105
+ fnext expr_labelarg;
2106
+ }
2107
+
2108
+ fbreak;
2109
+ };
2110
+
2111
+ bareword ambiguous_ident_suffix | keyword => { p = ts - 1; fgoto expr_end; };
2112
+
2113
+ call_or_var => local_ident;
2114
+
2115
+ (call_or_var - keyword)
2116
+ % { ident_tok = tok(state, ts, te); ident_ts = ts; ident_te = te; }
2117
+ w_space+ '('
2118
+ => {
2119
+ emit_token(state, tIDENTIFIER, ident_tok, ident_ts, ident_te);
2120
+ p = ident_te - 1;
2121
+
2122
+ if (STATIC_ENV_DECLARED(ident_tok) && state->version < 25) {
2123
+ fnext expr_endfn;
2124
+ } else {
2125
+ fnext expr_cmdarg;
2126
+ }
2127
+
2128
+ fbreak;
2129
+ };
2130
+
2131
+ w_any;
2132
+
2133
+ e_heredoc_nl '=begin' ( c_space | c_nl_zlen ) => {
2134
+ p = ts - 1;
2135
+ state->cs_before_block_comment = state->cs;
2136
+ fgoto line_begin;
2137
+ };
2138
+
2139
+ operator_arithmetic '=' |
2140
+ operator_rest |
2141
+ punctuation_end |
2142
+ c_any
2143
+ => { p = ts - 1; fgoto expr_end; };
2144
+
2145
+ c_eof => do_eof;
2146
+ *|;
2147
+
2148
+ expr_labelarg := |*
2149
+ w_space_comment;
2150
+
2151
+ w_newline => {
2152
+ if (state->in_kwarg) {
2153
+ fhold; fgoto expr_end;
2154
+ } else {
2155
+ fgoto line_begin;
2156
+ }
2157
+ };
2158
+
2159
+ c_any => { fhold; fgoto expr_beg; };
2160
+
2161
+ c_eof => do_eof;
2162
+ *|;
2163
+
2164
+ expr_value := |*
2165
+ label (any - ':') => { p = ts - 1; fgoto expr_end; };
2166
+
2167
+ ['"] => { /* ' */
2168
+ VALUE type = tok(state, ts, te);
2169
+ fgoto *push_literal(state, type, type, ts, 0, 0, 0, 0);
2170
+ };
2171
+
2172
+ w_space_comment;
2173
+
2174
+ w_newline => { fgoto line_begin; };
2175
+
2176
+ c_any => { fhold; fgoto expr_beg; };
2177
+
2178
+ c_eof => do_eof;
2179
+ *|;
2180
+
2181
+ expr_end := |*
2182
+ '->' => {
2183
+ emit_token(state, tLAMBDA, tok(state, ts, ts + 2), ts, ts + 2);
2184
+ rb_ary_push(state->lambda_stack, INT2NUM(state->paren_nest));
2185
+ fnext expr_endfn; fbreak;
2186
+ };
2187
+
2188
+ e_lbrace => {
2189
+ VALUE val = array_last(state->lambda_stack);
2190
+ if (val != Qnil && NUM2INT(val) == state->paren_nest) {
2191
+ rb_ary_pop(state->lambda_stack);
2192
+ emit(tLAMBEG);
2193
+ } else {
2194
+ emit(tLCURLY);
2195
+ }
2196
+ fnext expr_value; fbreak;
2197
+ };
2198
+
2199
+ 'do' => {
2200
+ VALUE val = array_last(state->lambda_stack);
2201
+ if (val != Qnil && NUM2INT(val) == state->paren_nest) {
2202
+ rb_ary_pop(state->lambda_stack);
2203
+ emit(kDO_LAMBDA);
2204
+ } else {
2205
+ emit_do(state, 0, ts, te);
2206
+ }
2207
+ fnext expr_value; fbreak;
2208
+ };
2209
+
2210
+ keyword_with_fname
2211
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
2212
+ fnext expr_fname; fbreak; };
2213
+
2214
+ 'class' w_any* '<<'
2215
+ => { emit_token(state, kCLASS, rb_str_new2("class"), ts, ts + 5);
2216
+ emit_token(state, tLSHFT, rb_str_new2("<<"), te - 2, te);
2217
+ fnext expr_value; fbreak; };
2218
+
2219
+ keyword_modifier
2220
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
2221
+ fnext expr_beg; fbreak; };
2222
+
2223
+ keyword_with_value
2224
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
2225
+ fnext expr_value; fbreak; };
2226
+
2227
+ keyword_with_mid
2228
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
2229
+ fnext expr_mid; fbreak; };
2230
+
2231
+ keyword_with_arg
2232
+ => {
2233
+ VALUE keyword = tok(state, ts, te);
2234
+ emit_table_KEYWORDS(state, keyword, ts, te);
2235
+
2236
+ if (state->version == 18 && strcmp(RSTRING_PTR(keyword), "not") == 0) {
2237
+ fnext expr_beg; fbreak;
2238
+ } else {
2239
+ fnext expr_arg; fbreak;
2240
+ }
2241
+ };
2242
+
2243
+ '__ENCODING__' => {
2244
+ if (state->version == 18) {
2245
+ VALUE str = tok(state, ts, te);
2246
+ emit(tIDENTIFIER);
2247
+
2248
+ if (STATIC_ENV_DECLARED(str)) {
2249
+ fnext expr_end;
2250
+ } else {
2251
+ fnext *arg_or_cmdarg(command_state);
2252
+ }
2253
+ } else {
2254
+ emit(k__ENCODING__);
2255
+ }
2256
+ fbreak;
2257
+ };
2258
+
2259
+ keyword_with_end
2260
+ => { emit_table_KEYWORDS(state, tok(state, ts, te), ts, te);
2261
+ fbreak; };
2262
+
2263
+ ( '0' [Xx] %{ num_base = 16; num_digits_s = p; } int_hex
2264
+ | '0' [Dd] %{ num_base = 10; num_digits_s = p; } int_dec
2265
+ | '0' [Oo] %{ num_base = 8; num_digits_s = p; } int_dec
2266
+ | '0' [Bb] %{ num_base = 2; num_digits_s = p; } int_bin
2267
+ | [1-9] digit* '_'? %{ num_base = 10; num_digits_s = ts; } int_dec
2268
+ | '0' digit* '_'? %{ num_base = 8; num_digits_s = ts; } int_dec
2269
+ ) %{ num_suffix_s = p; } int_suffix
2270
+ => {
2271
+ int invalid_idx;
2272
+ VALUE digits = tok(state, num_digits_s, num_suffix_s);
2273
+
2274
+ if (NUM2INT(rb_ary_entry(state->source_pts, num_suffix_s - 1)) == '_') {
2275
+ VALUE hash = rb_hash_new();
2276
+ rb_hash_aset(hash, character, rb_str_new2("_"));
2277
+ diagnostic(state, severity_error, trailing_in_number, hash,
2278
+ range(state, te - 1, te), empty_array);
2279
+ } else if (RSTRING_LEN(digits) == 0 && num_base == 8 && state->version == 18) {
2280
+ digits = rb_str_new2("0");
2281
+ } else if (RSTRING_LEN(digits) == 0) {
2282
+ diagnostic(state, severity_error, empty_numeric, Qnil,
2283
+ range(state, ts, te), empty_array);
2284
+ } else if (num_base == 8 && (invalid_idx = find_8_or_9(digits)) != -1) {
2285
+ long invalid_s = num_digits_s + invalid_idx;
2286
+ diagnostic(state, severity_error, invalid_octal, Qnil,
2287
+ range(state, invalid_s, invalid_s + 1), empty_array);
2288
+ }
2289
+
2290
+ VALUE integer = rb_str_to_inum(digits, num_base, 0);
2291
+ if (state->version >= 18 && state->version <= 20) {
2292
+ emit_token(state, tINTEGER, integer, numeric_s, num_suffix_s);
2293
+ p = num_suffix_s - 1;
2294
+ } else {
2295
+ num_xfrm(state, integer, numeric_s, te);
2296
+ }
2297
+
2298
+ fbreak;
2299
+ };
2300
+
2301
+ flo_frac flo_pow?
2302
+ => {
2303
+ diagnostic(state, severity_error, no_dot_digit_literal, Qnil,
2304
+ range(state, ts, te), empty_array);
2305
+ };
2306
+
2307
+ flo_int [eE]
2308
+ => {
2309
+ if (state->version >= 18 && state->version <= 20) {
2310
+ VALUE hash = rb_hash_new();
2311
+ rb_hash_aset(hash, character, tok(state, te - 1, te));
2312
+ diagnostic(state, severity_error, trailing_in_number, hash,
2313
+ range(state, te - 1, te), empty_array);
2314
+ } else {
2315
+ VALUE integer = rb_str_to_inum(tok(state, ts, te - 1), 10, 0);
2316
+ emit_token(state, tINTEGER, integer, ts, te - 1);
2317
+ fhold; fbreak;
2318
+ }
2319
+ };
2320
+
2321
+ flo_int flo_frac [eE]
2322
+ => {
2323
+ if (state->version >= 18 && state->version <= 20) {
2324
+ VALUE hash = rb_hash_new();
2325
+ rb_hash_aset(hash, character, tok(state, te - 1, te));
2326
+ diagnostic(state, severity_error, trailing_in_number, hash,
2327
+ range(state, te - 1, te), empty_array);
2328
+ } else {
2329
+ VALUE fval = rb_funcall(tok(state, ts, te - 1), rb_intern("to_f"), 0);
2330
+ emit_token(state, tFLOAT, fval, ts, te - 1);
2331
+ fhold; fbreak;
2332
+ }
2333
+ };
2334
+
2335
+ flo_int
2336
+ ( flo_frac? flo_pow %{ num_suffix_s = p; } flo_pow_suffix
2337
+ | flo_frac %{ num_suffix_s = p; } flo_suffix
2338
+ )
2339
+ => {
2340
+ VALUE digits = tok(state, ts, num_suffix_s);
2341
+
2342
+ if (state->version >= 18 && state->version <= 20) {
2343
+ VALUE fval = rb_Float(digits);
2344
+ emit_token(state, tFLOAT, fval, ts, num_suffix_s);
2345
+ p = num_suffix_s - 1;
2346
+ } else {
2347
+ num_xfrm(state, digits, ts, te);
2348
+ }
2349
+ fbreak;
2350
+ };
2351
+
2352
+ '`' | ['"] => { /* ' */
2353
+ VALUE type = tok(state, ts, te);
2354
+ VALUE delimiter = tok(state, te - 1, te);
2355
+ fgoto *push_literal(state, type, delimiter, ts, 0, 0, 0, 1);
2356
+ };
2357
+
2358
+ constant => { emit(tCONSTANT); fnext *arg_or_cmdarg(command_state); fbreak; };
2359
+
2360
+ constant ambiguous_const_suffix => {
2361
+ emit_token(state, tCONSTANT, tok(state, ts, tm), ts, tm);
2362
+ p = tm - 1;
2363
+ fbreak;
2364
+ };
2365
+
2366
+ global_var | class_var_v | instance_var_v
2367
+ => { p = ts - 1; fcall expr_variable; };
2368
+
2369
+ '.' | '&.' | '::'
2370
+ => { emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
2371
+ fnext expr_dot; fbreak; };
2372
+
2373
+ call_or_var => local_ident;
2374
+
2375
+ bareword ambiguous_fid_suffix => {
2376
+ if (tm == te) {
2377
+ emit(tFID);
2378
+ } else {
2379
+ emit_token(state, tIDENTIFIER, tok(state, ts, tm), ts, tm);
2380
+ p = tm - 1;
2381
+ }
2382
+ fnext expr_arg; fbreak;
2383
+ };
2384
+
2385
+ ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' )
2386
+ => {
2387
+ emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
2388
+ fnext expr_value; fbreak;
2389
+ };
2390
+
2391
+ ( e_lparen | '|' | '~' | '!' )
2392
+ => { emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
2393
+ fnext expr_beg; fbreak; };
2394
+
2395
+ e_rbrace => {
2396
+ emit(tRCURLY);
2397
+
2398
+ if (state->version < 24) {
2399
+ stack_state_lexpop(&state->cond);
2400
+ stack_state_lexpop(&state->cmdarg);
2401
+ } else {
2402
+ stack_state_pop(&state->cond);
2403
+ stack_state_pop(&state->cmdarg);
2404
+ }
2405
+
2406
+ if (state->version >= 25) {
2407
+ fnext expr_end;
2408
+ } else {
2409
+ fnext expr_endarg;
2410
+ }
2411
+
2412
+ fbreak;
2413
+ };
2414
+
2415
+ e_rparen => {
2416
+ emit(tRPAREN);
2417
+
2418
+ if (state->version < 24) {
2419
+ stack_state_lexpop(&state->cond);
2420
+ stack_state_lexpop(&state->cmdarg);
2421
+ } else {
2422
+ stack_state_pop(&state->cond);
2423
+ stack_state_pop(&state->cmdarg);
2424
+ }
2425
+
2426
+ fbreak;
2427
+ };
2428
+
2429
+ ']' => {
2430
+ emit(tRBRACK);
2431
+
2432
+ if (state->version < 24) {
2433
+ stack_state_lexpop(&state->cond);
2434
+ stack_state_lexpop(&state->cmdarg);
2435
+ } else {
2436
+ stack_state_pop(&state->cond);
2437
+ stack_state_pop(&state->cmdarg);
2438
+ }
2439
+
2440
+ if (state->version >= 25) {
2441
+ fnext expr_end;
2442
+ } else {
2443
+ fnext expr_endarg;
2444
+ }
2445
+
2446
+ fbreak;
2447
+ };
2448
+
2449
+ operator_arithmetic '='
2450
+ => { emit_token(state, tOP_ASGN, tok(state, ts, te - 1), ts, te);
2451
+ fnext expr_beg; fbreak; };
2452
+
2453
+ '?' => { emit(tEH); fnext expr_value; fbreak; };
2454
+
2455
+ e_lbrack => { emit(tLBRACK2); fnext expr_beg; fbreak; };
2456
+
2457
+ punctuation_end
2458
+ => { emit_table_PUNCTUATION(state, tok(state, ts, te), ts, te);
2459
+ fnext expr_beg; fbreak; };
2460
+
2461
+ w_space_comment;
2462
+
2463
+ w_newline => { fgoto leading_dot; };
2464
+
2465
+ ';' => { emit(tSEMI); fnext expr_value; fbreak; };
2466
+
2467
+ '\\' c_line {
2468
+ diagnostic(state, severity_error, bare_backslash, Qnil,
2469
+ range(state, ts, ts + 1), empty_array);
2470
+ fhold;
2471
+ };
2472
+
2473
+ c_any
2474
+ => {
2475
+ VALUE hash = rb_hash_new();
2476
+ VALUE str = rb_str_inspect(tok(state, ts, te));
2477
+ rb_hash_aset(hash, character, rb_str_substr(str, 1, NUM2INT(rb_str_length(str)) - 2));
2478
+ diagnostic(state, fatal, unexpected, hash, range(state, ts, te), empty_array);
2479
+ };
2480
+
2481
+ c_eof => do_eof;
2482
+ *|;
2483
+
2484
+ leading_dot := |*
2485
+ c_space* %{ tm = p; } ('.' | '&.') => { p = tm - 1; fgoto expr_end; };
2486
+
2487
+ any => {
2488
+ emit_token(state, tNL, Qnil, state->newline_s, state->newline_s + 1);
2489
+ fhold; fnext line_begin; fbreak;
2490
+ };
2491
+ *|;
2492
+
2493
+ line_comment := |*
2494
+ '=end' c_line* c_nl_zlen => {
2495
+ emit_comment(state, state->eq_begin_s, te);
2496
+ fgoto *state->cs_before_block_comment;
2497
+ };
2498
+
2499
+ c_line* c_nl;
2500
+
2501
+ c_line* zlen => {
2502
+ diagnostic(state, fatal, embedded_document, Qnil,
2503
+ range(state, state->eq_begin_s, state->eq_begin_s + 6),
2504
+ empty_array);
2505
+ };
2506
+ *|;
2507
+
2508
+ line_begin := |*
2509
+ w_any;
2510
+
2511
+ '=begin' ( c_space | c_nl_zlen ) => {
2512
+ state->eq_begin_s = ts;
2513
+ fgoto line_comment;
2514
+ };
2515
+
2516
+ '__END__' ( c_eol - zlen ) => { p = pe - 3; };
2517
+
2518
+ c_any => { fhold; fgoto expr_value; };
2519
+
2520
+ c_eof => do_eof;
2521
+ *|;
2522
+ }%%