c_lexer 2.5.1.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.gitmodules +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +53 -0
- data/README.md +26 -0
- data/Rakefile +51 -0
- data/bin/c-ruby-parse +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/c_lexer.gemspec +36 -0
- data/ext/lexer/cmdarg.h +47 -0
- data/ext/lexer/cond.h +47 -0
- data/ext/lexer/emit_tables.h +177 -0
- data/ext/lexer/extconf.rb +5 -0
- data/ext/lexer/lexer.h +308 -0
- data/ext/lexer/lexer.rl +2522 -0
- data/ext/lexer/literal.h +62 -0
- data/ext/lexer/literal/methods.h +371 -0
- data/ext/lexer/literal/type.h +28 -0
- data/ext/lexer/stack.h +41 -0
- data/ext/lexer/stack_state.h +48 -0
- data/lib/c_lexer.rb +80 -0
- data/lib/c_lexer/version.rb +3 -0
- metadata +195 -0
data/ext/lexer/literal.h
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#ifndef LITERAL_H
|
2
|
+
#define LITERAL_H
|
3
|
+
|
4
|
+
#include "literal/type.h"
|
5
|
+
typedef struct literal literal;
|
6
|
+
#include "lexer.h"
|
7
|
+
|
8
|
+
struct literal {
|
9
|
+
struct lexer_state *lexer;
|
10
|
+
VALUE buffer;
|
11
|
+
long buffer_s;
|
12
|
+
long buffer_e;
|
13
|
+
|
14
|
+
int nesting;
|
15
|
+
|
16
|
+
str_type str_type;
|
17
|
+
VALUE start_tok;
|
18
|
+
int interpolate;
|
19
|
+
|
20
|
+
VALUE start_delim;
|
21
|
+
VALUE end_delim;
|
22
|
+
VALUE delimiter;
|
23
|
+
|
24
|
+
long heredoc_e;
|
25
|
+
long str_s;
|
26
|
+
long herebody_s;
|
27
|
+
|
28
|
+
int indent;
|
29
|
+
int label_allowed;
|
30
|
+
int interp_braces;
|
31
|
+
int space_emitted;
|
32
|
+
int monolithic;
|
33
|
+
|
34
|
+
int dedent_body;
|
35
|
+
int dedent_level;
|
36
|
+
};
|
37
|
+
|
38
|
+
static void literal_init(literal*, lexer_state*, VALUE, VALUE, long, long, int, int, int);
|
39
|
+
static str_type literal_string_to_str_type(VALUE);
|
40
|
+
static VALUE literal_str_type_to_string(str_type);
|
41
|
+
static void literal_set_start_tok_and_interpolate(literal*, str_type);
|
42
|
+
static VALUE literal_get_start_delim(VALUE);
|
43
|
+
static VALUE literal_get_end_delim(VALUE);
|
44
|
+
static int literal_munge_escape_p(literal*, VALUE);
|
45
|
+
static int literal_nest_and_try_closing(literal*, VALUE, long, long, VALUE);
|
46
|
+
static void literal_emit_start_tok(literal*);
|
47
|
+
static void literal_start_interp_brace(literal*);
|
48
|
+
static int literal_end_interp_brace_and_try_closing(literal*);
|
49
|
+
static void literal_extend_string(literal*, VALUE, long, long);
|
50
|
+
static void literal_flush_string(literal*);
|
51
|
+
static void literal_extend_content(literal*);
|
52
|
+
static void literal_extend_space(literal*, long, long);
|
53
|
+
static int literal_words_p(literal*);
|
54
|
+
static int literal_regexp_p(literal*);
|
55
|
+
static int literal_heredoc_p(literal*);
|
56
|
+
static int literal_squiggly_heredoc_p(literal*);
|
57
|
+
static int newline_char_p(VALUE);
|
58
|
+
static void literal_infer_indent_level(literal*, VALUE);
|
59
|
+
static int next_state_for_literal(literal*);
|
60
|
+
static void literal_clear_buffer(literal*);
|
61
|
+
|
62
|
+
#endif
|
@@ -0,0 +1,371 @@
|
|
1
|
+
static void literal_init(literal *self, lexer_state *lexer, VALUE str_type,
|
2
|
+
VALUE delimiter, long str_s, long heredoc_e, int indent,
|
3
|
+
int dedent_body, int label_allowed)
|
4
|
+
{
|
5
|
+
self->lexer = lexer;
|
6
|
+
self->nesting = 1;
|
7
|
+
self->str_type = literal_string_to_str_type(str_type);
|
8
|
+
|
9
|
+
if (self->str_type == INVALID) {
|
10
|
+
VALUE hash = rb_hash_new();
|
11
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("type")), str_type);
|
12
|
+
diagnostic(lexer, severity_error, unexpected_percent_str, hash,
|
13
|
+
range(lexer, str_s, str_s + 2), empty_array);
|
14
|
+
}
|
15
|
+
|
16
|
+
literal_set_start_tok_and_interpolate(self, self->str_type);
|
17
|
+
|
18
|
+
self->str_s = str_s;
|
19
|
+
|
20
|
+
self->start_delim = literal_get_start_delim(delimiter);
|
21
|
+
self->end_delim = literal_get_end_delim(delimiter);
|
22
|
+
self->delimiter = delimiter;
|
23
|
+
|
24
|
+
self->herebody_s = 0;
|
25
|
+
self->heredoc_e = heredoc_e;
|
26
|
+
self->indent = indent;
|
27
|
+
self->label_allowed = label_allowed;
|
28
|
+
self->dedent_body = dedent_body;
|
29
|
+
self->dedent_level = -1;
|
30
|
+
self->interp_braces = 0;
|
31
|
+
self->space_emitted = 1;
|
32
|
+
self->monolithic = (self->start_tok == tSTRING_BEG &&
|
33
|
+
(self->str_type == SINGLE_QUOTE ||
|
34
|
+
self->str_type == DOUBLE_QUOTE) &&
|
35
|
+
!heredoc_e);
|
36
|
+
|
37
|
+
literal_clear_buffer(self);
|
38
|
+
|
39
|
+
if (!self->monolithic) {
|
40
|
+
literal_emit_start_tok(self);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
static str_type literal_string_to_str_type(VALUE str)
|
45
|
+
{
|
46
|
+
char *p = RSTRING_PTR(str);
|
47
|
+
switch (*p) {
|
48
|
+
case '%':
|
49
|
+
switch (*++p) {
|
50
|
+
case 'q': return PERCENT_Q;
|
51
|
+
case 'Q': return BIG_PERCENT_Q;
|
52
|
+
case '\0': return BARE_PERCENT;
|
53
|
+
case 'w': return PERCENT_W;
|
54
|
+
case 'W': return BIG_PERCENT_W;
|
55
|
+
case 'i': return PERCENT_I;
|
56
|
+
case 'I': return BIG_PERCENT_I;
|
57
|
+
case 's': return PERCENT_S;
|
58
|
+
case 'r': return PERCENT_R;
|
59
|
+
case 'x': return PERCENT_X;
|
60
|
+
default: return INVALID;
|
61
|
+
}
|
62
|
+
break;
|
63
|
+
case '"': return DOUBLE_QUOTE;
|
64
|
+
case '\'': return SINGLE_QUOTE;
|
65
|
+
case '/': return SLASH;
|
66
|
+
case ':':
|
67
|
+
switch (*++p) {
|
68
|
+
case '"': return SYM_DOUBLE_QUOT;
|
69
|
+
case '\'': return SYM_SINGLE_QUOT;
|
70
|
+
default: return INVALID;
|
71
|
+
}
|
72
|
+
break;
|
73
|
+
case '`': return BACKTICK;
|
74
|
+
case '<':
|
75
|
+
if (*++p != '<')
|
76
|
+
return INVALID;
|
77
|
+
switch (*++p) {
|
78
|
+
case '"': return LSHFT_DOUBLE_QUOT;
|
79
|
+
case '\'': return LSHFT_SINGLE_QUOT;
|
80
|
+
case '`': return LSHFT_BACKTICK;
|
81
|
+
default: return INVALID;
|
82
|
+
}
|
83
|
+
default: return INVALID;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
static VALUE literal_str_type_to_string(str_type stype)
|
88
|
+
{
|
89
|
+
switch (stype) {
|
90
|
+
case SINGLE_QUOTE: return rb_str_new2("'");
|
91
|
+
case DOUBLE_QUOTE: return rb_str_new2("\"");
|
92
|
+
case PERCENT_Q: return rb_str_new2("%q");
|
93
|
+
case BIG_PERCENT_Q: return rb_str_new2("%Q");
|
94
|
+
case LSHFT_SINGLE_QUOT: return rb_str_new2("<<'");
|
95
|
+
case LSHFT_DOUBLE_QUOT: return rb_str_new2("<<\"");
|
96
|
+
case BARE_PERCENT: return rb_str_new2("%");
|
97
|
+
case PERCENT_W: return rb_str_new2("%w");
|
98
|
+
case BIG_PERCENT_W: return rb_str_new2("%W");
|
99
|
+
case PERCENT_I: return rb_str_new2("%i");
|
100
|
+
case BIG_PERCENT_I: return rb_str_new2("%I");
|
101
|
+
case SYM_SINGLE_QUOT: return rb_str_new2(":'");
|
102
|
+
case SYM_DOUBLE_QUOT: return rb_str_new2(":\"");
|
103
|
+
case PERCENT_S: return rb_str_new2("%s");
|
104
|
+
case SLASH: return rb_str_new2("/");
|
105
|
+
case PERCENT_R: return rb_str_new2("%r");
|
106
|
+
case PERCENT_X: return rb_str_new2("%x");
|
107
|
+
case BACKTICK: return rb_str_new2("`");
|
108
|
+
case LSHFT_BACKTICK: return rb_str_new2("<<`");
|
109
|
+
default: return Qnil;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
static void literal_set_start_tok_and_interpolate(literal *lit, str_type stype)
|
114
|
+
{
|
115
|
+
switch(stype) {
|
116
|
+
case SINGLE_QUOTE:
|
117
|
+
case PERCENT_Q:
|
118
|
+
case LSHFT_SINGLE_QUOT:
|
119
|
+
lit->start_tok = tSTRING_BEG;
|
120
|
+
lit->interpolate = 0;
|
121
|
+
break;
|
122
|
+
case DOUBLE_QUOTE:
|
123
|
+
case BIG_PERCENT_Q:
|
124
|
+
case BARE_PERCENT:
|
125
|
+
case LSHFT_DOUBLE_QUOT:
|
126
|
+
lit->start_tok = tSTRING_BEG;
|
127
|
+
lit->interpolate = 1;
|
128
|
+
break;
|
129
|
+
case PERCENT_W:
|
130
|
+
lit->start_tok = tQWORDS_BEG;
|
131
|
+
lit->interpolate = 0;
|
132
|
+
break;
|
133
|
+
case BIG_PERCENT_W:
|
134
|
+
lit->start_tok = tWORDS_BEG;
|
135
|
+
lit->interpolate = 1;
|
136
|
+
break;
|
137
|
+
case PERCENT_I:
|
138
|
+
lit->start_tok = tQSYMBOLS_BEG;
|
139
|
+
lit->interpolate = 0;
|
140
|
+
break;
|
141
|
+
case BIG_PERCENT_I:
|
142
|
+
lit->start_tok = tSYMBOLS_BEG;
|
143
|
+
lit->interpolate = 1;
|
144
|
+
break;
|
145
|
+
case SYM_SINGLE_QUOT:
|
146
|
+
case PERCENT_S:
|
147
|
+
lit->start_tok = tSYMBEG;
|
148
|
+
lit->interpolate = 0;
|
149
|
+
break;
|
150
|
+
case SYM_DOUBLE_QUOT:
|
151
|
+
lit->start_tok = tSYMBEG;
|
152
|
+
lit->interpolate = 1;
|
153
|
+
break;
|
154
|
+
case SLASH:
|
155
|
+
case PERCENT_R:
|
156
|
+
lit->start_tok = tREGEXP_BEG;
|
157
|
+
lit->interpolate = 1;
|
158
|
+
break;
|
159
|
+
case PERCENT_X:
|
160
|
+
case BACKTICK:
|
161
|
+
case LSHFT_BACKTICK:
|
162
|
+
lit->start_tok = tXSTRING_BEG;
|
163
|
+
lit->interpolate = 1;
|
164
|
+
break;
|
165
|
+
default:
|
166
|
+
lit->start_tok = Qnil;
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static VALUE literal_get_start_delim(VALUE str)
|
172
|
+
{
|
173
|
+
VALUE end_delim = literal_get_end_delim(str);
|
174
|
+
if (end_delim == str)
|
175
|
+
return Qnil;
|
176
|
+
else
|
177
|
+
return str;
|
178
|
+
}
|
179
|
+
|
180
|
+
static VALUE literal_get_end_delim(VALUE str)
|
181
|
+
{
|
182
|
+
char *p = RSTRING_PTR(str);
|
183
|
+
|
184
|
+
switch (*p) {
|
185
|
+
case '(': return rb_str_new2(")");
|
186
|
+
case '[': return rb_str_new2("]");
|
187
|
+
case '{': return rb_str_new2("}");
|
188
|
+
case '<': return rb_str_new2(">");
|
189
|
+
default: return str;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
static int literal_words_p(literal *lit)
|
194
|
+
{
|
195
|
+
return lit->start_tok == tWORDS_BEG || lit->start_tok == tQWORDS_BEG ||
|
196
|
+
lit->start_tok == tSYMBOLS_BEG || lit->start_tok == tQSYMBOLS_BEG;
|
197
|
+
}
|
198
|
+
|
199
|
+
static int literal_regexp_p(literal *lit)
|
200
|
+
{
|
201
|
+
return lit->start_tok == tREGEXP_BEG;
|
202
|
+
}
|
203
|
+
|
204
|
+
static int literal_heredoc_p(literal *lit)
|
205
|
+
{
|
206
|
+
return lit->heredoc_e ? 1 : 0;
|
207
|
+
}
|
208
|
+
|
209
|
+
static int literal_squiggly_heredoc_p(literal *lit)
|
210
|
+
{
|
211
|
+
return literal_heredoc_p(lit) && lit->dedent_body;
|
212
|
+
}
|
213
|
+
|
214
|
+
static int literal_backslash_delimited_p(literal *lit)
|
215
|
+
{
|
216
|
+
return *RSTRING_PTR(lit->end_delim) == '\\';
|
217
|
+
}
|
218
|
+
|
219
|
+
static int literal_munge_escape_p(literal *lit, VALUE character)
|
220
|
+
{
|
221
|
+
char *p = RSTRING_PTR(character);
|
222
|
+
|
223
|
+
if (literal_words_p(lit) && (*p == ' ' || *p == '\t' || *p == '\n' ||
|
224
|
+
*p == '\r' || *p == '\v' || *p == '\f')) {
|
225
|
+
return 1;
|
226
|
+
} else if (*p == '\\' ||
|
227
|
+
rb_equal(character, lit->start_delim) ||
|
228
|
+
rb_equal(character, lit->end_delim)) {
|
229
|
+
return 1;
|
230
|
+
} else {
|
231
|
+
return 0;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
static int literal_nest_and_try_closing(literal *lit, VALUE delimiter, long ts, long te, VALUE lookahead)
|
236
|
+
{
|
237
|
+
if (lit->start_delim != Qnil && rb_equal(lit->start_delim, delimiter)) {
|
238
|
+
lit->nesting += 1;
|
239
|
+
} else if (lit->indent && rb_equal(lit->end_delim, rb_funcall(delimiter, rb_intern("lstrip"), 0))) {
|
240
|
+
lit->nesting -= 1;
|
241
|
+
} else if (!lit->indent && rb_equal(lit->end_delim, delimiter)) {
|
242
|
+
lit->nesting -= 1;
|
243
|
+
}
|
244
|
+
|
245
|
+
if (lit->nesting == 0) {
|
246
|
+
if (literal_words_p(lit)) {
|
247
|
+
literal_extend_space(lit, ts, ts);
|
248
|
+
}
|
249
|
+
|
250
|
+
int quoted_label = 0;
|
251
|
+
|
252
|
+
if (lookahead != Qnil) {
|
253
|
+
char *p = RSTRING_PTR(lookahead);
|
254
|
+
if (p[0] == ':' && p[1] != ':' && lit->label_allowed && lit->start_tok == tSTRING_BEG)
|
255
|
+
quoted_label = 1;
|
256
|
+
}
|
257
|
+
|
258
|
+
if (quoted_label) {
|
259
|
+
literal_flush_string(lit);
|
260
|
+
emit_token(lit->lexer, tLABEL_END, lit->end_delim, ts, te + 1);
|
261
|
+
} else if (lit->monolithic) {
|
262
|
+
emit_token(lit->lexer, tSTRING, lit->buffer, lit->str_s, te);
|
263
|
+
} else {
|
264
|
+
if (!lit->heredoc_e) {
|
265
|
+
literal_flush_string(lit);
|
266
|
+
}
|
267
|
+
emit_token(lit->lexer, tSTRING_END, lit->end_delim, ts, te);
|
268
|
+
}
|
269
|
+
return 1;
|
270
|
+
} else {
|
271
|
+
return 0;
|
272
|
+
}
|
273
|
+
}
|
274
|
+
|
275
|
+
static void literal_infer_indent_level(literal *lit, VALUE line)
|
276
|
+
{
|
277
|
+
if (!lit->dedent_body)
|
278
|
+
return;
|
279
|
+
|
280
|
+
char *p = RSTRING_PTR(line);
|
281
|
+
int indent_level = 0;
|
282
|
+
|
283
|
+
while (*p) {
|
284
|
+
if (*p == ' ') {
|
285
|
+
indent_level += 1;
|
286
|
+
} else if (*p == '\t') {
|
287
|
+
indent_level += (8 - (indent_level % 8));
|
288
|
+
} else {
|
289
|
+
if (lit->dedent_level == -1 || lit->dedent_level > indent_level)
|
290
|
+
lit->dedent_level = indent_level;
|
291
|
+
break;
|
292
|
+
}
|
293
|
+
p++;
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|
297
|
+
static void literal_start_interp_brace(literal *lit)
|
298
|
+
{
|
299
|
+
lit->interp_braces += 1;
|
300
|
+
}
|
301
|
+
|
302
|
+
static int literal_end_interp_brace_and_try_closing(literal *lit)
|
303
|
+
{
|
304
|
+
return --lit->interp_braces == 0;
|
305
|
+
}
|
306
|
+
|
307
|
+
static void literal_extend_string(literal *lit, VALUE str, long ts, long te)
|
308
|
+
{
|
309
|
+
if (!lit->buffer_s)
|
310
|
+
lit->buffer_s = ts;
|
311
|
+
|
312
|
+
lit->buffer_e = te;
|
313
|
+
rb_str_concat(lit->buffer, str);
|
314
|
+
}
|
315
|
+
|
316
|
+
static void literal_flush_string(literal *lit)
|
317
|
+
{
|
318
|
+
if (lit->monolithic) {
|
319
|
+
literal_emit_start_tok(lit);
|
320
|
+
lit->monolithic = 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
if (RSTRING_LEN(lit->buffer) > 0) {
|
324
|
+
emit_token(lit->lexer, tSTRING_CONTENT, lit->buffer, lit->buffer_s, lit->buffer_e);
|
325
|
+
literal_clear_buffer(lit);
|
326
|
+
lit->space_emitted = 0;
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
static void literal_extend_content(literal *lit)
|
331
|
+
{
|
332
|
+
lit->space_emitted = 0;
|
333
|
+
}
|
334
|
+
|
335
|
+
static void literal_extend_space(literal *lit, long ts, long te)
|
336
|
+
{
|
337
|
+
literal_flush_string(lit);
|
338
|
+
|
339
|
+
if (!lit->space_emitted) {
|
340
|
+
emit_token(lit->lexer, tSPACE, Qnil, ts, te);
|
341
|
+
lit->space_emitted = 1;
|
342
|
+
}
|
343
|
+
}
|
344
|
+
|
345
|
+
static void literal_clear_buffer(literal *lit)
|
346
|
+
{
|
347
|
+
lit->buffer = rb_str_new2("");
|
348
|
+
force_encoding(lit->buffer, lit->lexer->encoding);
|
349
|
+
|
350
|
+
lit->buffer_s = 0;
|
351
|
+
lit->buffer_e = 0;
|
352
|
+
}
|
353
|
+
|
354
|
+
static void literal_emit_start_tok(literal *lit)
|
355
|
+
{
|
356
|
+
VALUE str_type = literal_str_type_to_string(lit->str_type);
|
357
|
+
|
358
|
+
if (*RSTRING_PTR(str_type) == '%')
|
359
|
+
rb_str_concat(str_type, lit->delimiter);
|
360
|
+
|
361
|
+
long str_e = lit->heredoc_e;
|
362
|
+
if (str_e == 0)
|
363
|
+
str_e = lit->str_s + NUM2INT(rb_str_length(str_type));
|
364
|
+
|
365
|
+
emit_token(lit->lexer, lit->start_tok, str_type, lit->str_s, str_e);
|
366
|
+
}
|
367
|
+
|
368
|
+
inline int newline_char_p(VALUE str)
|
369
|
+
{
|
370
|
+
return RTEST(rb_str_equal(str, rb_str_new2("\n")));
|
371
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
typedef enum {
|
2
|
+
SINGLE_QUOTE, /* ' */
|
3
|
+
DOUBLE_QUOTE, /* " */
|
4
|
+
PERCENT_Q, /* %q */
|
5
|
+
BIG_PERCENT_Q, /* %Q */
|
6
|
+
LSHFT_SINGLE_QUOT, /* <<' */
|
7
|
+
LSHFT_DOUBLE_QUOT, /* <<" */
|
8
|
+
BARE_PERCENT, /* % */
|
9
|
+
|
10
|
+
PERCENT_W, /* %w */
|
11
|
+
BIG_PERCENT_W, /* %W */
|
12
|
+
|
13
|
+
PERCENT_I, /* %i */
|
14
|
+
BIG_PERCENT_I, /* %I */
|
15
|
+
|
16
|
+
SYM_SINGLE_QUOT, /* :' */
|
17
|
+
SYM_DOUBLE_QUOT, /* :" */
|
18
|
+
PERCENT_S, /* %s */
|
19
|
+
|
20
|
+
SLASH, /* / for regexp */
|
21
|
+
PERCENT_R, /* %r */
|
22
|
+
|
23
|
+
PERCENT_X, /* %x for xstr */
|
24
|
+
BACKTICK, /* ` */
|
25
|
+
LSHFT_BACKTICK, /* <<` */
|
26
|
+
|
27
|
+
INVALID
|
28
|
+
} str_type;
|