c_lexer 2.5.1.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.gitmodules +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +53 -0
- data/README.md +26 -0
- data/Rakefile +51 -0
- data/bin/c-ruby-parse +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/c_lexer.gemspec +36 -0
- data/ext/lexer/cmdarg.h +47 -0
- data/ext/lexer/cond.h +47 -0
- data/ext/lexer/emit_tables.h +177 -0
- data/ext/lexer/extconf.rb +5 -0
- data/ext/lexer/lexer.h +308 -0
- data/ext/lexer/lexer.rl +2522 -0
- data/ext/lexer/literal.h +62 -0
- data/ext/lexer/literal/methods.h +371 -0
- data/ext/lexer/literal/type.h +28 -0
- data/ext/lexer/stack.h +41 -0
- data/ext/lexer/stack_state.h +48 -0
- data/lib/c_lexer.rb +80 -0
- data/lib/c_lexer/version.rb +3 -0
- metadata +195 -0
data/ext/lexer/literal.h
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#ifndef LITERAL_H
|
2
|
+
#define LITERAL_H
|
3
|
+
|
4
|
+
#include "literal/type.h"
|
5
|
+
typedef struct literal literal;
|
6
|
+
#include "lexer.h"
|
7
|
+
|
8
|
+
struct literal {
|
9
|
+
struct lexer_state *lexer;
|
10
|
+
VALUE buffer;
|
11
|
+
long buffer_s;
|
12
|
+
long buffer_e;
|
13
|
+
|
14
|
+
int nesting;
|
15
|
+
|
16
|
+
str_type str_type;
|
17
|
+
VALUE start_tok;
|
18
|
+
int interpolate;
|
19
|
+
|
20
|
+
VALUE start_delim;
|
21
|
+
VALUE end_delim;
|
22
|
+
VALUE delimiter;
|
23
|
+
|
24
|
+
long heredoc_e;
|
25
|
+
long str_s;
|
26
|
+
long herebody_s;
|
27
|
+
|
28
|
+
int indent;
|
29
|
+
int label_allowed;
|
30
|
+
int interp_braces;
|
31
|
+
int space_emitted;
|
32
|
+
int monolithic;
|
33
|
+
|
34
|
+
int dedent_body;
|
35
|
+
int dedent_level;
|
36
|
+
};
|
37
|
+
|
38
|
+
static void literal_init(literal*, lexer_state*, VALUE, VALUE, long, long, int, int, int);
|
39
|
+
static str_type literal_string_to_str_type(VALUE);
|
40
|
+
static VALUE literal_str_type_to_string(str_type);
|
41
|
+
static void literal_set_start_tok_and_interpolate(literal*, str_type);
|
42
|
+
static VALUE literal_get_start_delim(VALUE);
|
43
|
+
static VALUE literal_get_end_delim(VALUE);
|
44
|
+
static int literal_munge_escape_p(literal*, VALUE);
|
45
|
+
static int literal_nest_and_try_closing(literal*, VALUE, long, long, VALUE);
|
46
|
+
static void literal_emit_start_tok(literal*);
|
47
|
+
static void literal_start_interp_brace(literal*);
|
48
|
+
static int literal_end_interp_brace_and_try_closing(literal*);
|
49
|
+
static void literal_extend_string(literal*, VALUE, long, long);
|
50
|
+
static void literal_flush_string(literal*);
|
51
|
+
static void literal_extend_content(literal*);
|
52
|
+
static void literal_extend_space(literal*, long, long);
|
53
|
+
static int literal_words_p(literal*);
|
54
|
+
static int literal_regexp_p(literal*);
|
55
|
+
static int literal_heredoc_p(literal*);
|
56
|
+
static int literal_squiggly_heredoc_p(literal*);
|
57
|
+
static int newline_char_p(VALUE);
|
58
|
+
static void literal_infer_indent_level(literal*, VALUE);
|
59
|
+
static int next_state_for_literal(literal*);
|
60
|
+
static void literal_clear_buffer(literal*);
|
61
|
+
|
62
|
+
#endif
|
@@ -0,0 +1,371 @@
|
|
1
|
+
static void literal_init(literal *self, lexer_state *lexer, VALUE str_type,
|
2
|
+
VALUE delimiter, long str_s, long heredoc_e, int indent,
|
3
|
+
int dedent_body, int label_allowed)
|
4
|
+
{
|
5
|
+
self->lexer = lexer;
|
6
|
+
self->nesting = 1;
|
7
|
+
self->str_type = literal_string_to_str_type(str_type);
|
8
|
+
|
9
|
+
if (self->str_type == INVALID) {
|
10
|
+
VALUE hash = rb_hash_new();
|
11
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("type")), str_type);
|
12
|
+
diagnostic(lexer, severity_error, unexpected_percent_str, hash,
|
13
|
+
range(lexer, str_s, str_s + 2), empty_array);
|
14
|
+
}
|
15
|
+
|
16
|
+
literal_set_start_tok_and_interpolate(self, self->str_type);
|
17
|
+
|
18
|
+
self->str_s = str_s;
|
19
|
+
|
20
|
+
self->start_delim = literal_get_start_delim(delimiter);
|
21
|
+
self->end_delim = literal_get_end_delim(delimiter);
|
22
|
+
self->delimiter = delimiter;
|
23
|
+
|
24
|
+
self->herebody_s = 0;
|
25
|
+
self->heredoc_e = heredoc_e;
|
26
|
+
self->indent = indent;
|
27
|
+
self->label_allowed = label_allowed;
|
28
|
+
self->dedent_body = dedent_body;
|
29
|
+
self->dedent_level = -1;
|
30
|
+
self->interp_braces = 0;
|
31
|
+
self->space_emitted = 1;
|
32
|
+
self->monolithic = (self->start_tok == tSTRING_BEG &&
|
33
|
+
(self->str_type == SINGLE_QUOTE ||
|
34
|
+
self->str_type == DOUBLE_QUOTE) &&
|
35
|
+
!heredoc_e);
|
36
|
+
|
37
|
+
literal_clear_buffer(self);
|
38
|
+
|
39
|
+
if (!self->monolithic) {
|
40
|
+
literal_emit_start_tok(self);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
static str_type literal_string_to_str_type(VALUE str)
|
45
|
+
{
|
46
|
+
char *p = RSTRING_PTR(str);
|
47
|
+
switch (*p) {
|
48
|
+
case '%':
|
49
|
+
switch (*++p) {
|
50
|
+
case 'q': return PERCENT_Q;
|
51
|
+
case 'Q': return BIG_PERCENT_Q;
|
52
|
+
case '\0': return BARE_PERCENT;
|
53
|
+
case 'w': return PERCENT_W;
|
54
|
+
case 'W': return BIG_PERCENT_W;
|
55
|
+
case 'i': return PERCENT_I;
|
56
|
+
case 'I': return BIG_PERCENT_I;
|
57
|
+
case 's': return PERCENT_S;
|
58
|
+
case 'r': return PERCENT_R;
|
59
|
+
case 'x': return PERCENT_X;
|
60
|
+
default: return INVALID;
|
61
|
+
}
|
62
|
+
break;
|
63
|
+
case '"': return DOUBLE_QUOTE;
|
64
|
+
case '\'': return SINGLE_QUOTE;
|
65
|
+
case '/': return SLASH;
|
66
|
+
case ':':
|
67
|
+
switch (*++p) {
|
68
|
+
case '"': return SYM_DOUBLE_QUOT;
|
69
|
+
case '\'': return SYM_SINGLE_QUOT;
|
70
|
+
default: return INVALID;
|
71
|
+
}
|
72
|
+
break;
|
73
|
+
case '`': return BACKTICK;
|
74
|
+
case '<':
|
75
|
+
if (*++p != '<')
|
76
|
+
return INVALID;
|
77
|
+
switch (*++p) {
|
78
|
+
case '"': return LSHFT_DOUBLE_QUOT;
|
79
|
+
case '\'': return LSHFT_SINGLE_QUOT;
|
80
|
+
case '`': return LSHFT_BACKTICK;
|
81
|
+
default: return INVALID;
|
82
|
+
}
|
83
|
+
default: return INVALID;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
static VALUE literal_str_type_to_string(str_type stype)
|
88
|
+
{
|
89
|
+
switch (stype) {
|
90
|
+
case SINGLE_QUOTE: return rb_str_new2("'");
|
91
|
+
case DOUBLE_QUOTE: return rb_str_new2("\"");
|
92
|
+
case PERCENT_Q: return rb_str_new2("%q");
|
93
|
+
case BIG_PERCENT_Q: return rb_str_new2("%Q");
|
94
|
+
case LSHFT_SINGLE_QUOT: return rb_str_new2("<<'");
|
95
|
+
case LSHFT_DOUBLE_QUOT: return rb_str_new2("<<\"");
|
96
|
+
case BARE_PERCENT: return rb_str_new2("%");
|
97
|
+
case PERCENT_W: return rb_str_new2("%w");
|
98
|
+
case BIG_PERCENT_W: return rb_str_new2("%W");
|
99
|
+
case PERCENT_I: return rb_str_new2("%i");
|
100
|
+
case BIG_PERCENT_I: return rb_str_new2("%I");
|
101
|
+
case SYM_SINGLE_QUOT: return rb_str_new2(":'");
|
102
|
+
case SYM_DOUBLE_QUOT: return rb_str_new2(":\"");
|
103
|
+
case PERCENT_S: return rb_str_new2("%s");
|
104
|
+
case SLASH: return rb_str_new2("/");
|
105
|
+
case PERCENT_R: return rb_str_new2("%r");
|
106
|
+
case PERCENT_X: return rb_str_new2("%x");
|
107
|
+
case BACKTICK: return rb_str_new2("`");
|
108
|
+
case LSHFT_BACKTICK: return rb_str_new2("<<`");
|
109
|
+
default: return Qnil;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
static void literal_set_start_tok_and_interpolate(literal *lit, str_type stype)
|
114
|
+
{
|
115
|
+
switch(stype) {
|
116
|
+
case SINGLE_QUOTE:
|
117
|
+
case PERCENT_Q:
|
118
|
+
case LSHFT_SINGLE_QUOT:
|
119
|
+
lit->start_tok = tSTRING_BEG;
|
120
|
+
lit->interpolate = 0;
|
121
|
+
break;
|
122
|
+
case DOUBLE_QUOTE:
|
123
|
+
case BIG_PERCENT_Q:
|
124
|
+
case BARE_PERCENT:
|
125
|
+
case LSHFT_DOUBLE_QUOT:
|
126
|
+
lit->start_tok = tSTRING_BEG;
|
127
|
+
lit->interpolate = 1;
|
128
|
+
break;
|
129
|
+
case PERCENT_W:
|
130
|
+
lit->start_tok = tQWORDS_BEG;
|
131
|
+
lit->interpolate = 0;
|
132
|
+
break;
|
133
|
+
case BIG_PERCENT_W:
|
134
|
+
lit->start_tok = tWORDS_BEG;
|
135
|
+
lit->interpolate = 1;
|
136
|
+
break;
|
137
|
+
case PERCENT_I:
|
138
|
+
lit->start_tok = tQSYMBOLS_BEG;
|
139
|
+
lit->interpolate = 0;
|
140
|
+
break;
|
141
|
+
case BIG_PERCENT_I:
|
142
|
+
lit->start_tok = tSYMBOLS_BEG;
|
143
|
+
lit->interpolate = 1;
|
144
|
+
break;
|
145
|
+
case SYM_SINGLE_QUOT:
|
146
|
+
case PERCENT_S:
|
147
|
+
lit->start_tok = tSYMBEG;
|
148
|
+
lit->interpolate = 0;
|
149
|
+
break;
|
150
|
+
case SYM_DOUBLE_QUOT:
|
151
|
+
lit->start_tok = tSYMBEG;
|
152
|
+
lit->interpolate = 1;
|
153
|
+
break;
|
154
|
+
case SLASH:
|
155
|
+
case PERCENT_R:
|
156
|
+
lit->start_tok = tREGEXP_BEG;
|
157
|
+
lit->interpolate = 1;
|
158
|
+
break;
|
159
|
+
case PERCENT_X:
|
160
|
+
case BACKTICK:
|
161
|
+
case LSHFT_BACKTICK:
|
162
|
+
lit->start_tok = tXSTRING_BEG;
|
163
|
+
lit->interpolate = 1;
|
164
|
+
break;
|
165
|
+
default:
|
166
|
+
lit->start_tok = Qnil;
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
static VALUE literal_get_start_delim(VALUE str)
|
172
|
+
{
|
173
|
+
VALUE end_delim = literal_get_end_delim(str);
|
174
|
+
if (end_delim == str)
|
175
|
+
return Qnil;
|
176
|
+
else
|
177
|
+
return str;
|
178
|
+
}
|
179
|
+
|
180
|
+
static VALUE literal_get_end_delim(VALUE str)
|
181
|
+
{
|
182
|
+
char *p = RSTRING_PTR(str);
|
183
|
+
|
184
|
+
switch (*p) {
|
185
|
+
case '(': return rb_str_new2(")");
|
186
|
+
case '[': return rb_str_new2("]");
|
187
|
+
case '{': return rb_str_new2("}");
|
188
|
+
case '<': return rb_str_new2(">");
|
189
|
+
default: return str;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
static int literal_words_p(literal *lit)
|
194
|
+
{
|
195
|
+
return lit->start_tok == tWORDS_BEG || lit->start_tok == tQWORDS_BEG ||
|
196
|
+
lit->start_tok == tSYMBOLS_BEG || lit->start_tok == tQSYMBOLS_BEG;
|
197
|
+
}
|
198
|
+
|
199
|
+
static int literal_regexp_p(literal *lit)
|
200
|
+
{
|
201
|
+
return lit->start_tok == tREGEXP_BEG;
|
202
|
+
}
|
203
|
+
|
204
|
+
static int literal_heredoc_p(literal *lit)
|
205
|
+
{
|
206
|
+
return lit->heredoc_e ? 1 : 0;
|
207
|
+
}
|
208
|
+
|
209
|
+
static int literal_squiggly_heredoc_p(literal *lit)
|
210
|
+
{
|
211
|
+
return literal_heredoc_p(lit) && lit->dedent_body;
|
212
|
+
}
|
213
|
+
|
214
|
+
static int literal_backslash_delimited_p(literal *lit)
|
215
|
+
{
|
216
|
+
return *RSTRING_PTR(lit->end_delim) == '\\';
|
217
|
+
}
|
218
|
+
|
219
|
+
static int literal_munge_escape_p(literal *lit, VALUE character)
|
220
|
+
{
|
221
|
+
char *p = RSTRING_PTR(character);
|
222
|
+
|
223
|
+
if (literal_words_p(lit) && (*p == ' ' || *p == '\t' || *p == '\n' ||
|
224
|
+
*p == '\r' || *p == '\v' || *p == '\f')) {
|
225
|
+
return 1;
|
226
|
+
} else if (*p == '\\' ||
|
227
|
+
rb_equal(character, lit->start_delim) ||
|
228
|
+
rb_equal(character, lit->end_delim)) {
|
229
|
+
return 1;
|
230
|
+
} else {
|
231
|
+
return 0;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
static int literal_nest_and_try_closing(literal *lit, VALUE delimiter, long ts, long te, VALUE lookahead)
|
236
|
+
{
|
237
|
+
if (lit->start_delim != Qnil && rb_equal(lit->start_delim, delimiter)) {
|
238
|
+
lit->nesting += 1;
|
239
|
+
} else if (lit->indent && rb_equal(lit->end_delim, rb_funcall(delimiter, rb_intern("lstrip"), 0))) {
|
240
|
+
lit->nesting -= 1;
|
241
|
+
} else if (!lit->indent && rb_equal(lit->end_delim, delimiter)) {
|
242
|
+
lit->nesting -= 1;
|
243
|
+
}
|
244
|
+
|
245
|
+
if (lit->nesting == 0) {
|
246
|
+
if (literal_words_p(lit)) {
|
247
|
+
literal_extend_space(lit, ts, ts);
|
248
|
+
}
|
249
|
+
|
250
|
+
int quoted_label = 0;
|
251
|
+
|
252
|
+
if (lookahead != Qnil) {
|
253
|
+
char *p = RSTRING_PTR(lookahead);
|
254
|
+
if (p[0] == ':' && p[1] != ':' && lit->label_allowed && lit->start_tok == tSTRING_BEG)
|
255
|
+
quoted_label = 1;
|
256
|
+
}
|
257
|
+
|
258
|
+
if (quoted_label) {
|
259
|
+
literal_flush_string(lit);
|
260
|
+
emit_token(lit->lexer, tLABEL_END, lit->end_delim, ts, te + 1);
|
261
|
+
} else if (lit->monolithic) {
|
262
|
+
emit_token(lit->lexer, tSTRING, lit->buffer, lit->str_s, te);
|
263
|
+
} else {
|
264
|
+
if (!lit->heredoc_e) {
|
265
|
+
literal_flush_string(lit);
|
266
|
+
}
|
267
|
+
emit_token(lit->lexer, tSTRING_END, lit->end_delim, ts, te);
|
268
|
+
}
|
269
|
+
return 1;
|
270
|
+
} else {
|
271
|
+
return 0;
|
272
|
+
}
|
273
|
+
}
|
274
|
+
|
275
|
+
static void literal_infer_indent_level(literal *lit, VALUE line)
|
276
|
+
{
|
277
|
+
if (!lit->dedent_body)
|
278
|
+
return;
|
279
|
+
|
280
|
+
char *p = RSTRING_PTR(line);
|
281
|
+
int indent_level = 0;
|
282
|
+
|
283
|
+
while (*p) {
|
284
|
+
if (*p == ' ') {
|
285
|
+
indent_level += 1;
|
286
|
+
} else if (*p == '\t') {
|
287
|
+
indent_level += (8 - (indent_level % 8));
|
288
|
+
} else {
|
289
|
+
if (lit->dedent_level == -1 || lit->dedent_level > indent_level)
|
290
|
+
lit->dedent_level = indent_level;
|
291
|
+
break;
|
292
|
+
}
|
293
|
+
p++;
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|
297
|
+
static void literal_start_interp_brace(literal *lit)
|
298
|
+
{
|
299
|
+
lit->interp_braces += 1;
|
300
|
+
}
|
301
|
+
|
302
|
+
static int literal_end_interp_brace_and_try_closing(literal *lit)
|
303
|
+
{
|
304
|
+
return --lit->interp_braces == 0;
|
305
|
+
}
|
306
|
+
|
307
|
+
static void literal_extend_string(literal *lit, VALUE str, long ts, long te)
|
308
|
+
{
|
309
|
+
if (!lit->buffer_s)
|
310
|
+
lit->buffer_s = ts;
|
311
|
+
|
312
|
+
lit->buffer_e = te;
|
313
|
+
rb_str_concat(lit->buffer, str);
|
314
|
+
}
|
315
|
+
|
316
|
+
static void literal_flush_string(literal *lit)
|
317
|
+
{
|
318
|
+
if (lit->monolithic) {
|
319
|
+
literal_emit_start_tok(lit);
|
320
|
+
lit->monolithic = 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
if (RSTRING_LEN(lit->buffer) > 0) {
|
324
|
+
emit_token(lit->lexer, tSTRING_CONTENT, lit->buffer, lit->buffer_s, lit->buffer_e);
|
325
|
+
literal_clear_buffer(lit);
|
326
|
+
lit->space_emitted = 0;
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
static void literal_extend_content(literal *lit)
|
331
|
+
{
|
332
|
+
lit->space_emitted = 0;
|
333
|
+
}
|
334
|
+
|
335
|
+
static void literal_extend_space(literal *lit, long ts, long te)
|
336
|
+
{
|
337
|
+
literal_flush_string(lit);
|
338
|
+
|
339
|
+
if (!lit->space_emitted) {
|
340
|
+
emit_token(lit->lexer, tSPACE, Qnil, ts, te);
|
341
|
+
lit->space_emitted = 1;
|
342
|
+
}
|
343
|
+
}
|
344
|
+
|
345
|
+
static void literal_clear_buffer(literal *lit)
|
346
|
+
{
|
347
|
+
lit->buffer = rb_str_new2("");
|
348
|
+
force_encoding(lit->buffer, lit->lexer->encoding);
|
349
|
+
|
350
|
+
lit->buffer_s = 0;
|
351
|
+
lit->buffer_e = 0;
|
352
|
+
}
|
353
|
+
|
354
|
+
static void literal_emit_start_tok(literal *lit)
|
355
|
+
{
|
356
|
+
VALUE str_type = literal_str_type_to_string(lit->str_type);
|
357
|
+
|
358
|
+
if (*RSTRING_PTR(str_type) == '%')
|
359
|
+
rb_str_concat(str_type, lit->delimiter);
|
360
|
+
|
361
|
+
long str_e = lit->heredoc_e;
|
362
|
+
if (str_e == 0)
|
363
|
+
str_e = lit->str_s + NUM2INT(rb_str_length(str_type));
|
364
|
+
|
365
|
+
emit_token(lit->lexer, lit->start_tok, str_type, lit->str_s, str_e);
|
366
|
+
}
|
367
|
+
|
368
|
+
inline int newline_char_p(VALUE str)
|
369
|
+
{
|
370
|
+
return RTEST(rb_str_equal(str, rb_str_new2("\n")));
|
371
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
typedef enum {
|
2
|
+
SINGLE_QUOTE, /* ' */
|
3
|
+
DOUBLE_QUOTE, /* " */
|
4
|
+
PERCENT_Q, /* %q */
|
5
|
+
BIG_PERCENT_Q, /* %Q */
|
6
|
+
LSHFT_SINGLE_QUOT, /* <<' */
|
7
|
+
LSHFT_DOUBLE_QUOT, /* <<" */
|
8
|
+
BARE_PERCENT, /* % */
|
9
|
+
|
10
|
+
PERCENT_W, /* %w */
|
11
|
+
BIG_PERCENT_W, /* %W */
|
12
|
+
|
13
|
+
PERCENT_I, /* %i */
|
14
|
+
BIG_PERCENT_I, /* %I */
|
15
|
+
|
16
|
+
SYM_SINGLE_QUOT, /* :' */
|
17
|
+
SYM_DOUBLE_QUOT, /* :" */
|
18
|
+
PERCENT_S, /* %s */
|
19
|
+
|
20
|
+
SLASH, /* / for regexp */
|
21
|
+
PERCENT_R, /* %r */
|
22
|
+
|
23
|
+
PERCENT_X, /* %x for xstr */
|
24
|
+
BACKTICK, /* ` */
|
25
|
+
LSHFT_BACKTICK, /* <<` */
|
26
|
+
|
27
|
+
INVALID
|
28
|
+
} str_type;
|