herb 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +6 -3
- data/README.md +1 -1
- data/ext/herb/error_helpers.c +1 -1
- data/ext/herb/error_helpers.h +1 -1
- data/ext/herb/extension.c +20 -4
- data/ext/herb/nodes.c +70 -41
- data/ext/herb/nodes.h +1 -1
- data/lib/herb/ast/nodes.rb +149 -91
- data/lib/herb/cli.rb +19 -6
- data/lib/herb/errors.rb +1 -1
- data/lib/herb/version.rb +1 -1
- data/lib/herb/visitor.rb +11 -6
- data/sig/herb/ast/nodes.rbs +65 -38
- data/sig/herb/visitor.rbs +6 -3
- data/sig/serialized_ast_errors.rbs +1 -1
- data/sig/serialized_ast_nodes.rbs +15 -10
- data/src/analyze.c +2 -1
- data/src/ast_nodes.c +79 -36
- data/src/ast_pretty_print.c +21 -13
- data/src/errors.c +1 -1
- data/src/herb.c +2 -2
- data/src/include/ast_nodes.h +23 -15
- data/src/include/ast_pretty_print.h +1 -1
- data/src/include/errors.h +1 -1
- data/src/include/herb.h +2 -1
- data/src/include/lexer_peek_helpers.h +21 -0
- data/src/include/parser.h +18 -1
- data/src/include/parser_helpers.h +9 -0
- data/src/include/token_struct.h +7 -1
- data/src/include/version.h +1 -1
- data/src/lexer.c +21 -1
- data/src/lexer_peek_helpers.c +77 -0
- data/src/main.c +2 -2
- data/src/parser.c +532 -99
- data/src/parser_helpers.c +46 -0
- data/src/token.c +6 -0
- data/src/visitor.c +39 -6
- metadata +2 -2
data/src/parser.c
CHANGED
@@ -6,6 +6,7 @@
|
|
6
6
|
#include "include/errors.h"
|
7
7
|
#include "include/html_util.h"
|
8
8
|
#include "include/lexer.h"
|
9
|
+
#include "include/lexer_peek_helpers.h"
|
9
10
|
#include "include/parser_helpers.h"
|
10
11
|
#include "include/token.h"
|
11
12
|
#include "include/token_matchers.h"
|
@@ -17,22 +18,81 @@
|
|
17
18
|
#include <strings.h>
|
18
19
|
|
19
20
|
static void parser_parse_in_data_state(parser_T* parser, array_T* children, array_T* errors);
|
21
|
+
static void parser_parse_foreign_content(parser_T* parser, array_T* children, array_T* errors);
|
20
22
|
static AST_ERB_CONTENT_NODE_T* parser_parse_erb_tag(parser_T* parser);
|
23
|
+
static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children);
|
24
|
+
static void parser_consume_whitespace(parser_T* parser, array_T* children);
|
25
|
+
static void parser_skip_erb_content(lexer_T* lexer);
|
26
|
+
static bool parser_lookahead_erb_is_attribute(lexer_T* lexer);
|
27
|
+
static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children);
|
28
|
+
static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children);
|
21
29
|
|
22
30
|
size_t parser_sizeof(void) {
|
23
31
|
return sizeof(struct PARSER_STRUCT);
|
24
32
|
}
|
25
33
|
|
26
|
-
parser_T* parser_init(lexer_T* lexer) {
|
34
|
+
parser_T* parser_init(lexer_T* lexer, parser_options_T* options) {
|
27
35
|
parser_T* parser = calloc(1, parser_sizeof());
|
28
36
|
|
29
37
|
parser->lexer = lexer;
|
30
38
|
parser->current_token = lexer_next_token(lexer);
|
31
39
|
parser->open_tags_stack = array_init(16);
|
40
|
+
parser->state = PARSER_STATE_DATA;
|
41
|
+
parser->foreign_content_type = FOREIGN_CONTENT_UNKNOWN;
|
42
|
+
|
43
|
+
if (options) {
|
44
|
+
parser->options = calloc(1, sizeof(parser_options_T));
|
45
|
+
parser->options->track_whitespace = options->track_whitespace;
|
46
|
+
} else {
|
47
|
+
parser->options = NULL;
|
48
|
+
}
|
32
49
|
|
33
50
|
return parser;
|
34
51
|
}
|
35
52
|
|
53
|
+
static AST_CDATA_NODE_T* parser_parse_cdata(parser_T* parser) {
|
54
|
+
array_T* errors = array_init(8);
|
55
|
+
array_T* children = array_init(8);
|
56
|
+
buffer_T content = buffer_new();
|
57
|
+
|
58
|
+
token_T* tag_opening = parser_consume_expected(parser, TOKEN_CDATA_START, errors);
|
59
|
+
position_T* start = position_copy(parser->current_token->location->start);
|
60
|
+
|
61
|
+
while (token_is_none_of(parser, TOKEN_CDATA_END, TOKEN_EOF)) {
|
62
|
+
if (token_is(parser, TOKEN_ERB_START)) {
|
63
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
64
|
+
AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
|
65
|
+
array_append(children, erb_node);
|
66
|
+
position_free(start);
|
67
|
+
start = position_copy(parser->current_token->location->start);
|
68
|
+
continue;
|
69
|
+
}
|
70
|
+
|
71
|
+
token_T* token = parser_advance(parser);
|
72
|
+
buffer_append(&content, token->value);
|
73
|
+
token_free(token);
|
74
|
+
}
|
75
|
+
|
76
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
77
|
+
token_T* tag_closing = parser_consume_expected(parser, TOKEN_CDATA_END, errors);
|
78
|
+
|
79
|
+
AST_CDATA_NODE_T* cdata = ast_cdata_node_init(
|
80
|
+
tag_opening,
|
81
|
+
children,
|
82
|
+
tag_closing,
|
83
|
+
tag_opening->location->start,
|
84
|
+
tag_closing->location->end,
|
85
|
+
errors
|
86
|
+
);
|
87
|
+
|
88
|
+
position_free(start);
|
89
|
+
buffer_free(&content);
|
90
|
+
token_free(tag_opening);
|
91
|
+
token_free(tag_closing);
|
92
|
+
|
93
|
+
return cdata;
|
94
|
+
}
|
95
|
+
|
36
96
|
static AST_HTML_COMMENT_NODE_T* parser_parse_html_comment(parser_T* parser) {
|
37
97
|
array_T* errors = array_init(8);
|
38
98
|
array_T* children = array_init(8);
|
@@ -125,6 +185,54 @@ static AST_HTML_DOCTYPE_NODE_T* parser_parse_html_doctype(parser_T* parser) {
|
|
125
185
|
return doctype;
|
126
186
|
}
|
127
187
|
|
188
|
+
static AST_XML_DECLARATION_NODE_T* parser_parse_xml_declaration(parser_T* parser) {
|
189
|
+
array_T* errors = array_init(8);
|
190
|
+
array_T* children = array_init(8);
|
191
|
+
buffer_T content = buffer_new();
|
192
|
+
|
193
|
+
token_T* tag_opening = parser_consume_expected(parser, TOKEN_XML_DECLARATION, errors);
|
194
|
+
|
195
|
+
position_T* start = position_copy(parser->current_token->location->start);
|
196
|
+
|
197
|
+
while (token_is_none_of(parser, TOKEN_XML_DECLARATION_END, TOKEN_EOF)) {
|
198
|
+
if (token_is(parser, TOKEN_ERB_START)) {
|
199
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
200
|
+
|
201
|
+
AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
|
202
|
+
array_append(children, erb_node);
|
203
|
+
|
204
|
+
position_free(start);
|
205
|
+
start = position_copy(parser->current_token->location->start);
|
206
|
+
|
207
|
+
continue;
|
208
|
+
}
|
209
|
+
|
210
|
+
token_T* token = parser_advance(parser);
|
211
|
+
buffer_append(&content, token->value);
|
212
|
+
token_free(token);
|
213
|
+
}
|
214
|
+
|
215
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
216
|
+
|
217
|
+
token_T* tag_closing = parser_consume_expected(parser, TOKEN_XML_DECLARATION_END, errors);
|
218
|
+
|
219
|
+
AST_XML_DECLARATION_NODE_T* xml_declaration = ast_xml_declaration_node_init(
|
220
|
+
tag_opening,
|
221
|
+
children,
|
222
|
+
tag_closing,
|
223
|
+
tag_opening->location->start,
|
224
|
+
tag_closing->location->end,
|
225
|
+
errors
|
226
|
+
);
|
227
|
+
|
228
|
+
position_free(start);
|
229
|
+
token_free(tag_opening);
|
230
|
+
token_free(tag_closing);
|
231
|
+
buffer_free(&content);
|
232
|
+
|
233
|
+
return xml_declaration;
|
234
|
+
}
|
235
|
+
|
128
236
|
static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T* document_errors) {
|
129
237
|
position_T* start = position_copy(parser->current_token->location->start);
|
130
238
|
|
@@ -185,96 +293,58 @@ static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T
|
|
185
293
|
|
186
294
|
static AST_HTML_ATTRIBUTE_NAME_NODE_T* parser_parse_html_attribute_name(parser_T* parser) {
|
187
295
|
array_T* errors = array_init(8);
|
296
|
+
array_T* children = array_init(8);
|
297
|
+
buffer_T buffer = buffer_new();
|
298
|
+
position_T* start = position_copy(parser->current_token->location->start);
|
188
299
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
token_free(at_token);
|
300
|
+
while (token_is_none_of(
|
301
|
+
parser,
|
302
|
+
TOKEN_EQUALS,
|
303
|
+
TOKEN_WHITESPACE,
|
304
|
+
TOKEN_NEWLINE,
|
305
|
+
TOKEN_HTML_TAG_END,
|
306
|
+
TOKEN_HTML_TAG_SELF_CLOSE,
|
307
|
+
TOKEN_EOF
|
308
|
+
)) {
|
309
|
+
if (token_is(parser, TOKEN_ERB_START)) {
|
310
|
+
parser_append_literal_node_from_buffer(parser, &buffer, children, start);
|
202
311
|
|
203
|
-
|
204
|
-
|
205
|
-
} else {
|
206
|
-
first_token = parser_consume_if_present(parser, TOKEN_IDENTIFIER);
|
312
|
+
AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
|
313
|
+
array_append(children, erb_node);
|
207
314
|
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
return attribute_name;
|
315
|
+
position_free(start);
|
316
|
+
start = position_copy(parser->current_token->location->start);
|
317
|
+
continue;
|
212
318
|
}
|
213
|
-
}
|
214
|
-
|
215
|
-
buffer_T name_buffer = buffer_new();
|
216
|
-
|
217
|
-
position_T* start_position;
|
218
319
|
|
219
|
-
|
220
|
-
buffer_append(&
|
221
|
-
|
222
|
-
} else {
|
223
|
-
start_position = position_copy(first_token->location->start);
|
320
|
+
token_T* token = parser_advance(parser);
|
321
|
+
buffer_append(&buffer, token->value);
|
322
|
+
token_free(token);
|
224
323
|
}
|
225
324
|
|
226
|
-
|
227
|
-
|
228
|
-
position_T* end_position = position_copy(first_token->location->end);
|
229
|
-
size_t range_end = first_token->range->to;
|
230
|
-
|
231
|
-
while (parser->current_token->type == TOKEN_CHARACTER && parser->current_token->value
|
232
|
-
&& strcmp(parser->current_token->value, ".") == 0) {
|
233
|
-
|
234
|
-
token_T* dot_token = parser_advance(parser);
|
235
|
-
|
236
|
-
buffer_append(&name_buffer, dot_token->value);
|
237
|
-
position_free(end_position);
|
238
|
-
|
239
|
-
end_position = position_copy(dot_token->location->end);
|
240
|
-
range_end = dot_token->range->to;
|
241
|
-
|
242
|
-
token_free(dot_token);
|
325
|
+
parser_append_literal_node_from_buffer(parser, &buffer, children, start);
|
243
326
|
|
244
|
-
|
245
|
-
|
327
|
+
position_T* node_start = NULL;
|
328
|
+
position_T* node_end = NULL;
|
246
329
|
|
247
|
-
|
248
|
-
|
330
|
+
if (children->size > 0) {
|
331
|
+
AST_NODE_T* first_child = array_get(children, 0);
|
332
|
+
AST_NODE_T* last_child = array_get(children, children->size - 1);
|
249
333
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
}
|
334
|
+
node_start = position_copy(first_child->location->start);
|
335
|
+
node_end = position_copy(last_child->location->end);
|
336
|
+
} else {
|
337
|
+
node_start = position_copy(parser->current_token->location->start);
|
338
|
+
node_end = position_copy(parser->current_token->location->start);
|
256
339
|
}
|
257
340
|
|
258
|
-
token_T* combined_token = calloc(1, sizeof(token_T));
|
259
|
-
combined_token->value = herb_strdup(name_buffer.value);
|
260
|
-
combined_token->type = TOKEN_IDENTIFIER;
|
261
|
-
combined_token->location =
|
262
|
-
location_from(start_position->line, start_position->column, end_position->line, end_position->column);
|
263
|
-
|
264
|
-
size_t range_start = at_token != NULL ? at_token->range->from : first_token->range->from;
|
265
|
-
combined_token->range = range_init(range_start, range_end);
|
266
|
-
|
267
341
|
AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name =
|
268
|
-
ast_html_attribute_name_node_init(
|
269
|
-
|
270
|
-
buffer_free(&name_buffer);
|
271
|
-
position_free(start_position);
|
272
|
-
position_free(end_position);
|
273
|
-
token_free(first_token);
|
342
|
+
ast_html_attribute_name_node_init(children, node_start, node_end, errors);
|
274
343
|
|
275
|
-
|
276
|
-
|
277
|
-
|
344
|
+
position_free(start);
|
345
|
+
position_free(node_start);
|
346
|
+
position_free(node_end);
|
347
|
+
buffer_free(&buffer);
|
278
348
|
|
279
349
|
return attribute_name;
|
280
350
|
}
|
@@ -304,9 +374,87 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_quoted_html_attribute_value
|
|
304
374
|
continue;
|
305
375
|
}
|
306
376
|
|
377
|
+
if (token_is(parser, TOKEN_BACKSLASH)) {
|
378
|
+
lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
|
379
|
+
|
380
|
+
token_T* next_token = lexer_next_token(parser->lexer);
|
381
|
+
|
382
|
+
if (next_token && next_token->type == TOKEN_QUOTE && opening_quote != NULL
|
383
|
+
&& strcmp(next_token->value, opening_quote->value) == 0) {
|
384
|
+
buffer_append(&buffer, parser->current_token->value);
|
385
|
+
buffer_append(&buffer, next_token->value);
|
386
|
+
|
387
|
+
token_free(parser->current_token);
|
388
|
+
token_free(next_token);
|
389
|
+
|
390
|
+
parser->current_token = lexer_next_token(parser->lexer);
|
391
|
+
continue;
|
392
|
+
} else {
|
393
|
+
lexer_restore_state(parser->lexer, saved_state);
|
394
|
+
|
395
|
+
if (next_token) { token_free(next_token); }
|
396
|
+
}
|
397
|
+
}
|
398
|
+
|
307
399
|
buffer_append(&buffer, parser->current_token->value);
|
308
400
|
token_free(parser->current_token);
|
401
|
+
|
402
|
+
parser->current_token = lexer_next_token(parser->lexer);
|
403
|
+
}
|
404
|
+
|
405
|
+
if (token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
|
406
|
+
&& strcmp(parser->current_token->value, opening_quote->value) == 0) {
|
407
|
+
lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
|
408
|
+
|
409
|
+
token_T* potential_closing = parser->current_token;
|
309
410
|
parser->current_token = lexer_next_token(parser->lexer);
|
411
|
+
|
412
|
+
if (token_is(parser, TOKEN_IDENTIFIER) || token_is(parser, TOKEN_CHARACTER)) {
|
413
|
+
append_unexpected_error(
|
414
|
+
"Unescaped quote character in attribute value",
|
415
|
+
"escaped quote (\\') or different quote style (\")",
|
416
|
+
opening_quote->value,
|
417
|
+
potential_closing->location->start,
|
418
|
+
potential_closing->location->end,
|
419
|
+
errors
|
420
|
+
);
|
421
|
+
|
422
|
+
lexer_restore_state(parser->lexer, saved_state);
|
423
|
+
|
424
|
+
token_free(parser->current_token);
|
425
|
+
parser->current_token = potential_closing;
|
426
|
+
|
427
|
+
buffer_append(&buffer, parser->current_token->value);
|
428
|
+
token_free(parser->current_token);
|
429
|
+
parser->current_token = lexer_next_token(parser->lexer);
|
430
|
+
|
431
|
+
while (!token_is(parser, TOKEN_EOF)
|
432
|
+
&& !(
|
433
|
+
token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
|
434
|
+
&& strcmp(parser->current_token->value, opening_quote->value) == 0
|
435
|
+
)) {
|
436
|
+
if (token_is(parser, TOKEN_ERB_START)) {
|
437
|
+
parser_append_literal_node_from_buffer(parser, &buffer, children, start);
|
438
|
+
|
439
|
+
array_append(children, parser_parse_erb_tag(parser));
|
440
|
+
|
441
|
+
position_free(start);
|
442
|
+
start = position_copy(parser->current_token->location->start);
|
443
|
+
|
444
|
+
continue;
|
445
|
+
}
|
446
|
+
|
447
|
+
buffer_append(&buffer, parser->current_token->value);
|
448
|
+
token_free(parser->current_token);
|
449
|
+
|
450
|
+
parser->current_token = lexer_next_token(parser->lexer);
|
451
|
+
}
|
452
|
+
} else {
|
453
|
+
token_free(parser->current_token);
|
454
|
+
parser->current_token = potential_closing;
|
455
|
+
|
456
|
+
lexer_restore_state(parser->lexer, saved_state);
|
457
|
+
}
|
310
458
|
}
|
311
459
|
|
312
460
|
parser_append_literal_node_from_buffer(parser, &buffer, children, start);
|
@@ -383,6 +531,30 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser
|
|
383
531
|
// <div id="home">
|
384
532
|
if (token_is(parser, TOKEN_QUOTE)) { return parser_parse_quoted_html_attribute_value(parser, children, errors); }
|
385
533
|
|
534
|
+
if (token_is(parser, TOKEN_BACKTICK)) {
|
535
|
+
token_T* token = parser_advance(parser);
|
536
|
+
position_T* start = position_copy(token->location->start);
|
537
|
+
position_T* end = position_copy(token->location->end);
|
538
|
+
|
539
|
+
append_unexpected_error(
|
540
|
+
"Invalid quote character for HTML attribute",
|
541
|
+
"single quote (') or double quote (\")",
|
542
|
+
"backtick (`)",
|
543
|
+
start,
|
544
|
+
end,
|
545
|
+
errors
|
546
|
+
);
|
547
|
+
|
548
|
+
AST_HTML_ATTRIBUTE_VALUE_NODE_T* value =
|
549
|
+
ast_html_attribute_value_node_init(NULL, children, NULL, false, start, end, errors);
|
550
|
+
|
551
|
+
position_free(start);
|
552
|
+
position_free(end);
|
553
|
+
token_free(token);
|
554
|
+
|
555
|
+
return value;
|
556
|
+
}
|
557
|
+
|
386
558
|
token_T* token = parser_advance(parser);
|
387
559
|
|
388
560
|
append_unexpected_error(
|
@@ -412,9 +584,86 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser
|
|
412
584
|
static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser) {
|
413
585
|
AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name = parser_parse_html_attribute_name(parser);
|
414
586
|
|
587
|
+
if (parser->options && parser->options->track_whitespace) {
|
588
|
+
bool has_equals = (parser->current_token->type == TOKEN_EQUALS)
|
589
|
+
|| lexer_peek_for_token_type_after_whitespace(parser->lexer, TOKEN_EQUALS);
|
590
|
+
|
591
|
+
if (has_equals) {
|
592
|
+
buffer_T equals_buffer = buffer_new();
|
593
|
+
position_T* equals_start = NULL;
|
594
|
+
position_T* equals_end = NULL;
|
595
|
+
size_t range_start = 0;
|
596
|
+
size_t range_end = 0;
|
597
|
+
|
598
|
+
while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
|
599
|
+
token_T* whitespace = parser_advance(parser);
|
600
|
+
|
601
|
+
if (equals_start == NULL) {
|
602
|
+
equals_start = position_copy(whitespace->location->start);
|
603
|
+
range_start = whitespace->range->from;
|
604
|
+
}
|
605
|
+
|
606
|
+
buffer_append(&equals_buffer, whitespace->value);
|
607
|
+
token_free(whitespace);
|
608
|
+
}
|
609
|
+
|
610
|
+
token_T* equals = parser_advance(parser);
|
611
|
+
|
612
|
+
if (equals_start == NULL) {
|
613
|
+
equals_start = position_copy(equals->location->start);
|
614
|
+
range_start = equals->range->from;
|
615
|
+
}
|
616
|
+
|
617
|
+
buffer_append(&equals_buffer, equals->value);
|
618
|
+
equals_end = position_copy(equals->location->end);
|
619
|
+
range_end = equals->range->to;
|
620
|
+
token_free(equals);
|
621
|
+
|
622
|
+
while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
|
623
|
+
token_T* whitespace = parser_advance(parser);
|
624
|
+
buffer_append(&equals_buffer, whitespace->value);
|
625
|
+
equals_end = position_copy(whitespace->location->end);
|
626
|
+
range_end = whitespace->range->to;
|
627
|
+
token_free(whitespace);
|
628
|
+
}
|
629
|
+
|
630
|
+
token_T* equals_with_whitespace = calloc(1, sizeof(token_T));
|
631
|
+
equals_with_whitespace->type = TOKEN_EQUALS;
|
632
|
+
equals_with_whitespace->value = herb_strdup(equals_buffer.value);
|
633
|
+
equals_with_whitespace->location = location_init(equals_start, equals_end);
|
634
|
+
equals_with_whitespace->range = range_init(range_start, range_end);
|
635
|
+
|
636
|
+
buffer_free(&equals_buffer);
|
637
|
+
|
638
|
+
AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
|
639
|
+
|
640
|
+
return ast_html_attribute_node_init(
|
641
|
+
attribute_name,
|
642
|
+
equals_with_whitespace,
|
643
|
+
attribute_value,
|
644
|
+
attribute_name->base.location->start,
|
645
|
+
attribute_value->base.location->end,
|
646
|
+
NULL
|
647
|
+
);
|
648
|
+
} else {
|
649
|
+
return ast_html_attribute_node_init(
|
650
|
+
attribute_name,
|
651
|
+
NULL,
|
652
|
+
NULL,
|
653
|
+
attribute_name->base.location->start,
|
654
|
+
attribute_name->base.location->end,
|
655
|
+
NULL
|
656
|
+
);
|
657
|
+
}
|
658
|
+
} else {
|
659
|
+
parser_consume_whitespace(parser, NULL);
|
660
|
+
}
|
661
|
+
|
415
662
|
token_T* equals = parser_consume_if_present(parser, TOKEN_EQUALS);
|
416
663
|
|
417
664
|
if (equals != NULL) {
|
665
|
+
parser_consume_whitespace(parser, NULL);
|
666
|
+
|
418
667
|
AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
|
419
668
|
|
420
669
|
AST_HTML_ATTRIBUTE_NODE_T* attribute_node = ast_html_attribute_node_init(
|
@@ -441,30 +690,103 @@ static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser)
|
|
441
690
|
);
|
442
691
|
}
|
443
692
|
|
444
|
-
static
|
445
|
-
|
446
|
-
array_T* children = array_init(8);
|
693
|
+
static void parser_skip_erb_content(lexer_T* lexer) {
|
694
|
+
token_T* token = NULL;
|
447
695
|
|
448
|
-
|
449
|
-
|
696
|
+
do {
|
697
|
+
token = lexer_next_token(lexer);
|
450
698
|
|
451
|
-
|
452
|
-
|
699
|
+
if (token->type == TOKEN_ERB_END) {
|
700
|
+
token_free(token);
|
701
|
+
break;
|
702
|
+
}
|
453
703
|
|
454
|
-
|
455
|
-
|
704
|
+
token_free(token);
|
705
|
+
} while (true);
|
706
|
+
}
|
707
|
+
|
708
|
+
static bool parser_lookahead_erb_is_attribute(lexer_T* lexer) {
|
709
|
+
token_T* after = NULL;
|
710
|
+
|
711
|
+
do {
|
712
|
+
after = lexer_next_token(lexer);
|
713
|
+
|
714
|
+
if (after->type == TOKEN_EQUALS) {
|
715
|
+
token_free(after);
|
716
|
+
return true;
|
717
|
+
}
|
718
|
+
|
719
|
+
if (after->type == TOKEN_WHITESPACE || after->type == TOKEN_NEWLINE) {
|
720
|
+
token_free(after);
|
456
721
|
continue;
|
457
722
|
}
|
458
723
|
|
459
|
-
|
724
|
+
if (after->type == TOKEN_IDENTIFIER || after->type == TOKEN_CHARACTER || after->type == TOKEN_DASH
|
725
|
+
|| after->type == TOKEN_ERB_START) {
|
460
726
|
|
461
|
-
|
462
|
-
|
727
|
+
if (after->type == TOKEN_ERB_START) {
|
728
|
+
token_free(after);
|
729
|
+
parser_skip_erb_content(lexer);
|
730
|
+
} else {
|
731
|
+
token_free(after);
|
732
|
+
}
|
463
733
|
continue;
|
464
734
|
}
|
465
735
|
|
466
|
-
|
467
|
-
|
736
|
+
token_free(after);
|
737
|
+
return false;
|
738
|
+
|
739
|
+
} while (true);
|
740
|
+
}
|
741
|
+
|
742
|
+
static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children) {
|
743
|
+
bool is_output_tag = parser->current_token->value && strlen(parser->current_token->value) >= 3
|
744
|
+
&& strncmp(parser->current_token->value, "<%=", 3) == 0;
|
745
|
+
|
746
|
+
if (!is_output_tag) {
|
747
|
+
array_append(children, parser_parse_erb_tag(parser));
|
748
|
+
|
749
|
+
return;
|
750
|
+
}
|
751
|
+
|
752
|
+
lexer_T lexer_copy = *parser->lexer;
|
753
|
+
|
754
|
+
token_T* erb_start = lexer_next_token(&lexer_copy);
|
755
|
+
token_free(erb_start);
|
756
|
+
parser_skip_erb_content(&lexer_copy);
|
757
|
+
|
758
|
+
bool looks_like_attribute = parser_lookahead_erb_is_attribute(&lexer_copy);
|
759
|
+
|
760
|
+
if (looks_like_attribute) {
|
761
|
+
array_append(children, parser_parse_html_attribute(parser));
|
762
|
+
} else {
|
763
|
+
array_append(children, parser_parse_erb_tag(parser));
|
764
|
+
}
|
765
|
+
}
|
766
|
+
|
767
|
+
static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children) {
|
768
|
+
token_T* whitespace = parser_consume_if_present(parser, TOKEN_WHITESPACE);
|
769
|
+
|
770
|
+
if (whitespace != NULL) {
|
771
|
+
parser_handle_whitespace(parser, whitespace, children);
|
772
|
+
return;
|
773
|
+
}
|
774
|
+
|
775
|
+
token_T* newline = parser_consume_if_present(parser, TOKEN_NEWLINE);
|
776
|
+
|
777
|
+
if (newline != NULL) { parser_handle_whitespace(parser, newline, children); }
|
778
|
+
}
|
779
|
+
|
780
|
+
static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
|
781
|
+
array_T* errors = array_init(8);
|
782
|
+
array_T* children = array_init(8);
|
783
|
+
|
784
|
+
token_T* tag_start = parser_consume_expected(parser, TOKEN_HTML_TAG_START, errors);
|
785
|
+
token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
|
786
|
+
|
787
|
+
while (token_is_none_of(parser, TOKEN_HTML_TAG_END, TOKEN_HTML_TAG_SELF_CLOSE, TOKEN_EOF)) {
|
788
|
+
if (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
|
789
|
+
parser_handle_whitespace_in_open_tag(parser, children);
|
468
790
|
continue;
|
469
791
|
}
|
470
792
|
|
@@ -473,6 +795,11 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
|
|
473
795
|
continue;
|
474
796
|
}
|
475
797
|
|
798
|
+
if (parser->current_token->type == TOKEN_ERB_START) {
|
799
|
+
parser_handle_erb_in_open_tag(parser, children);
|
800
|
+
continue;
|
801
|
+
}
|
802
|
+
|
476
803
|
if (parser->current_token->type == TOKEN_AT) {
|
477
804
|
array_append(children, parser_parse_html_attribute(parser));
|
478
805
|
continue;
|
@@ -526,14 +853,15 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
|
|
526
853
|
|
527
854
|
static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser) {
|
528
855
|
array_T* errors = array_init(8);
|
856
|
+
array_T* children = array_init(8);
|
529
857
|
|
530
858
|
token_T* tag_opening = parser_consume_expected(parser, TOKEN_HTML_TAG_START_CLOSE, errors);
|
859
|
+
|
860
|
+
parser_consume_whitespace(parser, children);
|
861
|
+
|
531
862
|
token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
|
532
863
|
|
533
|
-
|
534
|
-
token_T* whitespace = parser_advance(parser);
|
535
|
-
token_free(whitespace);
|
536
|
-
}
|
864
|
+
parser_consume_whitespace(parser, children);
|
537
865
|
|
538
866
|
token_T* tag_closing = parser_consume_expected(parser, TOKEN_HTML_TAG_END, errors);
|
539
867
|
|
@@ -557,6 +885,7 @@ static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser)
|
|
557
885
|
AST_HTML_CLOSE_TAG_NODE_T* close_tag = ast_html_close_tag_node_init(
|
558
886
|
tag_opening,
|
559
887
|
tag_name,
|
888
|
+
children,
|
560
889
|
tag_closing,
|
561
890
|
tag_opening->location->start,
|
562
891
|
tag_closing->location->end,
|
@@ -596,7 +925,13 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_regular_element(
|
|
596
925
|
|
597
926
|
parser_push_open_tag(parser, open_tag->tag_name);
|
598
927
|
|
599
|
-
|
928
|
+
if (open_tag->tag_name->value && parser_is_foreign_content_tag(open_tag->tag_name->value)) {
|
929
|
+
foreign_content_type_T content_type = parser_get_foreign_content_type(open_tag->tag_name->value);
|
930
|
+
parser_enter_foreign_content(parser, content_type);
|
931
|
+
parser_parse_foreign_content(parser, body, errors);
|
932
|
+
} else {
|
933
|
+
parser_parse_in_data_state(parser, body, errors);
|
934
|
+
}
|
600
935
|
|
601
936
|
if (!token_is(parser, TOKEN_HTML_TAG_START_CLOSE)) { return parser_handle_missing_close_tag(open_tag, body, errors); }
|
602
937
|
|
@@ -633,9 +968,7 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_element(parser_T* parser) {
|
|
633
968
|
AST_HTML_OPEN_TAG_NODE_T* open_tag = parser_parse_html_open_tag(parser);
|
634
969
|
|
635
970
|
// <tag />
|
636
|
-
if (open_tag->is_void
|
637
|
-
return parser_parse_html_self_closing_element(parser, open_tag);
|
638
|
-
}
|
971
|
+
if (open_tag->is_void) { return parser_parse_html_self_closing_element(parser, open_tag); }
|
639
972
|
|
640
973
|
// <tag>, in void element list, and not in inside an <svg> element
|
641
974
|
if (!open_tag->is_void && is_void_element(open_tag->tag_name->value) && !parser_in_svg_context(parser)) {
|
@@ -687,6 +1020,68 @@ static AST_ERB_CONTENT_NODE_T* parser_parse_erb_tag(parser_T* parser) {
|
|
687
1020
|
return erb_node;
|
688
1021
|
}
|
689
1022
|
|
1023
|
+
static void parser_parse_foreign_content(parser_T* parser, array_T* children, array_T* errors) {
|
1024
|
+
buffer_T content = buffer_new();
|
1025
|
+
position_T* start = position_copy(parser->current_token->location->start);
|
1026
|
+
const char* expected_closing_tag = parser_get_foreign_content_closing_tag(parser->foreign_content_type);
|
1027
|
+
|
1028
|
+
if (expected_closing_tag == NULL) {
|
1029
|
+
parser_exit_foreign_content(parser);
|
1030
|
+
position_free(start);
|
1031
|
+
buffer_free(&content);
|
1032
|
+
|
1033
|
+
return;
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
while (!token_is(parser, TOKEN_EOF)) {
|
1037
|
+
if (token_is(parser, TOKEN_ERB_START)) {
|
1038
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
1039
|
+
|
1040
|
+
AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
|
1041
|
+
array_append(children, erb_node);
|
1042
|
+
|
1043
|
+
position_free(start);
|
1044
|
+
start = position_copy(parser->current_token->location->start);
|
1045
|
+
|
1046
|
+
continue;
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
if (token_is(parser, TOKEN_HTML_TAG_START_CLOSE)) {
|
1050
|
+
lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
|
1051
|
+
|
1052
|
+
token_T* next_token = lexer_next_token(parser->lexer);
|
1053
|
+
bool is_potential_match = false;
|
1054
|
+
|
1055
|
+
if (next_token && next_token->type == TOKEN_IDENTIFIER && next_token->value) {
|
1056
|
+
is_potential_match = parser_is_expected_closing_tag_name(next_token->value, parser->foreign_content_type);
|
1057
|
+
}
|
1058
|
+
|
1059
|
+
lexer_restore_state(parser->lexer, saved_state);
|
1060
|
+
|
1061
|
+
if (next_token) { token_free(next_token); }
|
1062
|
+
|
1063
|
+
if (is_potential_match) {
|
1064
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
1065
|
+
parser_exit_foreign_content(parser);
|
1066
|
+
|
1067
|
+
position_free(start);
|
1068
|
+
buffer_free(&content);
|
1069
|
+
|
1070
|
+
return;
|
1071
|
+
}
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
token_T* token = parser_advance(parser);
|
1075
|
+
buffer_append(&content, token->value);
|
1076
|
+
token_free(token);
|
1077
|
+
}
|
1078
|
+
|
1079
|
+
parser_append_literal_node_from_buffer(parser, &content, children, start);
|
1080
|
+
parser_exit_foreign_content(parser);
|
1081
|
+
position_free(start);
|
1082
|
+
buffer_free(&content);
|
1083
|
+
}
|
1084
|
+
|
690
1085
|
static void parser_parse_in_data_state(parser_T* parser, array_T* children, array_T* errors) {
|
691
1086
|
while (token_is_none_of(parser, TOKEN_HTML_TAG_START_CLOSE, TOKEN_EOF)) {
|
692
1087
|
if (token_is(parser, TOKEN_ERB_START)) {
|
@@ -699,6 +1094,16 @@ static void parser_parse_in_data_state(parser_T* parser, array_T* children, arra
|
|
699
1094
|
continue;
|
700
1095
|
}
|
701
1096
|
|
1097
|
+
if (token_is(parser, TOKEN_XML_DECLARATION)) {
|
1098
|
+
array_append(children, parser_parse_xml_declaration(parser));
|
1099
|
+
continue;
|
1100
|
+
}
|
1101
|
+
|
1102
|
+
if (token_is(parser, TOKEN_CDATA_START)) {
|
1103
|
+
array_append(children, parser_parse_cdata(parser));
|
1104
|
+
continue;
|
1105
|
+
}
|
1106
|
+
|
702
1107
|
if (token_is(parser, TOKEN_HTML_COMMENT_START)) {
|
703
1108
|
array_append(children, parser_parse_html_comment(parser));
|
704
1109
|
continue;
|
@@ -808,12 +1213,40 @@ AST_DOCUMENT_NODE_T* parser_parse(parser_T* parser) {
|
|
808
1213
|
return parser_parse_document(parser);
|
809
1214
|
}
|
810
1215
|
|
1216
|
+
static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children) {
|
1217
|
+
if (parser->options && parser->options->track_whitespace) {
|
1218
|
+
array_T* errors = array_init(8);
|
1219
|
+
AST_WHITESPACE_NODE_T* whitespace_node = ast_whitespace_node_init(
|
1220
|
+
whitespace_token,
|
1221
|
+
whitespace_token->location->start,
|
1222
|
+
whitespace_token->location->end,
|
1223
|
+
errors
|
1224
|
+
);
|
1225
|
+
array_append(children, whitespace_node);
|
1226
|
+
}
|
1227
|
+
|
1228
|
+
token_free(whitespace_token);
|
1229
|
+
}
|
1230
|
+
|
1231
|
+
static void parser_consume_whitespace(parser_T* parser, array_T* children) {
|
1232
|
+
while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
|
1233
|
+
token_T* whitespace = parser_advance(parser);
|
1234
|
+
|
1235
|
+
if (parser->options && parser->options->track_whitespace && children != NULL) {
|
1236
|
+
parser_handle_whitespace(parser, whitespace, children);
|
1237
|
+
} else {
|
1238
|
+
token_free(whitespace);
|
1239
|
+
}
|
1240
|
+
}
|
1241
|
+
}
|
1242
|
+
|
811
1243
|
void parser_free(parser_T* parser) {
|
812
1244
|
if (parser == NULL) { return; }
|
813
1245
|
|
814
1246
|
if (parser->lexer != NULL) { lexer_free(parser->lexer); }
|
815
1247
|
if (parser->current_token != NULL) { token_free(parser->current_token); }
|
816
1248
|
if (parser->open_tags_stack != NULL) { array_free(&parser->open_tags_stack); }
|
1249
|
+
if (parser->options != NULL) { free(parser->options); }
|
817
1250
|
|
818
1251
|
free(parser);
|
819
1252
|
}
|