herb 0.4.3-x86_64-darwin → 0.6.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/parser.c CHANGED
@@ -6,6 +6,7 @@
6
6
  #include "include/errors.h"
7
7
  #include "include/html_util.h"
8
8
  #include "include/lexer.h"
9
+ #include "include/lexer_peek_helpers.h"
9
10
  #include "include/parser_helpers.h"
10
11
  #include "include/token.h"
11
12
  #include "include/token_matchers.h"
@@ -17,22 +18,81 @@
17
18
  #include <strings.h>
18
19
 
19
20
  static void parser_parse_in_data_state(parser_T* parser, array_T* children, array_T* errors);
21
+ static void parser_parse_foreign_content(parser_T* parser, array_T* children, array_T* errors);
20
22
  static AST_ERB_CONTENT_NODE_T* parser_parse_erb_tag(parser_T* parser);
23
+ static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children);
24
+ static void parser_consume_whitespace(parser_T* parser, array_T* children);
25
+ static void parser_skip_erb_content(lexer_T* lexer);
26
+ static bool parser_lookahead_erb_is_attribute(lexer_T* lexer);
27
+ static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children);
28
+ static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children);
21
29
 
22
30
  size_t parser_sizeof(void) {
23
31
  return sizeof(struct PARSER_STRUCT);
24
32
  }
25
33
 
26
- parser_T* parser_init(lexer_T* lexer) {
34
+ parser_T* parser_init(lexer_T* lexer, parser_options_T* options) {
27
35
  parser_T* parser = calloc(1, parser_sizeof());
28
36
 
29
37
  parser->lexer = lexer;
30
38
  parser->current_token = lexer_next_token(lexer);
31
39
  parser->open_tags_stack = array_init(16);
40
+ parser->state = PARSER_STATE_DATA;
41
+ parser->foreign_content_type = FOREIGN_CONTENT_UNKNOWN;
42
+
43
+ if (options) {
44
+ parser->options = calloc(1, sizeof(parser_options_T));
45
+ parser->options->track_whitespace = options->track_whitespace;
46
+ } else {
47
+ parser->options = NULL;
48
+ }
32
49
 
33
50
  return parser;
34
51
  }
35
52
 
53
+ static AST_CDATA_NODE_T* parser_parse_cdata(parser_T* parser) {
54
+ array_T* errors = array_init(8);
55
+ array_T* children = array_init(8);
56
+ buffer_T content = buffer_new();
57
+
58
+ token_T* tag_opening = parser_consume_expected(parser, TOKEN_CDATA_START, errors);
59
+ position_T* start = position_copy(parser->current_token->location->start);
60
+
61
+ while (token_is_none_of(parser, TOKEN_CDATA_END, TOKEN_EOF)) {
62
+ if (token_is(parser, TOKEN_ERB_START)) {
63
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
64
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
65
+ array_append(children, erb_node);
66
+ position_free(start);
67
+ start = position_copy(parser->current_token->location->start);
68
+ continue;
69
+ }
70
+
71
+ token_T* token = parser_advance(parser);
72
+ buffer_append(&content, token->value);
73
+ token_free(token);
74
+ }
75
+
76
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
77
+ token_T* tag_closing = parser_consume_expected(parser, TOKEN_CDATA_END, errors);
78
+
79
+ AST_CDATA_NODE_T* cdata = ast_cdata_node_init(
80
+ tag_opening,
81
+ children,
82
+ tag_closing,
83
+ tag_opening->location->start,
84
+ tag_closing->location->end,
85
+ errors
86
+ );
87
+
88
+ position_free(start);
89
+ buffer_free(&content);
90
+ token_free(tag_opening);
91
+ token_free(tag_closing);
92
+
93
+ return cdata;
94
+ }
95
+
36
96
  static AST_HTML_COMMENT_NODE_T* parser_parse_html_comment(parser_T* parser) {
37
97
  array_T* errors = array_init(8);
38
98
  array_T* children = array_init(8);
@@ -125,6 +185,54 @@ static AST_HTML_DOCTYPE_NODE_T* parser_parse_html_doctype(parser_T* parser) {
125
185
  return doctype;
126
186
  }
127
187
 
188
+ static AST_XML_DECLARATION_NODE_T* parser_parse_xml_declaration(parser_T* parser) {
189
+ array_T* errors = array_init(8);
190
+ array_T* children = array_init(8);
191
+ buffer_T content = buffer_new();
192
+
193
+ token_T* tag_opening = parser_consume_expected(parser, TOKEN_XML_DECLARATION, errors);
194
+
195
+ position_T* start = position_copy(parser->current_token->location->start);
196
+
197
+ while (token_is_none_of(parser, TOKEN_XML_DECLARATION_END, TOKEN_EOF)) {
198
+ if (token_is(parser, TOKEN_ERB_START)) {
199
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
200
+
201
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
202
+ array_append(children, erb_node);
203
+
204
+ position_free(start);
205
+ start = position_copy(parser->current_token->location->start);
206
+
207
+ continue;
208
+ }
209
+
210
+ token_T* token = parser_advance(parser);
211
+ buffer_append(&content, token->value);
212
+ token_free(token);
213
+ }
214
+
215
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
216
+
217
+ token_T* tag_closing = parser_consume_expected(parser, TOKEN_XML_DECLARATION_END, errors);
218
+
219
+ AST_XML_DECLARATION_NODE_T* xml_declaration = ast_xml_declaration_node_init(
220
+ tag_opening,
221
+ children,
222
+ tag_closing,
223
+ tag_opening->location->start,
224
+ tag_closing->location->end,
225
+ errors
226
+ );
227
+
228
+ position_free(start);
229
+ token_free(tag_opening);
230
+ token_free(tag_closing);
231
+ buffer_free(&content);
232
+
233
+ return xml_declaration;
234
+ }
235
+
128
236
  static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T* document_errors) {
129
237
  position_T* start = position_copy(parser->current_token->location->start);
130
238
 
@@ -185,96 +293,58 @@ static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T
185
293
 
186
294
  static AST_HTML_ATTRIBUTE_NAME_NODE_T* parser_parse_html_attribute_name(parser_T* parser) {
187
295
  array_T* errors = array_init(8);
296
+ array_T* children = array_init(8);
297
+ buffer_T buffer = buffer_new();
298
+ position_T* start = position_copy(parser->current_token->location->start);
188
299
 
189
- token_T* at_token = parser_consume_if_present(parser, TOKEN_AT);
190
- token_T* first_token = NULL;
191
-
192
- if (at_token != NULL) {
193
- first_token = parser_consume_if_present(parser, TOKEN_IDENTIFIER);
194
-
195
- if (first_token == NULL) {
196
- parser_append_unexpected_token_error(parser, TOKEN_IDENTIFIER, errors);
197
-
198
- AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name =
199
- ast_html_attribute_name_node_init(at_token, at_token->location->start, at_token->location->end, errors);
200
-
201
- token_free(at_token);
300
+ while (token_is_none_of(
301
+ parser,
302
+ TOKEN_EQUALS,
303
+ TOKEN_WHITESPACE,
304
+ TOKEN_NEWLINE,
305
+ TOKEN_HTML_TAG_END,
306
+ TOKEN_HTML_TAG_SELF_CLOSE,
307
+ TOKEN_EOF
308
+ )) {
309
+ if (token_is(parser, TOKEN_ERB_START)) {
310
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
202
311
 
203
- return attribute_name;
204
- }
205
- } else {
206
- first_token = parser_consume_if_present(parser, TOKEN_IDENTIFIER);
312
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
313
+ array_append(children, erb_node);
207
314
 
208
- if (first_token == NULL) {
209
- parser_append_unexpected_token_error(parser, TOKEN_IDENTIFIER, errors);
210
- AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name = ast_html_attribute_name_node_init(NULL, NULL, NULL, errors);
211
- return attribute_name;
315
+ position_free(start);
316
+ start = position_copy(parser->current_token->location->start);
317
+ continue;
212
318
  }
213
- }
214
-
215
- buffer_T name_buffer = buffer_new();
216
-
217
- position_T* start_position;
218
319
 
219
- if (at_token != NULL) {
220
- buffer_append(&name_buffer, at_token->value);
221
- start_position = position_copy(at_token->location->start);
222
- } else {
223
- start_position = position_copy(first_token->location->start);
320
+ token_T* token = parser_advance(parser);
321
+ buffer_append(&buffer, token->value);
322
+ token_free(token);
224
323
  }
225
324
 
226
- buffer_append(&name_buffer, first_token->value);
227
-
228
- position_T* end_position = position_copy(first_token->location->end);
229
- size_t range_end = first_token->range->to;
230
-
231
- while (parser->current_token->type == TOKEN_CHARACTER && parser->current_token->value
232
- && strcmp(parser->current_token->value, ".") == 0) {
233
-
234
- token_T* dot_token = parser_advance(parser);
235
-
236
- buffer_append(&name_buffer, dot_token->value);
237
- position_free(end_position);
238
-
239
- end_position = position_copy(dot_token->location->end);
240
- range_end = dot_token->range->to;
241
-
242
- token_free(dot_token);
325
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
243
326
 
244
- if (parser->current_token->type == TOKEN_IDENTIFIER) {
245
- token_T* next_identifier = parser_advance(parser);
327
+ position_T* node_start = NULL;
328
+ position_T* node_end = NULL;
246
329
 
247
- buffer_append(&name_buffer, next_identifier->value);
248
- position_free(end_position);
330
+ if (children->size > 0) {
331
+ AST_NODE_T* first_child = array_get(children, 0);
332
+ AST_NODE_T* last_child = array_get(children, children->size - 1);
249
333
 
250
- end_position = position_copy(next_identifier->location->end);
251
- range_end = next_identifier->range->to;
252
- token_free(next_identifier);
253
- } else {
254
- break;
255
- }
334
+ node_start = position_copy(first_child->location->start);
335
+ node_end = position_copy(last_child->location->end);
336
+ } else {
337
+ node_start = position_copy(parser->current_token->location->start);
338
+ node_end = position_copy(parser->current_token->location->start);
256
339
  }
257
340
 
258
- token_T* combined_token = calloc(1, sizeof(token_T));
259
- combined_token->value = herb_strdup(name_buffer.value);
260
- combined_token->type = TOKEN_IDENTIFIER;
261
- combined_token->location =
262
- location_from(start_position->line, start_position->column, end_position->line, end_position->column);
263
-
264
- size_t range_start = at_token != NULL ? at_token->range->from : first_token->range->from;
265
- combined_token->range = range_init(range_start, range_end);
266
-
267
341
  AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name =
268
- ast_html_attribute_name_node_init(combined_token, start_position, end_position, errors);
269
-
270
- buffer_free(&name_buffer);
271
- position_free(start_position);
272
- position_free(end_position);
273
- token_free(first_token);
342
+ ast_html_attribute_name_node_init(children, node_start, node_end, errors);
274
343
 
275
- if (at_token != NULL) { token_free(at_token); }
276
-
277
- token_free(combined_token);
344
+ position_free(start);
345
+ position_free(node_start);
346
+ position_free(node_end);
347
+ buffer_free(&buffer);
278
348
 
279
349
  return attribute_name;
280
350
  }
@@ -304,9 +374,87 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_quoted_html_attribute_value
304
374
  continue;
305
375
  }
306
376
 
377
+ if (token_is(parser, TOKEN_BACKSLASH)) {
378
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
379
+
380
+ token_T* next_token = lexer_next_token(parser->lexer);
381
+
382
+ if (next_token && next_token->type == TOKEN_QUOTE && opening_quote != NULL
383
+ && strcmp(next_token->value, opening_quote->value) == 0) {
384
+ buffer_append(&buffer, parser->current_token->value);
385
+ buffer_append(&buffer, next_token->value);
386
+
387
+ token_free(parser->current_token);
388
+ token_free(next_token);
389
+
390
+ parser->current_token = lexer_next_token(parser->lexer);
391
+ continue;
392
+ } else {
393
+ lexer_restore_state(parser->lexer, saved_state);
394
+
395
+ if (next_token) { token_free(next_token); }
396
+ }
397
+ }
398
+
307
399
  buffer_append(&buffer, parser->current_token->value);
308
400
  token_free(parser->current_token);
401
+
402
+ parser->current_token = lexer_next_token(parser->lexer);
403
+ }
404
+
405
+ if (token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
406
+ && strcmp(parser->current_token->value, opening_quote->value) == 0) {
407
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
408
+
409
+ token_T* potential_closing = parser->current_token;
309
410
  parser->current_token = lexer_next_token(parser->lexer);
411
+
412
+ if (token_is(parser, TOKEN_IDENTIFIER) || token_is(parser, TOKEN_CHARACTER)) {
413
+ append_unexpected_error(
414
+ "Unescaped quote character in attribute value",
415
+ "escaped quote (\\') or different quote style (\")",
416
+ opening_quote->value,
417
+ potential_closing->location->start,
418
+ potential_closing->location->end,
419
+ errors
420
+ );
421
+
422
+ lexer_restore_state(parser->lexer, saved_state);
423
+
424
+ token_free(parser->current_token);
425
+ parser->current_token = potential_closing;
426
+
427
+ buffer_append(&buffer, parser->current_token->value);
428
+ token_free(parser->current_token);
429
+ parser->current_token = lexer_next_token(parser->lexer);
430
+
431
+ while (!token_is(parser, TOKEN_EOF)
432
+ && !(
433
+ token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
434
+ && strcmp(parser->current_token->value, opening_quote->value) == 0
435
+ )) {
436
+ if (token_is(parser, TOKEN_ERB_START)) {
437
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
438
+
439
+ array_append(children, parser_parse_erb_tag(parser));
440
+
441
+ position_free(start);
442
+ start = position_copy(parser->current_token->location->start);
443
+
444
+ continue;
445
+ }
446
+
447
+ buffer_append(&buffer, parser->current_token->value);
448
+ token_free(parser->current_token);
449
+
450
+ parser->current_token = lexer_next_token(parser->lexer);
451
+ }
452
+ } else {
453
+ token_free(parser->current_token);
454
+ parser->current_token = potential_closing;
455
+
456
+ lexer_restore_state(parser->lexer, saved_state);
457
+ }
310
458
  }
311
459
 
312
460
  parser_append_literal_node_from_buffer(parser, &buffer, children, start);
@@ -383,6 +531,30 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser
383
531
  // <div id="home">
384
532
  if (token_is(parser, TOKEN_QUOTE)) { return parser_parse_quoted_html_attribute_value(parser, children, errors); }
385
533
 
534
+ if (token_is(parser, TOKEN_BACKTICK)) {
535
+ token_T* token = parser_advance(parser);
536
+ position_T* start = position_copy(token->location->start);
537
+ position_T* end = position_copy(token->location->end);
538
+
539
+ append_unexpected_error(
540
+ "Invalid quote character for HTML attribute",
541
+ "single quote (') or double quote (\")",
542
+ "backtick (`)",
543
+ start,
544
+ end,
545
+ errors
546
+ );
547
+
548
+ AST_HTML_ATTRIBUTE_VALUE_NODE_T* value =
549
+ ast_html_attribute_value_node_init(NULL, children, NULL, false, start, end, errors);
550
+
551
+ position_free(start);
552
+ position_free(end);
553
+ token_free(token);
554
+
555
+ return value;
556
+ }
557
+
386
558
  token_T* token = parser_advance(parser);
387
559
 
388
560
  append_unexpected_error(
@@ -412,9 +584,86 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser
412
584
  static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser) {
413
585
  AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name = parser_parse_html_attribute_name(parser);
414
586
 
587
+ if (parser->options && parser->options->track_whitespace) {
588
+ bool has_equals = (parser->current_token->type == TOKEN_EQUALS)
589
+ || lexer_peek_for_token_type_after_whitespace(parser->lexer, TOKEN_EQUALS);
590
+
591
+ if (has_equals) {
592
+ buffer_T equals_buffer = buffer_new();
593
+ position_T* equals_start = NULL;
594
+ position_T* equals_end = NULL;
595
+ size_t range_start = 0;
596
+ size_t range_end = 0;
597
+
598
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
599
+ token_T* whitespace = parser_advance(parser);
600
+
601
+ if (equals_start == NULL) {
602
+ equals_start = position_copy(whitespace->location->start);
603
+ range_start = whitespace->range->from;
604
+ }
605
+
606
+ buffer_append(&equals_buffer, whitespace->value);
607
+ token_free(whitespace);
608
+ }
609
+
610
+ token_T* equals = parser_advance(parser);
611
+
612
+ if (equals_start == NULL) {
613
+ equals_start = position_copy(equals->location->start);
614
+ range_start = equals->range->from;
615
+ }
616
+
617
+ buffer_append(&equals_buffer, equals->value);
618
+ equals_end = position_copy(equals->location->end);
619
+ range_end = equals->range->to;
620
+ token_free(equals);
621
+
622
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
623
+ token_T* whitespace = parser_advance(parser);
624
+ buffer_append(&equals_buffer, whitespace->value);
625
+ equals_end = position_copy(whitespace->location->end);
626
+ range_end = whitespace->range->to;
627
+ token_free(whitespace);
628
+ }
629
+
630
+ token_T* equals_with_whitespace = calloc(1, sizeof(token_T));
631
+ equals_with_whitespace->type = TOKEN_EQUALS;
632
+ equals_with_whitespace->value = herb_strdup(equals_buffer.value);
633
+ equals_with_whitespace->location = location_init(equals_start, equals_end);
634
+ equals_with_whitespace->range = range_init(range_start, range_end);
635
+
636
+ buffer_free(&equals_buffer);
637
+
638
+ AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
639
+
640
+ return ast_html_attribute_node_init(
641
+ attribute_name,
642
+ equals_with_whitespace,
643
+ attribute_value,
644
+ attribute_name->base.location->start,
645
+ attribute_value->base.location->end,
646
+ NULL
647
+ );
648
+ } else {
649
+ return ast_html_attribute_node_init(
650
+ attribute_name,
651
+ NULL,
652
+ NULL,
653
+ attribute_name->base.location->start,
654
+ attribute_name->base.location->end,
655
+ NULL
656
+ );
657
+ }
658
+ } else {
659
+ parser_consume_whitespace(parser, NULL);
660
+ }
661
+
415
662
  token_T* equals = parser_consume_if_present(parser, TOKEN_EQUALS);
416
663
 
417
664
  if (equals != NULL) {
665
+ parser_consume_whitespace(parser, NULL);
666
+
418
667
  AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
419
668
 
420
669
  AST_HTML_ATTRIBUTE_NODE_T* attribute_node = ast_html_attribute_node_init(
@@ -441,30 +690,103 @@ static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser)
441
690
  );
442
691
  }
443
692
 
444
- static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
445
- array_T* errors = array_init(8);
446
- array_T* children = array_init(8);
693
+ static void parser_skip_erb_content(lexer_T* lexer) {
694
+ token_T* token = NULL;
447
695
 
448
- token_T* tag_start = parser_consume_expected(parser, TOKEN_HTML_TAG_START, errors);
449
- token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
696
+ do {
697
+ token = lexer_next_token(lexer);
450
698
 
451
- while (token_is_none_of(parser, TOKEN_HTML_TAG_END, TOKEN_HTML_TAG_SELF_CLOSE, TOKEN_EOF)) {
452
- token_T* whitespace = parser_consume_if_present(parser, TOKEN_WHITESPACE);
699
+ if (token->type == TOKEN_ERB_END) {
700
+ token_free(token);
701
+ break;
702
+ }
453
703
 
454
- if (whitespace != NULL) {
455
- token_free(whitespace);
704
+ token_free(token);
705
+ } while (true);
706
+ }
707
+
708
+ static bool parser_lookahead_erb_is_attribute(lexer_T* lexer) {
709
+ token_T* after = NULL;
710
+
711
+ do {
712
+ after = lexer_next_token(lexer);
713
+
714
+ if (after->type == TOKEN_EQUALS) {
715
+ token_free(after);
716
+ return true;
717
+ }
718
+
719
+ if (after->type == TOKEN_WHITESPACE || after->type == TOKEN_NEWLINE) {
720
+ token_free(after);
456
721
  continue;
457
722
  }
458
723
 
459
- token_T* newline = parser_consume_if_present(parser, TOKEN_NEWLINE);
724
+ if (after->type == TOKEN_IDENTIFIER || after->type == TOKEN_CHARACTER || after->type == TOKEN_DASH
725
+ || after->type == TOKEN_ERB_START) {
460
726
 
461
- if (newline != NULL) {
462
- token_free(newline);
727
+ if (after->type == TOKEN_ERB_START) {
728
+ token_free(after);
729
+ parser_skip_erb_content(lexer);
730
+ } else {
731
+ token_free(after);
732
+ }
463
733
  continue;
464
734
  }
465
735
 
466
- if (parser->current_token->type == TOKEN_ERB_START) {
467
- array_append(children, parser_parse_erb_tag(parser));
736
+ token_free(after);
737
+ return false;
738
+
739
+ } while (true);
740
+ }
741
+
742
+ static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children) {
743
+ bool is_output_tag = parser->current_token->value && strlen(parser->current_token->value) >= 3
744
+ && strncmp(parser->current_token->value, "<%=", 3) == 0;
745
+
746
+ if (!is_output_tag) {
747
+ array_append(children, parser_parse_erb_tag(parser));
748
+
749
+ return;
750
+ }
751
+
752
+ lexer_T lexer_copy = *parser->lexer;
753
+
754
+ token_T* erb_start = lexer_next_token(&lexer_copy);
755
+ token_free(erb_start);
756
+ parser_skip_erb_content(&lexer_copy);
757
+
758
+ bool looks_like_attribute = parser_lookahead_erb_is_attribute(&lexer_copy);
759
+
760
+ if (looks_like_attribute) {
761
+ array_append(children, parser_parse_html_attribute(parser));
762
+ } else {
763
+ array_append(children, parser_parse_erb_tag(parser));
764
+ }
765
+ }
766
+
767
+ static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children) {
768
+ token_T* whitespace = parser_consume_if_present(parser, TOKEN_WHITESPACE);
769
+
770
+ if (whitespace != NULL) {
771
+ parser_handle_whitespace(parser, whitespace, children);
772
+ return;
773
+ }
774
+
775
+ token_T* newline = parser_consume_if_present(parser, TOKEN_NEWLINE);
776
+
777
+ if (newline != NULL) { parser_handle_whitespace(parser, newline, children); }
778
+ }
779
+
780
+ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
781
+ array_T* errors = array_init(8);
782
+ array_T* children = array_init(8);
783
+
784
+ token_T* tag_start = parser_consume_expected(parser, TOKEN_HTML_TAG_START, errors);
785
+ token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
786
+
787
+ while (token_is_none_of(parser, TOKEN_HTML_TAG_END, TOKEN_HTML_TAG_SELF_CLOSE, TOKEN_EOF)) {
788
+ if (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
789
+ parser_handle_whitespace_in_open_tag(parser, children);
468
790
  continue;
469
791
  }
470
792
 
@@ -473,6 +795,11 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
473
795
  continue;
474
796
  }
475
797
 
798
+ if (parser->current_token->type == TOKEN_ERB_START) {
799
+ parser_handle_erb_in_open_tag(parser, children);
800
+ continue;
801
+ }
802
+
476
803
  if (parser->current_token->type == TOKEN_AT) {
477
804
  array_append(children, parser_parse_html_attribute(parser));
478
805
  continue;
@@ -526,14 +853,15 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
526
853
 
527
854
  static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser) {
528
855
  array_T* errors = array_init(8);
856
+ array_T* children = array_init(8);
529
857
 
530
858
  token_T* tag_opening = parser_consume_expected(parser, TOKEN_HTML_TAG_START_CLOSE, errors);
859
+
860
+ parser_consume_whitespace(parser, children);
861
+
531
862
  token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
532
863
 
533
- while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
534
- token_T* whitespace = parser_advance(parser);
535
- token_free(whitespace);
536
- }
864
+ parser_consume_whitespace(parser, children);
537
865
 
538
866
  token_T* tag_closing = parser_consume_expected(parser, TOKEN_HTML_TAG_END, errors);
539
867
 
@@ -557,6 +885,7 @@ static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser)
557
885
  AST_HTML_CLOSE_TAG_NODE_T* close_tag = ast_html_close_tag_node_init(
558
886
  tag_opening,
559
887
  tag_name,
888
+ children,
560
889
  tag_closing,
561
890
  tag_opening->location->start,
562
891
  tag_closing->location->end,
@@ -596,7 +925,13 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_regular_element(
596
925
 
597
926
  parser_push_open_tag(parser, open_tag->tag_name);
598
927
 
599
- parser_parse_in_data_state(parser, body, errors);
928
+ if (open_tag->tag_name->value && parser_is_foreign_content_tag(open_tag->tag_name->value)) {
929
+ foreign_content_type_T content_type = parser_get_foreign_content_type(open_tag->tag_name->value);
930
+ parser_enter_foreign_content(parser, content_type);
931
+ parser_parse_foreign_content(parser, body, errors);
932
+ } else {
933
+ parser_parse_in_data_state(parser, body, errors);
934
+ }
600
935
 
601
936
  if (!token_is(parser, TOKEN_HTML_TAG_START_CLOSE)) { return parser_handle_missing_close_tag(open_tag, body, errors); }
602
937
 
@@ -633,9 +968,7 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_element(parser_T* parser) {
633
968
  AST_HTML_OPEN_TAG_NODE_T* open_tag = parser_parse_html_open_tag(parser);
634
969
 
635
970
  // <tag />
636
- if (open_tag->is_void || ast_node_is((AST_NODE_T*) open_tag, AST_HTML_SELF_CLOSE_TAG_NODE)) {
637
- return parser_parse_html_self_closing_element(parser, open_tag);
638
- }
971
+ if (open_tag->is_void) { return parser_parse_html_self_closing_element(parser, open_tag); }
639
972
 
640
973
  // <tag>, in void element list, and not in inside an <svg> element
641
974
  if (!open_tag->is_void && is_void_element(open_tag->tag_name->value) && !parser_in_svg_context(parser)) {
@@ -687,6 +1020,68 @@ static AST_ERB_CONTENT_NODE_T* parser_parse_erb_tag(parser_T* parser) {
687
1020
  return erb_node;
688
1021
  }
689
1022
 
1023
+ static void parser_parse_foreign_content(parser_T* parser, array_T* children, array_T* errors) {
1024
+ buffer_T content = buffer_new();
1025
+ position_T* start = position_copy(parser->current_token->location->start);
1026
+ const char* expected_closing_tag = parser_get_foreign_content_closing_tag(parser->foreign_content_type);
1027
+
1028
+ if (expected_closing_tag == NULL) {
1029
+ parser_exit_foreign_content(parser);
1030
+ position_free(start);
1031
+ buffer_free(&content);
1032
+
1033
+ return;
1034
+ }
1035
+
1036
+ while (!token_is(parser, TOKEN_EOF)) {
1037
+ if (token_is(parser, TOKEN_ERB_START)) {
1038
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
1039
+
1040
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
1041
+ array_append(children, erb_node);
1042
+
1043
+ position_free(start);
1044
+ start = position_copy(parser->current_token->location->start);
1045
+
1046
+ continue;
1047
+ }
1048
+
1049
+ if (token_is(parser, TOKEN_HTML_TAG_START_CLOSE)) {
1050
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
1051
+
1052
+ token_T* next_token = lexer_next_token(parser->lexer);
1053
+ bool is_potential_match = false;
1054
+
1055
+ if (next_token && next_token->type == TOKEN_IDENTIFIER && next_token->value) {
1056
+ is_potential_match = parser_is_expected_closing_tag_name(next_token->value, parser->foreign_content_type);
1057
+ }
1058
+
1059
+ lexer_restore_state(parser->lexer, saved_state);
1060
+
1061
+ if (next_token) { token_free(next_token); }
1062
+
1063
+ if (is_potential_match) {
1064
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
1065
+ parser_exit_foreign_content(parser);
1066
+
1067
+ position_free(start);
1068
+ buffer_free(&content);
1069
+
1070
+ return;
1071
+ }
1072
+ }
1073
+
1074
+ token_T* token = parser_advance(parser);
1075
+ buffer_append(&content, token->value);
1076
+ token_free(token);
1077
+ }
1078
+
1079
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
1080
+ parser_exit_foreign_content(parser);
1081
+ position_free(start);
1082
+ buffer_free(&content);
1083
+ }
1084
+
690
1085
  static void parser_parse_in_data_state(parser_T* parser, array_T* children, array_T* errors) {
691
1086
  while (token_is_none_of(parser, TOKEN_HTML_TAG_START_CLOSE, TOKEN_EOF)) {
692
1087
  if (token_is(parser, TOKEN_ERB_START)) {
@@ -699,6 +1094,16 @@ static void parser_parse_in_data_state(parser_T* parser, array_T* children, arra
699
1094
  continue;
700
1095
  }
701
1096
 
1097
+ if (token_is(parser, TOKEN_XML_DECLARATION)) {
1098
+ array_append(children, parser_parse_xml_declaration(parser));
1099
+ continue;
1100
+ }
1101
+
1102
+ if (token_is(parser, TOKEN_CDATA_START)) {
1103
+ array_append(children, parser_parse_cdata(parser));
1104
+ continue;
1105
+ }
1106
+
702
1107
  if (token_is(parser, TOKEN_HTML_COMMENT_START)) {
703
1108
  array_append(children, parser_parse_html_comment(parser));
704
1109
  continue;
@@ -808,12 +1213,40 @@ AST_DOCUMENT_NODE_T* parser_parse(parser_T* parser) {
808
1213
  return parser_parse_document(parser);
809
1214
  }
810
1215
 
1216
+ static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children) {
1217
+ if (parser->options && parser->options->track_whitespace) {
1218
+ array_T* errors = array_init(8);
1219
+ AST_WHITESPACE_NODE_T* whitespace_node = ast_whitespace_node_init(
1220
+ whitespace_token,
1221
+ whitespace_token->location->start,
1222
+ whitespace_token->location->end,
1223
+ errors
1224
+ );
1225
+ array_append(children, whitespace_node);
1226
+ }
1227
+
1228
+ token_free(whitespace_token);
1229
+ }
1230
+
1231
+ static void parser_consume_whitespace(parser_T* parser, array_T* children) {
1232
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
1233
+ token_T* whitespace = parser_advance(parser);
1234
+
1235
+ if (parser->options && parser->options->track_whitespace && children != NULL) {
1236
+ parser_handle_whitespace(parser, whitespace, children);
1237
+ } else {
1238
+ token_free(whitespace);
1239
+ }
1240
+ }
1241
+ }
1242
+
811
1243
  void parser_free(parser_T* parser) {
812
1244
  if (parser == NULL) { return; }
813
1245
 
814
1246
  if (parser->lexer != NULL) { lexer_free(parser->lexer); }
815
1247
  if (parser->current_token != NULL) { token_free(parser->current_token); }
816
1248
  if (parser->open_tags_stack != NULL) { array_free(&parser->open_tags_stack); }
1249
+ if (parser->options != NULL) { free(parser->options); }
817
1250
 
818
1251
  free(parser);
819
1252
  }