herb 0.5.0-x86-linux-musl → 0.6.0-x86-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/parser.c CHANGED
@@ -6,6 +6,7 @@
6
6
  #include "include/errors.h"
7
7
  #include "include/html_util.h"
8
8
  #include "include/lexer.h"
9
+ #include "include/lexer_peek_helpers.h"
9
10
  #include "include/parser_helpers.h"
10
11
  #include "include/token.h"
11
12
  #include "include/token_matchers.h"
@@ -19,12 +20,18 @@
19
20
  static void parser_parse_in_data_state(parser_T* parser, array_T* children, array_T* errors);
20
21
  static void parser_parse_foreign_content(parser_T* parser, array_T* children, array_T* errors);
21
22
  static AST_ERB_CONTENT_NODE_T* parser_parse_erb_tag(parser_T* parser);
23
+ static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children);
24
+ static void parser_consume_whitespace(parser_T* parser, array_T* children);
25
+ static void parser_skip_erb_content(lexer_T* lexer);
26
+ static bool parser_lookahead_erb_is_attribute(lexer_T* lexer);
27
+ static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children);
28
+ static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children);
22
29
 
23
30
  size_t parser_sizeof(void) {
24
31
  return sizeof(struct PARSER_STRUCT);
25
32
  }
26
33
 
27
- parser_T* parser_init(lexer_T* lexer) {
34
+ parser_T* parser_init(lexer_T* lexer, parser_options_T* options) {
28
35
  parser_T* parser = calloc(1, parser_sizeof());
29
36
 
30
37
  parser->lexer = lexer;
@@ -33,9 +40,59 @@ parser_T* parser_init(lexer_T* lexer) {
33
40
  parser->state = PARSER_STATE_DATA;
34
41
  parser->foreign_content_type = FOREIGN_CONTENT_UNKNOWN;
35
42
 
43
+ if (options) {
44
+ parser->options = calloc(1, sizeof(parser_options_T));
45
+ parser->options->track_whitespace = options->track_whitespace;
46
+ } else {
47
+ parser->options = NULL;
48
+ }
49
+
36
50
  return parser;
37
51
  }
38
52
 
53
+ static AST_CDATA_NODE_T* parser_parse_cdata(parser_T* parser) {
54
+ array_T* errors = array_init(8);
55
+ array_T* children = array_init(8);
56
+ buffer_T content = buffer_new();
57
+
58
+ token_T* tag_opening = parser_consume_expected(parser, TOKEN_CDATA_START, errors);
59
+ position_T* start = position_copy(parser->current_token->location->start);
60
+
61
+ while (token_is_none_of(parser, TOKEN_CDATA_END, TOKEN_EOF)) {
62
+ if (token_is(parser, TOKEN_ERB_START)) {
63
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
64
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
65
+ array_append(children, erb_node);
66
+ position_free(start);
67
+ start = position_copy(parser->current_token->location->start);
68
+ continue;
69
+ }
70
+
71
+ token_T* token = parser_advance(parser);
72
+ buffer_append(&content, token->value);
73
+ token_free(token);
74
+ }
75
+
76
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
77
+ token_T* tag_closing = parser_consume_expected(parser, TOKEN_CDATA_END, errors);
78
+
79
+ AST_CDATA_NODE_T* cdata = ast_cdata_node_init(
80
+ tag_opening,
81
+ children,
82
+ tag_closing,
83
+ tag_opening->location->start,
84
+ tag_closing->location->end,
85
+ errors
86
+ );
87
+
88
+ position_free(start);
89
+ buffer_free(&content);
90
+ token_free(tag_opening);
91
+ token_free(tag_closing);
92
+
93
+ return cdata;
94
+ }
95
+
39
96
  static AST_HTML_COMMENT_NODE_T* parser_parse_html_comment(parser_T* parser) {
40
97
  array_T* errors = array_init(8);
41
98
  array_T* children = array_init(8);
@@ -128,6 +185,54 @@ static AST_HTML_DOCTYPE_NODE_T* parser_parse_html_doctype(parser_T* parser) {
128
185
  return doctype;
129
186
  }
130
187
 
188
+ static AST_XML_DECLARATION_NODE_T* parser_parse_xml_declaration(parser_T* parser) {
189
+ array_T* errors = array_init(8);
190
+ array_T* children = array_init(8);
191
+ buffer_T content = buffer_new();
192
+
193
+ token_T* tag_opening = parser_consume_expected(parser, TOKEN_XML_DECLARATION, errors);
194
+
195
+ position_T* start = position_copy(parser->current_token->location->start);
196
+
197
+ while (token_is_none_of(parser, TOKEN_XML_DECLARATION_END, TOKEN_EOF)) {
198
+ if (token_is(parser, TOKEN_ERB_START)) {
199
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
200
+
201
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
202
+ array_append(children, erb_node);
203
+
204
+ position_free(start);
205
+ start = position_copy(parser->current_token->location->start);
206
+
207
+ continue;
208
+ }
209
+
210
+ token_T* token = parser_advance(parser);
211
+ buffer_append(&content, token->value);
212
+ token_free(token);
213
+ }
214
+
215
+ parser_append_literal_node_from_buffer(parser, &content, children, start);
216
+
217
+ token_T* tag_closing = parser_consume_expected(parser, TOKEN_XML_DECLARATION_END, errors);
218
+
219
+ AST_XML_DECLARATION_NODE_T* xml_declaration = ast_xml_declaration_node_init(
220
+ tag_opening,
221
+ children,
222
+ tag_closing,
223
+ tag_opening->location->start,
224
+ tag_closing->location->end,
225
+ errors
226
+ );
227
+
228
+ position_free(start);
229
+ token_free(tag_opening);
230
+ token_free(tag_closing);
231
+ buffer_free(&content);
232
+
233
+ return xml_declaration;
234
+ }
235
+
131
236
  static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T* document_errors) {
132
237
  position_T* start = position_copy(parser->current_token->location->start);
133
238
 
@@ -188,96 +293,58 @@ static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, array_T
188
293
 
189
294
  static AST_HTML_ATTRIBUTE_NAME_NODE_T* parser_parse_html_attribute_name(parser_T* parser) {
190
295
  array_T* errors = array_init(8);
296
+ array_T* children = array_init(8);
297
+ buffer_T buffer = buffer_new();
298
+ position_T* start = position_copy(parser->current_token->location->start);
191
299
 
192
- token_T* at_token = parser_consume_if_present(parser, TOKEN_AT);
193
- token_T* first_token = NULL;
194
-
195
- if (at_token != NULL) {
196
- first_token = parser_consume_if_present(parser, TOKEN_IDENTIFIER);
197
-
198
- if (first_token == NULL) {
199
- parser_append_unexpected_token_error(parser, TOKEN_IDENTIFIER, errors);
200
-
201
- AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name =
202
- ast_html_attribute_name_node_init(at_token, at_token->location->start, at_token->location->end, errors);
203
-
204
- token_free(at_token);
300
+ while (token_is_none_of(
301
+ parser,
302
+ TOKEN_EQUALS,
303
+ TOKEN_WHITESPACE,
304
+ TOKEN_NEWLINE,
305
+ TOKEN_HTML_TAG_END,
306
+ TOKEN_HTML_TAG_SELF_CLOSE,
307
+ TOKEN_EOF
308
+ )) {
309
+ if (token_is(parser, TOKEN_ERB_START)) {
310
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
205
311
 
206
- return attribute_name;
207
- }
208
- } else {
209
- first_token = parser_consume_if_present(parser, TOKEN_IDENTIFIER);
312
+ AST_ERB_CONTENT_NODE_T* erb_node = parser_parse_erb_tag(parser);
313
+ array_append(children, erb_node);
210
314
 
211
- if (first_token == NULL) {
212
- parser_append_unexpected_token_error(parser, TOKEN_IDENTIFIER, errors);
213
- AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name = ast_html_attribute_name_node_init(NULL, NULL, NULL, errors);
214
- return attribute_name;
315
+ position_free(start);
316
+ start = position_copy(parser->current_token->location->start);
317
+ continue;
215
318
  }
216
- }
217
-
218
- buffer_T name_buffer = buffer_new();
219
319
 
220
- position_T* start_position;
221
-
222
- if (at_token != NULL) {
223
- buffer_append(&name_buffer, at_token->value);
224
- start_position = position_copy(at_token->location->start);
225
- } else {
226
- start_position = position_copy(first_token->location->start);
320
+ token_T* token = parser_advance(parser);
321
+ buffer_append(&buffer, token->value);
322
+ token_free(token);
227
323
  }
228
324
 
229
- buffer_append(&name_buffer, first_token->value);
230
-
231
- position_T* end_position = position_copy(first_token->location->end);
232
- size_t range_end = first_token->range->to;
233
-
234
- while (parser->current_token->type == TOKEN_CHARACTER && parser->current_token->value
235
- && strcmp(parser->current_token->value, ".") == 0) {
236
-
237
- token_T* dot_token = parser_advance(parser);
238
-
239
- buffer_append(&name_buffer, dot_token->value);
240
- position_free(end_position);
241
-
242
- end_position = position_copy(dot_token->location->end);
243
- range_end = dot_token->range->to;
244
-
245
- token_free(dot_token);
325
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
246
326
 
247
- if (parser->current_token->type == TOKEN_IDENTIFIER) {
248
- token_T* next_identifier = parser_advance(parser);
327
+ position_T* node_start = NULL;
328
+ position_T* node_end = NULL;
249
329
 
250
- buffer_append(&name_buffer, next_identifier->value);
251
- position_free(end_position);
330
+ if (children->size > 0) {
331
+ AST_NODE_T* first_child = array_get(children, 0);
332
+ AST_NODE_T* last_child = array_get(children, children->size - 1);
252
333
 
253
- end_position = position_copy(next_identifier->location->end);
254
- range_end = next_identifier->range->to;
255
- token_free(next_identifier);
256
- } else {
257
- break;
258
- }
334
+ node_start = position_copy(first_child->location->start);
335
+ node_end = position_copy(last_child->location->end);
336
+ } else {
337
+ node_start = position_copy(parser->current_token->location->start);
338
+ node_end = position_copy(parser->current_token->location->start);
259
339
  }
260
340
 
261
- token_T* combined_token = calloc(1, sizeof(token_T));
262
- combined_token->value = herb_strdup(name_buffer.value);
263
- combined_token->type = TOKEN_IDENTIFIER;
264
- combined_token->location =
265
- location_from(start_position->line, start_position->column, end_position->line, end_position->column);
266
-
267
- size_t range_start = at_token != NULL ? at_token->range->from : first_token->range->from;
268
- combined_token->range = range_init(range_start, range_end);
269
-
270
341
  AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name =
271
- ast_html_attribute_name_node_init(combined_token, start_position, end_position, errors);
272
-
273
- buffer_free(&name_buffer);
274
- position_free(start_position);
275
- position_free(end_position);
276
- token_free(first_token);
277
-
278
- if (at_token != NULL) { token_free(at_token); }
342
+ ast_html_attribute_name_node_init(children, node_start, node_end, errors);
279
343
 
280
- token_free(combined_token);
344
+ position_free(start);
345
+ position_free(node_start);
346
+ position_free(node_end);
347
+ buffer_free(&buffer);
281
348
 
282
349
  return attribute_name;
283
350
  }
@@ -307,11 +374,89 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_quoted_html_attribute_value
307
374
  continue;
308
375
  }
309
376
 
377
+ if (token_is(parser, TOKEN_BACKSLASH)) {
378
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
379
+
380
+ token_T* next_token = lexer_next_token(parser->lexer);
381
+
382
+ if (next_token && next_token->type == TOKEN_QUOTE && opening_quote != NULL
383
+ && strcmp(next_token->value, opening_quote->value) == 0) {
384
+ buffer_append(&buffer, parser->current_token->value);
385
+ buffer_append(&buffer, next_token->value);
386
+
387
+ token_free(parser->current_token);
388
+ token_free(next_token);
389
+
390
+ parser->current_token = lexer_next_token(parser->lexer);
391
+ continue;
392
+ } else {
393
+ lexer_restore_state(parser->lexer, saved_state);
394
+
395
+ if (next_token) { token_free(next_token); }
396
+ }
397
+ }
398
+
310
399
  buffer_append(&buffer, parser->current_token->value);
311
400
  token_free(parser->current_token);
401
+
312
402
  parser->current_token = lexer_next_token(parser->lexer);
313
403
  }
314
404
 
405
+ if (token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
406
+ && strcmp(parser->current_token->value, opening_quote->value) == 0) {
407
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
408
+
409
+ token_T* potential_closing = parser->current_token;
410
+ parser->current_token = lexer_next_token(parser->lexer);
411
+
412
+ if (token_is(parser, TOKEN_IDENTIFIER) || token_is(parser, TOKEN_CHARACTER)) {
413
+ append_unexpected_error(
414
+ "Unescaped quote character in attribute value",
415
+ "escaped quote (\\') or different quote style (\")",
416
+ opening_quote->value,
417
+ potential_closing->location->start,
418
+ potential_closing->location->end,
419
+ errors
420
+ );
421
+
422
+ lexer_restore_state(parser->lexer, saved_state);
423
+
424
+ token_free(parser->current_token);
425
+ parser->current_token = potential_closing;
426
+
427
+ buffer_append(&buffer, parser->current_token->value);
428
+ token_free(parser->current_token);
429
+ parser->current_token = lexer_next_token(parser->lexer);
430
+
431
+ while (!token_is(parser, TOKEN_EOF)
432
+ && !(
433
+ token_is(parser, TOKEN_QUOTE) && opening_quote != NULL
434
+ && strcmp(parser->current_token->value, opening_quote->value) == 0
435
+ )) {
436
+ if (token_is(parser, TOKEN_ERB_START)) {
437
+ parser_append_literal_node_from_buffer(parser, &buffer, children, start);
438
+
439
+ array_append(children, parser_parse_erb_tag(parser));
440
+
441
+ position_free(start);
442
+ start = position_copy(parser->current_token->location->start);
443
+
444
+ continue;
445
+ }
446
+
447
+ buffer_append(&buffer, parser->current_token->value);
448
+ token_free(parser->current_token);
449
+
450
+ parser->current_token = lexer_next_token(parser->lexer);
451
+ }
452
+ } else {
453
+ token_free(parser->current_token);
454
+ parser->current_token = potential_closing;
455
+
456
+ lexer_restore_state(parser->lexer, saved_state);
457
+ }
458
+ }
459
+
315
460
  parser_append_literal_node_from_buffer(parser, &buffer, children, start);
316
461
  position_free(start);
317
462
  buffer_free(&buffer);
@@ -439,18 +584,85 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser
439
584
  static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser) {
440
585
  AST_HTML_ATTRIBUTE_NAME_NODE_T* attribute_name = parser_parse_html_attribute_name(parser);
441
586
 
442
- while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
443
- token_T* whitespace = parser_advance(parser);
444
- token_free(whitespace);
587
+ if (parser->options && parser->options->track_whitespace) {
588
+ bool has_equals = (parser->current_token->type == TOKEN_EQUALS)
589
+ || lexer_peek_for_token_type_after_whitespace(parser->lexer, TOKEN_EQUALS);
590
+
591
+ if (has_equals) {
592
+ buffer_T equals_buffer = buffer_new();
593
+ position_T* equals_start = NULL;
594
+ position_T* equals_end = NULL;
595
+ size_t range_start = 0;
596
+ size_t range_end = 0;
597
+
598
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
599
+ token_T* whitespace = parser_advance(parser);
600
+
601
+ if (equals_start == NULL) {
602
+ equals_start = position_copy(whitespace->location->start);
603
+ range_start = whitespace->range->from;
604
+ }
605
+
606
+ buffer_append(&equals_buffer, whitespace->value);
607
+ token_free(whitespace);
608
+ }
609
+
610
+ token_T* equals = parser_advance(parser);
611
+
612
+ if (equals_start == NULL) {
613
+ equals_start = position_copy(equals->location->start);
614
+ range_start = equals->range->from;
615
+ }
616
+
617
+ buffer_append(&equals_buffer, equals->value);
618
+ equals_end = position_copy(equals->location->end);
619
+ range_end = equals->range->to;
620
+ token_free(equals);
621
+
622
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
623
+ token_T* whitespace = parser_advance(parser);
624
+ buffer_append(&equals_buffer, whitespace->value);
625
+ equals_end = position_copy(whitespace->location->end);
626
+ range_end = whitespace->range->to;
627
+ token_free(whitespace);
628
+ }
629
+
630
+ token_T* equals_with_whitespace = calloc(1, sizeof(token_T));
631
+ equals_with_whitespace->type = TOKEN_EQUALS;
632
+ equals_with_whitespace->value = herb_strdup(equals_buffer.value);
633
+ equals_with_whitespace->location = location_init(equals_start, equals_end);
634
+ equals_with_whitespace->range = range_init(range_start, range_end);
635
+
636
+ buffer_free(&equals_buffer);
637
+
638
+ AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
639
+
640
+ return ast_html_attribute_node_init(
641
+ attribute_name,
642
+ equals_with_whitespace,
643
+ attribute_value,
644
+ attribute_name->base.location->start,
645
+ attribute_value->base.location->end,
646
+ NULL
647
+ );
648
+ } else {
649
+ return ast_html_attribute_node_init(
650
+ attribute_name,
651
+ NULL,
652
+ NULL,
653
+ attribute_name->base.location->start,
654
+ attribute_name->base.location->end,
655
+ NULL
656
+ );
657
+ }
658
+ } else {
659
+ parser_consume_whitespace(parser, NULL);
445
660
  }
446
661
 
447
662
  token_T* equals = parser_consume_if_present(parser, TOKEN_EQUALS);
448
663
 
449
664
  if (equals != NULL) {
450
- while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
451
- token_T* whitespace = parser_advance(parser);
452
- token_free(whitespace);
453
- }
665
+ parser_consume_whitespace(parser, NULL);
454
666
 
455
667
  AST_HTML_ATTRIBUTE_VALUE_NODE_T* attribute_value = parser_parse_html_attribute_value(parser);
456
668
 
@@ -478,30 +690,103 @@ static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser)
478
690
  );
479
691
  }
480
692
 
481
- static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
482
- array_T* errors = array_init(8);
483
- array_T* children = array_init(8);
693
+ static void parser_skip_erb_content(lexer_T* lexer) {
694
+ token_T* token = NULL;
484
695
 
485
- token_T* tag_start = parser_consume_expected(parser, TOKEN_HTML_TAG_START, errors);
486
- token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
696
+ do {
697
+ token = lexer_next_token(lexer);
487
698
 
488
- while (token_is_none_of(parser, TOKEN_HTML_TAG_END, TOKEN_HTML_TAG_SELF_CLOSE, TOKEN_EOF)) {
489
- token_T* whitespace = parser_consume_if_present(parser, TOKEN_WHITESPACE);
699
+ if (token->type == TOKEN_ERB_END) {
700
+ token_free(token);
701
+ break;
702
+ }
490
703
 
491
- if (whitespace != NULL) {
492
- token_free(whitespace);
704
+ token_free(token);
705
+ } while (true);
706
+ }
707
+
708
+ static bool parser_lookahead_erb_is_attribute(lexer_T* lexer) {
709
+ token_T* after = NULL;
710
+
711
+ do {
712
+ after = lexer_next_token(lexer);
713
+
714
+ if (after->type == TOKEN_EQUALS) {
715
+ token_free(after);
716
+ return true;
717
+ }
718
+
719
+ if (after->type == TOKEN_WHITESPACE || after->type == TOKEN_NEWLINE) {
720
+ token_free(after);
493
721
  continue;
494
722
  }
495
723
 
496
- token_T* newline = parser_consume_if_present(parser, TOKEN_NEWLINE);
724
+ if (after->type == TOKEN_IDENTIFIER || after->type == TOKEN_CHARACTER || after->type == TOKEN_DASH
725
+ || after->type == TOKEN_ERB_START) {
497
726
 
498
- if (newline != NULL) {
499
- token_free(newline);
727
+ if (after->type == TOKEN_ERB_START) {
728
+ token_free(after);
729
+ parser_skip_erb_content(lexer);
730
+ } else {
731
+ token_free(after);
732
+ }
500
733
  continue;
501
734
  }
502
735
 
503
- if (parser->current_token->type == TOKEN_ERB_START) {
504
- array_append(children, parser_parse_erb_tag(parser));
736
+ token_free(after);
737
+ return false;
738
+
739
+ } while (true);
740
+ }
741
+
742
+ static void parser_handle_erb_in_open_tag(parser_T* parser, array_T* children) {
743
+ bool is_output_tag = parser->current_token->value && strlen(parser->current_token->value) >= 3
744
+ && strncmp(parser->current_token->value, "<%=", 3) == 0;
745
+
746
+ if (!is_output_tag) {
747
+ array_append(children, parser_parse_erb_tag(parser));
748
+
749
+ return;
750
+ }
751
+
752
+ lexer_T lexer_copy = *parser->lexer;
753
+
754
+ token_T* erb_start = lexer_next_token(&lexer_copy);
755
+ token_free(erb_start);
756
+ parser_skip_erb_content(&lexer_copy);
757
+
758
+ bool looks_like_attribute = parser_lookahead_erb_is_attribute(&lexer_copy);
759
+
760
+ if (looks_like_attribute) {
761
+ array_append(children, parser_parse_html_attribute(parser));
762
+ } else {
763
+ array_append(children, parser_parse_erb_tag(parser));
764
+ }
765
+ }
766
+
767
+ static void parser_handle_whitespace_in_open_tag(parser_T* parser, array_T* children) {
768
+ token_T* whitespace = parser_consume_if_present(parser, TOKEN_WHITESPACE);
769
+
770
+ if (whitespace != NULL) {
771
+ parser_handle_whitespace(parser, whitespace, children);
772
+ return;
773
+ }
774
+
775
+ token_T* newline = parser_consume_if_present(parser, TOKEN_NEWLINE);
776
+
777
+ if (newline != NULL) { parser_handle_whitespace(parser, newline, children); }
778
+ }
779
+
780
+ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
781
+ array_T* errors = array_init(8);
782
+ array_T* children = array_init(8);
783
+
784
+ token_T* tag_start = parser_consume_expected(parser, TOKEN_HTML_TAG_START, errors);
785
+ token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
786
+
787
+ while (token_is_none_of(parser, TOKEN_HTML_TAG_END, TOKEN_HTML_TAG_SELF_CLOSE, TOKEN_EOF)) {
788
+ if (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
789
+ parser_handle_whitespace_in_open_tag(parser, children);
505
790
  continue;
506
791
  }
507
792
 
@@ -510,6 +795,11 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
510
795
  continue;
511
796
  }
512
797
 
798
+ if (parser->current_token->type == TOKEN_ERB_START) {
799
+ parser_handle_erb_in_open_tag(parser, children);
800
+ continue;
801
+ }
802
+
513
803
  if (parser->current_token->type == TOKEN_AT) {
514
804
  array_append(children, parser_parse_html_attribute(parser));
515
805
  continue;
@@ -563,14 +853,15 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) {
563
853
 
564
854
  static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser) {
565
855
  array_T* errors = array_init(8);
856
+ array_T* children = array_init(8);
566
857
 
567
858
  token_T* tag_opening = parser_consume_expected(parser, TOKEN_HTML_TAG_START_CLOSE, errors);
859
+
860
+ parser_consume_whitespace(parser, children);
861
+
568
862
  token_T* tag_name = parser_consume_expected(parser, TOKEN_IDENTIFIER, errors);
569
863
 
570
- while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
571
- token_T* whitespace = parser_advance(parser);
572
- token_free(whitespace);
573
- }
864
+ parser_consume_whitespace(parser, children);
574
865
 
575
866
  token_T* tag_closing = parser_consume_expected(parser, TOKEN_HTML_TAG_END, errors);
576
867
 
@@ -594,6 +885,7 @@ static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser)
594
885
  AST_HTML_CLOSE_TAG_NODE_T* close_tag = ast_html_close_tag_node_init(
595
886
  tag_opening,
596
887
  tag_name,
888
+ children,
597
889
  tag_closing,
598
890
  tag_opening->location->start,
599
891
  tag_closing->location->end,
@@ -676,9 +968,7 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_element(parser_T* parser) {
676
968
  AST_HTML_OPEN_TAG_NODE_T* open_tag = parser_parse_html_open_tag(parser);
677
969
 
678
970
  // <tag />
679
- if (open_tag->is_void || ast_node_is((AST_NODE_T*) open_tag, AST_HTML_SELF_CLOSE_TAG_NODE)) {
680
- return parser_parse_html_self_closing_element(parser, open_tag);
681
- }
971
+ if (open_tag->is_void) { return parser_parse_html_self_closing_element(parser, open_tag); }
682
972
 
683
973
  // <tag>, in void element list, and not in inside an <svg> element
684
974
  if (!open_tag->is_void && is_void_element(open_tag->tag_name->value) && !parser_in_svg_context(parser)) {
@@ -757,15 +1047,7 @@ static void parser_parse_foreign_content(parser_T* parser, array_T* children, ar
757
1047
  }
758
1048
 
759
1049
  if (token_is(parser, TOKEN_HTML_TAG_START_CLOSE)) {
760
- size_t saved_position = parser->lexer->current_position;
761
- size_t saved_line = parser->lexer->current_line;
762
- size_t saved_column = parser->lexer->current_column;
763
- size_t saved_previous_position = parser->lexer->previous_position;
764
- size_t saved_previous_line = parser->lexer->previous_line;
765
- size_t saved_previous_column = parser->lexer->previous_column;
766
-
767
- char saved_char = parser->lexer->current_character;
768
- lexer_state_T saved_state = parser->lexer->state;
1050
+ lexer_state_snapshot_T saved_state = lexer_save_state(parser->lexer);
769
1051
 
770
1052
  token_T* next_token = lexer_next_token(parser->lexer);
771
1053
  bool is_potential_match = false;
@@ -774,14 +1056,7 @@ static void parser_parse_foreign_content(parser_T* parser, array_T* children, ar
774
1056
  is_potential_match = parser_is_expected_closing_tag_name(next_token->value, parser->foreign_content_type);
775
1057
  }
776
1058
 
777
- parser->lexer->current_position = saved_position;
778
- parser->lexer->current_line = saved_line;
779
- parser->lexer->current_column = saved_column;
780
- parser->lexer->previous_position = saved_previous_position;
781
- parser->lexer->previous_line = saved_previous_line;
782
- parser->lexer->previous_column = saved_previous_column;
783
- parser->lexer->current_character = saved_char;
784
- parser->lexer->state = saved_state;
1059
+ lexer_restore_state(parser->lexer, saved_state);
785
1060
 
786
1061
  if (next_token) { token_free(next_token); }
787
1062
 
@@ -819,6 +1094,16 @@ static void parser_parse_in_data_state(parser_T* parser, array_T* children, arra
819
1094
  continue;
820
1095
  }
821
1096
 
1097
+ if (token_is(parser, TOKEN_XML_DECLARATION)) {
1098
+ array_append(children, parser_parse_xml_declaration(parser));
1099
+ continue;
1100
+ }
1101
+
1102
+ if (token_is(parser, TOKEN_CDATA_START)) {
1103
+ array_append(children, parser_parse_cdata(parser));
1104
+ continue;
1105
+ }
1106
+
822
1107
  if (token_is(parser, TOKEN_HTML_COMMENT_START)) {
823
1108
  array_append(children, parser_parse_html_comment(parser));
824
1109
  continue;
@@ -928,12 +1213,40 @@ AST_DOCUMENT_NODE_T* parser_parse(parser_T* parser) {
928
1213
  return parser_parse_document(parser);
929
1214
  }
930
1215
 
1216
+ static void parser_handle_whitespace(parser_T* parser, token_T* whitespace_token, array_T* children) {
1217
+ if (parser->options && parser->options->track_whitespace) {
1218
+ array_T* errors = array_init(8);
1219
+ AST_WHITESPACE_NODE_T* whitespace_node = ast_whitespace_node_init(
1220
+ whitespace_token,
1221
+ whitespace_token->location->start,
1222
+ whitespace_token->location->end,
1223
+ errors
1224
+ );
1225
+ array_append(children, whitespace_node);
1226
+ }
1227
+
1228
+ token_free(whitespace_token);
1229
+ }
1230
+
1231
+ static void parser_consume_whitespace(parser_T* parser, array_T* children) {
1232
+ while (token_is_any_of(parser, TOKEN_WHITESPACE, TOKEN_NEWLINE)) {
1233
+ token_T* whitespace = parser_advance(parser);
1234
+
1235
+ if (parser->options && parser->options->track_whitespace && children != NULL) {
1236
+ parser_handle_whitespace(parser, whitespace, children);
1237
+ } else {
1238
+ token_free(whitespace);
1239
+ }
1240
+ }
1241
+ }
1242
+
931
1243
  void parser_free(parser_T* parser) {
932
1244
  if (parser == NULL) { return; }
933
1245
 
934
1246
  if (parser->lexer != NULL) { lexer_free(parser->lexer); }
935
1247
  if (parser->current_token != NULL) { token_free(parser->current_token); }
936
1248
  if (parser->open_tags_stack != NULL) { array_free(&parser->open_tags_stack); }
1249
+ if (parser->options != NULL) { free(parser->options); }
937
1250
 
938
1251
  free(parser);
939
1252
  }