herb 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/include/errors.h CHANGED
@@ -1,5 +1,5 @@
1
1
  // NOTE: This file is generated by the templates/template.rb script and should not
2
- // be modified manually. See /Users/marcoroth/Development/herb-release-0.5.0/templates/src/include/errors.h.erb
2
+ // be modified manually. See /Users/marcoroth/Development/herb-release-0.6.0/templates/src/include/errors.h.erb
3
3
 
4
4
  #ifndef HERB_ERRORS_H
5
5
  #define HERB_ERRORS_H
data/src/include/herb.h CHANGED
@@ -5,6 +5,7 @@
5
5
  #include "ast_node.h"
6
6
  #include "buffer.h"
7
7
  #include "extract.h"
8
+ #include "parser.h"
8
9
 
9
10
  #include <stdint.h>
10
11
 
@@ -18,7 +19,7 @@ void herb_lex_json_to_buffer(const char* source, buffer_T* output);
18
19
  array_T* herb_lex(const char* source);
19
20
  array_T* herb_lex_file(const char* path);
20
21
 
21
- AST_DOCUMENT_NODE_T* herb_parse(const char* source);
22
+ AST_DOCUMENT_NODE_T* herb_parse(const char* source, parser_options_T* options);
22
23
 
23
24
  const char* herb_version(void);
24
25
  const char* herb_prism_version(void);
@@ -2,13 +2,28 @@
2
2
  #define HERB_LEXER_PEEK_HELPERS_H
3
3
 
4
4
  #include "lexer_struct.h"
5
+ #include "token_struct.h"
5
6
 
6
7
  #include <stdbool.h>
7
8
  #include <stdio.h>
8
9
  #include <stdlib.h>
9
10
 
11
+ typedef struct {
12
+ size_t position;
13
+ size_t line;
14
+ size_t column;
15
+ size_t previous_position;
16
+ size_t previous_line;
17
+ size_t previous_column;
18
+ char current_character;
19
+ lexer_state_T state;
20
+ } lexer_state_snapshot_T;
21
+
10
22
  char lexer_peek(const lexer_T* lexer, int offset);
11
23
  bool lexer_peek_for_doctype(const lexer_T* lexer, int offset);
24
+ bool lexer_peek_for_xml_declaration(const lexer_T* lexer, int offset);
25
+ bool lexer_peek_for_cdata_start(const lexer_T* lexer, int offset);
26
+ bool lexer_peek_for_cdata_end(const lexer_T* lexer, int offset);
12
27
 
13
28
  bool lexer_peek_for_html_comment_start(const lexer_T* lexer, int offset);
14
29
  bool lexer_peek_for_html_comment_end(const lexer_T* lexer, int offset);
@@ -20,4 +35,10 @@ bool lexer_peek_erb_end(const lexer_T* lexer, int offset);
20
35
 
21
36
  char lexer_backtrack(const lexer_T* lexer, int offset);
22
37
 
38
+ bool lexer_peek_for_token_type_after_whitespace(lexer_T* lexer, token_type_T token_type);
39
+ bool lexer_peek_for_close_tag_start(const lexer_T* lexer, int offset);
40
+
41
+ lexer_state_snapshot_T lexer_save_state(lexer_T* lexer);
42
+ void lexer_restore_state(lexer_T* lexer, lexer_state_snapshot_T snapshot);
43
+
23
44
  #endif
data/src/include/parser.h CHANGED
@@ -15,15 +15,20 @@ typedef enum {
15
15
 
16
16
  typedef enum { PARSER_STATE_DATA, PARSER_STATE_FOREIGN_CONTENT } parser_state_T;
17
17
 
18
+ typedef struct PARSER_OPTIONS_STRUCT {
19
+ bool track_whitespace;
20
+ } parser_options_T;
21
+
18
22
  typedef struct PARSER_STRUCT {
19
23
  lexer_T* lexer;
20
24
  token_T* current_token;
21
25
  array_T* open_tags_stack;
22
26
  parser_state_T state;
23
27
  foreign_content_type_T foreign_content_type;
28
+ parser_options_T* options;
24
29
  } parser_T;
25
30
 
26
- parser_T* parser_init(lexer_T* lexer);
31
+ parser_T* parser_init(lexer_T* lexer, parser_options_T* options);
27
32
 
28
33
  AST_DOCUMENT_NODE_T* parser_parse(parser_T* parser);
29
34
 
@@ -10,7 +10,11 @@ typedef enum {
10
10
  TOKEN_NEWLINE, // \n
11
11
  TOKEN_IDENTIFIER,
12
12
 
13
- TOKEN_HTML_DOCTYPE, // <!DOCTYPE, <!doctype, <!DoCtYpE, <!dOcTyPe
13
+ TOKEN_HTML_DOCTYPE, // <!DOCTYPE, <!doctype, <!DoCtYpE, <!dOcTyPe
14
+ TOKEN_XML_DECLARATION, // <?xml
15
+ TOKEN_XML_DECLARATION_END, // ?>
16
+ TOKEN_CDATA_START, // <![CDATA[
17
+ TOKEN_CDATA_END, // ]]>
14
18
 
15
19
  TOKEN_HTML_TAG_START, // <
16
20
  TOKEN_HTML_TAG_START_CLOSE, // </
@@ -29,6 +33,7 @@ typedef enum {
29
33
  TOKEN_EQUALS, // =
30
34
  TOKEN_QUOTE, // ", '
31
35
  TOKEN_BACKTICK, // `
36
+ TOKEN_BACKSLASH, // backslash
32
37
  TOKEN_DASH, // -
33
38
  TOKEN_UNDERSCORE, // _
34
39
  TOKEN_EXCLAMATION, // !
@@ -1,6 +1,6 @@
1
1
  #ifndef HERB_VERSION_H
2
2
  #define HERB_VERSION_H
3
3
 
4
- #define HERB_VERSION "0.5.0"
4
+ #define HERB_VERSION "0.6.0"
5
5
 
6
6
  #endif
data/src/lexer.c CHANGED
@@ -290,13 +290,21 @@ token_T* lexer_next_token(lexer_T* lexer) {
290
290
  return lexer_advance_with_next(lexer, strlen("<!DOCTYPE"), TOKEN_HTML_DOCTYPE);
291
291
  }
292
292
 
293
+ if (lexer_peek_for_xml_declaration(lexer, 0)) {
294
+ return lexer_advance_with_next(lexer, strlen("<?xml"), TOKEN_XML_DECLARATION);
295
+ }
296
+
297
+ if (lexer_peek_for_cdata_start(lexer, 0)) {
298
+ return lexer_advance_with_next(lexer, strlen("<![CDATA["), TOKEN_CDATA_START);
299
+ }
300
+
293
301
  if (isalnum(lexer_peek(lexer, 1))) { return lexer_advance_current(lexer, TOKEN_HTML_TAG_START); }
294
302
 
295
303
  if (lexer_peek_for_html_comment_start(lexer, 0)) {
296
304
  return lexer_advance_with(lexer, "<!--", TOKEN_HTML_COMMENT_START);
297
305
  }
298
306
 
299
- if (lexer_peek(lexer, 1) == '/' && isalnum(lexer_peek(lexer, 2))) {
307
+ if (lexer_peek_for_close_tag_start(lexer, 0)) {
300
308
  return lexer_advance_with(lexer, "</", TOKEN_HTML_TAG_START_CLOSE);
301
309
  }
302
310
 
@@ -308,11 +316,21 @@ token_T* lexer_next_token(lexer_T* lexer) {
308
316
  return token ? token : lexer_advance_current(lexer, TOKEN_SLASH);
309
317
  }
310
318
 
319
+ case '?': {
320
+ token_T* token = lexer_match_and_advance(lexer, "?>", TOKEN_XML_DECLARATION_END);
321
+ return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER);
322
+ }
323
+
311
324
  case '-': {
312
325
  token_T* token = lexer_match_and_advance(lexer, "-->", TOKEN_HTML_COMMENT_END);
313
326
  return token ? token : lexer_advance_current(lexer, TOKEN_DASH);
314
327
  }
315
328
 
329
+ case ']': {
330
+ token_T* token = lexer_match_and_advance(lexer, "]]>", TOKEN_CDATA_END);
331
+ return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER);
332
+ }
333
+
316
334
  case '>': return lexer_advance_current(lexer, TOKEN_HTML_TAG_END);
317
335
  case '_': return lexer_advance_current(lexer, TOKEN_UNDERSCORE);
318
336
  case ':': return lexer_advance_current(lexer, TOKEN_COLON);
@@ -326,6 +344,7 @@ token_T* lexer_next_token(lexer_T* lexer) {
326
344
  case '"':
327
345
  case '\'': return lexer_advance_current(lexer, TOKEN_QUOTE);
328
346
  case '`': return lexer_advance_current(lexer, TOKEN_BACKTICK);
347
+ case '\\': return lexer_advance_current(lexer, TOKEN_BACKSLASH);
329
348
 
330
349
  default: {
331
350
  if (isalnum(lexer->current_character)) { return lexer_parse_identifier(lexer); }
@@ -1,6 +1,8 @@
1
1
  #include "include/lexer_peek_helpers.h"
2
+ #include "include/lexer.h"
2
3
  #include "include/lexer_struct.h"
3
4
  #include "include/macros.h"
5
+ #include "include/token.h"
4
6
 
5
7
  #include <ctype.h>
6
8
  #include <stdbool.h>
@@ -31,6 +33,18 @@ bool lexer_peek_for_doctype(const lexer_T* lexer, const int offset) {
31
33
  return lexer_peek_for(lexer, offset, "<!DOCTYPE", true);
32
34
  }
33
35
 
36
+ bool lexer_peek_for_xml_declaration(const lexer_T* lexer, const int offset) {
37
+ return lexer_peek_for(lexer, offset, "<?xml", true);
38
+ }
39
+
40
+ bool lexer_peek_for_cdata_start(const lexer_T* lexer, const int offset) {
41
+ return lexer_peek_for(lexer, offset, "<![CDATA[", false);
42
+ }
43
+
44
+ bool lexer_peek_for_cdata_end(const lexer_T* lexer, const int offset) {
45
+ return lexer_peek_for(lexer, offset, "]]>", false);
46
+ }
47
+
34
48
  bool lexer_peek_for_html_comment_start(const lexer_T* lexer, const int offset) {
35
49
  return lexer_peek_for(lexer, offset, "<!--", false);
36
50
  }
@@ -57,3 +71,66 @@ bool lexer_peek_erb_end(const lexer_T* lexer, const int offset) {
57
71
  || lexer_peek_erb_percent_close_tag(lexer, offset)
58
72
  );
59
73
  }
74
+
75
+ bool lexer_peek_for_token_type_after_whitespace(lexer_T* lexer, token_type_T token_type) {
76
+ size_t saved_position = lexer->current_position;
77
+ size_t saved_line = lexer->current_line;
78
+ size_t saved_column = lexer->current_column;
79
+ char saved_character = lexer->current_character;
80
+
81
+ token_T* token = lexer_next_token(lexer);
82
+
83
+ while (token && (token->type == TOKEN_WHITESPACE || token->type == TOKEN_NEWLINE)) {
84
+ token_free(token);
85
+ token = lexer_next_token(lexer);
86
+ }
87
+
88
+ bool result = (token && token->type == token_type);
89
+
90
+ if (token) { token_free(token); }
91
+
92
+ lexer->current_position = saved_position;
93
+ lexer->current_line = saved_line;
94
+ lexer->current_column = saved_column;
95
+ lexer->current_character = saved_character;
96
+
97
+ return result;
98
+ }
99
+
100
+ bool lexer_peek_for_close_tag_start(const lexer_T* lexer, const int offset) {
101
+ if (lexer_peek(lexer, offset) != '<' || lexer_peek(lexer, offset + 1) != '/') { return false; }
102
+
103
+ int pos = offset + 2;
104
+
105
+ while (lexer_peek(lexer, pos) == ' ' || lexer_peek(lexer, pos) == '\t' || lexer_peek(lexer, pos) == '\n'
106
+ || lexer_peek(lexer, pos) == '\r') {
107
+ pos++;
108
+ }
109
+
110
+ char c = lexer_peek(lexer, pos);
111
+
112
+ return isalpha(c) || c == '_';
113
+ }
114
+
115
+ lexer_state_snapshot_T lexer_save_state(lexer_T* lexer) {
116
+ lexer_state_snapshot_T snapshot = { .position = lexer->current_position,
117
+ .line = lexer->current_line,
118
+ .column = lexer->current_column,
119
+ .previous_position = lexer->previous_position,
120
+ .previous_line = lexer->previous_line,
121
+ .previous_column = lexer->previous_column,
122
+ .current_character = lexer->current_character,
123
+ .state = lexer->state };
124
+ return snapshot;
125
+ }
126
+
127
+ void lexer_restore_state(lexer_T* lexer, lexer_state_snapshot_T snapshot) {
128
+ lexer->current_position = snapshot.position;
129
+ lexer->current_line = snapshot.line;
130
+ lexer->current_column = snapshot.column;
131
+ lexer->previous_position = snapshot.previous_position;
132
+ lexer->previous_line = snapshot.previous_line;
133
+ lexer->previous_column = snapshot.previous_column;
134
+ lexer->current_character = snapshot.current_character;
135
+ lexer->state = snapshot.state;
136
+ }
data/src/main.c CHANGED
@@ -63,7 +63,7 @@ int main(const int argc, char* argv[]) {
63
63
  clock_gettime(CLOCK_MONOTONIC, &start);
64
64
 
65
65
  if (strcmp(argv[1], "visit") == 0) {
66
- AST_DOCUMENT_NODE_T* root = herb_parse(source);
66
+ AST_DOCUMENT_NODE_T* root = herb_parse(source, NULL);
67
67
  clock_gettime(CLOCK_MONOTONIC, &end);
68
68
 
69
69
  herb_analyze_parse_tree(root, source);
@@ -105,7 +105,7 @@ int main(const int argc, char* argv[]) {
105
105
  }
106
106
 
107
107
  if (strcmp(argv[1], "parse") == 0) {
108
- AST_DOCUMENT_NODE_T* root = herb_parse(source);
108
+ AST_DOCUMENT_NODE_T* root = herb_parse(source, NULL);
109
109
  clock_gettime(CLOCK_MONOTONIC, &end);
110
110
 
111
111
  ast_pretty_print_node((AST_NODE_T*) root, 0, 0, &output);