herb 0.7.5 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +8 -5
- data/config.yml +26 -6
- data/ext/herb/error_helpers.c +57 -3
- data/ext/herb/error_helpers.h +1 -1
- data/ext/herb/extconf.rb +1 -0
- data/ext/herb/extension.c +10 -24
- data/ext/herb/extension_helpers.c +3 -3
- data/ext/herb/extension_helpers.h +1 -1
- data/ext/herb/nodes.c +72 -37
- data/herb.gemspec +0 -2
- data/lib/herb/ast/helpers.rb +11 -0
- data/lib/herb/ast/node.rb +15 -6
- data/lib/herb/ast/nodes.rb +609 -392
- data/lib/herb/cli.rb +31 -0
- data/lib/herb/colors.rb +82 -0
- data/lib/herb/engine/compiler.rb +140 -14
- data/lib/herb/engine/debug_visitor.rb +1 -5
- data/lib/herb/engine/parser_error_overlay.rb +1 -1
- data/lib/herb/engine.rb +8 -14
- data/lib/herb/errors.rb +166 -56
- data/lib/herb/location.rb +2 -2
- data/lib/herb/project.rb +86 -21
- data/lib/herb/token.rb +14 -2
- data/lib/herb/version.rb +1 -1
- data/lib/herb.rb +1 -0
- data/sig/herb/ast/helpers.rbs +3 -0
- data/sig/herb/ast/node.rbs +12 -5
- data/sig/herb/ast/nodes.rbs +124 -62
- data/sig/herb/colors.rbs +35 -0
- data/sig/herb/engine/compiler.rbs +23 -1
- data/sig/herb/errors.rbs +74 -20
- data/sig/herb/token.rbs +8 -0
- data/sig/herb_c_extension.rbs +1 -1
- data/sig/serialized_ast_errors.rbs +8 -0
- data/src/analyze.c +420 -171
- data/src/analyze_helpers.c +5 -0
- data/src/analyze_missing_end.c +147 -0
- data/src/analyze_transform.c +196 -0
- data/src/analyzed_ruby.c +23 -2
- data/src/ast_node.c +5 -5
- data/src/ast_nodes.c +179 -179
- data/src/ast_pretty_print.c +232 -232
- data/src/element_source.c +7 -6
- data/src/errors.c +246 -126
- data/src/extract.c +92 -34
- data/src/herb.c +37 -49
- data/src/html_util.c +34 -96
- data/src/include/analyze.h +10 -2
- data/src/include/analyze_helpers.h +3 -0
- data/src/include/analyzed_ruby.h +4 -2
- data/src/include/ast_node.h +2 -2
- data/src/include/ast_nodes.h +67 -66
- data/src/include/ast_pretty_print.h +2 -2
- data/src/include/element_source.h +3 -1
- data/src/include/errors.h +30 -14
- data/src/include/extract.h +4 -4
- data/src/include/herb.h +6 -7
- data/src/include/html_util.h +4 -5
- data/src/include/lexer.h +1 -3
- data/src/include/lexer_peek_helpers.h +14 -14
- data/src/include/lexer_struct.h +3 -2
- data/src/include/macros.h +4 -0
- data/src/include/parser.h +12 -6
- data/src/include/parser_helpers.h +25 -15
- data/src/include/pretty_print.h +38 -28
- data/src/include/token.h +5 -8
- data/src/include/utf8.h +3 -2
- data/src/include/util/hb_arena.h +31 -0
- data/src/include/util/hb_arena_debug.h +8 -0
- data/src/include/util/hb_array.h +33 -0
- data/src/include/util/hb_buffer.h +34 -0
- data/src/include/util/hb_string.h +29 -0
- data/src/include/util/hb_system.h +9 -0
- data/src/include/util.h +3 -14
- data/src/include/version.h +1 -1
- data/src/include/visitor.h +1 -1
- data/src/io.c +7 -4
- data/src/lexer.c +61 -88
- data/src/lexer_peek_helpers.c +35 -37
- data/src/main.c +19 -23
- data/src/parser.c +282 -201
- data/src/parser_helpers.c +46 -40
- data/src/parser_match_tags.c +316 -0
- data/src/pretty_print.c +82 -106
- data/src/token.c +18 -65
- data/src/utf8.c +4 -4
- data/src/util/hb_arena.c +179 -0
- data/src/util/hb_arena_debug.c +237 -0
- data/src/{array.c → util/hb_array.c} +26 -27
- data/src/util/hb_buffer.c +203 -0
- data/src/util/hb_string.c +85 -0
- data/src/util/hb_system.c +30 -0
- data/src/util.c +29 -99
- data/src/visitor.c +54 -54
- data/templates/ext/herb/error_helpers.c.erb +3 -3
- data/templates/ext/herb/error_helpers.h.erb +1 -1
- data/templates/ext/herb/nodes.c.erb +11 -6
- data/templates/java/error_helpers.c.erb +75 -0
- data/templates/java/error_helpers.h.erb +20 -0
- data/templates/java/nodes.c.erb +97 -0
- data/templates/java/nodes.h.erb +23 -0
- data/templates/java/org/herb/ast/Errors.java.erb +121 -0
- data/templates/java/org/herb/ast/NodeVisitor.java.erb +14 -0
- data/templates/java/org/herb/ast/Nodes.java.erb +220 -0
- data/templates/java/org/herb/ast/Visitor.java.erb +56 -0
- data/templates/javascript/packages/node/extension/error_helpers.cpp.erb +8 -8
- data/templates/javascript/packages/node/extension/error_helpers.h.erb +1 -1
- data/templates/javascript/packages/node/extension/nodes.cpp.erb +9 -9
- data/templates/javascript/packages/node/extension/nodes.h.erb +1 -1
- data/templates/lib/herb/ast/nodes.rb.erb +28 -16
- data/templates/lib/herb/errors.rb.erb +17 -12
- data/templates/rust/src/ast/nodes.rs.erb +220 -0
- data/templates/rust/src/errors.rs.erb +216 -0
- data/templates/rust/src/nodes.rs.erb +374 -0
- data/templates/src/analyze_missing_end.c.erb +36 -0
- data/templates/src/analyze_transform.c.erb +24 -0
- data/templates/src/ast_nodes.c.erb +14 -14
- data/templates/src/ast_pretty_print.c.erb +36 -36
- data/templates/src/errors.c.erb +31 -31
- data/templates/src/include/ast_nodes.h.erb +10 -9
- data/templates/src/include/ast_pretty_print.h.erb +2 -2
- data/templates/src/include/errors.h.erb +6 -6
- data/templates/src/parser_match_tags.c.erb +38 -0
- data/templates/src/visitor.c.erb +4 -4
- data/templates/template.rb +22 -3
- data/templates/wasm/error_helpers.cpp.erb +9 -9
- data/templates/wasm/error_helpers.h.erb +1 -1
- data/templates/wasm/nodes.cpp.erb +9 -9
- data/templates/wasm/nodes.h.erb +1 -1
- data/vendor/prism/Rakefile +4 -1
- data/vendor/prism/config.yml +2 -1
- data/vendor/prism/include/prism/ast.h +31 -1
- data/vendor/prism/include/prism/diagnostic.h +1 -0
- data/vendor/prism/include/prism/version.h +3 -3
- data/vendor/prism/src/diagnostic.c +3 -1
- data/vendor/prism/src/prism.c +130 -71
- data/vendor/prism/src/util/pm_string.c +6 -8
- data/vendor/prism/templates/include/prism/ast.h.erb +2 -0
- data/vendor/prism/templates/java/org/prism/Loader.java.erb +2 -2
- data/vendor/prism/templates/javascript/src/deserialize.js.erb +2 -2
- data/vendor/prism/templates/lib/prism/serialize.rb.erb +2 -2
- data/vendor/prism/templates/sig/prism.rbs.erb +4 -0
- data/vendor/prism/templates/src/diagnostic.c.erb +1 -0
- metadata +34 -20
- data/lib/herb/libherb/array.rb +0 -51
- data/lib/herb/libherb/ast_node.rb +0 -50
- data/lib/herb/libherb/buffer.rb +0 -56
- data/lib/herb/libherb/extract_result.rb +0 -20
- data/lib/herb/libherb/lex_result.rb +0 -32
- data/lib/herb/libherb/libherb.rb +0 -52
- data/lib/herb/libherb/parse_result.rb +0 -20
- data/lib/herb/libherb/token.rb +0 -46
- data/lib/herb/libherb.rb +0 -35
- data/src/buffer.c +0 -241
- data/src/include/array.h +0 -33
- data/src/include/buffer.h +0 -39
- data/src/include/json.h +0 -28
- data/src/include/memory.h +0 -12
- data/src/json.c +0 -205
- data/src/memory.c +0 -53
data/src/lexer.c
CHANGED
|
@@ -1,24 +1,21 @@
|
|
|
1
|
-
#include "include/buffer.h"
|
|
2
1
|
#include "include/lexer_peek_helpers.h"
|
|
3
2
|
#include "include/token.h"
|
|
4
3
|
#include "include/utf8.h"
|
|
5
4
|
#include "include/util.h"
|
|
5
|
+
#include "include/util/hb_buffer.h"
|
|
6
|
+
#include "include/util/hb_string.h"
|
|
6
7
|
|
|
7
8
|
#include <ctype.h>
|
|
8
9
|
#include <string.h>
|
|
9
10
|
|
|
10
11
|
#define LEXER_STALL_LIMIT 5
|
|
11
12
|
|
|
12
|
-
static size_t lexer_sizeof(void) {
|
|
13
|
-
return sizeof(struct LEXER_STRUCT);
|
|
14
|
-
}
|
|
15
|
-
|
|
16
13
|
static bool lexer_eof(const lexer_T* lexer) {
|
|
17
14
|
return lexer->current_character == '\0' || lexer->stalled;
|
|
18
15
|
}
|
|
19
16
|
|
|
20
17
|
static bool lexer_has_more_characters(const lexer_T* lexer) {
|
|
21
|
-
return lexer->current_position < lexer->
|
|
18
|
+
return lexer->current_position < lexer->source.length;
|
|
22
19
|
}
|
|
23
20
|
|
|
24
21
|
static bool lexer_stalled(lexer_T* lexer) {
|
|
@@ -34,17 +31,16 @@ static bool lexer_stalled(lexer_T* lexer) {
|
|
|
34
31
|
return lexer->stalled;
|
|
35
32
|
}
|
|
36
33
|
|
|
37
|
-
lexer_T*
|
|
38
|
-
if (source
|
|
39
|
-
|
|
40
|
-
|
|
34
|
+
void lexer_init(lexer_T* lexer, const char* source) {
|
|
35
|
+
if (source != NULL) {
|
|
36
|
+
lexer->source = hb_string(source);
|
|
37
|
+
} else {
|
|
38
|
+
lexer->source = hb_string("");
|
|
39
|
+
}
|
|
41
40
|
|
|
41
|
+
lexer->current_character = lexer->source.data[0];
|
|
42
42
|
lexer->state = STATE_DATA;
|
|
43
43
|
|
|
44
|
-
lexer->source = source;
|
|
45
|
-
lexer->source_length = (uint32_t) strlen(source);
|
|
46
|
-
lexer->current_character = source[0];
|
|
47
|
-
|
|
48
44
|
lexer->current_line = 1;
|
|
49
45
|
lexer->current_column = 0;
|
|
50
46
|
lexer->current_position = 0;
|
|
@@ -56,8 +52,6 @@ lexer_T* lexer_init(const char* source) {
|
|
|
56
52
|
lexer->stall_counter = 0;
|
|
57
53
|
lexer->last_position = 0;
|
|
58
54
|
lexer->stalled = false;
|
|
59
|
-
|
|
60
|
-
return lexer;
|
|
61
55
|
}
|
|
62
56
|
|
|
63
57
|
token_T* lexer_error(lexer_T* lexer, const char* message) {
|
|
@@ -73,7 +67,7 @@ token_T* lexer_error(lexer_T* lexer, const char* message) {
|
|
|
73
67
|
lexer->current_column
|
|
74
68
|
);
|
|
75
69
|
|
|
76
|
-
return token_init(error_message, TOKEN_ERROR, lexer);
|
|
70
|
+
return token_init(hb_string(error_message), TOKEN_ERROR, lexer);
|
|
77
71
|
}
|
|
78
72
|
|
|
79
73
|
static void lexer_advance(lexer_T* lexer) {
|
|
@@ -81,7 +75,7 @@ static void lexer_advance(lexer_T* lexer) {
|
|
|
81
75
|
if (!is_newline(lexer->current_character)) { lexer->current_column++; }
|
|
82
76
|
|
|
83
77
|
lexer->current_position++;
|
|
84
|
-
lexer->current_character = lexer->source[lexer->current_position];
|
|
78
|
+
lexer->current_character = lexer->source.data[lexer->current_position];
|
|
85
79
|
}
|
|
86
80
|
}
|
|
87
81
|
|
|
@@ -93,11 +87,11 @@ static void lexer_advance_utf8_bytes(lexer_T* lexer, int byte_count) {
|
|
|
93
87
|
|
|
94
88
|
lexer->current_position += byte_count;
|
|
95
89
|
|
|
96
|
-
if (lexer->current_position >= lexer->
|
|
97
|
-
lexer->current_position = lexer->
|
|
90
|
+
if (lexer->current_position >= lexer->source.length) {
|
|
91
|
+
lexer->current_position = lexer->source.length;
|
|
98
92
|
lexer->current_character = '\0';
|
|
99
93
|
} else {
|
|
100
|
-
lexer->current_character = lexer->source[lexer->current_position];
|
|
94
|
+
lexer->current_character = lexer->source.data[lexer->current_position];
|
|
101
95
|
}
|
|
102
96
|
}
|
|
103
97
|
}
|
|
@@ -108,65 +102,50 @@ static void lexer_advance_by(lexer_T* lexer, const size_t count) {
|
|
|
108
102
|
}
|
|
109
103
|
}
|
|
110
104
|
|
|
111
|
-
static token_T* lexer_advance_with(lexer_T* lexer,
|
|
112
|
-
lexer_advance_by(lexer,
|
|
105
|
+
static token_T* lexer_advance_with(lexer_T* lexer, hb_string_T value, const token_type_T type) {
|
|
106
|
+
lexer_advance_by(lexer, value.length);
|
|
113
107
|
return token_init(value, type, lexer);
|
|
114
108
|
}
|
|
115
109
|
|
|
116
110
|
static token_T* lexer_advance_with_next(lexer_T* lexer, size_t count, token_type_T type) {
|
|
117
|
-
|
|
118
|
-
if (!collected) { return NULL; }
|
|
111
|
+
uint32_t start_position = lexer->current_position;
|
|
119
112
|
|
|
120
113
|
for (size_t i = 0; i < count; i++) {
|
|
121
|
-
collected[i] = lexer->current_character;
|
|
122
114
|
lexer_advance(lexer);
|
|
123
115
|
}
|
|
124
116
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
token_T* token = token_init(collected, type, lexer);
|
|
128
|
-
free(collected);
|
|
117
|
+
token_T* token = token_init(hb_string_range(lexer->source, start_position, lexer->current_position), type, lexer);
|
|
129
118
|
|
|
130
119
|
return token;
|
|
131
120
|
}
|
|
132
121
|
|
|
133
122
|
static token_T* lexer_advance_current(lexer_T* lexer, const token_type_T type) {
|
|
134
|
-
|
|
123
|
+
char buffer[2];
|
|
124
|
+
buffer[0] = lexer->current_character;
|
|
125
|
+
buffer[1] = '\0';
|
|
126
|
+
|
|
127
|
+
return lexer_advance_with(lexer, hb_string(buffer), type);
|
|
135
128
|
}
|
|
136
129
|
|
|
137
130
|
static token_T* lexer_advance_utf8_character(lexer_T* lexer, const token_type_T type) {
|
|
138
|
-
int char_byte_length = utf8_sequence_length(lexer->source, lexer->current_position, lexer->
|
|
139
|
-
|
|
131
|
+
int char_byte_length = utf8_sequence_length(lexer->source.data, lexer->current_position, lexer->source.length);
|
|
140
132
|
if (char_byte_length <= 1) { return lexer_advance_current(lexer, type); }
|
|
141
|
-
|
|
142
|
-
char* utf8_char = malloc(char_byte_length + 1);
|
|
143
|
-
|
|
144
|
-
if (!utf8_char) { return lexer_advance_current(lexer, type); }
|
|
133
|
+
uint32_t start_position = lexer->current_position;
|
|
145
134
|
|
|
146
135
|
for (int i = 0; i < char_byte_length; i++) {
|
|
147
|
-
if (lexer->current_position + i >= lexer->
|
|
148
|
-
free(utf8_char);
|
|
149
|
-
return lexer_advance_current(lexer, type);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
utf8_char[i] = lexer->source[lexer->current_position + i];
|
|
136
|
+
if (lexer->current_position + i >= lexer->source.length) { return lexer_advance_current(lexer, type); }
|
|
153
137
|
}
|
|
154
138
|
|
|
155
|
-
utf8_char[char_byte_length] = '\0';
|
|
156
|
-
|
|
157
139
|
lexer_advance_utf8_bytes(lexer, char_byte_length);
|
|
158
140
|
|
|
159
|
-
token_T* token = token_init(
|
|
160
|
-
|
|
161
|
-
free(utf8_char);
|
|
141
|
+
token_T* token = token_init(hb_string_range(lexer->source, start_position, lexer->current_position), type, lexer);
|
|
162
142
|
|
|
163
143
|
return token;
|
|
164
144
|
}
|
|
165
145
|
|
|
166
|
-
static token_T* lexer_match_and_advance(lexer_T* lexer,
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
}
|
|
146
|
+
static token_T* lexer_match_and_advance(lexer_T* lexer, hb_string_T value, const token_type_T type) {
|
|
147
|
+
hb_string_T remaining_source = hb_string_slice(lexer->source, lexer->current_position);
|
|
148
|
+
if (hb_string_starts_with(remaining_source, value)) { return lexer_advance_with(lexer, value, type); }
|
|
170
149
|
|
|
171
150
|
return NULL;
|
|
172
151
|
}
|
|
@@ -174,35 +153,31 @@ static token_T* lexer_match_and_advance(lexer_T* lexer, const char* value, const
|
|
|
174
153
|
// ===== Specialized Parsers
|
|
175
154
|
|
|
176
155
|
static token_T* lexer_parse_whitespace(lexer_T* lexer) {
|
|
177
|
-
|
|
156
|
+
uint32_t start_position = lexer->current_position;
|
|
178
157
|
|
|
179
158
|
while (isspace(lexer->current_character) && lexer->current_character != '\n' && lexer->current_character != '\r'
|
|
180
159
|
&& !lexer_eof(lexer)) {
|
|
181
|
-
buffer_append_char(&buffer, lexer->current_character);
|
|
182
160
|
lexer_advance(lexer);
|
|
183
161
|
}
|
|
184
162
|
|
|
185
|
-
token_T* token =
|
|
186
|
-
|
|
187
|
-
buffer_free(&buffer);
|
|
163
|
+
token_T* token =
|
|
164
|
+
token_init(hb_string_range(lexer->source, start_position, lexer->current_position), TOKEN_WHITESPACE, lexer);
|
|
188
165
|
|
|
189
166
|
return token;
|
|
190
167
|
}
|
|
191
168
|
|
|
192
169
|
static token_T* lexer_parse_identifier(lexer_T* lexer) {
|
|
193
|
-
|
|
170
|
+
uint32_t start_position = lexer->current_position;
|
|
194
171
|
|
|
195
172
|
while ((isalnum(lexer->current_character) || lexer->current_character == '-' || lexer->current_character == '_'
|
|
196
173
|
|| lexer->current_character == ':')
|
|
197
174
|
&& !lexer_peek_for_html_comment_end(lexer, 0) && !lexer_eof(lexer)) {
|
|
198
175
|
|
|
199
|
-
buffer_append_char(&buffer, lexer->current_character);
|
|
200
176
|
lexer_advance(lexer);
|
|
201
177
|
}
|
|
202
178
|
|
|
203
|
-
token_T* token =
|
|
204
|
-
|
|
205
|
-
buffer_free(&buffer);
|
|
179
|
+
token_T* token =
|
|
180
|
+
token_init(hb_string_range(lexer->source, start_position, lexer->current_position), TOKEN_IDENTIFIER, lexer);
|
|
206
181
|
|
|
207
182
|
return token;
|
|
208
183
|
}
|
|
@@ -210,7 +185,8 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
|
|
|
210
185
|
// ===== ERB Parsing
|
|
211
186
|
|
|
212
187
|
static token_T* lexer_parse_erb_open(lexer_T* lexer) {
|
|
213
|
-
|
|
188
|
+
hb_string_T erb_patterns[] = { hb_string("<%=="), hb_string("<%%="), hb_string("<%="), hb_string("<%#"),
|
|
189
|
+
hb_string("<%-"), hb_string("<%%"), hb_string("<%") };
|
|
214
190
|
|
|
215
191
|
lexer->state = STATE_ERB_CONTENT;
|
|
216
192
|
|
|
@@ -223,14 +199,18 @@ static token_T* lexer_parse_erb_open(lexer_T* lexer) {
|
|
|
223
199
|
}
|
|
224
200
|
|
|
225
201
|
static token_T* lexer_parse_erb_content(lexer_T* lexer) {
|
|
226
|
-
|
|
202
|
+
uint32_t start_position = lexer->current_position;
|
|
227
203
|
|
|
228
204
|
while (!lexer_peek_erb_end(lexer, 0)) {
|
|
229
205
|
if (lexer_eof(lexer)) {
|
|
230
|
-
|
|
231
|
-
|
|
206
|
+
token_T* token = token_init(
|
|
207
|
+
hb_string_range(lexer->source, start_position, lexer->current_position),
|
|
208
|
+
TOKEN_ERROR,
|
|
209
|
+
lexer
|
|
210
|
+
); // Handle unexpected EOF
|
|
232
211
|
|
|
233
|
-
|
|
212
|
+
return token;
|
|
213
|
+
}
|
|
234
214
|
|
|
235
215
|
if (is_newline(lexer->current_character)) {
|
|
236
216
|
lexer->current_line++;
|
|
@@ -240,14 +220,13 @@ static token_T* lexer_parse_erb_content(lexer_T* lexer) {
|
|
|
240
220
|
}
|
|
241
221
|
|
|
242
222
|
lexer->current_position++;
|
|
243
|
-
lexer->current_character = lexer->source[lexer->current_position];
|
|
223
|
+
lexer->current_character = lexer->source.data[lexer->current_position];
|
|
244
224
|
}
|
|
245
225
|
|
|
246
226
|
lexer->state = STATE_ERB_CLOSE;
|
|
247
227
|
|
|
248
|
-
token_T* token =
|
|
249
|
-
|
|
250
|
-
buffer_free(&buffer);
|
|
228
|
+
token_T* token =
|
|
229
|
+
token_init(hb_string_range(lexer->source, start_position, lexer->current_position), TOKEN_ERB_CONTENT, lexer);
|
|
251
230
|
|
|
252
231
|
return token;
|
|
253
232
|
}
|
|
@@ -255,17 +234,17 @@ static token_T* lexer_parse_erb_content(lexer_T* lexer) {
|
|
|
255
234
|
static token_T* lexer_parse_erb_close(lexer_T* lexer) {
|
|
256
235
|
lexer->state = STATE_DATA;
|
|
257
236
|
|
|
258
|
-
if (lexer_peek_erb_percent_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "%%>", TOKEN_ERB_END); }
|
|
259
|
-
if (lexer_peek_erb_equals_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "=%>", TOKEN_ERB_END); }
|
|
260
|
-
if (lexer_peek_erb_dash_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "-%>", TOKEN_ERB_END); }
|
|
237
|
+
if (lexer_peek_erb_percent_close_tag(lexer, 0)) { return lexer_advance_with(lexer, hb_string("%%>"), TOKEN_ERB_END); }
|
|
238
|
+
if (lexer_peek_erb_equals_close_tag(lexer, 0)) { return lexer_advance_with(lexer, hb_string("=%>"), TOKEN_ERB_END); }
|
|
239
|
+
if (lexer_peek_erb_dash_close_tag(lexer, 0)) { return lexer_advance_with(lexer, hb_string("-%>"), TOKEN_ERB_END); }
|
|
261
240
|
|
|
262
|
-
return lexer_advance_with(lexer, "%>", TOKEN_ERB_END);
|
|
241
|
+
return lexer_advance_with(lexer, hb_string("%>"), TOKEN_ERB_END);
|
|
263
242
|
}
|
|
264
243
|
|
|
265
244
|
// ===== Tokenizing Function
|
|
266
245
|
|
|
267
246
|
token_T* lexer_next_token(lexer_T* lexer) {
|
|
268
|
-
if (lexer_eof(lexer)) { return token_init("", TOKEN_EOF, lexer); }
|
|
247
|
+
if (lexer_eof(lexer)) { return token_init(hb_string(""), TOKEN_EOF, lexer); }
|
|
269
248
|
if (lexer_stalled(lexer)) { return lexer_error(lexer, "Lexer stalled after 5 iterations"); }
|
|
270
249
|
|
|
271
250
|
if (lexer->state == STATE_ERB_CONTENT) { return lexer_parse_erb_content(lexer); }
|
|
@@ -302,33 +281,33 @@ token_T* lexer_next_token(lexer_T* lexer) {
|
|
|
302
281
|
if (isalnum(lexer_peek(lexer, 1))) { return lexer_advance_current(lexer, TOKEN_HTML_TAG_START); }
|
|
303
282
|
|
|
304
283
|
if (lexer_peek_for_html_comment_start(lexer, 0)) {
|
|
305
|
-
return lexer_advance_with(lexer, "<!--", TOKEN_HTML_COMMENT_START);
|
|
284
|
+
return lexer_advance_with(lexer, hb_string("<!--"), TOKEN_HTML_COMMENT_START);
|
|
306
285
|
}
|
|
307
286
|
|
|
308
287
|
if (lexer_peek_for_close_tag_start(lexer, 0)) {
|
|
309
|
-
return lexer_advance_with(lexer, "</", TOKEN_HTML_TAG_START_CLOSE);
|
|
288
|
+
return lexer_advance_with(lexer, hb_string("</"), TOKEN_HTML_TAG_START_CLOSE);
|
|
310
289
|
}
|
|
311
290
|
|
|
312
291
|
return lexer_advance_current(lexer, TOKEN_LT);
|
|
313
292
|
}
|
|
314
293
|
|
|
315
294
|
case '/': {
|
|
316
|
-
token_T* token = lexer_match_and_advance(lexer, "/>", TOKEN_HTML_TAG_SELF_CLOSE);
|
|
295
|
+
token_T* token = lexer_match_and_advance(lexer, hb_string("/>"), TOKEN_HTML_TAG_SELF_CLOSE);
|
|
317
296
|
return token ? token : lexer_advance_current(lexer, TOKEN_SLASH);
|
|
318
297
|
}
|
|
319
298
|
|
|
320
299
|
case '?': {
|
|
321
|
-
token_T* token = lexer_match_and_advance(lexer, "?>", TOKEN_XML_DECLARATION_END);
|
|
300
|
+
token_T* token = lexer_match_and_advance(lexer, hb_string("?>"), TOKEN_XML_DECLARATION_END);
|
|
322
301
|
return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER);
|
|
323
302
|
}
|
|
324
303
|
|
|
325
304
|
case '-': {
|
|
326
|
-
token_T* token = lexer_match_and_advance(lexer, "-->", TOKEN_HTML_COMMENT_END);
|
|
305
|
+
token_T* token = lexer_match_and_advance(lexer, hb_string("-->"), TOKEN_HTML_COMMENT_END);
|
|
327
306
|
return token ? token : lexer_advance_current(lexer, TOKEN_DASH);
|
|
328
307
|
}
|
|
329
308
|
|
|
330
309
|
case ']': {
|
|
331
|
-
token_T* token = lexer_match_and_advance(lexer, "]]>", TOKEN_CDATA_END);
|
|
310
|
+
token_T* token = lexer_match_and_advance(lexer, hb_string("]]>"), TOKEN_CDATA_END);
|
|
332
311
|
return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER);
|
|
333
312
|
}
|
|
334
313
|
|
|
@@ -354,9 +333,3 @@ token_T* lexer_next_token(lexer_T* lexer) {
|
|
|
354
333
|
}
|
|
355
334
|
}
|
|
356
335
|
}
|
|
357
|
-
|
|
358
|
-
void lexer_free(lexer_T* lexer) {
|
|
359
|
-
if (lexer == NULL) { return; }
|
|
360
|
-
|
|
361
|
-
free(lexer);
|
|
362
|
-
}
|
data/src/lexer_peek_helpers.c
CHANGED
|
@@ -3,73 +3,71 @@
|
|
|
3
3
|
#include "include/lexer_struct.h"
|
|
4
4
|
#include "include/macros.h"
|
|
5
5
|
#include "include/token.h"
|
|
6
|
+
#include "include/util/hb_string.h"
|
|
6
7
|
|
|
7
8
|
#include <ctype.h>
|
|
8
9
|
#include <stdbool.h>
|
|
9
10
|
|
|
10
|
-
char lexer_backtrack(const lexer_T* lexer,
|
|
11
|
-
return lexer->source[MAX(lexer->current_position - offset, 0)];
|
|
11
|
+
char lexer_backtrack(const lexer_T* lexer, uint32_t offset) {
|
|
12
|
+
return lexer->source.data[MAX(lexer->current_position - offset, 0)];
|
|
12
13
|
}
|
|
13
14
|
|
|
14
|
-
char lexer_peek(const lexer_T* lexer,
|
|
15
|
-
return lexer->source[MIN(lexer->current_position + offset, lexer->
|
|
15
|
+
char lexer_peek(const lexer_T* lexer, uint32_t offset) {
|
|
16
|
+
return lexer->source.data[MIN(lexer->current_position + offset, lexer->source.length)];
|
|
16
17
|
}
|
|
17
18
|
|
|
18
|
-
bool lexer_peek_for(const lexer_T* lexer,
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
bool lexer_peek_for(const lexer_T* lexer, uint32_t offset, hb_string_T pattern, const bool case_insensitive) {
|
|
20
|
+
hb_string_T remaining_source = hb_string_slice(lexer->source, lexer->current_position + offset);
|
|
21
|
+
remaining_source.length = MIN(pattern.length, remaining_source.length);
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
}
|
|
23
|
+
if (case_insensitive) {
|
|
24
|
+
return hb_string_equals_case_insensitive(remaining_source, pattern);
|
|
25
|
+
} else {
|
|
26
|
+
return hb_string_equals(remaining_source, pattern);
|
|
27
27
|
}
|
|
28
|
-
|
|
29
|
-
return true;
|
|
30
28
|
}
|
|
31
29
|
|
|
32
|
-
bool lexer_peek_for_doctype(const lexer_T* lexer,
|
|
33
|
-
return lexer_peek_for(lexer, offset, "<!DOCTYPE", true);
|
|
30
|
+
bool lexer_peek_for_doctype(const lexer_T* lexer, uint32_t offset) {
|
|
31
|
+
return lexer_peek_for(lexer, offset, hb_string("<!DOCTYPE"), true);
|
|
34
32
|
}
|
|
35
33
|
|
|
36
|
-
bool lexer_peek_for_xml_declaration(const lexer_T* lexer,
|
|
37
|
-
return lexer_peek_for(lexer, offset, "<?xml", true);
|
|
34
|
+
bool lexer_peek_for_xml_declaration(const lexer_T* lexer, uint32_t offset) {
|
|
35
|
+
return lexer_peek_for(lexer, offset, hb_string("<?xml"), true);
|
|
38
36
|
}
|
|
39
37
|
|
|
40
|
-
bool lexer_peek_for_cdata_start(const lexer_T* lexer,
|
|
41
|
-
return lexer_peek_for(lexer, offset, "<![CDATA[", false);
|
|
38
|
+
bool lexer_peek_for_cdata_start(const lexer_T* lexer, uint32_t offset) {
|
|
39
|
+
return lexer_peek_for(lexer, offset, hb_string("<![CDATA["), false);
|
|
42
40
|
}
|
|
43
41
|
|
|
44
|
-
bool lexer_peek_for_cdata_end(const lexer_T* lexer,
|
|
45
|
-
return lexer_peek_for(lexer, offset, "]]>", false);
|
|
42
|
+
bool lexer_peek_for_cdata_end(const lexer_T* lexer, uint32_t offset) {
|
|
43
|
+
return lexer_peek_for(lexer, offset, hb_string("]]>"), false);
|
|
46
44
|
}
|
|
47
45
|
|
|
48
|
-
bool lexer_peek_for_html_comment_start(const lexer_T* lexer,
|
|
49
|
-
return lexer_peek_for(lexer, offset, "<!--", false);
|
|
46
|
+
bool lexer_peek_for_html_comment_start(const lexer_T* lexer, uint32_t offset) {
|
|
47
|
+
return lexer_peek_for(lexer, offset, hb_string("<!--"), false);
|
|
50
48
|
}
|
|
51
49
|
|
|
52
|
-
bool lexer_peek_for_html_comment_end(const lexer_T* lexer,
|
|
53
|
-
return lexer_peek_for(lexer, offset, "-->", false);
|
|
50
|
+
bool lexer_peek_for_html_comment_end(const lexer_T* lexer, uint32_t offset) {
|
|
51
|
+
return lexer_peek_for(lexer, offset, hb_string("-->"), false);
|
|
54
52
|
}
|
|
55
53
|
|
|
56
|
-
bool lexer_peek_erb_close_tag(const lexer_T* lexer,
|
|
57
|
-
return lexer_peek_for(lexer, offset, "%>", false);
|
|
54
|
+
bool lexer_peek_erb_close_tag(const lexer_T* lexer, uint32_t offset) {
|
|
55
|
+
return lexer_peek_for(lexer, offset, hb_string("%>"), false);
|
|
58
56
|
}
|
|
59
57
|
|
|
60
|
-
bool lexer_peek_erb_dash_close_tag(const lexer_T* lexer,
|
|
61
|
-
return lexer_peek_for(lexer, offset, "-%>", false);
|
|
58
|
+
bool lexer_peek_erb_dash_close_tag(const lexer_T* lexer, uint32_t offset) {
|
|
59
|
+
return lexer_peek_for(lexer, offset, hb_string("-%>"), false);
|
|
62
60
|
}
|
|
63
61
|
|
|
64
|
-
bool lexer_peek_erb_percent_close_tag(const lexer_T* lexer,
|
|
65
|
-
return lexer_peek_for(lexer, offset, "%%>", false);
|
|
62
|
+
bool lexer_peek_erb_percent_close_tag(const lexer_T* lexer, uint32_t offset) {
|
|
63
|
+
return lexer_peek_for(lexer, offset, hb_string("%%>"), false);
|
|
66
64
|
}
|
|
67
65
|
|
|
68
|
-
bool lexer_peek_erb_equals_close_tag(const lexer_T* lexer,
|
|
69
|
-
return lexer_peek_for(lexer, offset, "=%>", false);
|
|
66
|
+
bool lexer_peek_erb_equals_close_tag(const lexer_T* lexer, uint32_t offset) {
|
|
67
|
+
return lexer_peek_for(lexer, offset, hb_string("=%>"), false);
|
|
70
68
|
}
|
|
71
69
|
|
|
72
|
-
bool lexer_peek_erb_end(const lexer_T* lexer,
|
|
70
|
+
bool lexer_peek_erb_end(const lexer_T* lexer, uint32_t offset) {
|
|
73
71
|
return (
|
|
74
72
|
lexer_peek_erb_close_tag(lexer, offset) || lexer_peek_erb_dash_close_tag(lexer, offset)
|
|
75
73
|
|| lexer_peek_erb_percent_close_tag(lexer, offset) || lexer_peek_erb_equals_close_tag(lexer, offset)
|
|
@@ -103,10 +101,10 @@ bool lexer_peek_for_token_type_after_whitespace(lexer_T* lexer, token_type_T tok
|
|
|
103
101
|
return result;
|
|
104
102
|
}
|
|
105
103
|
|
|
106
|
-
bool lexer_peek_for_close_tag_start(const lexer_T* lexer,
|
|
104
|
+
bool lexer_peek_for_close_tag_start(const lexer_T* lexer, uint32_t offset) {
|
|
107
105
|
if (lexer_peek(lexer, offset) != '<' || lexer_peek(lexer, offset + 1) != '/') { return false; }
|
|
108
106
|
|
|
109
|
-
|
|
107
|
+
uint32_t pos = offset + 2;
|
|
110
108
|
|
|
111
109
|
while (lexer_peek(lexer, pos) == ' ' || lexer_peek(lexer, pos) == '\t' || lexer_peek(lexer, pos) == '\n'
|
|
112
110
|
|| lexer_peek(lexer, pos) == '\r') {
|
data/src/main.c
CHANGED
|
@@ -4,11 +4,11 @@
|
|
|
4
4
|
#include "include/ast_node.h"
|
|
5
5
|
#include "include/ast_nodes.h"
|
|
6
6
|
#include "include/ast_pretty_print.h"
|
|
7
|
-
#include "include/buffer.h"
|
|
8
7
|
#include "include/extract.h"
|
|
9
8
|
#include "include/herb.h"
|
|
10
9
|
#include "include/io.h"
|
|
11
10
|
#include "include/ruby_parser.h"
|
|
11
|
+
#include "include/util/hb_buffer.h"
|
|
12
12
|
|
|
13
13
|
#include <stdio.h>
|
|
14
14
|
#include <string.h>
|
|
@@ -39,7 +39,6 @@ int main(const int argc, char* argv[]) {
|
|
|
39
39
|
printf("Herb 🌿 Powerful and seamless HTML-aware ERB parsing and tooling.\n\n");
|
|
40
40
|
|
|
41
41
|
printf("./herb lex [file] - Lex a file\n");
|
|
42
|
-
printf("./herb lex_json [file] - Lex a file and return the result as json.\n");
|
|
43
42
|
printf("./herb parse [file] - Parse a file\n");
|
|
44
43
|
printf("./herb ruby [file] - Extract Ruby from a file\n");
|
|
45
44
|
printf("./herb html [file] - Extract HTML from a file\n");
|
|
@@ -53,9 +52,9 @@ int main(const int argc, char* argv[]) {
|
|
|
53
52
|
return 1;
|
|
54
53
|
}
|
|
55
54
|
|
|
56
|
-
|
|
55
|
+
hb_buffer_T output;
|
|
57
56
|
|
|
58
|
-
if (!
|
|
57
|
+
if (!hb_buffer_init(&output, 4096)) { return 1; }
|
|
59
58
|
|
|
60
59
|
char* source = herb_read_file(argv[2]);
|
|
61
60
|
|
|
@@ -74,7 +73,7 @@ int main(const int argc, char* argv[]) {
|
|
|
74
73
|
print_time_diff(start, end, "visiting");
|
|
75
74
|
|
|
76
75
|
ast_node_free((AST_NODE_T*) root);
|
|
77
|
-
|
|
76
|
+
free(output.value);
|
|
78
77
|
free(source);
|
|
79
78
|
|
|
80
79
|
return 0;
|
|
@@ -87,18 +86,7 @@ int main(const int argc, char* argv[]) {
|
|
|
87
86
|
printf("%s\n", output.value);
|
|
88
87
|
print_time_diff(start, end, "lexing");
|
|
89
88
|
|
|
90
|
-
|
|
91
|
-
free(source);
|
|
92
|
-
|
|
93
|
-
return 0;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
if (strcmp(argv[1], "lex_json") == 0) {
|
|
97
|
-
herb_lex_json_to_buffer(source, &output);
|
|
98
|
-
|
|
99
|
-
printf("%s\n", output.value);
|
|
100
|
-
|
|
101
|
-
buffer_free(&output);
|
|
89
|
+
free(output.value);
|
|
102
90
|
free(source);
|
|
103
91
|
|
|
104
92
|
return 0;
|
|
@@ -106,15 +94,23 @@ int main(const int argc, char* argv[]) {
|
|
|
106
94
|
|
|
107
95
|
if (strcmp(argv[1], "parse") == 0) {
|
|
108
96
|
AST_DOCUMENT_NODE_T* root = herb_parse(source, NULL);
|
|
97
|
+
|
|
98
|
+
herb_analyze_parse_tree(root, source);
|
|
99
|
+
|
|
109
100
|
clock_gettime(CLOCK_MONOTONIC, &end);
|
|
110
101
|
|
|
111
|
-
|
|
112
|
-
|
|
102
|
+
int silent = 0;
|
|
103
|
+
if (argc > 3 && strcmp(argv[3], "--silent") == 0) { silent = 1; }
|
|
104
|
+
|
|
105
|
+
if (!silent) {
|
|
106
|
+
ast_pretty_print_node((AST_NODE_T*) root, 0, 0, &output);
|
|
107
|
+
printf("%s\n", output.value);
|
|
113
108
|
|
|
114
|
-
|
|
109
|
+
print_time_diff(start, end, "parsing");
|
|
110
|
+
}
|
|
115
111
|
|
|
116
112
|
ast_node_free((AST_NODE_T*) root);
|
|
117
|
-
|
|
113
|
+
free(output.value);
|
|
118
114
|
free(source);
|
|
119
115
|
|
|
120
116
|
return 0;
|
|
@@ -127,7 +123,7 @@ int main(const int argc, char* argv[]) {
|
|
|
127
123
|
printf("%s\n", output.value);
|
|
128
124
|
print_time_diff(start, end, "extracting Ruby");
|
|
129
125
|
|
|
130
|
-
|
|
126
|
+
free(output.value);
|
|
131
127
|
free(source);
|
|
132
128
|
|
|
133
129
|
return 0;
|
|
@@ -140,7 +136,7 @@ int main(const int argc, char* argv[]) {
|
|
|
140
136
|
printf("%s\n", output.value);
|
|
141
137
|
print_time_diff(start, end, "extracting HTML");
|
|
142
138
|
|
|
143
|
-
|
|
139
|
+
free(output.value);
|
|
144
140
|
free(source);
|
|
145
141
|
|
|
146
142
|
return 0;
|