@ast-grep/lang-html 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +24 -0
- package/index.d.ts +10 -0
- package/index.js +9 -0
- package/package.json +46 -0
- package/postinstall.js +4 -0
- package/prebuilds/prebuild-Linux-X64/parser.so +0 -0
- package/prebuilds/prebuild-Windows-X64/parser.so +0 -0
- package/prebuilds/prebuild-macOS-ARM64/parser.so +0 -0
- package/src/grammar.json +501 -0
- package/src/node-types.json +318 -0
- package/src/parser.c +2300 -0
- package/src/scanner.c +362 -0
- package/src/tag.h +385 -0
- package/src/tree_sitter/alloc.h +54 -0
- package/src/tree_sitter/array.h +290 -0
- package/src/tree_sitter/parser.h +266 -0
- package/type.d.ts +284 -0
package/src/scanner.c
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
#include "tag.h"
|
|
2
|
+
#include "tree_sitter/parser.h"
|
|
3
|
+
|
|
4
|
+
#include <wctype.h>
|
|
5
|
+
|
|
6
|
+
enum TokenType {
|
|
7
|
+
START_TAG_NAME,
|
|
8
|
+
SCRIPT_START_TAG_NAME,
|
|
9
|
+
STYLE_START_TAG_NAME,
|
|
10
|
+
END_TAG_NAME,
|
|
11
|
+
ERRONEOUS_END_TAG_NAME,
|
|
12
|
+
SELF_CLOSING_TAG_DELIMITER,
|
|
13
|
+
IMPLICIT_END_TAG,
|
|
14
|
+
RAW_TEXT,
|
|
15
|
+
COMMENT,
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
typedef struct {
|
|
19
|
+
Array(Tag) tags;
|
|
20
|
+
} Scanner;
|
|
21
|
+
|
|
22
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
23
|
+
|
|
24
|
+
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
|
25
|
+
|
|
26
|
+
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
|
|
27
|
+
|
|
28
|
+
static unsigned serialize(Scanner *scanner, char *buffer) {
|
|
29
|
+
uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
|
|
30
|
+
uint16_t serialized_tag_count = 0;
|
|
31
|
+
|
|
32
|
+
unsigned size = sizeof(tag_count);
|
|
33
|
+
memcpy(&buffer[size], &tag_count, sizeof(tag_count));
|
|
34
|
+
size += sizeof(tag_count);
|
|
35
|
+
|
|
36
|
+
for (; serialized_tag_count < tag_count; serialized_tag_count++) {
|
|
37
|
+
Tag tag = scanner->tags.contents[serialized_tag_count];
|
|
38
|
+
if (tag.type == CUSTOM) {
|
|
39
|
+
unsigned name_length = tag.custom_tag_name.size;
|
|
40
|
+
if (name_length > UINT8_MAX) {
|
|
41
|
+
name_length = UINT8_MAX;
|
|
42
|
+
}
|
|
43
|
+
if (size + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
buffer[size++] = (char)tag.type;
|
|
47
|
+
buffer[size++] = (char)name_length;
|
|
48
|
+
strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
|
|
49
|
+
size += name_length;
|
|
50
|
+
} else {
|
|
51
|
+
if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
buffer[size++] = (char)tag.type;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
|
|
59
|
+
return size;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
|
|
63
|
+
for (unsigned i = 0; i < scanner->tags.size; i++) {
|
|
64
|
+
tag_free(&scanner->tags.contents[i]);
|
|
65
|
+
}
|
|
66
|
+
array_clear(&scanner->tags);
|
|
67
|
+
|
|
68
|
+
if (length > 0) {
|
|
69
|
+
unsigned size = 0;
|
|
70
|
+
uint16_t tag_count = 0;
|
|
71
|
+
uint16_t serialized_tag_count = 0;
|
|
72
|
+
|
|
73
|
+
memcpy(&serialized_tag_count, &buffer[size], sizeof(serialized_tag_count));
|
|
74
|
+
size += sizeof(serialized_tag_count);
|
|
75
|
+
|
|
76
|
+
memcpy(&tag_count, &buffer[size], sizeof(tag_count));
|
|
77
|
+
size += sizeof(tag_count);
|
|
78
|
+
|
|
79
|
+
array_reserve(&scanner->tags, tag_count);
|
|
80
|
+
if (tag_count > 0) {
|
|
81
|
+
unsigned iter = 0;
|
|
82
|
+
for (iter = 0; iter < serialized_tag_count; iter++) {
|
|
83
|
+
Tag tag = tag_new();
|
|
84
|
+
tag.type = (TagType)buffer[size++];
|
|
85
|
+
if (tag.type == CUSTOM) {
|
|
86
|
+
uint16_t name_length = (uint8_t)buffer[size++];
|
|
87
|
+
array_reserve(&tag.custom_tag_name, name_length);
|
|
88
|
+
tag.custom_tag_name.size = name_length;
|
|
89
|
+
memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
|
|
90
|
+
size += name_length;
|
|
91
|
+
}
|
|
92
|
+
array_push(&scanner->tags, tag);
|
|
93
|
+
}
|
|
94
|
+
// add zero tags if we didn't read enough, this is because the
|
|
95
|
+
// buffer had no more room but we held more tags.
|
|
96
|
+
for (; iter < tag_count; iter++) {
|
|
97
|
+
array_push(&scanner->tags, tag_new());
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
static String scan_tag_name(TSLexer *lexer) {
|
|
104
|
+
String tag_name = array_new();
|
|
105
|
+
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
|
|
106
|
+
array_push(&tag_name, towupper(lexer->lookahead));
|
|
107
|
+
advance(lexer);
|
|
108
|
+
}
|
|
109
|
+
return tag_name;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
static bool scan_comment(TSLexer *lexer) {
|
|
113
|
+
if (lexer->lookahead != '-') {
|
|
114
|
+
return false;
|
|
115
|
+
}
|
|
116
|
+
advance(lexer);
|
|
117
|
+
if (lexer->lookahead != '-') {
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
advance(lexer);
|
|
121
|
+
|
|
122
|
+
unsigned dashes = 0;
|
|
123
|
+
while (lexer->lookahead) {
|
|
124
|
+
switch (lexer->lookahead) {
|
|
125
|
+
case '-':
|
|
126
|
+
++dashes;
|
|
127
|
+
break;
|
|
128
|
+
case '>':
|
|
129
|
+
if (dashes >= 2) {
|
|
130
|
+
lexer->result_symbol = COMMENT;
|
|
131
|
+
advance(lexer);
|
|
132
|
+
lexer->mark_end(lexer);
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
default:
|
|
136
|
+
dashes = 0;
|
|
137
|
+
}
|
|
138
|
+
advance(lexer);
|
|
139
|
+
}
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
|
|
144
|
+
if (scanner->tags.size == 0) {
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
lexer->mark_end(lexer);
|
|
149
|
+
|
|
150
|
+
const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";
|
|
151
|
+
|
|
152
|
+
unsigned delimiter_index = 0;
|
|
153
|
+
while (lexer->lookahead) {
|
|
154
|
+
if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
|
|
155
|
+
delimiter_index++;
|
|
156
|
+
if (delimiter_index == strlen(end_delimiter)) {
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
advance(lexer);
|
|
160
|
+
} else {
|
|
161
|
+
delimiter_index = 0;
|
|
162
|
+
advance(lexer);
|
|
163
|
+
lexer->mark_end(lexer);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
lexer->result_symbol = RAW_TEXT;
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
static void pop_tag(Scanner *scanner) {
|
|
172
|
+
Tag popped_tag = array_pop(&scanner->tags);
|
|
173
|
+
tag_free(&popped_tag);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
|
|
177
|
+
Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);
|
|
178
|
+
|
|
179
|
+
bool is_closing_tag = false;
|
|
180
|
+
if (lexer->lookahead == '/') {
|
|
181
|
+
is_closing_tag = true;
|
|
182
|
+
advance(lexer);
|
|
183
|
+
} else {
|
|
184
|
+
if (parent && tag_is_void(parent)) {
|
|
185
|
+
pop_tag(scanner);
|
|
186
|
+
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
187
|
+
return true;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
String tag_name = scan_tag_name(lexer);
|
|
192
|
+
if (tag_name.size == 0 && !lexer->eof(lexer)) {
|
|
193
|
+
array_delete(&tag_name);
|
|
194
|
+
return false;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
Tag next_tag = tag_for_name(tag_name);
|
|
198
|
+
|
|
199
|
+
if (is_closing_tag) {
|
|
200
|
+
// The tag correctly closes the topmost element on the stack
|
|
201
|
+
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
|
|
202
|
+
tag_free(&next_tag);
|
|
203
|
+
return false;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Otherwise, dig deeper and queue implicit end tags (to be nice in
|
|
207
|
+
// the case of malformed HTML)
|
|
208
|
+
for (unsigned i = scanner->tags.size; i > 0; i--) {
|
|
209
|
+
if (scanner->tags.contents[i - 1].type == next_tag.type) {
|
|
210
|
+
pop_tag(scanner);
|
|
211
|
+
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
212
|
+
tag_free(&next_tag);
|
|
213
|
+
return true;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
} else if (
|
|
217
|
+
parent &&
|
|
218
|
+
(
|
|
219
|
+
!tag_can_contain(parent, &next_tag) ||
|
|
220
|
+
((parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))
|
|
221
|
+
)
|
|
222
|
+
) {
|
|
223
|
+
pop_tag(scanner);
|
|
224
|
+
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
225
|
+
tag_free(&next_tag);
|
|
226
|
+
return true;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
tag_free(&next_tag);
|
|
230
|
+
return false;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
|
|
234
|
+
String tag_name = scan_tag_name(lexer);
|
|
235
|
+
if (tag_name.size == 0) {
|
|
236
|
+
array_delete(&tag_name);
|
|
237
|
+
return false;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
Tag tag = tag_for_name(tag_name);
|
|
241
|
+
array_push(&scanner->tags, tag);
|
|
242
|
+
switch (tag.type) {
|
|
243
|
+
case SCRIPT:
|
|
244
|
+
lexer->result_symbol = SCRIPT_START_TAG_NAME;
|
|
245
|
+
break;
|
|
246
|
+
case STYLE:
|
|
247
|
+
lexer->result_symbol = STYLE_START_TAG_NAME;
|
|
248
|
+
break;
|
|
249
|
+
default:
|
|
250
|
+
lexer->result_symbol = START_TAG_NAME;
|
|
251
|
+
break;
|
|
252
|
+
}
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
|
|
257
|
+
String tag_name = scan_tag_name(lexer);
|
|
258
|
+
|
|
259
|
+
if (tag_name.size == 0) {
|
|
260
|
+
array_delete(&tag_name);
|
|
261
|
+
return false;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
Tag tag = tag_for_name(tag_name);
|
|
265
|
+
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
|
|
266
|
+
pop_tag(scanner);
|
|
267
|
+
lexer->result_symbol = END_TAG_NAME;
|
|
268
|
+
} else {
|
|
269
|
+
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
tag_free(&tag);
|
|
273
|
+
return true;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
|
|
277
|
+
advance(lexer);
|
|
278
|
+
if (lexer->lookahead == '>') {
|
|
279
|
+
advance(lexer);
|
|
280
|
+
if (scanner->tags.size > 0) {
|
|
281
|
+
pop_tag(scanner);
|
|
282
|
+
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
|
|
283
|
+
}
|
|
284
|
+
return true;
|
|
285
|
+
}
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
|
|
290
|
+
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
|
|
291
|
+
return scan_raw_text(scanner, lexer);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
while (iswspace(lexer->lookahead)) {
|
|
295
|
+
skip(lexer);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
switch (lexer->lookahead) {
|
|
299
|
+
case '<':
|
|
300
|
+
lexer->mark_end(lexer);
|
|
301
|
+
advance(lexer);
|
|
302
|
+
|
|
303
|
+
if (lexer->lookahead == '!') {
|
|
304
|
+
advance(lexer);
|
|
305
|
+
return scan_comment(lexer);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (valid_symbols[IMPLICIT_END_TAG]) {
|
|
309
|
+
return scan_implicit_end_tag(scanner, lexer);
|
|
310
|
+
}
|
|
311
|
+
break;
|
|
312
|
+
|
|
313
|
+
case '\0':
|
|
314
|
+
if (valid_symbols[IMPLICIT_END_TAG]) {
|
|
315
|
+
return scan_implicit_end_tag(scanner, lexer);
|
|
316
|
+
}
|
|
317
|
+
break;
|
|
318
|
+
|
|
319
|
+
case '/':
|
|
320
|
+
if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
|
|
321
|
+
return scan_self_closing_tag_delimiter(scanner, lexer);
|
|
322
|
+
}
|
|
323
|
+
break;
|
|
324
|
+
|
|
325
|
+
default:
|
|
326
|
+
if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) {
|
|
327
|
+
return valid_symbols[START_TAG_NAME] ? scan_start_tag_name(scanner, lexer)
|
|
328
|
+
: scan_end_tag_name(scanner, lexer);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return false;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
void *tree_sitter_html_external_scanner_create() {
|
|
336
|
+
Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
|
|
337
|
+
return scanner;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
|
|
341
|
+
Scanner *scanner = (Scanner *)payload;
|
|
342
|
+
return scan(scanner, lexer, valid_symbols);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
|
|
346
|
+
Scanner *scanner = (Scanner *)payload;
|
|
347
|
+
return serialize(scanner, buffer);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
|
351
|
+
Scanner *scanner = (Scanner *)payload;
|
|
352
|
+
deserialize(scanner, buffer, length);
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
void tree_sitter_html_external_scanner_destroy(void *payload) {
|
|
356
|
+
Scanner *scanner = (Scanner *)payload;
|
|
357
|
+
for (unsigned i = 0; i < scanner->tags.size; i++) {
|
|
358
|
+
tag_free(&scanner->tags.contents[i]);
|
|
359
|
+
}
|
|
360
|
+
array_delete(&scanner->tags);
|
|
361
|
+
ts_free(scanner);
|
|
362
|
+
}
|