ruxml 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,490 @@
1
+ #include "parser.hpp"
2
+
3
+ #include <fcntl.h>
4
+ #include <sys/mman.h>
5
+ #include <sys/stat.h>
6
+
7
+ void parser_init(Parser *parser) {
8
+ parser->line = 1;
9
+ parser->col = 1;
10
+
11
+ for (int i = 0; i < 128; i++) parser->tag_initial_map[i] = LA_INVALID;
12
+ for (int i = 128; i < 256; i++) parser->tag_initial_map[i] = LA_IDENTIFIER; // UTF-8 characters are allowed
13
+ parser->tag_initial_map[' '] = LA_WHITESPACE;
14
+ parser->tag_initial_map['\r'] = LA_WHITESPACE;
15
+ parser->tag_initial_map['\t'] = LA_WHITESPACE;
16
+ parser->tag_initial_map['\n'] = LA_NEWLINE;
17
+ parser->tag_initial_map['\''] = LA_VALUE;
18
+ parser->tag_initial_map['"'] = LA_VALUE;
19
+ parser->tag_initial_map['<'] = LA_ONE_CHAR;
20
+ parser->tag_initial_map['>'] = LA_ONE_CHAR;
21
+ parser->tag_initial_map['='] = LA_ONE_CHAR;
22
+ parser->tag_initial_map[':'] = LA_ONE_CHAR;
23
+ parser->tag_initial_map['/'] = LA_TWO_CHAR_END;
24
+ parser->tag_initial_map['?'] = LA_TWO_CHAR_END;
25
+ for (int c = 'a'; c <= 'z'; c++) parser->tag_initial_map[c] = LA_IDENTIFIER;
26
+ for (int c = 'A'; c <= 'Z'; c++) parser->tag_initial_map[c] = LA_IDENTIFIER;
27
+ parser->tag_initial_map['_'] = LA_IDENTIFIER;
28
+
29
+ for (int i = 0; i < 128; i++) parser->identifier_map[i] = 0;
30
+ for (int i = 128; i < 256; i++) parser->identifier_map[i] = 1; // UTF-8 characters are allowed
31
+ for (int c = 'a'; c <= 'z'; c++) parser->identifier_map[c] = 1;
32
+ for (int c = 'A'; c <= 'Z'; c++) parser->identifier_map[c] = 1;
33
+ for (int c = '0'; c <= '9'; c++) parser->identifier_map[c] = 1;
34
+ parser->identifier_map['-'] = 1;
35
+ parser->identifier_map['_'] = 1;
36
+ parser->identifier_map['.'] = 1;
37
+ }
38
+
39
+ bool parser_open_memory(Parser *parser, String name, const char *memory, int64_t offset, int64_t length) {
40
+ parser->source_type = PST_MEMORY;
41
+ parser->source = name;
42
+ parser->buffer = (char *) memory + offset;
43
+ parser->length = length;
44
+ parser->ptr = parser->buffer;
45
+ parser->end_ptr = parser->buffer + parser->length;
46
+ return true;
47
+ }
48
+
49
+ bool parser_open_file_mmap(Parser *parser, String filename, int64_t offset, int64_t length) {
50
+ parser->source_type = PST_MMAP;
51
+ parser->source = filename;
52
+
53
+ char *cpath = str_to_zstr(filename);
54
+ int fd = open(cpath, O_RDWR);
55
+ raw_free(cpath);
56
+
57
+ if (fd < 0) {
58
+ fprintf(stderr, "\nCould not open file: %.*s\n", str_prt(filename));
59
+ return false;
60
+ }
61
+
62
+ if (length == 0) {
63
+ struct stat stat_result;
64
+ int err = fstat(fd, &stat_result);
65
+ if (err < 0) {
66
+ fprintf(stderr, "\nCould not open file: %.*s\n", str_prt(filename));
67
+ return false;
68
+ }
69
+ parser->length = stat_result.st_size - offset;
70
+ } else {
71
+ parser->length = length;
72
+ }
73
+
74
+ parser->buffer = (char *) mmap(nullptr, parser->length, PROT_READ, MAP_SHARED, fd, offset);
75
+ if (parser->buffer == MAP_FAILED) {
76
+ fprintf(stderr, "Could not memory map file: %.*s\n", str_prt(filename));
77
+ return false;
78
+ }
79
+
80
+ parser->ptr = parser->buffer;
81
+ parser->end_ptr = parser->buffer + parser->length;
82
+
83
+ return true;
84
+ }
85
+
86
+ void parser_destroy(Parser *parser) {
87
+ if (parser->source_type == PST_MMAP) {
88
+ munmap(parser->buffer, parser->length);
89
+ }
90
+ }
91
+
92
+ inline bool scan_value(Parser *parser, Token *token) {
93
+ auto start = parser->ptr;
94
+ auto end = parser->ptr + 1;
95
+ while (end != parser->end_ptr && *end != *start) end++;
96
+ if (end == parser->end_ptr) return false;
97
+ end++;
98
+
99
+ int64_t length = end - start;
100
+ token->type = TOK_VALUE;
101
+ token->text.length = length;
102
+ token->text.data = start;
103
+
104
+ parser->col += length;
105
+ parser->ptr = end;
106
+ return true;
107
+ }
108
+
109
+ inline void scan_identifier(Parser *parser, Token *token) {
110
+ auto start = parser->ptr;
111
+ auto end = parser->ptr;
112
+ while (end != parser->end_ptr && parser->identifier_map[(unsigned char) *end]) end++;
113
+
114
+ int64_t length = end - start;
115
+ token->type = TOK_IDENTIFIER;
116
+ token->text.length = length;
117
+ token->text.data = start;
118
+
119
+ parser->col += length;
120
+ parser->ptr = end;
121
+ }
122
+
123
+ void scan_text(Parser *parser, Token *token) {
124
+ auto start = parser->ptr;
125
+ auto end = parser->ptr;
126
+ auto line_start = parser->ptr;
127
+ while (end != parser->end_ptr && *end != '<') {
128
+ if (*end == '\n') {
129
+ line_start = end + 1;
130
+ parser->line++;
131
+ parser->col = 1;
132
+ }
133
+ end++;
134
+ }
135
+
136
+ token->type = TOK_TEXT;
137
+ token->text.length = end - start;
138
+ token->text.data = start;
139
+
140
+ parser->col += end - line_start;
141
+ parser->ptr = end;
142
+ }
143
+
144
+ bool scan_comment_start(Parser *parser, Token *token) {
145
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '<') return false;
146
+ parser->ptr++;
147
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '!') return false;
148
+ parser->ptr++;
149
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '-') return false;
150
+ parser->ptr++;
151
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '-') return false;
152
+ parser->ptr++;
153
+
154
+ token->type = TOK_COMMENT_START;
155
+ parser->col += 4;
156
+ return true;
157
+ }
158
+
159
+ bool scan_comment_end(Parser *parser, Token *token) {
160
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '-') return false;
161
+ parser->ptr++;
162
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '-') return false;
163
+ parser->ptr++;
164
+ if (parser->ptr == parser->end_ptr || *parser->ptr != '>') return false;
165
+ parser->ptr++;
166
+
167
+ token->type = TOK_COMMENT_END;
168
+ parser->col += 3;
169
+ return true;
170
+ }
171
+
172
+ void scan_comment(Parser *parser, Token *token) {
173
+ auto start = parser->ptr;
174
+ auto end = parser->ptr;
175
+ auto line_start = parser->ptr;
176
+ while (end != parser->end_ptr) {
177
+ if (*end == '\n') {
178
+ line_start = end + 1;
179
+ parser->line++;
180
+ parser->col = 1;
181
+ }
182
+ if (*end == '-') {
183
+ auto check_end_ptr = end + 1;
184
+ if (check_end_ptr != parser->end_ptr && *check_end_ptr == '-') {
185
+ break;
186
+ }
187
+ }
188
+ end++;
189
+ }
190
+
191
+ token->type = TOK_TEXT;
192
+ token->text.length = end - start;
193
+ token->text.data = start;
194
+
195
+ parser->col += end - line_start;
196
+ parser->ptr = end;
197
+ }
198
+
199
+ Token read_token(Parser *parser) {
200
+ while (parser->ptr != parser->end_ptr) {
201
+ Token token = {};
202
+ token.line = parser->line;
203
+ token.c0 = parser->col;
204
+ token.offset = parser->ptr - parser->buffer;
205
+
206
+ auto c = *parser->ptr;
207
+ if (parser->mode == LM_TAG) {
208
+ auto state = parser->tag_initial_map[(unsigned char) c];
209
+ if (state == LA_WHITESPACE) {
210
+ parser->col++;
211
+ parser->ptr++;
212
+ continue; // Ignore white space in a tag
213
+ } else if (state == LA_NEWLINE) {
214
+ parser->line++;
215
+ parser->col = 1;
216
+ parser->ptr++;
217
+ continue; // Ignore white space in a tag
218
+ } else if (state == LA_ONE_CHAR) {
219
+ token.type = (TokenType) c;
220
+ parser->col++;
221
+ parser->ptr++;
222
+ parser->mode = (token.type == TOK_R_ANGLED) ? LM_OUT : LM_TAG;
223
+ } else if (state == LA_TWO_CHAR_END) {
224
+ auto c2 = *(parser->ptr + 1);
225
+ if (c2 == '>') {
226
+ token.type = TOKEN2(parser->ptr);
227
+ parser->col += 2;
228
+ parser->ptr += 2;
229
+ parser->mode = LM_OUT;
230
+ } else {
231
+ print_error_start(parser, token);
232
+ printf("Didn't expect '%c' to be followed by '%c'\n", c, c2);
233
+ return token;
234
+ }
235
+ } else if (state == LA_IDENTIFIER) {
236
+ scan_identifier(parser, &token);
237
+ } else if (state == LA_VALUE) {
238
+ if (!scan_value(parser, &token)) return token;
239
+ } else {
240
+ print_error_start(parser, token);
241
+ printf("Invalid character '%c'\n", c);
242
+ return token;
243
+ }
244
+ } else if (parser->mode == LM_OUT) {
245
+ if (c == '<') {
246
+ auto c2 = *(parser->ptr + 1);
247
+ if (c2 == '?' || c2 == '/') {
248
+ token.type = TOKEN2(parser->ptr);
249
+ token.c1 = parser->col + 1;
250
+ parser->col += 2;
251
+ parser->ptr += 2;
252
+ parser->mode = LM_TAG;
253
+ } else if (c2 == '!') {
254
+ if (!scan_comment_start(parser, &token)) return token;
255
+ parser->mode = LM_COMMENT;
256
+ } else {
257
+ token.type = TOK_L_ANGLED;
258
+ token.c1 = parser->col;
259
+ parser->col++;
260
+ parser->ptr++;
261
+ parser->mode = LM_TAG;
262
+ }
263
+ } else {
264
+ scan_text(parser, &token);
265
+ }
266
+ } else {
267
+ if (c == '-' && *(parser->ptr + 1) == '-') {
268
+ if (!scan_comment_end(parser, &token)) return token;
269
+ parser->mode = LM_OUT;
270
+ }
271
+ if (!token.type) scan_comment(parser, &token);
272
+ }
273
+
274
+ return token;
275
+ }
276
+
277
+ Token token = {};
278
+ token.line = parser->line;
279
+ token.c0 = parser->col;
280
+ token.offset = parser->ptr - parser->buffer;
281
+ return token;
282
+ }
283
+
284
+ void print_error_start(Parser *parser, Token token) {
285
+ parser->errored = true;
286
+ printf("%.*s:%li:%li - ", str_prt(parser->source), token.line, token.c0);
287
+ }
288
+
289
+ void print_token_type(TokenType type) {
290
+ if (type == TOK_INVALID) {
291
+ printf("INVALID");
292
+ } else if (type < TOK_ONE_CHAR) {
293
+ printf("'%c'", type);
294
+ } else if (type < TOK_TWO_CHAR) {
295
+ printf("'%c%c'", type & 0x7F, (type >> 7) & 0x7F);
296
+ } else if (type == TOK_COMMENT_START) {
297
+ printf("'<!--'");
298
+ } else if (type == TOK_COMMENT_END) {
299
+ printf("'-->'");
300
+ } else if (type == TOK_IDENTIFIER) {
301
+ printf("identifier");
302
+ } else if (type == TOK_VALUE) {
303
+ printf("value");
304
+ } else if (type == TOK_TEXT) {
305
+ printf("text");
306
+ }
307
+ }
308
+
309
+ void print_token_type(TokenType type, String str) {
310
+ if (type == TOK_INVALID) {
311
+ printf("INVALID");
312
+ } else if (type < TOK_ONE_CHAR) {
313
+ printf("'%c'", type);
314
+ } else if (type < TOK_TWO_CHAR) {
315
+ printf("'%c%c'", type & 0x7F, (type >> 7) & 0x7F);
316
+ } else if (type == TOK_COMMENT_START) {
317
+ printf("'<!--'");
318
+ } else if (type == TOK_COMMENT_END) {
319
+ printf("'-->'");
320
+ } else if (type == TOK_IDENTIFIER) {
321
+ printf("identifier: %.*s", str_prt(str));
322
+ } else if (type == TOK_VALUE) {
323
+ printf("value: %.*s", str_prt(str));
324
+ } else if (type == TOK_TEXT) {
325
+ printf("text: %.*s", str_prt(str));
326
+ }
327
+ }
328
+
329
+ bool expect_type(Parser *parser, TokenType type) {
330
+ Token token = parser->token;
331
+ if (token.type == type) return true;
332
+ print_error_start(parser, token);
333
+ printf("Expected ");
334
+ print_token_type(type);
335
+ if (token.type) {
336
+ printf(" but got ");
337
+ print_token_type(token.type);
338
+ }
339
+ printf("\n");
340
+ return false;
341
+ }
342
+
343
+ void print_token(Token token) {
344
+ printf("%li:%li: ", token.line, token.c0);
345
+ print_token_type(token.type, token.text);
346
+ printf("\n");
347
+ }
348
+
349
+ Node parse_xml_header(Parser *parser) {
350
+ auto start_token = peek_token(parser);
351
+ auto token = get_token(parser);
352
+ if (!expect_type(parser, TOK_IDENTIFIER)) return {};
353
+
354
+ Node node = {};
355
+ node.type = NODE_XML_HEADER;
356
+ node.line = start_token.line;
357
+ node.c0 = start_token.c0;
358
+ node.c1 = start_token.c1;
359
+ node.offset = token.offset;
360
+ node.depth = parser->depth;
361
+
362
+ while (!parser->done) {
363
+ token = get_token(parser);
364
+ if (token.type == TOK_TAG_XML_END) break;
365
+ }
366
+ expect_type(parser, TOK_TAG_XML_END);
367
+
368
+ node.c1 = token.c1;
369
+ return node;
370
+ }
371
+
372
+ Node parse_element_begin(Parser *parser) {
373
+ auto start_token = peek_token(parser);
374
+ auto token = get_token(parser);
375
+ if (!expect_type(parser, TOK_IDENTIFIER)) return {};
376
+
377
+ Node node = {};
378
+ node.type = NODE_ELEMENT_BEGIN;
379
+ node.line = start_token.line;
380
+ node.c0 = start_token.c0;
381
+ node.offset = token.offset;
382
+ node.depth = parser->depth;
383
+ node.text = token.text;
384
+
385
+ while (!parser->done) {
386
+ token = get_token(parser);
387
+ if (token.type == TOK_TAG_SELF_CLOSE) {
388
+ node.self_closing = true;
389
+ break;
390
+ }
391
+ if (token.type == TOK_R_ANGLED) break;
392
+ }
393
+
394
+ node.c1 = token.c1;
395
+
396
+ if (!node.self_closing) parser->depth++;
397
+ return node;
398
+ }
399
+
400
+ Node parse_element_end(Parser *parser) {
401
+ auto start_token = peek_token(parser);
402
+ auto token = get_token(parser);
403
+ if (!expect_type(parser, TOK_IDENTIFIER)) return {};
404
+
405
+ Node node = {};
406
+ node.type = NODE_ELEMENT_END;
407
+ node.line = start_token.line;
408
+ node.c0 = start_token.c0;
409
+ node.offset = start_token.offset;
410
+ node.depth = parser->depth - 1;
411
+ node.text = token.text;
412
+
413
+ while (!parser->done) {
414
+ token = get_token(parser);
415
+ if (token.type == TOK_R_ANGLED) break;
416
+ }
417
+
418
+ node.c1 = token.c1;
419
+
420
+ parser->depth--;
421
+ return node;
422
+ }
423
+
424
+ Node parse_text(Parser *parser) {
425
+ if (!expect_type(parser, TOK_TEXT)) return {};
426
+
427
+ auto token = peek_token(parser);
428
+ Node node = {};
429
+ node.type = NODE_TEXT;
430
+ node.line = token.line;
431
+ node.c0 = token.c0;
432
+ node.c1 = token.c1;
433
+ node.offset = token.offset;
434
+ node.depth = parser->depth;
435
+ node.text = token.text;
436
+ return node;
437
+ }
438
+
439
+ Node parse_comment(Parser *parser) {
440
+ auto start_token = peek_token(parser);
441
+ Node node = {};
442
+ node.line = start_token.line;
443
+ node.c0 = start_token.c0;
444
+ node.offset = start_token.offset;
445
+
446
+ auto token = get_token(parser);
447
+ if (!expect_type(parser, TOK_TEXT)) return node;
448
+ node.depth = parser->depth;
449
+ node.text = token.text;
450
+
451
+ auto end_token = get_token(parser);
452
+ if (!expect_type(parser, TOK_COMMENT_END)) return node;
453
+ node.type = NODE_COMMENT;
454
+ node.c1 = end_token.c1;
455
+ return node;
456
+ }
457
+
458
+ Node read_node(Parser *parser) {
459
+ if (parser->done || parser->errored) return {};
460
+
461
+ auto token = get_token(parser);
462
+ if (token.type == TOK_TAG_XML_START) {
463
+ return parse_xml_header(parser);
464
+ } else if (token.type == TOK_TAG_START_CLOSE) {
465
+ return parse_element_end(parser);
466
+ } else if (token.type == TOK_L_ANGLED) {
467
+ return parse_element_begin(parser);
468
+ } else if (token.type == TOK_COMMENT_START) {
469
+ return parse_comment(parser);
470
+ } else if (token.type == TOK_INVALID) {
471
+ return {};
472
+ } else {
473
+ return parse_text(parser);
474
+ }
475
+ }
476
+
477
+ void print_node(Node node) {
478
+ printf("%li:%li: [%li] ", node.line, node.c0, node.depth);
479
+ if (node.type == NODE_ELEMENT_BEGIN) {
480
+ printf("Begin: '%.*s' %s\n", str_prt(node.text), node.self_closing ? "self-closing" : "");
481
+ } else if (node.type == NODE_ELEMENT_END) {
482
+ printf("End: '%.*s'\n", str_prt(node.text));
483
+ } else if (node.type == NODE_TEXT) {
484
+ printf("Text: '%.*s'\n", str_prt(node.text));
485
+ } else if (node.type == NODE_COMMENT) {
486
+ printf("Comment: '%.*s'\n", str_prt(node.text));
487
+ } else if (node.type == NODE_XML_HEADER) {
488
+ printf("XML header\n");
489
+ }
490
+ }