minihtml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,549 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdint.h>
4
+ #include <string.h>
5
+ #include <ctype.h>
6
+ #include <stdlib.h>
7
+
8
+ #include "minihtml_scanner.h"
9
+
10
+ #define EOF_CP (-1)
11
+
12
+ /**
13
+ * next_utf8_cp - Decode the next UTF-8 code point from a byte stream.
14
+ *
15
+ * @p: Pointer to a pointer to the current byte. This pointer will be
16
+ * advanced past the consumed sequence on success or error.
17
+ * @end: Pointer to one past the last valid byte in the buffer.
18
+ *
19
+ * Returns:
20
+ * - A valid Unicode code point (U+0000..U+10FFFF) if decoding succeeds.
21
+ * - 0xFFFD (replacement character) if the sequence is malformed
22
+ * (invalid continuation, overlong encoding, surrogate, or out-of-range).
23
+ * - EOF_CP if @p has reached or passed @end (no more input).
24
+ *
25
+ * Behavior:
26
+ * - Handles 1–4 byte UTF-8 sequences.
27
+ * - Performs boundary checks to avoid reading past @end.
28
+ * - Rejects overlong encodings and surrogate code points (U+D800..U+DFFF).
29
+ * - Does not allocate memory; operates directly on caller-provided buffer.
30
+ *
31
+ * Example:
32
+ * const uint8_t *ptr = buf, *end = buf + len;
33
+ * int cp;
34
+ * while ((cp = next_utf8_cp(&ptr, end)) != EOF_CP) {
35
+ * if (cp == 0xFFFD) { handle invalid sequence }
36
+ * else { process code point }
37
+ * }
38
+ */
39
+ static inline int next_utf8_cp(const uint8_t **p, const uint8_t *end) {
40
+ if (*p >= end) return EOF_CP;
41
+
42
+ const uint8_t b0 = **p;
43
+ (*p)++;
44
+
45
+ if (b0 < 0x80) return b0;
46
+
47
+ if ((b0 & 0xE0) == 0xC0) {
48
+ // 2 bytes
49
+ if (*p >= end) return EOF_CP;
50
+
51
+ const uint8_t b1 = **p;
52
+ (*p)++;
53
+ if ((b1 & 0xC0) != 0x80) return 0xFFFD;
54
+
55
+ const int cp = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
56
+ return cp < 0x80 ? 0xFFFD : cp;
57
+ }
58
+
59
+ if ((b0 & 0xF0) == 0xE0) {
60
+ // 3 bytes
61
+ if (*p + 1 >= end) return EOF_CP;
62
+
63
+ const uint8_t b1 = **p;
64
+ const uint8_t b2 = *(*p + 1);
65
+ (*p) += 2;
66
+ if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) return 0xFFFD;
67
+
68
+ const int cp = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
69
+
70
+ // exclude surrogates
71
+ if (cp >= 0xD800 && cp <= 0xDFFF) return 0xFFFD;
72
+
73
+ return cp < 0x800 ? 0xFFFD : cp;
74
+ }
75
+
76
+ if ((b0 & 0xF8) == 0xF0) {
77
+ // 4 bytes
78
+ if (*p + 2 >= end) return EOF_CP;
79
+
80
+ const uint8_t b1 = **p;
81
+ const uint8_t b2 = *(*p + 1);
82
+ const uint8_t b3 = *(*p + 2);
83
+ (*p) += 3;
84
+
85
+ if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) return 0xFFFD;
86
+
87
+ const int cp = ((b0 & 0x07) << 18)
88
+ | ((b1 & 0x3F) << 12)
89
+ | ((b2 & 0x3F) << 6)
90
+ | (b3 & 0x3F);
91
+ if (cp > 0x10FFFF) return 0xFFFD;
92
+
93
+ return cp < 0x10000 ? 0xFFFD : cp;
94
+ }
95
+
96
+ return 0xFFFD; // invalid leading byte
97
+ }
98
+
99
+ static inline bool scanner_is_letter(const int v) {
100
+ return (v >= 'A' && v <= 'Z') || (v >= 'a' && v <= 'z');
101
+ }
102
+
103
+ static inline bool scanner_is_space(const int v) {
104
+ return v == SPACE || v == CARRIAGE_RETURN || v == FORM_FEED || v == NEWLINE || v == HORIZONTAL_TAB || v ==
105
+ VERTICAL_TAB;
106
+ }
107
+
108
+ static inline bool scanner_is_tag_ident(const int v) {
109
+ return v != EOF_CP && (scanner_is_letter(v) || isdigit(v) || v == UNDERSCORE || v == PERIOD || v == COLON);
110
+ }
111
+
112
+ static void scanner_free(void *ptr) {
113
+ xfree(ptr);
114
+ }
115
+
116
+ static size_t scanner_memsize(const void *ptr) {
117
+ return sizeof(scanner_t);
118
+ }
119
+
120
+ static void scanner_mark(void *ptr) {
121
+ const scanner_t *scanner = ptr;
122
+ if (scanner->str) rb_gc_mark(scanner->str);
123
+
124
+ if (scanner->tokens) rb_gc_mark(scanner->tokens);
125
+
126
+ if (scanner->errors) rb_gc_mark(scanner->errors);
127
+ }
128
+
129
+ static const rb_data_type_t scanner_type = {
130
+ "MiniHTML::Scanner",
131
+ {scanner_mark, scanner_free, scanner_memsize},
132
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
133
+ };
134
+
135
+ static VALUE scanner_alloc(const VALUE klass) {
136
+ scanner_t *t = ALLOC(scanner_t);
137
+ memset(t, 0, sizeof(scanner_t));
138
+ return TypedData_Wrap_Struct(klass, &scanner_type, t);
139
+ }
140
+
141
+ static VALUE scanner_initialize(VALUE self, VALUE str) {
142
+ Check_Type(str, T_STRING);
143
+ scanner_t *t;
144
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
145
+
146
+ str = rb_str_dup(str);
147
+ t->str = str;
148
+ t->p = (const uint8_t *) RSTRING_PTR(str);
149
+ t->end = t->p + RSTRING_LEN(str);
150
+ t->idx_cp = 0;
151
+ t->tokens = rb_ary_new();
152
+ t->errors = rb_ary_new();
153
+ t->line = 1;
154
+ t->col = 1;
155
+
156
+ // prime lookahead
157
+ t->look[0] = next_utf8_cp(&t->p, t->end);
158
+ t->look[1] = next_utf8_cp(&t->p, t->end);
159
+ t->look[2] = next_utf8_cp(&t->p, t->end);
160
+ t->look[3] = next_utf8_cp(&t->p, t->end);
161
+
162
+ return self;
163
+ }
164
+
165
+ static inline void rotate(scanner_t *t) {
166
+ t->look[0] = t->look[1];
167
+ t->look[1] = t->look[2];
168
+ t->look[2] = t->look[3];
169
+ t->look[3] = next_utf8_cp(&t->p, t->end);
170
+ }
171
+
172
+ static VALUE scanner_start_token(scanner_t *t) {
173
+ t->start_token_offset = t->idx_cp;
174
+ t->start_token_line = t->line;
175
+ t->start_token_column = t->col;
176
+ return Qnil;
177
+ }
178
+
179
+ static void scanner_consume(scanner_t *t) {
180
+ const int v = t->look[0];
181
+ if (v == EOF_CP) return;
182
+
183
+ t->idx_cp += 1;
184
+ if (v == NEWLINE) {
185
+ t->line += 1;
186
+ t->col = 1;
187
+ } else {
188
+ t->col += 1;
189
+ }
190
+
191
+ rotate(t);
192
+ }
193
+
194
+ static void scanner_consume_tag_ident(scanner_t *t) {
195
+ while (scanner_is_tag_ident(t->look[0])) {
196
+ scanner_consume(t);
197
+ }
198
+ }
199
+
200
+ static void scanner_amend_last_token_kind(const scanner_t *t, const VALUE newKind) {
201
+ const VALUE v = rb_ary_entry(t->tokens, -1);
202
+ rb_hash_aset(v, sym_kind, newKind);
203
+ }
204
+
205
+ static void scanner_push_token_simple(const scanner_t *t, const VALUE type) {
206
+ const VALUE h = rb_hash_new();
207
+ rb_hash_aset(h, sym_kind, type);
208
+ rb_hash_aset(h, sym_start_line, LONG2FIX(t->start_token_line));
209
+ rb_hash_aset(h, sym_start_column, LONG2FIX(t->start_token_column));
210
+ rb_hash_aset(h, sym_start_offset, LONG2FIX(t->start_token_offset));
211
+ rb_hash_aset(h, sym_end_line, LONG2FIX(t->line));
212
+ rb_hash_aset(h, sym_end_column, LONG2FIX(t->col));
213
+ rb_hash_aset(h, sym_end_offset, LONG2FIX(t->idx_cp));
214
+ rb_ary_push(t->tokens, h);
215
+ }
216
+
217
+ static inline void scanner_consume_spaces(scanner_t *t) {
218
+ while (scanner_is_space(t->look[0])) {
219
+ scanner_consume(t);
220
+ }
221
+ }
222
+
223
+ static void scanner_consume_executable(scanner_t *t) {
224
+ scanner_consume(t); // {
225
+ scanner_consume(t); // {
226
+ scanner_start_token(t);
227
+
228
+ int bracketLevel = 0;
229
+ while (t->look[0] != EOF_CP) {
230
+ if (bracketLevel == 0 && t->look[0] == CURLY_RIGHT && t->look[1] == CURLY_RIGHT) {
231
+ scanner_push_token_simple(t, sym_executable);
232
+ scanner_consume(t); // }
233
+ scanner_consume(t); // }
234
+ return;
235
+ }
236
+
237
+ if (t->look[0] == CURLY_LEFT) bracketLevel++;
238
+ if (t->look[0] == CURLY_RIGHT) bracketLevel--;
239
+
240
+ scanner_consume(t);
241
+ }
242
+ char errStr[128] = {0};
243
+ snprintf(errStr, 127, "Unmatched {{ block at line %lu, column %lu, offset %lu", t->line, t->col, t->idx_cp);
244
+ rb_ary_push(t->errors, rb_str_new_cstr(errStr));
245
+ }
246
+
247
+ static void scanner_set_string_quote_value(const scanner_t *t, const char quoteChar) {
248
+ const VALUE token = rb_ary_entry(t->tokens, -1);
249
+ const char value[2] = {quoteChar, 0};
250
+ rb_hash_aset(token, sym_quote_char, rb_str_new_cstr(value));
251
+ }
252
+
253
+ static void scanner_consume_string(scanner_t *t) {
254
+ const int quoteChar = t->look[0];
255
+ scanner_consume(t); // " or '
256
+ scanner_start_token(t);
257
+ while (t->look[0] != EOF) {
258
+ if (t->look[0] == INVERTED_SOLIDUS && t->look[1] == quoteChar) {
259
+ scanner_consume(t); // '\'
260
+ scanner_consume(t); // quoteChar
261
+ } else if (t->look[0] == quoteChar) {
262
+ scanner_push_token_simple(t, sym_string);
263
+ scanner_set_string_quote_value(t, (char)quoteChar);
264
+ scanner_consume(t);
265
+ return;
266
+ } else if (t->look[0] == CURLY_LEFT && t->look[1] == CURLY_LEFT) {
267
+ scanner_push_token_simple(t, sym_string_interpolation);
268
+ scanner_set_string_quote_value(t, (char)quoteChar);
269
+ scanner_consume_executable(t);
270
+ scanner_amend_last_token_kind(t, sym_interpolated_executable);
271
+ scanner_start_token(t);
272
+ } else {
273
+ scanner_consume(t);
274
+ }
275
+ }
276
+
277
+ char errStr[128] = {0};
278
+ snprintf(errStr, 127, "Unterminated string value at line %lu, column %lu, offset %lu", t->line, t->col, t->idx_cp);
279
+ rb_ary_push(t->errors, rb_str_new_cstr(errStr));
280
+ scanner_push_token_simple(t, sym_string);
281
+ scanner_set_string_quote_value(t, (char)quoteChar);
282
+ }
283
+
284
+ static inline bool scanner_is_attr_ident(const int p) {
285
+ return (p >= 'A' && p <= 'Z')
286
+ || (p >= 'a' && p <= 'z')
287
+ || (p >= '0' && p <= '9')
288
+ || p == '-' || p == '_' || p == '.';
289
+ }
290
+
291
+ static void scanner_consume_attr_name(scanner_t *t) {
292
+ while (scanner_is_attr_ident(t->look[0])) {
293
+ scanner_consume(t);
294
+ }
295
+ }
296
+
297
+ static void scanner_consume_attr(scanner_t *t) {
298
+ scanner_start_token(t);
299
+ scanner_consume_attr_name(t);
300
+ scanner_push_token_simple(t, sym_attr_key);
301
+ scanner_consume_spaces(t);
302
+ if (t->look[0] == EQUAL) {
303
+ scanner_start_token(t);
304
+ scanner_consume(t); // =
305
+ scanner_push_token_simple(t, sym_equal);
306
+ scanner_consume_spaces(t);
307
+ switch (t->look[0]) {
308
+ case APOSTROPHE:
309
+ case QUOTE:
310
+ scanner_consume_string(t);
311
+ break;
312
+ case CURLY_LEFT:
313
+ if (t->look[1] == CURLY_LEFT) {
314
+ scanner_consume_executable(t);
315
+ break;
316
+ }
317
+ default:
318
+ scanner_start_token(t);
319
+ scanner_consume_tag_ident(t);
320
+ scanner_push_token_simple(t, sym_attr_value_literal);
321
+ break;
322
+ }
323
+ }
324
+ }
325
+
326
+ static void scanner_consume_comment_tag(scanner_t *t) {
327
+ while (t->look[0] != EOF) {
328
+ if (t->look[0] == MINUS_HYPHEN && t->look[1] == MINUS_HYPHEN && t->look[2] == ANGLED_RIGHT) {
329
+ scanner_consume(t); // -
330
+ scanner_consume(t); // -
331
+ scanner_consume(t); // >
332
+ scanner_push_token_simple(t, sym_tag_comment_end);
333
+ return;
334
+ }
335
+ scanner_consume(t);
336
+ }
337
+
338
+ // If we reach this point, it's an error.
339
+ char errStr[128] = {0};
340
+ snprintf(errStr, 127, "Unterminated comment tag at line %lu, column %lu, offset %lu", t->line, t->col, t->idx_cp);
341
+ rb_ary_push(t->errors, rb_str_new_cstr(errStr));
342
+ scanner_push_token_simple(t, sym_tag_comment_end);
343
+ }
344
+
345
+ static void scanner_scan_open_tag(scanner_t *t) {
346
+ if (t->look[1] == '!' && t->look[2] == '-' && t->look[3] == '-') {
347
+ scanner_start_token(t);
348
+ scanner_consume(t); // <
349
+ scanner_consume(t); // !
350
+ scanner_consume(t); // -
351
+ scanner_consume(t); // -
352
+ scanner_push_token_simple(t, sym_tag_begin);
353
+ scanner_consume_comment_tag(t);
354
+ return;
355
+ }
356
+
357
+ if (scanner_is_letter(t->look[1])) {
358
+ scanner_start_token(t);
359
+ scanner_consume(t); // <
360
+ scanner_consume_tag_ident(t); // \w
361
+ scanner_push_token_simple(t, sym_tag_begin);
362
+
363
+ scanner_consume_spaces(t);
364
+ while (scanner_is_letter(t->look[0])) {
365
+ scanner_consume_attr(t);
366
+ scanner_consume_spaces(t);
367
+ }
368
+ return;
369
+ }
370
+
371
+ if (t->look[1] == '/') {
372
+ scanner_start_token(t);
373
+ scanner_consume(t); // <
374
+ scanner_consume(t); // /
375
+
376
+ if (scanner_is_tag_ident(t->look[0])) {
377
+ scanner_consume_tag_ident(t);
378
+ }
379
+ scanner_push_token_simple(t, sym_tag_closing_start);
380
+ scanner_consume_spaces(t);
381
+ while (scanner_is_tag_ident(t->look[0])) {
382
+ scanner_consume_attr(t);
383
+ scanner_consume_spaces(t);
384
+ }
385
+ scanner_consume_spaces(t);
386
+
387
+ if (t->look[0] == ANGLED_RIGHT) {
388
+ scanner_start_token(t);
389
+ scanner_consume(t); // >
390
+ scanner_push_token_simple(t, sym_tag_closing_end);
391
+ }
392
+ }
393
+ }
394
+
395
+ static void scanner_consume_literal(scanner_t *t) {
396
+ scanner_start_token(t);
397
+ bool next = true;
398
+ while (next) {
399
+ switch (t->look[0]) {
400
+ case ANGLED_LEFT:
401
+ next = false;
402
+ break;
403
+ case CURLY_LEFT:
404
+ if (t->look[1] == CURLY_LEFT) {
405
+ next = false;
406
+ break;
407
+ }
408
+ scanner_consume(t);
409
+ break;
410
+ case EOF:
411
+ next = false;
412
+ break;
413
+ default:
414
+ scanner_consume(t);
415
+ }
416
+ }
417
+ scanner_push_token_simple(t, sym_literal);
418
+ }
419
+
420
+ static VALUE scanner_scan_token(scanner_t *t) {
421
+ switch (t->look[0]) {
422
+ case ANGLED_LEFT:
423
+ scanner_scan_open_tag(t);
424
+ break;
425
+ case ANGLED_RIGHT:
426
+ scanner_start_token(t);
427
+ scanner_consume(t); // >
428
+ scanner_push_token_simple(t, sym_right_angled);
429
+ break;
430
+ case SOLIDUS:
431
+ if (t->look[1] == '>') {
432
+ scanner_start_token(t);
433
+ scanner_consume(t); // '/'
434
+ scanner_consume(t); // >
435
+ scanner_push_token_simple(t, sym_tag_end);
436
+ break;
437
+ }
438
+
439
+ scanner_consume_literal(t);
440
+ break;
441
+ case CURLY_LEFT:
442
+ if (t->look[1] == CURLY_LEFT) {
443
+ scanner_consume_executable(t);
444
+ } else {
445
+ scanner_consume_literal(t);
446
+ }
447
+ break;
448
+ case EOF:
449
+ break;
450
+ default:
451
+ scanner_consume_literal(t);
452
+ break;
453
+ }
454
+
455
+ return Qnil;
456
+ }
457
+
458
+ static VALUE scanner_tokens(const VALUE self) {
459
+ scanner_t *t;
460
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
461
+ return t->tokens;
462
+ }
463
+
464
+ static VALUE scanner_errors(const VALUE self) {
465
+ scanner_t *t;
466
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
467
+ return t->errors;
468
+ }
469
+
470
+ static VALUE scanner_stats(const VALUE self) {
471
+ scanner_t *t;
472
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
473
+ const VALUE h = rb_hash_new();
474
+ rb_hash_aset(h, sym_line, LONG2NUM(t->line));
475
+ rb_hash_aset(h, sym_column, LONG2NUM(t->col));
476
+ rb_hash_aset(h, sym_offset, LONG2NUM(t->idx_cp));
477
+ return h;
478
+ }
479
+
480
+ static VALUE scanner_at_eof(const VALUE self) {
481
+ scanner_t *t;
482
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
483
+ return t->look[0] == EOF ? Qtrue : Qfalse;
484
+ }
485
+
486
+ static void scanner_hydrate_tokens(const scanner_t *t) {
487
+ const long len = RARRAY_LEN(t->tokens);
488
+ for (long i = 0; i < len; i++) {
489
+ const VALUE v = rb_ary_entry(t->tokens, i);
490
+ const long startOffset = NUM2LONG(rb_hash_aref(v, sym_start_offset));
491
+ const long endOffset = NUM2LONG(rb_hash_aref(v, sym_end_offset));
492
+ const long strLen = endOffset - startOffset;
493
+ if (strLen < 0) {
494
+ rb_raise(rb_eRuntimeError, "invalid offset boundaries %ld -> %ld", startOffset, endOffset);
495
+ }
496
+ rb_hash_aset(v, sym_literal, rb_str_substr(t->str, startOffset, strLen));
497
+ }
498
+ }
499
+
500
+ static VALUE scanner_tokenize(const VALUE self) {
501
+ scanner_t *t;
502
+ TypedData_Get_Struct(self, scanner_t, &scanner_type, t);
503
+ while (t->look[0] != EOF) {
504
+ scanner_scan_token(t);
505
+ }
506
+ scanner_hydrate_tokens(t);
507
+ return t->tokens;
508
+ }
509
+
510
+ RUBY_FUNC_EXPORTED void Init_minihtml_scanner(void) {
511
+ VALUE rb_mMiniHTML = rb_define_module("MiniHTML");
512
+ VALUE rb_cScanner = rb_define_class_under(rb_mMiniHTML, "Scanner", rb_cObject);
513
+
514
+ INITIALIZE_REUSABLE_SYMBOL(line);
515
+ INITIALIZE_REUSABLE_SYMBOL(column);
516
+ INITIALIZE_REUSABLE_SYMBOL(offset);
517
+ INITIALIZE_REUSABLE_SYMBOL(start_line);
518
+ INITIALIZE_REUSABLE_SYMBOL(start_column);
519
+ INITIALIZE_REUSABLE_SYMBOL(start_offset);
520
+ INITIALIZE_REUSABLE_SYMBOL(end_line);
521
+ INITIALIZE_REUSABLE_SYMBOL(end_column);
522
+ INITIALIZE_REUSABLE_SYMBOL(end_offset);
523
+ INITIALIZE_REUSABLE_SYMBOL(new);
524
+ INITIALIZE_REUSABLE_SYMBOL(literal);
525
+ INITIALIZE_REUSABLE_SYMBOL(self_closing);
526
+ INITIALIZE_REUSABLE_SYMBOL(tag_begin);
527
+ INITIALIZE_REUSABLE_SYMBOL(tag_end);
528
+ INITIALIZE_REUSABLE_SYMBOL(tag_closing_start);
529
+ INITIALIZE_REUSABLE_SYMBOL(tag_closing_end);
530
+ INITIALIZE_REUSABLE_SYMBOL(right_angled);
531
+ INITIALIZE_REUSABLE_SYMBOL(attr_key);
532
+ INITIALIZE_REUSABLE_SYMBOL(attr_value_literal);
533
+ INITIALIZE_REUSABLE_SYMBOL(kind);
534
+ INITIALIZE_REUSABLE_SYMBOL(string);
535
+ INITIALIZE_REUSABLE_SYMBOL(string_interpolation);
536
+ INITIALIZE_REUSABLE_SYMBOL(interpolated_executable);
537
+ INITIALIZE_REUSABLE_SYMBOL(executable);
538
+ INITIALIZE_REUSABLE_SYMBOL(equal);
539
+ INITIALIZE_REUSABLE_SYMBOL(quote_char);
540
+ INITIALIZE_REUSABLE_SYMBOL(tag_comment_end);
541
+
542
+ rb_define_alloc_func(rb_cScanner, scanner_alloc);
543
+ rb_define_method(rb_cScanner, "initialize", scanner_initialize, 1);
544
+ rb_define_method(rb_cScanner, "tokens", scanner_tokens, 0);
545
+ rb_define_method(rb_cScanner, "errors", scanner_errors, 0);
546
+ rb_define_method(rb_cScanner, "stats", scanner_stats, 0);
547
+ rb_define_method(rb_cScanner, "eof?", scanner_at_eof, 0);
548
+ rb_define_method(rb_cScanner, "tokenize", scanner_tokenize, 0);
549
+ }
@@ -0,0 +1,73 @@
1
+ #ifndef MINIHTML_H
2
+ #define MINIHTML_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ #define SPACE ' '
7
+ #define CARRIAGE_RETURN '\r'
8
+ #define FORM_FEED '\f'
9
+ #define NEWLINE 0x0A
10
+ #define HORIZONTAL_TAB '\t'
11
+ #define VERTICAL_TAB '\v'
12
+ #define ANGLED_LEFT '<'
13
+ #define ANGLED_RIGHT '>'
14
+ #define SOLIDUS '/'
15
+ #define CURLY_LEFT '{'
16
+ #define CURLY_RIGHT '}'
17
+ #define BANG '!'
18
+ #define MINUS_HYPHEN '-'
19
+ #define UNDERSCORE '_'
20
+ #define EQUAL '='
21
+ #define APOSTROPHE '\''
22
+ #define QUOTE '"'
23
+ #define INVERTED_SOLIDUS '\\'
24
+ #define PERIOD '.'
25
+ #define COLON ':'
26
+
27
+ #define DEFINE_REUSABLE_SYMBOL(name) static ID id_type_##name; static VALUE sym_##name;
28
+ #define INITIALIZE_REUSABLE_SYMBOL(name) id_type_##name = rb_intern(#name); sym_##name = ID2SYM(id_type_##name);
29
+
30
+ DEFINE_REUSABLE_SYMBOL(line);
31
+ DEFINE_REUSABLE_SYMBOL(column);
32
+ DEFINE_REUSABLE_SYMBOL(offset);
33
+ DEFINE_REUSABLE_SYMBOL(start_line);
34
+ DEFINE_REUSABLE_SYMBOL(start_column);
35
+ DEFINE_REUSABLE_SYMBOL(start_offset);
36
+ DEFINE_REUSABLE_SYMBOL(end_line);
37
+ DEFINE_REUSABLE_SYMBOL(end_column);
38
+ DEFINE_REUSABLE_SYMBOL(end_offset);
39
+ DEFINE_REUSABLE_SYMBOL(new);
40
+ DEFINE_REUSABLE_SYMBOL(literal);
41
+ DEFINE_REUSABLE_SYMBOL(self_closing);
42
+ DEFINE_REUSABLE_SYMBOL(tag_begin);
43
+ DEFINE_REUSABLE_SYMBOL(tag_end);
44
+ DEFINE_REUSABLE_SYMBOL(tag_closing_start);
45
+ DEFINE_REUSABLE_SYMBOL(tag_closing_end);
46
+ DEFINE_REUSABLE_SYMBOL(right_angled);
47
+ DEFINE_REUSABLE_SYMBOL(attr_key);
48
+ DEFINE_REUSABLE_SYMBOL(attr_value_literal);
49
+ DEFINE_REUSABLE_SYMBOL(kind);
50
+ DEFINE_REUSABLE_SYMBOL(string);
51
+ DEFINE_REUSABLE_SYMBOL(string_interpolation);
52
+ DEFINE_REUSABLE_SYMBOL(interpolated_executable);
53
+ DEFINE_REUSABLE_SYMBOL(executable);
54
+ DEFINE_REUSABLE_SYMBOL(equal);
55
+ DEFINE_REUSABLE_SYMBOL(quote_char);
56
+ DEFINE_REUSABLE_SYMBOL(tag_comment_end);
57
+
58
+ typedef struct {
59
+ VALUE str;
60
+ VALUE tokens;
61
+ VALUE errors;
62
+ const uint8_t *p;
63
+ const uint8_t *end;
64
+ long idx_cp;
65
+ int look[4];
66
+ long line;
67
+ long col;
68
+ long start_token_offset;
69
+ long start_token_line;
70
+ long start_token_column;
71
+ } scanner_t;
72
+
73
+ #endif /* MINIHTML_H */
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ append_cflags("-fvisibility=hidden")
6
+
7
+ create_makefile("minihtml/minihtml_token_stream")