minicss 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/.ruby-version +1 -0
- data/README.md +124 -5
- data/Rakefile +25 -1
- data/benchmarks/bootstrap-4.css +8975 -0
- data/benchmarks/bootstrap-4.min.css +7 -0
- data/benchmarks/ruby_prof.rb +41 -0
- data/benchmarks/selectors.rb +41 -0
- data/benchmarks/stylesheet.rb +75 -0
- data/ext/minicss_scanner/extconf.rb +7 -0
- data/ext/minicss_scanner/minicss_scanner.c +1436 -0
- data/ext/minicss_scanner/minicss_scanner.h +132 -0
- data/ext/minicss_token_stream/extconf.rb +5 -0
- data/ext/minicss_token_stream/minicss_token_stream.c +195 -0
- data/lib/minicss/css/ast/function.rb +1 -1
- data/lib/minicss/css/parser.rb +50 -54
- data/lib/minicss/css/refinements.rb +21 -63
- data/lib/minicss/css/token_stream.rb +1 -54
- data/lib/minicss/css/tokenizer.rb +13 -552
- data/lib/minicss/css.rb +1 -1
- data/lib/minicss/sel.rb +10 -8
- data/lib/minicss/serializer.rb +48 -50
- data/lib/minicss/version.rb +1 -1
- metadata +19 -6
|
@@ -0,0 +1,1436 @@
|
|
|
1
|
+
#include "ruby.h"
|
|
2
|
+
#include "ruby/encoding.h"
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
#include <string.h>
|
|
5
|
+
|
|
6
|
+
#include "minicss_scanner.h"
|
|
7
|
+
|
|
8
|
+
#include <ctype.h>
|
|
9
|
+
#include <stdlib.h>
|
|
10
|
+
|
|
11
|
+
static VALUE cPosition;
|
|
12
|
+
static VALUE cToken;
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
// ASCII helpers ---------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* is_digit - Check if code point is an ASCII decimal digit.
|
|
19
|
+
*
|
|
20
|
+
* @cp: Unicode code point.
|
|
21
|
+
*
|
|
22
|
+
* Returns non-zero if @cp is '0'..'9' (U+0030–U+0039), else 0.
|
|
23
|
+
*/
|
|
24
|
+
static inline int is_digit(const int cp) { return cp >= 0x30 && cp <= 0x39; }
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* is_hex - Check if code point is a valid ASCII hexadecimal digit.
|
|
28
|
+
*
|
|
29
|
+
* @cp: Unicode code point.
|
|
30
|
+
*
|
|
31
|
+
* Returns non-zero if @cp is '0'..'9', 'A'..'F', or 'a'..'f'.
|
|
32
|
+
*/
|
|
33
|
+
static inline int is_hex(const int cp) {
|
|
34
|
+
return is_digit(cp)
|
|
35
|
+
|| (cp >= 0x41 && cp <= 0x46)
|
|
36
|
+
|| (cp >= 0x61 && cp <= 0x66);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* is_upper - Check if code point is an uppercase ASCII letter.
|
|
41
|
+
*
|
|
42
|
+
* @cp: Unicode code point.
|
|
43
|
+
*
|
|
44
|
+
* Returns non-zero if @cp is 'A'..'Z' (U+0041–U+005A).
|
|
45
|
+
*/
|
|
46
|
+
static inline int is_upper(const int cp) {
|
|
47
|
+
return cp >= 0x41 && cp <= 0x5A;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* is_lower - Check if code point is a lowercase ASCII letter.
|
|
52
|
+
*
|
|
53
|
+
* @cp: Unicode code point.
|
|
54
|
+
*
|
|
55
|
+
* Returns non-zero if @cp is 'a'..'z' (U+0061–U+007A).
|
|
56
|
+
*/
|
|
57
|
+
static inline int is_lower(const int cp) {
|
|
58
|
+
return cp >= 0x61 && cp <= 0x7A;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* is_letter - Check if code point is an ASCII letter.
|
|
63
|
+
*
|
|
64
|
+
* @cp: Unicode code point.
|
|
65
|
+
*
|
|
66
|
+
* Returns non-zero if @cp is 'A'..'Z' or 'a'..'z'.
|
|
67
|
+
*/
|
|
68
|
+
static inline int is_letter(const int cp) {
|
|
69
|
+
return is_upper(cp) || is_lower(cp);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* is_newline - Check if code point is a newline.
|
|
74
|
+
*
|
|
75
|
+
* @cp: Unicode code point.
|
|
76
|
+
*
|
|
77
|
+
* Returns non-zero if @cp equals NEWLINE (implementation-defined value).
|
|
78
|
+
*/
|
|
79
|
+
static inline int is_newline(const int cp) {
|
|
80
|
+
return cp == NEWLINE;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* is_ws - Check if code point is a CSS whitespace character.
|
|
85
|
+
*
|
|
86
|
+
* @cp: Unicode code point.
|
|
87
|
+
*
|
|
88
|
+
* Returns non-zero if @cp is NEWLINE, horizontal tab (U+0009), or space (U+0020).
|
|
89
|
+
*/
|
|
90
|
+
static inline int is_ws(const int cp) {
|
|
91
|
+
return cp == NEWLINE || cp == 0x09 || cp == 0x20;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* ident_start - Check if code point can start a CSS identifier.
|
|
96
|
+
*
|
|
97
|
+
* @cp: Unicode code point.
|
|
98
|
+
*
|
|
99
|
+
* Returns non-zero if @cp is an ASCII letter, underscore ('_'),
|
|
100
|
+
* non-ASCII character (>= U+0080), or both.
|
|
101
|
+
*/
|
|
102
|
+
static inline bool ident_start(const int cp) {
|
|
103
|
+
return is_letter(cp) || cp >= 0x80 || cp == 0x5F;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* ident_point - Check if code point can appear in a CSS identifier after the first character.
|
|
108
|
+
*
|
|
109
|
+
* @cp: Unicode code point.
|
|
110
|
+
*
|
|
111
|
+
* Returns non-zero if @cp is a valid ident_start, an ASCII digit ('0'..'9'),
|
|
112
|
+
* or a hyphen-minus ('-').
|
|
113
|
+
*/
|
|
114
|
+
static inline int ident_point(const int cp) {
|
|
115
|
+
return ident_start(cp) || is_digit(cp) || cp == 0x2D;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* next_utf8_cp - Decode the next UTF-8 code point from a byte stream.
|
|
120
|
+
*
|
|
121
|
+
* @p: Pointer to a pointer to the current byte. This pointer will be
|
|
122
|
+
* advanced past the consumed sequence on success or error.
|
|
123
|
+
* @end: Pointer to one past the last valid byte in the buffer.
|
|
124
|
+
*
|
|
125
|
+
* Returns:
|
|
126
|
+
* - A valid Unicode code point (U+0000..U+10FFFF) if decoding succeeds.
|
|
127
|
+
* - 0xFFFD (replacement character) if the sequence is malformed
|
|
128
|
+
* (invalid continuation, overlong encoding, surrogate, or out-of-range).
|
|
129
|
+
* - EOF_CP if @p has reached or passed @end (no more input).
|
|
130
|
+
*
|
|
131
|
+
* Behavior:
|
|
132
|
+
* - Handles 1–4 byte UTF-8 sequences.
|
|
133
|
+
* - Performs boundary checks to avoid reading past @end.
|
|
134
|
+
* - Rejects overlong encodings and surrogate code points (U+D800..U+DFFF).
|
|
135
|
+
* - Does not allocate memory; operates directly on caller-provided buffer.
|
|
136
|
+
*
|
|
137
|
+
* Example:
|
|
138
|
+
* const uint8_t *ptr = buf, *end = buf + len;
|
|
139
|
+
* int cp;
|
|
140
|
+
* while ((cp = next_utf8_cp(&ptr, end)) != EOF_CP) {
|
|
141
|
+
* if (cp == 0xFFFD) { handle invalid sequence }
|
|
142
|
+
* else { process code point }
|
|
143
|
+
* }
|
|
144
|
+
*/
|
|
145
|
+
static inline int next_utf8_cp(const uint8_t **p, const uint8_t *end) {
|
|
146
|
+
if (*p >= end) return EOF_CP;
|
|
147
|
+
|
|
148
|
+
const uint8_t b0 = **p;
|
|
149
|
+
(*p)++;
|
|
150
|
+
|
|
151
|
+
if (b0 < 0x80) return b0;
|
|
152
|
+
|
|
153
|
+
if ((b0 & 0xE0) == 0xC0) {
|
|
154
|
+
// 2 bytes
|
|
155
|
+
if (*p >= end) return EOF_CP;
|
|
156
|
+
|
|
157
|
+
const uint8_t b1 = **p;
|
|
158
|
+
(*p)++;
|
|
159
|
+
if ((b1 & 0xC0) != 0x80) return 0xFFFD;
|
|
160
|
+
|
|
161
|
+
const int cp = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
|
|
162
|
+
return cp < 0x80 ? 0xFFFD : cp;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if ((b0 & 0xF0) == 0xE0) {
|
|
166
|
+
// 3 bytes
|
|
167
|
+
if (*p + 1 >= end) return EOF_CP;
|
|
168
|
+
|
|
169
|
+
const uint8_t b1 = **p;
|
|
170
|
+
const uint8_t b2 = *(*p + 1);
|
|
171
|
+
(*p) += 2;
|
|
172
|
+
if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) return 0xFFFD;
|
|
173
|
+
|
|
174
|
+
const int cp = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
|
|
175
|
+
|
|
176
|
+
// exclude surrogates
|
|
177
|
+
if (cp >= 0xD800 && cp <= 0xDFFF) return 0xFFFD;
|
|
178
|
+
|
|
179
|
+
return cp < 0x800 ? 0xFFFD : cp;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if ((b0 & 0xF8) == 0xF0) {
|
|
183
|
+
// 4 bytes
|
|
184
|
+
if (*p + 2 >= end) return EOF_CP;
|
|
185
|
+
|
|
186
|
+
const uint8_t b1 = **p;
|
|
187
|
+
const uint8_t b2 = *(*p + 1);
|
|
188
|
+
const uint8_t b3 = *(*p + 2);
|
|
189
|
+
(*p) += 3;
|
|
190
|
+
|
|
191
|
+
if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) return 0xFFFD;
|
|
192
|
+
|
|
193
|
+
const int cp = ((b0 & 0x07) << 18)
|
|
194
|
+
| ((b1 & 0x3F) << 12)
|
|
195
|
+
| ((b2 & 0x3F) << 6)
|
|
196
|
+
| (b3 & 0x3F);
|
|
197
|
+
if (cp > 0x10FFFF) return 0xFFFD;
|
|
198
|
+
|
|
199
|
+
return cp < 0x10000 ? 0xFFFD : cp;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return 0xFFFD; // invalid leading byte
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
static double u32_chars_to_double(const uint32_t *buf, const size_t len) {
|
|
206
|
+
if (len == 0) return 0.0;
|
|
207
|
+
|
|
208
|
+
// stack buffer for small numbers, fallback if huge
|
|
209
|
+
char tmp_stack[64];
|
|
210
|
+
char *tmp;
|
|
211
|
+
if (len < sizeof(tmp_stack)) {
|
|
212
|
+
tmp = tmp_stack;
|
|
213
|
+
} else {
|
|
214
|
+
tmp = ALLOC_N(char, len + 1);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
size_t out_len = 0;
|
|
218
|
+
|
|
219
|
+
for (size_t i = 0; i < len; i++) {
|
|
220
|
+
uint32_t cp = buf[i];
|
|
221
|
+
if ((cp >= '0' && cp <= '9') || cp == '+' || cp == '-' || cp == '.') {
|
|
222
|
+
tmp[out_len++] = (char) cp;
|
|
223
|
+
} else {
|
|
224
|
+
break; // stop on first invalid character
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
tmp[out_len] = '\0';
|
|
229
|
+
|
|
230
|
+
// normalize "-.34" -> "-0.34" and "+.34" -> "+0.34"
|
|
231
|
+
if ((tmp[0] == '-' || tmp[0] == '+') && tmp[1] == '.') {
|
|
232
|
+
// shift right to make room for '0'
|
|
233
|
+
memmove(&tmp[2], &tmp[1], out_len - 1);
|
|
234
|
+
tmp[1] = '0';
|
|
235
|
+
out_len++;
|
|
236
|
+
tmp[out_len] = '\0';
|
|
237
|
+
}
|
|
238
|
+
// also handle ".34" -> "0.34"
|
|
239
|
+
else if (tmp[0] == '.') {
|
|
240
|
+
memmove(&tmp[1], &tmp[0], out_len);
|
|
241
|
+
tmp[0] = '0';
|
|
242
|
+
out_len++;
|
|
243
|
+
tmp[out_len] = '\0';
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
char *end_ptr;
|
|
247
|
+
errno = 0;
|
|
248
|
+
double val = strtod(tmp, &end_ptr);
|
|
249
|
+
|
|
250
|
+
// Must consume entire string, otherwise reject
|
|
251
|
+
if (end_ptr != tmp + out_len) {
|
|
252
|
+
val = 0.0;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if (tmp != tmp_stack)
|
|
256
|
+
xfree(tmp);
|
|
257
|
+
return val;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* rb_str_cat_codepoint - Append a Unicode code point to a Ruby String.
|
|
262
|
+
*
|
|
263
|
+
* @str: Ruby String VALUE (must be mutable).
|
|
264
|
+
* @cp: Unicode code point to append (U+0000..U+10FFFF).
|
|
265
|
+
*
|
|
266
|
+
* Behavior:
|
|
267
|
+
* - Encodes @cp into its UTF-8 representation (1–4 bytes).
|
|
268
|
+
* - Writes the encoded bytes into a temporary buffer.
|
|
269
|
+
* - Calls rb_str_cat to append the buffer contents to @str.
|
|
270
|
+
*
|
|
271
|
+
* Encoding rules:
|
|
272
|
+
* - cp < 0x80 → 1 byte: 0xxxxxxx
|
|
273
|
+
* - cp < 0x800 → 2 bytes: 110xxxxx 10xxxxxx
|
|
274
|
+
* - cp < 0x10000 → 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
|
|
275
|
+
* - cp <= 0x10FFFF → 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
276
|
+
*
|
|
277
|
+
* Notes:
|
|
278
|
+
* - Does not validate @cp for invalid ranges (e.g. UTF-16 surrogates).
|
|
279
|
+
* Caller must ensure only valid code points are passed.
|
|
280
|
+
* - Designed for efficiency inside the tokenizer and other code
|
|
281
|
+
* that needs to build Ruby strings from decoded code points.
|
|
282
|
+
*
|
|
283
|
+
* Example:
|
|
284
|
+
* VALUE out = rb_str_new("", 0);
|
|
285
|
+
* rb_str_cat_codepoint(out, 'A'); // appends "A"
|
|
286
|
+
* rb_str_cat_codepoint(out, 0x03C0); // appends "π"
|
|
287
|
+
*/
|
|
288
|
+
static void rb_str_cat_codepoint(const VALUE str, const uint32_t cp) {
|
|
289
|
+
char buf[4];
|
|
290
|
+
int n = 0;
|
|
291
|
+
|
|
292
|
+
if (cp < 0x80) {
|
|
293
|
+
buf[0] = (char) cp;
|
|
294
|
+
n = 1;
|
|
295
|
+
} else if (cp < 0x800) {
|
|
296
|
+
buf[0] = (char) (0xC0 | (cp >> 6));
|
|
297
|
+
buf[1] = (char) (0x80 | (cp & 0x3F));
|
|
298
|
+
n = 2;
|
|
299
|
+
} else if (cp < 0x10000) {
|
|
300
|
+
buf[0] = (char) (0xE0 | (cp >> 12));
|
|
301
|
+
buf[1] = (char) (0x80 | ((cp >> 6) & 0x3F));
|
|
302
|
+
buf[2] = (char) (0x80 | (cp & 0x3F));
|
|
303
|
+
n = 3;
|
|
304
|
+
} else {
|
|
305
|
+
buf[0] = (char) (0xF0 | (cp >> 18));
|
|
306
|
+
buf[1] = (char) (0x80 | ((cp >> 12) & 0x3F));
|
|
307
|
+
buf[2] = (char) (0x80 | ((cp >> 6) & 0x3F));
|
|
308
|
+
buf[3] = (char) (0x80 | (cp & 0x3F));
|
|
309
|
+
n = 4;
|
|
310
|
+
}
|
|
311
|
+
rb_str_cat(str, buf, n);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
#define APPEND_REPLACEMENT(str) rb_str_cat((str), "\xEF\xBF\xBD", 3)
|
|
315
|
+
|
|
316
|
+
// Scanner allocation, memory and gc stuff -------------------------------------
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* scanner_free - Free a scanner_t.
|
|
320
|
+
*
|
|
321
|
+
* Called by Ruby’s GC when a Scanner object is collected.
|
|
322
|
+
* Uses `xfree` to release the memory allocated for the scanner_t.
|
|
323
|
+
*/
|
|
324
|
+
static void scanner_free(void *ptr) {
|
|
325
|
+
xfree(ptr);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* scanner_memsize - Report memory size of a scanner_t.
|
|
330
|
+
*
|
|
331
|
+
* Required by Ruby’s GC for accounting purposes.
|
|
332
|
+
* Always returns sizeof(scanner_t).
|
|
333
|
+
*/
|
|
334
|
+
static size_t scanner_memsize(const void *ptr) {
|
|
335
|
+
return sizeof(scanner_t);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
static void scanner_mark(void *ptr) {
|
|
339
|
+
scanner_t *sc = ptr;
|
|
340
|
+
if (sc->owner) {
|
|
341
|
+
rb_gc_mark(sc->owner);
|
|
342
|
+
}
|
|
343
|
+
if (sc->tokens) {
|
|
344
|
+
rb_gc_mark(sc->tokens);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* scanner_type - Data type definition for MiniCSS::CSS::Scanner.
|
|
350
|
+
*
|
|
351
|
+
* Provides Ruby with type information, free function, and
|
|
352
|
+
* memory size function for scanner_t objects.
|
|
353
|
+
*
|
|
354
|
+
* RUBY_TYPED_FREE_IMMEDIATELY indicates the object can be
|
|
355
|
+
* immediately freed without finalizer deferral.
|
|
356
|
+
*/
|
|
357
|
+
static const rb_data_type_t scanner_type = {
|
|
358
|
+
"MiniCSS::CSS::Scanner",
|
|
359
|
+
{scanner_mark, scanner_free, scanner_memsize,},
|
|
360
|
+
0, 0, RUBY_TYPED_FREE_IMMEDIATELY
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* scanner_alloc - Allocate and wrap a scanner_t.
|
|
365
|
+
*
|
|
366
|
+
* @klass: The Ruby class (MiniCSS::CSS::Scanner).
|
|
367
|
+
*
|
|
368
|
+
* Allocates a zero-initialized scanner_t and wraps it as a
|
|
369
|
+
* Ruby object of type @klass. The object is associated with
|
|
370
|
+
* `scanner_type` so Ruby’s GC can manage its lifetime.
|
|
371
|
+
*
|
|
372
|
+
* Returns a Ruby VALUE wrapping the scanner_t pointer.
|
|
373
|
+
*/
|
|
374
|
+
static VALUE scanner_alloc(const VALUE klass) {
|
|
375
|
+
scanner_t *sc = ALLOC(scanner_t);
|
|
376
|
+
memset(sc, 0, sizeof(scanner_t));
|
|
377
|
+
return TypedData_Wrap_Struct(klass, &scanner_type, sc);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* scanner_initialize - Ruby constructor for MiniCSS::CSS::Scanner
|
|
382
|
+
*
|
|
383
|
+
* Ruby signature:
|
|
384
|
+
* Scanner#initialize(str)
|
|
385
|
+
*
|
|
386
|
+
* @self: The Ruby scanner object being initialized.
|
|
387
|
+
* @str: A Ruby String containing UTF-8 CSS source text.
|
|
388
|
+
*
|
|
389
|
+
* Behavior:
|
|
390
|
+
* - Validates that @str is a Ruby String (raises TypeError otherwise).
|
|
391
|
+
* - Duplicates @str to ensure scanner owns a stable buffer.
|
|
392
|
+
* - Initializes scanner_t fields:
|
|
393
|
+
* * str = duplicated Ruby String (kept alive by GC)
|
|
394
|
+
* * p = pointer to start of string bytes
|
|
395
|
+
* * end = pointer to one past last byte
|
|
396
|
+
* * idx_cp = 0 (start of code point stream)
|
|
397
|
+
* * line = 1 (1-based for error reporting)
|
|
398
|
+
* * col = 1 (1-based for error reporting)
|
|
399
|
+
* - Primes the lookahead buffer (look[0..3]) by decoding
|
|
400
|
+
* the first four UTF-8 code points from the string.
|
|
401
|
+
*
|
|
402
|
+
* Returns:
|
|
403
|
+
* The initialized Ruby object (self).
|
|
404
|
+
*
|
|
405
|
+
* Notes:
|
|
406
|
+
* - After initialization, the scanner is ready to be used
|
|
407
|
+
* by tokenizer methods, with look[0] representing the next
|
|
408
|
+
* code point to consume.
|
|
409
|
+
* - The lookahead mechanism allows token classification
|
|
410
|
+
* without repeatedly decoding UTF-8.
|
|
411
|
+
*/
|
|
412
|
+
static VALUE scanner_initialize(const VALUE self, VALUE str, VALUE allow_unicode_ranges) {
|
|
413
|
+
Check_Type(str, T_STRING);
|
|
414
|
+
scanner_t *sc;
|
|
415
|
+
TypedData_Get_Struct(self, scanner_t, &scanner_type, sc);
|
|
416
|
+
|
|
417
|
+
str = rb_str_dup(str);
|
|
418
|
+
sc->str = str;
|
|
419
|
+
sc->p = (const uint8_t *) RSTRING_PTR(str);
|
|
420
|
+
sc->end = sc->p + RSTRING_LEN(str);
|
|
421
|
+
sc->idx_cp = 0;
|
|
422
|
+
sc->line = 1;
|
|
423
|
+
sc->col = 1;
|
|
424
|
+
sc->allow_unicode_ranges = RTEST(allow_unicode_ranges);
|
|
425
|
+
sc->tokens = rb_ary_new();
|
|
426
|
+
|
|
427
|
+
// prime lookahead
|
|
428
|
+
sc->look[0] = next_utf8_cp(&sc->p, sc->end);
|
|
429
|
+
sc->look[1] = next_utf8_cp(&sc->p, sc->end);
|
|
430
|
+
sc->look[2] = next_utf8_cp(&sc->p, sc->end);
|
|
431
|
+
sc->look[3] = next_utf8_cp(&sc->p, sc->end);
|
|
432
|
+
|
|
433
|
+
return self;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// -----------------------------------------------------------------------------
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* rotate - Advance lookahead buffer after consuming one code point.
|
|
440
|
+
*
|
|
441
|
+
* @sc: Scanner state.
|
|
442
|
+
*
|
|
443
|
+
* Shifts the lookahead window left by one:
|
|
444
|
+
* - look[0] takes the value of look[1]
|
|
445
|
+
* - look[1] takes the value of look[2]
|
|
446
|
+
* - look[2] takes the value of look[3]
|
|
447
|
+
* - look[3] is filled with the next UTF-8 code point from the input
|
|
448
|
+
*
|
|
449
|
+
* Effectively, this "consumes" look[0] and refreshes the buffer so
|
|
450
|
+
* future peeks remain valid without re-decoding the input stream.
|
|
451
|
+
*
|
|
452
|
+
* Typical usage:
|
|
453
|
+
* - Called after a token consumes the current code point.
|
|
454
|
+
* - Keeps `look[]` in sync with `sc->p` as the scanner progresses.
|
|
455
|
+
*/
|
|
456
|
+
static inline void rotate(scanner_t *sc) {
|
|
457
|
+
sc->look[0] = sc->look[1];
|
|
458
|
+
sc->look[1] = sc->look[2];
|
|
459
|
+
sc->look[2] = sc->look[3];
|
|
460
|
+
sc->look[3] = next_utf8_cp(&sc->p, sc->end);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
static VALUE scanner_start_token(scanner_t *sc) {
|
|
464
|
+
sc->start_token_offset = sc->idx_cp;
|
|
465
|
+
sc->start_token_line = sc->line;
|
|
466
|
+
sc->start_token_column = sc->col;
|
|
467
|
+
return Qnil;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
static void scanner_consume(scanner_t *sc) {
|
|
471
|
+
const int v = sc->look[0];
|
|
472
|
+
if (v == EOF_CP) return;
|
|
473
|
+
|
|
474
|
+
sc->idx_cp += 1;
|
|
475
|
+
if (v == NEWLINE) {
|
|
476
|
+
sc->line += 1;
|
|
477
|
+
sc->col = 1;
|
|
478
|
+
} else {
|
|
479
|
+
sc->col += 1;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
rotate(sc);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// -----------------------------------------------------------------------------
|
|
486
|
+
|
|
487
|
+
static uint32_t decode_escape(scanner_t *sc) {
|
|
488
|
+
const int p0 = sc->look[0];
|
|
489
|
+
if (p0 == EOF_CP) return 0xFFFD;
|
|
490
|
+
|
|
491
|
+
if (is_hex(p0)) {
|
|
492
|
+
int max = 6;
|
|
493
|
+
uint32_t val = 0;
|
|
494
|
+
int consumed = 0;
|
|
495
|
+
while (is_hex(sc->look[0]) && max-- > 0) {
|
|
496
|
+
const int h = sc->look[0];
|
|
497
|
+
scanner_consume(sc);
|
|
498
|
+
consumed = 1;
|
|
499
|
+
|
|
500
|
+
if (h >= '0' && h <= '9') {
|
|
501
|
+
val = (val << 4) | (h - '0');
|
|
502
|
+
} else if (h >= 'A' && h <= 'F') {
|
|
503
|
+
val = (val << 4) | (h - 'A' + 10);
|
|
504
|
+
} else if (h >= 'a' && h <= 'f') {
|
|
505
|
+
val = (val << 4) | (h - 'a' + 10);
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (is_ws(sc->look[0])) scanner_consume(sc);
|
|
510
|
+
|
|
511
|
+
if (!consumed
|
|
512
|
+
|| val == 0
|
|
513
|
+
|| val > 0x10FFFF
|
|
514
|
+
|| (val >= 0xD800 && val <= 0xDFFF)) {
|
|
515
|
+
return 0xFFFD;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
return val;
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if (!is_newline(p0)) {
|
|
522
|
+
scanner_consume(sc);
|
|
523
|
+
return p0;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
return 0xFFFD; // backslash + newline
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
static bool scanner_valid_escape_c(const char cp1, const char cp2) {
|
|
530
|
+
return cp1 == 0x5C /* REVERSE_SOLIDUS */ && !is_newline(cp2);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
#define scanner_valid_escape scanner_valid_escape_c(sc->look[0], sc->look[1])
|
|
534
|
+
|
|
535
|
+
static bool scanner_ident_sequence_start(const scanner_t *sc) {
|
|
536
|
+
const int p0 = sc->look[0], p1 = sc->look[1], p2 = sc->look[2];
|
|
537
|
+
if (p0 == 0x2D /* '-' */) {
|
|
538
|
+
if (ident_start(p1) || p1 == 0x2D) return true;
|
|
539
|
+
if (p1 == 0x5C /* '\' */ && !is_newline(p2)) return true;
|
|
540
|
+
return false;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
if (ident_start(p0)) {
|
|
544
|
+
return true;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
if (p0 == 0x5C) {
|
|
548
|
+
return !is_newline(p1);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
return false;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
static VALUE scanner_unicode_range_start(const scanner_t *sc) {
|
|
555
|
+
const int p0 = sc->look[0], p1 = sc->look[1], p2 = sc->look[2];
|
|
556
|
+
if ((p0 == 0x75 || p0 == 0x55) && p1 == 0x2B /* '+' */) {
|
|
557
|
+
if (p2 == 0x3F /* '?' */ || is_hex(p2)) return Qtrue;
|
|
558
|
+
}
|
|
559
|
+
return Qfalse;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
static VALUE scanner_consume_ident_sequence(scanner_t *sc) {
|
|
563
|
+
const VALUE out = rb_str_new("", 0);
|
|
564
|
+
rb_enc_associate_index(out, rb_utf8_encindex());
|
|
565
|
+
|
|
566
|
+
for (;;) {
|
|
567
|
+
const int p0 = sc->look[0];
|
|
568
|
+
if (p0 == EOF_CP) break;
|
|
569
|
+
|
|
570
|
+
if (ident_point(p0)) {
|
|
571
|
+
rb_str_cat_codepoint(out, p0);
|
|
572
|
+
(void) scanner_consume(sc);
|
|
573
|
+
continue;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
if (p0 == 0x5C /* '\' */) {
|
|
577
|
+
const int p1 = sc->look[1];
|
|
578
|
+
if (p1 == EOF_CP) {
|
|
579
|
+
(void) scanner_consume(sc); // consume '\'
|
|
580
|
+
APPEND_REPLACEMENT(out);
|
|
581
|
+
continue;
|
|
582
|
+
}
|
|
583
|
+
if (!is_newline(p1)) {
|
|
584
|
+
(void) scanner_consume(sc); // consume '\'
|
|
585
|
+
const uint32_t val = decode_escape(sc);
|
|
586
|
+
rb_str_cat_codepoint(out, val);
|
|
587
|
+
continue;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
break; // neither ident_point nor valid escape
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
return out;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
static VALUE scanner_consume_escaped_code_point(scanner_t *sc) {
|
|
598
|
+
const int p = sc->look[0];
|
|
599
|
+
|
|
600
|
+
if (p == EOF_CP) {
|
|
601
|
+
return INT2NUM(0xFFFD);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
if (is_hex(p)) {
|
|
605
|
+
int max = 6;
|
|
606
|
+
uint32_t val = 0;
|
|
607
|
+
int consumed = 0;
|
|
608
|
+
while (is_hex(sc->look[0]) && max-- > 0) {
|
|
609
|
+
const int h = sc->look[0];
|
|
610
|
+
scanner_consume(sc);
|
|
611
|
+
consumed = 1;
|
|
612
|
+
if (h >= '0' && h <= '9') {
|
|
613
|
+
val = (val << 4) | (h - '0');
|
|
614
|
+
} else if (h >= 'A' && h <= 'F') {
|
|
615
|
+
val = (val << 4) | (h - 'A' + 10);
|
|
616
|
+
} else if (h >= 'a' && h <= 'f') {
|
|
617
|
+
val = (val << 4) | (h - 'a' + 10);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
if (is_ws(sc->look[0])) scanner_consume(sc);
|
|
622
|
+
|
|
623
|
+
if (!consumed || val == 0 || val > 0x10FFFF || (val >= 0xD800 && val <= 0xDFFF)) {
|
|
624
|
+
return INT2NUM(0xFFFD);
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
return INT2NUM((int) val);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (!is_newline(p)) {
|
|
631
|
+
(void) scanner_consume(sc);
|
|
632
|
+
return INT2NUM(p);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// backslash followed by newline → error
|
|
636
|
+
return INT2NUM(0xFFFD);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
static void scanner_push_token_opts(const scanner_t *sc, const VALUE type, const VALUE opts) {
|
|
640
|
+
const VALUE pos_start = rb_funcall(cPosition, id_type_new, 3,
|
|
641
|
+
LONG2NUM(sc->start_token_offset),
|
|
642
|
+
LONG2NUM(sc->start_token_line),
|
|
643
|
+
LONG2NUM(sc->start_token_column));
|
|
644
|
+
const VALUE pos_end = rb_funcall(cPosition, id_type_new, 3,
|
|
645
|
+
LONG2NUM(sc->idx_cp),
|
|
646
|
+
LONG2NUM(sc->line),
|
|
647
|
+
LONG2NUM(sc->col));
|
|
648
|
+
|
|
649
|
+
const VALUE tokenArgv[4] = {type, pos_start, pos_end, opts};
|
|
650
|
+
const VALUE token = rb_funcallv_kw(cToken, id_type_new, 4, tokenArgv,
|
|
651
|
+
RB_PASS_KEYWORDS);
|
|
652
|
+
rb_ary_push(sc->tokens, token);
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
static void scanner_push_token_simple(const scanner_t *sc, const VALUE type) {
|
|
656
|
+
const VALUE pos_start = rb_funcall(cPosition, id_type_new, 3,
|
|
657
|
+
LONG2NUM(sc->start_token_offset),
|
|
658
|
+
LONG2NUM(sc->start_token_line),
|
|
659
|
+
LONG2NUM(sc->start_token_column));
|
|
660
|
+
const VALUE pos_end = rb_funcall(cPosition, id_type_new, 3,
|
|
661
|
+
LONG2NUM(sc->idx_cp),
|
|
662
|
+
LONG2NUM(sc->line),
|
|
663
|
+
LONG2NUM(sc->col));
|
|
664
|
+
|
|
665
|
+
const VALUE tokenArgv[4] = {type, pos_start, pos_end};
|
|
666
|
+
const VALUE token = rb_funcallv(cToken, id_type_new, 3, tokenArgv);
|
|
667
|
+
rb_ary_push(sc->tokens, token);
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
static void scanner_consume_unicode_range(scanner_t *sc) {
|
|
671
|
+
// consume 'U+' or 'u+'
|
|
672
|
+
scanner_consume(sc);
|
|
673
|
+
scanner_consume(sc);
|
|
674
|
+
|
|
675
|
+
uint32_t buf[6];
|
|
676
|
+
int len = 0;
|
|
677
|
+
|
|
678
|
+
// collect up to 6 hex digits
|
|
679
|
+
while (is_hex(sc->look[0]) && len < 6) {
|
|
680
|
+
buf[len++] = sc->look[0];
|
|
681
|
+
scanner_consume(sc);
|
|
682
|
+
}
|
|
683
|
+
// then collect up to 6 question marks
|
|
684
|
+
while (sc->look[0] == 0x3F /* '?' */ && len < 6) {
|
|
685
|
+
buf[len++] = sc->look[0];
|
|
686
|
+
scanner_consume(sc);
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
uint32_t start_range = 0;
|
|
690
|
+
uint32_t end_range = 0;
|
|
691
|
+
|
|
692
|
+
int has_qm = 0;
|
|
693
|
+
for (int i = 0; i < len; i++) {
|
|
694
|
+
if (buf[i] == 0x3F) {
|
|
695
|
+
has_qm = 1;
|
|
696
|
+
break;
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
if (has_qm) {
|
|
701
|
+
// replace ? with 0 for start, F for end
|
|
702
|
+
for (int i = 0; i < len; i++) {
|
|
703
|
+
start_range <<= 4;
|
|
704
|
+
end_range <<= 4;
|
|
705
|
+
if (buf[i] == 0x3F) {
|
|
706
|
+
start_range |= 0x0;
|
|
707
|
+
end_range |= 0xF;
|
|
708
|
+
} else if (buf[i] >= '0' && buf[i] <= '9') {
|
|
709
|
+
start_range |= (buf[i] - '0');
|
|
710
|
+
end_range |= (buf[i] - '0');
|
|
711
|
+
} else if (buf[i] >= 'A' && buf[i] <= 'F') {
|
|
712
|
+
start_range |= (buf[i] - 'A' + 10);
|
|
713
|
+
end_range |= (buf[i] - 'A' + 10);
|
|
714
|
+
} else if (buf[i] >= 'a' && buf[i] <= 'f') {
|
|
715
|
+
start_range |= (buf[i] - 'a' + 10);
|
|
716
|
+
end_range |= (buf[i] - 'a' + 10);
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
} else {
|
|
720
|
+
for (int i = 0; i < len; i++) {
|
|
721
|
+
start_range <<= 4;
|
|
722
|
+
if (buf[i] >= '0' && buf[i] <= '9') start_range |= (buf[i] - '0');
|
|
723
|
+
else if (buf[i] >= 'A' && buf[i] <= 'F') start_range |= (buf[i] - 'A' + 10);
|
|
724
|
+
else if (buf[i] >= 'a' && buf[i] <= 'f') start_range |= (buf[i] - 'a' + 10);
|
|
725
|
+
}
|
|
726
|
+
end_range = start_range;
|
|
727
|
+
|
|
728
|
+
if (sc->look[0] == '-' && is_hex(sc->look[1])) {
|
|
729
|
+
scanner_consume(sc); // consume '-'
|
|
730
|
+
len = 0;
|
|
731
|
+
while (is_hex(sc->look[0]) && len < 6) {
|
|
732
|
+
buf[len++] = sc->look[0];
|
|
733
|
+
scanner_consume(sc);
|
|
734
|
+
}
|
|
735
|
+
end_range = 0;
|
|
736
|
+
for (int i = 0; i < len; i++) {
|
|
737
|
+
end_range <<= 4;
|
|
738
|
+
if (buf[i] >= '0' && buf[i] <= '9') end_range |= (buf[i] - '0');
|
|
739
|
+
else if (buf[i] >= 'A' && buf[i] <= 'F') end_range |= (buf[i] - 'A' + 10);
|
|
740
|
+
else if (buf[i] >= 'a' && buf[i] <= 'f') end_range |= (buf[i] - 'a' + 10);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
const VALUE hash = rb_hash_new();
|
|
746
|
+
rb_hash_aset(hash, sym_start, UINT2NUM(start_range));
|
|
747
|
+
rb_hash_aset(hash, sym_end, UINT2NUM(end_range));
|
|
748
|
+
scanner_push_token_opts(sc, sym_unicode_range, hash);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
static void scanner_consume_comments(scanner_t *sc) {
|
|
752
|
+
for (;;) {
|
|
753
|
+
// need a lookahead of "/*"
|
|
754
|
+
if (!(sc->look[0] == 0x2F /* '/' */ && sc->look[1] == 0x2A /* '*' */)) {
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// consume "/*"
|
|
759
|
+
scanner_consume(sc);
|
|
760
|
+
scanner_consume(sc);
|
|
761
|
+
|
|
762
|
+
// scan until "*/" or EOF
|
|
763
|
+
for (;;) {
|
|
764
|
+
if (sc->look[0] == 0x2A /* '*' */ && sc->look[1] == 0x2F /* '/' */) {
|
|
765
|
+
// consume "*/"
|
|
766
|
+
scanner_consume(sc);
|
|
767
|
+
scanner_consume(sc);
|
|
768
|
+
break; // done with this comment, restart outer loop
|
|
769
|
+
}
|
|
770
|
+
if (sc->look[0] == EOF_CP) {
|
|
771
|
+
return; // unterminated comment → stop
|
|
772
|
+
}
|
|
773
|
+
scanner_consume(sc);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
static void scanner_consume_whitespace(scanner_t *sc) {
|
|
779
|
+
int saw = 0;
|
|
780
|
+
|
|
781
|
+
for (;;) {
|
|
782
|
+
// comments count as whitespace
|
|
783
|
+
if (sc->look[0] == 0x2F /* '/' */ && sc->look[1] == 0x2A /* '*' */) {
|
|
784
|
+
if (!saw) {
|
|
785
|
+
scanner_start_token(sc);
|
|
786
|
+
saw = 1;
|
|
787
|
+
}
|
|
788
|
+
scanner_consume_comments(sc);
|
|
789
|
+
continue;
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
// not whitespace? stop
|
|
793
|
+
if (!is_ws(sc->look[0])) break;
|
|
794
|
+
|
|
795
|
+
if (!saw) {
|
|
796
|
+
scanner_start_token(sc);
|
|
797
|
+
saw = 1;
|
|
798
|
+
}
|
|
799
|
+
(void) scanner_consume(sc);
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
if (saw) {
|
|
803
|
+
scanner_push_token_simple(sc, sym_whitespace);
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
static void scanner_consume_string_token_c(scanner_t *sc, const int closing_token) {
|
|
808
|
+
scanner_start_token(sc);
|
|
809
|
+
|
|
810
|
+
const VALUE str = rb_str_new("", 0);
|
|
811
|
+
rb_enc_associate_index(str, rb_utf8_encindex());
|
|
812
|
+
|
|
813
|
+
for (;;) {
|
|
814
|
+
const int char_cp = sc->look[0];
|
|
815
|
+
if (char_cp == EOF_CP) break;
|
|
816
|
+
|
|
817
|
+
if (char_cp == closing_token) {
|
|
818
|
+
break; // found closing
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
if (char_cp == REVERSE_SOLIDUS) {
|
|
822
|
+
// '\'
|
|
823
|
+
const int p1 = sc->look[1];
|
|
824
|
+
if (p1 == EOF_CP) {
|
|
825
|
+
scanner_consume(sc);
|
|
826
|
+
continue;
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
if (is_newline(p1)) {
|
|
830
|
+
scanner_consume(sc);
|
|
831
|
+
scanner_consume(sc);
|
|
832
|
+
continue;
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
if (scanner_valid_escape) {
|
|
836
|
+
scanner_consume(sc);
|
|
837
|
+
const VALUE cp_val = scanner_consume_escaped_code_point(sc);
|
|
838
|
+
if (!NIL_P(cp_val)) {
|
|
839
|
+
const uint32_t cp = NUM2UINT(cp_val);
|
|
840
|
+
rb_str_cat_codepoint(str, cp);
|
|
841
|
+
}
|
|
842
|
+
continue;
|
|
843
|
+
}
|
|
844
|
+
// fallthrough: bad escape treated like normal char
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
if (is_newline(char_cp)) {
|
|
848
|
+
scanner_push_token_simple(sc, sym_bad_string);
|
|
849
|
+
return;
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
// normal consume
|
|
853
|
+
scanner_consume(sc);
|
|
854
|
+
rb_str_cat_codepoint(str, char_cp);
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
// quoting = [closing_token].pack("U*")
|
|
858
|
+
char buf[4];
|
|
859
|
+
int n = 0;
|
|
860
|
+
if (closing_token < 0x80) {
|
|
861
|
+
buf[0] = (char) closing_token;
|
|
862
|
+
n = 1;
|
|
863
|
+
} else if (closing_token < 0x800) {
|
|
864
|
+
buf[0] = (char) (0xC0 | (closing_token >> 6));
|
|
865
|
+
buf[1] = (char) (0x80 | (closing_token & 0x3F));
|
|
866
|
+
n = 2;
|
|
867
|
+
} else if (closing_token < 0x10000) {
|
|
868
|
+
buf[0] = (char) (0xE0 | (closing_token >> 12));
|
|
869
|
+
buf[1] = (char) (0x80 | ((closing_token >> 6) & 0x3F));
|
|
870
|
+
buf[2] = (char) (0x80 | (closing_token & 0x3F));
|
|
871
|
+
n = 3;
|
|
872
|
+
} else {
|
|
873
|
+
buf[0] = (char) (0xF0 | (closing_token >> 18));
|
|
874
|
+
buf[1] = (char) (0x80 | ((closing_token >> 12) & 0x3F));
|
|
875
|
+
buf[2] = (char) (0x80 | ((closing_token >> 6) & 0x3F));
|
|
876
|
+
buf[3] = (char) (0x80 | (closing_token & 0x3F));
|
|
877
|
+
n = 4;
|
|
878
|
+
}
|
|
879
|
+
const VALUE quoting = rb_utf8_str_new(buf, n);
|
|
880
|
+
|
|
881
|
+
// push_token(:string, literal: str, quoting: quoting)
|
|
882
|
+
const VALUE kwargs = rb_hash_new();
|
|
883
|
+
rb_hash_aset(kwargs, sym_literal, str);
|
|
884
|
+
rb_hash_aset(kwargs, sym_quoting, quoting);
|
|
885
|
+
scanner_push_token_opts(sc, sym_string, kwargs);
|
|
886
|
+
|
|
887
|
+
if (sc->look[0] != EOF_CP) {
|
|
888
|
+
scanner_consume(sc);
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
typedef struct numeric_repr {
|
|
893
|
+
char type; // `i` for integer, or `n` for numeric
|
|
894
|
+
char sign; // `+`, `-`, or `\0' for none.
|
|
895
|
+
double value;
|
|
896
|
+
} numeric_repr_t;
|
|
897
|
+
|
|
898
|
+
static void scanner_read_numeric_token(scanner_t *sc, numeric_repr_t *out) {
|
|
899
|
+
scanner_start_token(sc);
|
|
900
|
+
out->type = 'i';
|
|
901
|
+
uint32_t number_part[128];
|
|
902
|
+
int number_part_idx = 0;
|
|
903
|
+
uint32_t exponent_part[128];
|
|
904
|
+
int exponent_part_idx = 0;
|
|
905
|
+
out->sign = 0x00;
|
|
906
|
+
|
|
907
|
+
// Sign
|
|
908
|
+
if (sc->look[0] == PLUS_SIGN || sc->look[0] == HYPHEN_MINUS) {
|
|
909
|
+
out->sign = (char) sc->look[0];
|
|
910
|
+
scanner_consume(sc);
|
|
911
|
+
number_part[number_part_idx++] = (unsigned char) out->sign;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
// Digits
|
|
915
|
+
while (is_digit(sc->look[0])) {
|
|
916
|
+
number_part[number_part_idx++] = sc->look[0];
|
|
917
|
+
scanner_consume(sc);
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
// fractional
|
|
921
|
+
if (sc->look[0] == FULL_STOP && is_digit(sc->look[1])) {
|
|
922
|
+
number_part[number_part_idx++] = sc->look[0]; // the `.`
|
|
923
|
+
scanner_consume(sc);
|
|
924
|
+
while (is_digit(sc->look[0])) {
|
|
925
|
+
number_part[number_part_idx++] = sc->look[0];
|
|
926
|
+
scanner_consume(sc);
|
|
927
|
+
}
|
|
928
|
+
out->type = 'n';
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
// exponent
|
|
932
|
+
const uint32_t p = sc->look[0];
|
|
933
|
+
const uint32_t p1 = sc->look[1];
|
|
934
|
+
const uint32_t p2 = sc->look[2];
|
|
935
|
+
if (
|
|
936
|
+
(p == 0x45 || p == 0x65) &&
|
|
937
|
+
(((p1 == HYPHEN_MINUS || p1 == PLUS_SIGN) && is_digit((int) p2)) || is_digit((int) p1))
|
|
938
|
+
) {
|
|
939
|
+
scanner_consume(sc); // consume 'e' or 'E'
|
|
940
|
+
if (sc->look[0] == HYPHEN_MINUS || sc->look[0] == PLUS_SIGN) {
|
|
941
|
+
exponent_part[exponent_part_idx++] = sc->look[0];
|
|
942
|
+
scanner_consume(sc);
|
|
943
|
+
}
|
|
944
|
+
while (is_digit(sc->look[0])) {
|
|
945
|
+
exponent_part[exponent_part_idx++] = sc->look[0];
|
|
946
|
+
scanner_consume(sc);
|
|
947
|
+
}
|
|
948
|
+
out->type = 'n';
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
double value = u32_chars_to_double(number_part, number_part_idx);
|
|
952
|
+
if (exponent_part_idx != 0) {
|
|
953
|
+
const double exponent = u32_chars_to_double(exponent_part, exponent_part_idx);
|
|
954
|
+
value *= powf(10, (float_t) exponent);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
out->value = value;
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
static void scanner_consume_numeric_token(scanner_t *sc) {
|
|
961
|
+
numeric_repr_t num;
|
|
962
|
+
scanner_read_numeric_token(sc, &num);
|
|
963
|
+
const VALUE number = rb_hash_new();
|
|
964
|
+
rb_hash_aset(number, sym_value, DBL2NUM(num.value));
|
|
965
|
+
rb_hash_aset(number, sym_type,
|
|
966
|
+
num.type == 'i' ? sym_integer : sym_number);
|
|
967
|
+
if (num.sign != 0x00) {
|
|
968
|
+
char sign_char[2] = {num.sign, 0x00};
|
|
969
|
+
rb_hash_aset(number, sym_sign_character,rb_utf8_str_new(sign_char, 1));
|
|
970
|
+
}
|
|
971
|
+
if (scanner_ident_sequence_start(sc)) {
|
|
972
|
+
const VALUE unit = scanner_consume_ident_sequence(sc);
|
|
973
|
+
rb_hash_aset(number, sym_unit, unit);
|
|
974
|
+
scanner_push_token_opts(sc, sym_dimension, number);
|
|
975
|
+
} else if (sc->look[0] == PERCENTAGE_SIGN) {
|
|
976
|
+
scanner_consume(sc);
|
|
977
|
+
scanner_push_token_opts(sc, sym_percentage, number);
|
|
978
|
+
} else {
|
|
979
|
+
scanner_push_token_opts(sc, sym_number, number);
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
static inline bool is_url(const char *str) {
|
|
984
|
+
return str[0] && str[1] && str[2] &&
|
|
985
|
+
tolower((unsigned char) str[0]) == 'u' &&
|
|
986
|
+
tolower((unsigned char) str[1]) == 'r' &&
|
|
987
|
+
tolower((unsigned char) str[2]) == 'l' &&
|
|
988
|
+
str[3] == '\0';
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
static void scanner_consume_ident_like_token(scanner_t *sc) {
|
|
992
|
+
scanner_start_token(sc);
|
|
993
|
+
VALUE str = scanner_consume_ident_sequence(sc);
|
|
994
|
+
const char *cstr = rb_string_value_cstr(&str);
|
|
995
|
+
|
|
996
|
+
const VALUE push_opts = rb_hash_new();
|
|
997
|
+
rb_hash_aset(push_opts, sym_value, str);
|
|
998
|
+
|
|
999
|
+
if (rb_str_strlen(str) >= 3 && is_url(cstr) && sc->look[0] == LEFT_PARENTHESIS) {
|
|
1000
|
+
scanner_consume(sc); // consume '('
|
|
1001
|
+
while (is_ws(sc->look[0]) && is_ws(sc->look[1])) {
|
|
1002
|
+
scanner_consume(sc);
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
if ((sc->look[0] == QUOTATION_MARK || sc->look[0] == APOSTROPHE) ||
|
|
1006
|
+
(is_ws(sc->look[0]) && (sc->look[1] == QUOTATION_MARK || sc->look[1] == APOSTROPHE))) {
|
|
1007
|
+
scanner_push_token_opts(sc, sym_function, push_opts);
|
|
1008
|
+
return;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
scanner_consume_url_token(sc);
|
|
1012
|
+
return;
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
if (sc->look[0] == LEFT_PARENTHESIS) {
|
|
1016
|
+
scanner_consume(sc);
|
|
1017
|
+
const VALUE value = rb_str_dup(str);
|
|
1018
|
+
rb_str_cat_cstr(value, "(");
|
|
1019
|
+
rb_hash_aset(push_opts, sym_literal, value);
|
|
1020
|
+
rb_hash_aset(push_opts, sym_name, str);
|
|
1021
|
+
rb_hash_delete(push_opts, sym_value);
|
|
1022
|
+
scanner_push_token_opts(sc, sym_function, push_opts);
|
|
1023
|
+
return;
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
rb_hash_aset(push_opts, sym_literal, str);
|
|
1027
|
+
rb_hash_delete(push_opts, sym_value);
|
|
1028
|
+
scanner_push_token_opts(sc, sym_ident, push_opts);
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
static void scanner_consume_url_token(scanner_t *sc) {
|
|
1032
|
+
scanner_start_token(sc);
|
|
1033
|
+
|
|
1034
|
+
const VALUE data = rb_str_new("", 0);
|
|
1035
|
+
rb_enc_associate_index(data, rb_utf8_encindex());
|
|
1036
|
+
|
|
1037
|
+
// consume leading whitespace
|
|
1038
|
+
while (is_ws(sc->look[0])) {
|
|
1039
|
+
scanner_consume(sc);
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
for (;;) {
|
|
1043
|
+
const int p = sc->look[0];
|
|
1044
|
+
|
|
1045
|
+
if (p == 0x29 /* ')' */) {
|
|
1046
|
+
scanner_consume(sc);
|
|
1047
|
+
const VALUE kwargs = rb_hash_new();
|
|
1048
|
+
rb_hash_aset(kwargs, sym_value, data);
|
|
1049
|
+
scanner_push_token_opts(sc, sym_url, kwargs);
|
|
1050
|
+
return;
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
if (p == EOF_CP) {
|
|
1054
|
+
const VALUE kwargs = rb_hash_new();
|
|
1055
|
+
rb_hash_aset(kwargs, sym_value, data);
|
|
1056
|
+
scanner_push_token_opts(sc, sym_url, kwargs);
|
|
1057
|
+
return;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
if (is_ws(p)) {
|
|
1061
|
+
while (is_ws(sc->look[0])) scanner_consume(sc);
|
|
1062
|
+
|
|
1063
|
+
if (sc->look[0] == 0x29 /* ')' */) {
|
|
1064
|
+
scanner_consume(sc);
|
|
1065
|
+
const VALUE kwargs = rb_hash_new();
|
|
1066
|
+
rb_hash_aset(kwargs, sym_value, data);
|
|
1067
|
+
scanner_push_token_opts(sc, sym_url, kwargs);
|
|
1068
|
+
return;
|
|
1069
|
+
} else if (sc->look[0] == EOF_CP) {
|
|
1070
|
+
VALUE kwargs = rb_hash_new();
|
|
1071
|
+
rb_hash_aset(kwargs, sym_value, data);
|
|
1072
|
+
scanner_push_token_opts(sc, sym_url, kwargs);
|
|
1073
|
+
return;
|
|
1074
|
+
} else {
|
|
1075
|
+
scanner_consume_bad_url(sc);
|
|
1076
|
+
scanner_push_token_simple(sc, sym_bad_url);
|
|
1077
|
+
return;
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
if (p == QUOTATION_MARK || p == APOSTROPHE || p == 0x28 /* '(' */ ||
|
|
1082
|
+
(p <= 0x1F || p == 0x7F)) {
|
|
1083
|
+
// non-printable
|
|
1084
|
+
scanner_consume_bad_url(sc);
|
|
1085
|
+
scanner_push_token_simple(sc, sym_bad_url);
|
|
1086
|
+
return;
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
if (p == REVERSE_SOLIDUS) {
|
|
1090
|
+
if (scanner_valid_escape) {
|
|
1091
|
+
scanner_consume(sc); // consume '\'
|
|
1092
|
+
const VALUE cp_val = scanner_consume_escaped_code_point(sc);
|
|
1093
|
+
if (!NIL_P(cp_val)) {
|
|
1094
|
+
const uint32_t cp = NUM2UINT(cp_val);
|
|
1095
|
+
rb_str_cat_codepoint(data, cp);
|
|
1096
|
+
}
|
|
1097
|
+
continue;
|
|
1098
|
+
} else {
|
|
1099
|
+
scanner_consume_bad_url(sc);
|
|
1100
|
+
scanner_push_token_simple(sc, sym_bad_url);
|
|
1101
|
+
return;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
// normal char
|
|
1106
|
+
const int consumed = sc->look[0];
|
|
1107
|
+
scanner_consume(sc);
|
|
1108
|
+
rb_str_cat_codepoint(data, consumed);
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
static void scanner_consume_bad_url(scanner_t *sc) {
|
|
1113
|
+
while (sc->look[0] != EOF) {
|
|
1114
|
+
if (sc->look[0] == RIGHT_PARENTHESIS) {
|
|
1115
|
+
scanner_consume(sc);
|
|
1116
|
+
return;
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
if (scanner_valid_escape) {
|
|
1120
|
+
scanner_consume(sc);
|
|
1121
|
+
scanner_consume_escaped_code_point(sc);
|
|
1122
|
+
} else {
|
|
1123
|
+
scanner_consume(sc);
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
static VALUE scanner_consume_token(const VALUE self) {
|
|
1129
|
+
scanner_t *sc;
|
|
1130
|
+
TypedData_Get_Struct(self, scanner_t, &scanner_type, sc);
|
|
1131
|
+
|
|
1132
|
+
scanner_consume_comments(sc);
|
|
1133
|
+
scanner_consume_whitespace(sc);
|
|
1134
|
+
|
|
1135
|
+
switch (sc->look[0]) {
|
|
1136
|
+
case QUOTATION_MARK:
|
|
1137
|
+
case APOSTROPHE: {
|
|
1138
|
+
const int p = sc->look[0];
|
|
1139
|
+
scanner_consume(sc);
|
|
1140
|
+
scanner_consume_string_token_c(sc, p);
|
|
1141
|
+
return Qtrue;
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
case NUMBER_SIGN: {
|
|
1145
|
+
scanner_start_token(sc);
|
|
1146
|
+
const char cp1 = (char) sc->look[1], cp2 = (char) sc->look[2];
|
|
1147
|
+
|
|
1148
|
+
if (ident_point(sc->look[1]) || scanner_valid_escape_c(cp1, cp2)) {
|
|
1149
|
+
scanner_consume(sc);
|
|
1150
|
+
const VALUE flag = ident_start(sc->look[0]) ? rb_intern("id") : rb_intern("unrestricted");
|
|
1151
|
+
const VALUE value = scanner_consume_ident_sequence(sc);
|
|
1152
|
+
VALUE newValue = rb_str_new_cstr("#");
|
|
1153
|
+
newValue = rb_str_append(newValue, value);
|
|
1154
|
+
|
|
1155
|
+
const VALUE push_token_opts = rb_hash_new();
|
|
1156
|
+
rb_hash_aset(push_token_opts, sym_flag, flag);
|
|
1157
|
+
rb_hash_aset(push_token_opts, sym_literal, newValue);
|
|
1158
|
+
scanner_push_token_opts(sc, sym_hash, push_token_opts);
|
|
1159
|
+
return Qnil;
|
|
1160
|
+
}
|
|
1161
|
+
scanner_consume(sc);
|
|
1162
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1163
|
+
return Qnil;
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
case LEFT_PARENTHESIS: {
|
|
1167
|
+
scanner_start_token(sc);
|
|
1168
|
+
scanner_consume(sc);
|
|
1169
|
+
scanner_push_token_simple(sc, sym_left_parenthesis);
|
|
1170
|
+
return Qnil;
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
case RIGHT_PARENTHESIS: {
|
|
1174
|
+
scanner_start_token(sc);
|
|
1175
|
+
scanner_consume(sc);
|
|
1176
|
+
scanner_push_token_simple(sc, sym_right_parenthesis);
|
|
1177
|
+
return Qnil;
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
case PLUS_SIGN: {
|
|
1181
|
+
const char p1 = (char) sc->look[1], p2 = (char) sc->look[2];
|
|
1182
|
+
if (is_digit(p1) || p1 == FULL_STOP && is_digit(p2)) {
|
|
1183
|
+
scanner_consume_numeric_token(sc);
|
|
1184
|
+
return Qnil;
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
scanner_start_token(sc);
|
|
1188
|
+
scanner_consume(sc);
|
|
1189
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1190
|
+
return Qnil;
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
case COMMA: {
|
|
1194
|
+
scanner_start_token(sc);
|
|
1195
|
+
scanner_consume(sc);
|
|
1196
|
+
scanner_push_token_simple(sc, sym_comma);
|
|
1197
|
+
return Qnil;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
case HYPHEN_MINUS: {
|
|
1201
|
+
if (is_digit(sc->look[1]) || (sc->look[1] == FULL_STOP && is_digit(sc->look[2]))) {
|
|
1202
|
+
scanner_consume_numeric_token(sc);
|
|
1203
|
+
return Qnil;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
if (sc->look[1] == HYPHEN_MINUS && sc->look[2] == GREATER_THAN) {
|
|
1207
|
+
scanner_start_token(sc);
|
|
1208
|
+
scanner_consume(sc); // -
|
|
1209
|
+
scanner_consume(sc); // -
|
|
1210
|
+
scanner_consume(sc); // >
|
|
1211
|
+
scanner_push_token_simple(sc, sym_cdc);
|
|
1212
|
+
return Qnil;
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
if (scanner_ident_sequence_start(sc)) {
|
|
1216
|
+
scanner_consume_ident_like_token(sc);
|
|
1217
|
+
return Qnil;
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
scanner_start_token(sc);
|
|
1221
|
+
scanner_consume(sc);
|
|
1222
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1223
|
+
return Qnil;
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
case FULL_STOP: {
|
|
1227
|
+
if (is_digit(sc->look[1])) {
|
|
1228
|
+
scanner_consume_numeric_token(sc);
|
|
1229
|
+
return Qnil;
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
scanner_start_token(sc);
|
|
1233
|
+
scanner_consume(sc);
|
|
1234
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1235
|
+
return Qnil;
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
case COLON: {
|
|
1239
|
+
scanner_start_token(sc);
|
|
1240
|
+
scanner_consume(sc);
|
|
1241
|
+
scanner_push_token_simple(sc, sym_colon);
|
|
1242
|
+
return Qnil;
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
case SEMICOLON: {
|
|
1246
|
+
scanner_start_token(sc);
|
|
1247
|
+
scanner_consume(sc);
|
|
1248
|
+
scanner_push_token_simple(sc, sym_semicolon);
|
|
1249
|
+
return Qnil;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
case LESS_THAN: {
|
|
1253
|
+
scanner_start_token(sc);
|
|
1254
|
+
if (sc->look[0] == LESS_THAN &&
|
|
1255
|
+
sc->look[1] == EXCLAMATION_MARK &&
|
|
1256
|
+
sc->look[2] == HYPHEN_MINUS &&
|
|
1257
|
+
sc->look[3] == HYPHEN_MINUS) {
|
|
1258
|
+
scanner_consume(sc); // <
|
|
1259
|
+
scanner_consume(sc); // !
|
|
1260
|
+
scanner_consume(sc); // -
|
|
1261
|
+
scanner_consume(sc); // -
|
|
1262
|
+
scanner_push_token_simple(sc, sym_cdo);
|
|
1263
|
+
return Qnil;
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
scanner_start_token(sc);
|
|
1267
|
+
scanner_consume(sc);
|
|
1268
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1269
|
+
return Qnil;
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
case COMMERCIAL_AT: {
|
|
1273
|
+
scanner_start_token(sc);
|
|
1274
|
+
scanner_consume(sc);
|
|
1275
|
+
|
|
1276
|
+
if (scanner_ident_sequence_start(sc)) {
|
|
1277
|
+
const VALUE raw_val = scanner_consume_ident_sequence(sc);
|
|
1278
|
+
const VALUE val = rb_str_new_cstr("@");
|
|
1279
|
+
rb_str_append(val, raw_val);
|
|
1280
|
+
const VALUE push_opts = rb_hash_new();
|
|
1281
|
+
rb_hash_aset(push_opts, sym_literal, val);
|
|
1282
|
+
scanner_push_token_opts(sc, sym_at_keyword, push_opts);
|
|
1283
|
+
return Qnil;
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1287
|
+
return Qnil;
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
case LEFT_SQUARE_BRACKET: {
|
|
1291
|
+
scanner_start_token(sc);
|
|
1292
|
+
scanner_consume(sc);
|
|
1293
|
+
scanner_push_token_simple(sc, sym_left_square_bracket);
|
|
1294
|
+
return Qnil;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
case REVERSE_SOLIDUS: {
|
|
1298
|
+
if (scanner_valid_escape) {
|
|
1299
|
+
scanner_consume_ident_like_token(sc);
|
|
1300
|
+
return Qnil;
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
scanner_start_token(sc);
|
|
1304
|
+
scanner_consume(sc);
|
|
1305
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1306
|
+
return Qnil;
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
case RIGHT_SQUARE_BRACKET: {
|
|
1310
|
+
scanner_start_token(sc);
|
|
1311
|
+
scanner_consume(sc);
|
|
1312
|
+
scanner_push_token_simple(sc, sym_right_square_bracket);
|
|
1313
|
+
return Qnil;
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
case LEFT_CURLY: {
|
|
1317
|
+
scanner_start_token(sc);
|
|
1318
|
+
scanner_consume(sc);
|
|
1319
|
+
scanner_push_token_simple(sc, sym_left_curly);
|
|
1320
|
+
return Qnil;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
case RIGHT_CURLY: {
|
|
1324
|
+
scanner_start_token(sc);
|
|
1325
|
+
scanner_consume(sc);
|
|
1326
|
+
scanner_push_token_simple(sc, sym_right_curly);
|
|
1327
|
+
return Qnil;
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
case 0x75:
|
|
1331
|
+
case 0x55:
|
|
1332
|
+
if (sc->allow_unicode_ranges && scanner_unicode_range_start(sc) == Qtrue) {
|
|
1333
|
+
scanner_consume_unicode_range(sc);
|
|
1334
|
+
return Qnil;
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
scanner_consume_ident_like_token(sc);
|
|
1338
|
+
return Qnil;
|
|
1339
|
+
|
|
1340
|
+
default: {
|
|
1341
|
+
if (sc->look[0] == EOF) {
|
|
1342
|
+
return Qnil;
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
if (is_ws(sc->look[0])) {
|
|
1346
|
+
scanner_consume_whitespace(sc);
|
|
1347
|
+
return Qnil;
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
if (is_digit(sc->look[0])) {
|
|
1351
|
+
scanner_consume_numeric_token(sc);
|
|
1352
|
+
return Qnil;
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
if (ident_start(sc->look[0])) {
|
|
1356
|
+
scanner_consume_ident_like_token(sc);
|
|
1357
|
+
return Qnil;
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
scanner_start_token(sc);
|
|
1361
|
+
scanner_consume(sc);
|
|
1362
|
+
scanner_push_token_simple(sc, sym_delim);
|
|
1363
|
+
return Qnil;
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
static VALUE scanner_tokens(const VALUE self) {
|
|
1369
|
+
scanner_t *sc;
|
|
1370
|
+
TypedData_Get_Struct(self, scanner_t, &scanner_type, sc);
|
|
1371
|
+
return sc->tokens;
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
static VALUE scanner_at_eof(const VALUE self) {
|
|
1375
|
+
scanner_t *sc;
|
|
1376
|
+
TypedData_Get_Struct(self, scanner_t, &scanner_type, sc);
|
|
1377
|
+
return sc->look[0] == EOF ? Qtrue : Qfalse;
|
|
1378
|
+
}
|
|
1379
|
+
|
|
1380
|
+
void Init_minicss_scanner(void) {
|
|
1381
|
+
const VALUE mMiniCSS = rb_define_module("MiniCSS");
|
|
1382
|
+
const VALUE mCSS = rb_define_module_under(mMiniCSS, "CSS");
|
|
1383
|
+
const VALUE cScanner = rb_define_class_under(mCSS, "Scanner", rb_cObject);
|
|
1384
|
+
|
|
1385
|
+
cPosition = rb_path2class("MiniCSS::CSS::Position");
|
|
1386
|
+
cToken = rb_path2class("MiniCSS::CSS::Token");
|
|
1387
|
+
|
|
1388
|
+
// Initialize values reused across functions
|
|
1389
|
+
INITIALIZE_REUSABLE_SYMBOL(line);
|
|
1390
|
+
INITIALIZE_REUSABLE_SYMBOL(column);
|
|
1391
|
+
INITIALIZE_REUSABLE_SYMBOL(offset);
|
|
1392
|
+
INITIALIZE_REUSABLE_SYMBOL(whitespace);
|
|
1393
|
+
INITIALIZE_REUSABLE_SYMBOL(start);
|
|
1394
|
+
INITIALIZE_REUSABLE_SYMBOL(end);
|
|
1395
|
+
INITIALIZE_REUSABLE_SYMBOL(unicode_range);
|
|
1396
|
+
INITIALIZE_REUSABLE_SYMBOL(new);
|
|
1397
|
+
INITIALIZE_REUSABLE_SYMBOL(bad_string);
|
|
1398
|
+
INITIALIZE_REUSABLE_SYMBOL(literal);
|
|
1399
|
+
INITIALIZE_REUSABLE_SYMBOL(quoting);
|
|
1400
|
+
INITIALIZE_REUSABLE_SYMBOL(string);
|
|
1401
|
+
INITIALIZE_REUSABLE_SYMBOL(value);
|
|
1402
|
+
INITIALIZE_REUSABLE_SYMBOL(type);
|
|
1403
|
+
INITIALIZE_REUSABLE_SYMBOL(integer);
|
|
1404
|
+
INITIALIZE_REUSABLE_SYMBOL(number);
|
|
1405
|
+
INITIALIZE_REUSABLE_SYMBOL(sign_character);
|
|
1406
|
+
INITIALIZE_REUSABLE_SYMBOL(unit);
|
|
1407
|
+
INITIALIZE_REUSABLE_SYMBOL(dimension);
|
|
1408
|
+
INITIALIZE_REUSABLE_SYMBOL(percentage);
|
|
1409
|
+
INITIALIZE_REUSABLE_SYMBOL(function);
|
|
1410
|
+
INITIALIZE_REUSABLE_SYMBOL(name);
|
|
1411
|
+
INITIALIZE_REUSABLE_SYMBOL(ident);
|
|
1412
|
+
INITIALIZE_REUSABLE_SYMBOL(url);
|
|
1413
|
+
INITIALIZE_REUSABLE_SYMBOL(bad_url);
|
|
1414
|
+
INITIALIZE_REUSABLE_SYMBOL(flag);
|
|
1415
|
+
INITIALIZE_REUSABLE_SYMBOL(hash);
|
|
1416
|
+
INITIALIZE_REUSABLE_SYMBOL(delim);
|
|
1417
|
+
INITIALIZE_REUSABLE_SYMBOL(left_parenthesis);
|
|
1418
|
+
INITIALIZE_REUSABLE_SYMBOL(right_parenthesis);
|
|
1419
|
+
INITIALIZE_REUSABLE_SYMBOL(comma);
|
|
1420
|
+
INITIALIZE_REUSABLE_SYMBOL(cdc);
|
|
1421
|
+
INITIALIZE_REUSABLE_SYMBOL(colon);
|
|
1422
|
+
INITIALIZE_REUSABLE_SYMBOL(semicolon);
|
|
1423
|
+
INITIALIZE_REUSABLE_SYMBOL(cdo);
|
|
1424
|
+
INITIALIZE_REUSABLE_SYMBOL(at_keyword);
|
|
1425
|
+
INITIALIZE_REUSABLE_SYMBOL(left_square_bracket);
|
|
1426
|
+
INITIALIZE_REUSABLE_SYMBOL(right_square_bracket);
|
|
1427
|
+
INITIALIZE_REUSABLE_SYMBOL(left_curly);
|
|
1428
|
+
INITIALIZE_REUSABLE_SYMBOL(right_curly);
|
|
1429
|
+
|
|
1430
|
+
rb_define_alloc_func(cScanner, scanner_alloc);
|
|
1431
|
+
rb_define_method(cScanner, "initialize", scanner_initialize, 2);
|
|
1432
|
+
|
|
1433
|
+
rb_define_method(cScanner, "eof?", scanner_at_eof, 0);
|
|
1434
|
+
rb_define_method(cScanner, "tokens", scanner_tokens, 0);
|
|
1435
|
+
rb_define_method(cScanner, "consume_token", scanner_consume_token, 0);
|
|
1436
|
+
}
|