yarp 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -1
- data/Makefile +5 -1
- data/README.md +4 -3
- data/config.yml +461 -150
- data/docs/configuration.md +1 -0
- data/docs/encoding.md +5 -5
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +3 -3
- data/docs/testing.md +2 -2
- data/ext/yarp/api_node.c +810 -199
- data/ext/yarp/extension.c +94 -31
- data/ext/yarp/extension.h +2 -2
- data/include/yarp/ast.h +653 -150
- data/include/yarp/defines.h +2 -1
- data/include/yarp/diagnostic.h +3 -3
- data/include/yarp/enc/yp_encoding.h +10 -10
- data/include/yarp/node.h +10 -0
- data/include/yarp/parser.h +19 -19
- data/include/yarp/regexp.h +1 -1
- data/include/yarp/unescape.h +7 -5
- data/include/yarp/util/yp_buffer.h +3 -0
- data/include/yarp/util/yp_char.h +16 -16
- data/include/yarp/util/yp_constant_pool.h +2 -2
- data/include/yarp/util/yp_newline_list.h +7 -4
- data/include/yarp/util/yp_string.h +4 -4
- data/include/yarp/util/yp_string_list.h +0 -3
- data/include/yarp/util/yp_strpbrk.h +1 -1
- data/include/yarp/version.h +2 -2
- data/include/yarp.h +14 -3
- data/lib/yarp/desugar_visitor.rb +204 -0
- data/lib/yarp/ffi.rb +27 -1
- data/lib/yarp/lex_compat.rb +93 -25
- data/lib/yarp/mutation_visitor.rb +683 -0
- data/lib/yarp/node.rb +3121 -597
- data/lib/yarp/serialize.rb +198 -126
- data/lib/yarp.rb +53 -7
- data/src/diagnostic.c +1 -1
- data/src/enc/yp_big5.c +15 -42
- data/src/enc/yp_euc_jp.c +16 -43
- data/src/enc/yp_gbk.c +19 -46
- data/src/enc/yp_shift_jis.c +16 -43
- data/src/enc/yp_tables.c +36 -38
- data/src/enc/yp_unicode.c +20 -25
- data/src/enc/yp_windows_31j.c +16 -43
- data/src/node.c +1444 -836
- data/src/prettyprint.c +324 -103
- data/src/regexp.c +21 -21
- data/src/serialize.c +429 -276
- data/src/token_type.c +2 -2
- data/src/unescape.c +184 -136
- data/src/util/yp_buffer.c +7 -2
- data/src/util/yp_char.c +34 -34
- data/src/util/yp_constant_pool.c +4 -4
- data/src/util/yp_memchr.c +1 -1
- data/src/util/yp_newline_list.c +14 -3
- data/src/util/yp_string.c +22 -20
- data/src/util/yp_string_list.c +0 -6
- data/src/util/yp_strncasecmp.c +3 -6
- data/src/util/yp_strpbrk.c +8 -8
- data/src/yarp.c +1504 -615
- data/yarp.gemspec +3 -1
- metadata +4 -2
data/src/enc/yp_unicode.c
CHANGED
@@ -10,7 +10,7 @@ typedef uint32_t yp_unicode_codepoint_t;
|
|
10
10
|
// this table is different from other encodings where we used a lookup table
|
11
11
|
// because the indices of those tables are the byte representations, not the
|
12
12
|
// codepoints themselves.
|
13
|
-
|
13
|
+
uint8_t yp_encoding_unicode_table[256] = {
|
14
14
|
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
15
15
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
16
16
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
@@ -2220,7 +2220,7 @@ static const uint8_t yp_utf_8_dfa[] = {
|
|
2220
2220
|
};
|
2221
2221
|
|
2222
2222
|
static yp_unicode_codepoint_t
|
2223
|
-
yp_utf_8_codepoint(const
|
2223
|
+
yp_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
2224
2224
|
assert(n >= 1);
|
2225
2225
|
size_t maximum = (size_t) n;
|
2226
2226
|
|
@@ -2228,7 +2228,7 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
|
|
2228
2228
|
uint32_t state = 0;
|
2229
2229
|
|
2230
2230
|
for (size_t index = 0; index < 4 && index < maximum; index++) {
|
2231
|
-
uint32_t byte =
|
2231
|
+
uint32_t byte = b[index];
|
2232
2232
|
uint32_t type = yp_utf_8_dfa[byte];
|
2233
2233
|
|
2234
2234
|
codepoint = (state != 0) ?
|
@@ -2247,60 +2247,55 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
|
|
2247
2247
|
}
|
2248
2248
|
|
2249
2249
|
static size_t
|
2250
|
-
yp_encoding_utf_8_char_width(const
|
2250
|
+
yp_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
2251
2251
|
size_t width;
|
2252
|
-
|
2253
|
-
|
2254
|
-
yp_utf_8_codepoint(v, n, &width);
|
2252
|
+
yp_utf_8_codepoint(b, n, &width);
|
2255
2253
|
return width;
|
2256
2254
|
}
|
2257
2255
|
|
2258
2256
|
size_t
|
2259
|
-
yp_encoding_utf_8_alpha_char(const
|
2260
|
-
|
2261
|
-
|
2262
|
-
return (yp_encoding_unicode_table[*v] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
|
2257
|
+
yp_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
2258
|
+
if (*b < 0x80) {
|
2259
|
+
return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
|
2263
2260
|
}
|
2264
2261
|
|
2265
2262
|
size_t width;
|
2266
|
-
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(
|
2263
|
+
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
2267
2264
|
|
2268
2265
|
if (codepoint <= 0xFF) {
|
2269
|
-
return (yp_encoding_unicode_table[(
|
2266
|
+
return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_ALPHABETIC_BIT) ? width : 0;
|
2270
2267
|
} else {
|
2271
2268
|
return yp_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
|
2272
2269
|
}
|
2273
2270
|
}
|
2274
2271
|
|
2275
2272
|
size_t
|
2276
|
-
yp_encoding_utf_8_alnum_char(const
|
2277
|
-
|
2278
|
-
|
2279
|
-
return (yp_encoding_unicode_table[*v] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
|
2273
|
+
yp_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
2274
|
+
if (*b < 0x80) {
|
2275
|
+
return (yp_encoding_unicode_table[*b] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
|
2280
2276
|
}
|
2281
2277
|
|
2282
2278
|
size_t width;
|
2283
|
-
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(
|
2279
|
+
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
2284
2280
|
|
2285
2281
|
if (codepoint <= 0xFF) {
|
2286
|
-
return (yp_encoding_unicode_table[(
|
2282
|
+
return (yp_encoding_unicode_table[(uint8_t) codepoint] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
|
2287
2283
|
} else {
|
2288
2284
|
return yp_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
|
2289
2285
|
}
|
2290
2286
|
}
|
2291
2287
|
|
2292
2288
|
static bool
|
2293
|
-
yp_encoding_utf_8_isupper_char(const
|
2294
|
-
|
2295
|
-
|
2296
|
-
return (yp_encoding_unicode_table[*v] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
2289
|
+
yp_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
2290
|
+
if (*b < 0x80) {
|
2291
|
+
return (yp_encoding_unicode_table[*b] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
2297
2292
|
}
|
2298
2293
|
|
2299
2294
|
size_t width;
|
2300
|
-
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(
|
2295
|
+
yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
|
2301
2296
|
|
2302
2297
|
if (codepoint <= 0xFF) {
|
2303
|
-
return (yp_encoding_unicode_table[(
|
2298
|
+
return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
|
2304
2299
|
} else {
|
2305
2300
|
return yp_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
|
2306
2301
|
}
|
data/src/enc/yp_windows_31j.c
CHANGED
@@ -1,73 +1,46 @@
|
|
1
1
|
#include "yarp/enc/yp_encoding.h"
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
static yp_windows_31j_codepoint_t
|
6
|
-
yp_windows_31j_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
-
const unsigned char *uc = (const unsigned char *) c;
|
8
|
-
|
3
|
+
static size_t
|
4
|
+
yp_encoding_windows_31j_char_width(const uint8_t *b, ptrdiff_t n) {
|
9
5
|
// These are the single byte characters.
|
10
|
-
if (*
|
11
|
-
|
12
|
-
return *uc;
|
6
|
+
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
|
7
|
+
return 1;
|
13
8
|
}
|
14
9
|
|
15
10
|
// These are the double byte characters.
|
16
11
|
if (
|
17
12
|
(n > 1) &&
|
18
|
-
((
|
19
|
-
(
|
13
|
+
((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
|
14
|
+
(b[1] >= 0x40 && b[1] <= 0xFC)
|
20
15
|
) {
|
21
|
-
|
22
|
-
return (yp_windows_31j_codepoint_t) (uc[0] << 8 | uc[1]);
|
16
|
+
return 2;
|
23
17
|
}
|
24
18
|
|
25
|
-
*width = 0;
|
26
19
|
return 0;
|
27
20
|
}
|
28
21
|
|
29
22
|
static size_t
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
return width;
|
35
|
-
}
|
36
|
-
|
37
|
-
static size_t
|
38
|
-
yp_encoding_windows_31j_alpha_char(const char *c, ptrdiff_t n) {
|
39
|
-
size_t width;
|
40
|
-
yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
|
41
|
-
|
42
|
-
if (width == 1) {
|
43
|
-
const char value = (const char) codepoint;
|
44
|
-
return yp_encoding_ascii_alpha_char(&value, n);
|
23
|
+
yp_encoding_windows_31j_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
24
|
+
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
25
|
+
return yp_encoding_ascii_alpha_char(b, n);
|
45
26
|
} else {
|
46
27
|
return 0;
|
47
28
|
}
|
48
29
|
}
|
49
30
|
|
50
31
|
static size_t
|
51
|
-
yp_encoding_windows_31j_alnum_char(const
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
if (width == 1) {
|
56
|
-
const char value = (const char) codepoint;
|
57
|
-
return yp_encoding_ascii_alnum_char(&value, n);
|
32
|
+
yp_encoding_windows_31j_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
33
|
+
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
34
|
+
return yp_encoding_ascii_alnum_char(b, n);
|
58
35
|
} else {
|
59
36
|
return 0;
|
60
37
|
}
|
61
38
|
}
|
62
39
|
|
63
40
|
static bool
|
64
|
-
yp_encoding_windows_31j_isupper_char(const
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if (width == 1) {
|
69
|
-
const char value = (const char) codepoint;
|
70
|
-
return yp_encoding_ascii_isupper_char(&value, n);
|
41
|
+
yp_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
42
|
+
if (yp_encoding_windows_31j_char_width(b, n) == 1) {
|
43
|
+
return yp_encoding_ascii_isupper_char(b, n);
|
71
44
|
} else {
|
72
45
|
return false;
|
73
46
|
}
|