yarp 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -8
- data/CONTRIBUTING.md +2 -2
- data/Makefile +5 -5
- data/README.md +11 -12
- data/config.yml +6 -2
- data/docs/build_system.md +21 -21
- data/docs/building.md +4 -4
- data/docs/configuration.md +25 -21
- data/docs/design.md +2 -2
- data/docs/encoding.md +17 -17
- data/docs/fuzzing.md +4 -4
- data/docs/heredocs.md +3 -3
- data/docs/mapping.md +94 -94
- data/docs/ripper.md +4 -4
- data/docs/ruby_api.md +11 -11
- data/docs/serialization.md +17 -16
- data/docs/testing.md +6 -6
- data/ext/prism/api_node.c +4725 -0
- data/ext/{yarp → prism}/api_pack.c +82 -82
- data/ext/{yarp → prism}/extconf.rb +13 -13
- data/ext/{yarp → prism}/extension.c +175 -168
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +1932 -0
- data/include/prism/defines.h +45 -0
- data/include/prism/diagnostic.h +231 -0
- data/include/{yarp/enc/yp_encoding.h → prism/enc/pm_encoding.h} +40 -40
- data/include/prism/node.h +41 -0
- data/include/prism/pack.h +141 -0
- data/include/{yarp → prism}/parser.h +143 -142
- data/include/prism/regexp.h +19 -0
- data/include/prism/unescape.h +48 -0
- data/include/prism/util/pm_buffer.h +51 -0
- data/include/{yarp/util/yp_char.h → prism/util/pm_char.h} +20 -20
- data/include/{yarp/util/yp_constant_pool.h → prism/util/pm_constant_pool.h} +26 -22
- data/include/{yarp/util/yp_list.h → prism/util/pm_list.h} +21 -21
- data/include/prism/util/pm_memchr.h +14 -0
- data/include/{yarp/util/yp_newline_list.h → prism/util/pm_newline_list.h} +11 -11
- data/include/prism/util/pm_state_stack.h +24 -0
- data/include/{yarp/util/yp_string.h → prism/util/pm_string.h} +20 -20
- data/include/prism/util/pm_string_list.h +25 -0
- data/include/{yarp/util/yp_strpbrk.h → prism/util/pm_strpbrk.h} +7 -7
- data/include/prism/version.h +4 -0
- data/include/prism.h +82 -0
- data/lib/prism/compiler.rb +465 -0
- data/lib/prism/debug.rb +157 -0
- data/lib/{yarp/desugar_visitor.rb → prism/desugar_compiler.rb} +4 -2
- data/lib/prism/dispatcher.rb +2051 -0
- data/lib/prism/dsl.rb +750 -0
- data/lib/{yarp → prism}/ffi.rb +66 -67
- data/lib/{yarp → prism}/lex_compat.rb +40 -43
- data/lib/{yarp/mutation_visitor.rb → prism/mutation_compiler.rb} +3 -3
- data/lib/{yarp → prism}/node.rb +2012 -2593
- data/lib/prism/node_ext.rb +55 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/{yarp → prism}/pack.rb +1 -1
- data/lib/{yarp → prism}/parse_result/comments.rb +1 -1
- data/lib/{yarp → prism}/parse_result/newlines.rb +1 -1
- data/lib/prism/parse_result.rb +266 -0
- data/lib/{yarp → prism}/pattern.rb +14 -14
- data/lib/{yarp → prism}/ripper_compat.rb +5 -5
- data/lib/{yarp → prism}/serialize.rb +12 -7
- data/lib/prism/visitor.rb +470 -0
- data/lib/prism.rb +64 -0
- data/lib/yarp.rb +2 -614
- data/src/diagnostic.c +213 -208
- data/src/enc/pm_big5.c +52 -0
- data/src/enc/pm_euc_jp.c +58 -0
- data/src/enc/{yp_gbk.c → pm_gbk.c} +16 -16
- data/src/enc/pm_shift_jis.c +56 -0
- data/src/enc/{yp_tables.c → pm_tables.c} +69 -69
- data/src/enc/{yp_unicode.c → pm_unicode.c} +40 -40
- data/src/enc/pm_windows_31j.c +56 -0
- data/src/node.c +1293 -1233
- data/src/pack.c +247 -247
- data/src/prettyprint.c +1479 -1479
- data/src/{yarp.c → prism.c} +5205 -5083
- data/src/regexp.c +132 -132
- data/src/serialize.c +1121 -1121
- data/src/token_type.c +169 -167
- data/src/unescape.c +106 -87
- data/src/util/pm_buffer.c +103 -0
- data/src/util/{yp_char.c → pm_char.c} +72 -72
- data/src/util/{yp_constant_pool.c → pm_constant_pool.c} +85 -64
- data/src/util/{yp_list.c → pm_list.c} +10 -10
- data/src/util/{yp_memchr.c → pm_memchr.c} +6 -4
- data/src/util/{yp_newline_list.c → pm_newline_list.c} +21 -21
- data/src/util/{yp_state_stack.c → pm_state_stack.c} +4 -4
- data/src/util/{yp_string.c → pm_string.c} +38 -38
- data/src/util/pm_string_list.c +29 -0
- data/src/util/{yp_strncasecmp.c → pm_strncasecmp.c} +1 -1
- data/src/util/{yp_strpbrk.c → pm_strpbrk.c} +8 -8
- data/yarp.gemspec +68 -59
- metadata +70 -61
- data/ext/yarp/api_node.c +0 -4728
- data/ext/yarp/extension.h +0 -18
- data/include/yarp/ast.h +0 -1929
- data/include/yarp/defines.h +0 -45
- data/include/yarp/diagnostic.h +0 -226
- data/include/yarp/node.h +0 -42
- data/include/yarp/pack.h +0 -141
- data/include/yarp/regexp.h +0 -19
- data/include/yarp/unescape.h +0 -44
- data/include/yarp/util/yp_buffer.h +0 -51
- data/include/yarp/util/yp_memchr.h +0 -14
- data/include/yarp/util/yp_state_stack.h +0 -24
- data/include/yarp/util/yp_string_list.h +0 -25
- data/include/yarp/version.h +0 -4
- data/include/yarp.h +0 -82
- data/src/enc/yp_big5.c +0 -52
- data/src/enc/yp_euc_jp.c +0 -58
- data/src/enc/yp_shift_jis.c +0 -56
- data/src/enc/yp_windows_31j.c +0 -56
- data/src/util/yp_buffer.c +0 -101
- data/src/util/yp_string_list.c +0 -29
@@ -1,16 +1,16 @@
|
|
1
1
|
// Note that the UTF-8 decoding code is based on Bjoern Hoehrmann's UTF-8 DFA
|
2
2
|
// decoder. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
3
3
|
|
4
|
-
#include "
|
4
|
+
#include "prism/enc/pm_encoding.h"
|
5
5
|
|
6
|
-
typedef uint32_t
|
6
|
+
typedef uint32_t pm_unicode_codepoint_t;
|
7
7
|
|
8
8
|
// Each element of the following table contains a bitfield that indicates a
|
9
9
|
// piece of information about the corresponding unicode codepoint. Note that
|
10
10
|
// this table is different from other encodings where we used a lookup table
|
11
11
|
// because the indices of those tables are the byte representations, not the
|
12
12
|
// codepoints themselves.
|
13
|
-
const uint8_t
|
13
|
+
const uint8_t pm_encoding_unicode_table[256] = {
|
14
14
|
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
15
15
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
16
16
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
@@ -31,7 +31,7 @@ const uint8_t yp_encoding_unicode_table[256] = {
|
|
31
31
|
};
|
32
32
|
|
33
33
|
#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1450
|
34
|
-
static const
|
34
|
+
static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = {
|
35
35
|
0x100, 0x2C1,
|
36
36
|
0x2C6, 0x2D1,
|
37
37
|
0x2E0, 0x2E4,
|
@@ -760,7 +760,7 @@ static const yp_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
|
|
760
760
|
};
|
761
761
|
|
762
762
|
#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1528
|
763
|
-
static const
|
763
|
+
static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = {
|
764
764
|
0x100, 0x2C1,
|
765
765
|
0x2C6, 0x2D1,
|
766
766
|
0x2E0, 0x2E4,
|
@@ -1528,7 +1528,7 @@ static const yp_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
|
|
1528
1528
|
};
|
1529
1529
|
|
1530
1530
|
#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
|
1531
|
-
static const
|
1531
|
+
static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
|
1532
1532
|
0x100, 0x100,
|
1533
1533
|
0x102, 0x102,
|
1534
1534
|
0x104, 0x104,
|
@@ -2180,7 +2180,7 @@ static const yp_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
|
|
2180
2180
|
};
|
2181
2181
|
|
2182
2182
|
static bool
|
2183
|
-
|
2183
|
+
pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) {
|
2184
2184
|
size_t start = 0;
|
2185
2185
|
size_t end = size;
|
2186
2186
|
|
@@ -2202,7 +2202,7 @@ yp_unicode_codepoint_match(yp_unicode_codepoint_t codepoint, const yp_unicode_co
|
|
2202
2202
|
return false;
|
2203
2203
|
}
|
2204
2204
|
|
2205
|
-
static const uint8_t
|
2205
|
+
static const uint8_t pm_utf_8_dfa[] = {
|
2206
2206
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
2207
2207
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
2208
2208
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
@@ -2219,8 +2219,8 @@ static const uint8_t yp_utf_8_dfa[] = {
|
|
2219
2219
|
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
2220
2220
|
};
|
2221
2221
|
|
2222
|
-
static
|
2223
|
-
|
2222
|
+
static pm_unicode_codepoint_t
|
2223
|
+
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
2224
2224
|
assert(n >= 1);
|
2225
2225
|
size_t maximum = (size_t) n;
|
2226
2226
|
|
@@ -2229,16 +2229,16 @@ yp_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
|
2229
2229
|
|
2230
2230
|
for (size_t index = 0; index < 4 && index < maximum; index++) {
|
2231
2231
|
uint32_t byte = b[index];
|
2232
|
-
uint32_t type =
|
2232
|
+
uint32_t type = pm_utf_8_dfa[byte];
|
2233
2233
|
|
2234
2234
|
codepoint = (state != 0) ?
|
2235
2235
|
(byte & 0x3fu) | (codepoint << 6) :
|
2236
2236
|
(0xffu >> type) & (byte);
|
2237
2237
|
|
2238
|
-
state =
|
2238
|
+
state = pm_utf_8_dfa[256 + (state * 16) + type];
|
2239
2239
|
if (!state) {
|
2240
2240
|
*width = index + 1;
|
2241
|
-
return (
|
2241
|
+
return (pm_unicode_codepoint_t) codepoint;
|
2242
2242
|
}
|
2243
2243
|
}
|
2244
2244
|
|
@@ -2247,57 +2247,57 @@ yp_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
|
2247
2247
|
}
|
2248
2248
|
|
2249
2249
|
static size_t
|
2250
|
-
|
2250
|
+
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
2251
2251
|
size_t width;
|
2252
|
-
|
2252
|
+
pm_utf_8_codepoint(b, n, &width);
|
2253
2253
|
return width;
|
2254
2254
|
}
|
2255
2255
|
|
2256
2256
|
size_t
|
2257
|
-
|
2257
|
+
pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
2258
2258
|
if (*b < 0x80) {
|
2259
|
-
return (
|
2259
|
+
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
|
2260
2260
|
}
|
2261
2261
|
|
2262
2262
|
size_t width;
|
2263
|
-
|
2263
|
+
pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width);
|
2264
2264
|
|
2265
2265
|
if (codepoint <= 0xFF) {
|
2266
|
-
return (
|
2266
|
+
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0;
|
2267
2267
|
} else {
|
2268
|
-
return
|
2268
|
+
return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
|
2269
2269
|
}
|
2270
2270
|
}
|
2271
2271
|
|
2272
2272
|
size_t
|
2273
|
-
|
2273
|
+
pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
2274
2274
|
if (*b < 0x80) {
|
2275
|
-
return (
|
2275
|
+
return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
|
2276
2276
|
}
|
2277
2277
|
|
2278
2278
|
size_t width;
|
2279
|
-
|
2279
|
+
pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width);
|
2280
2280
|
|
2281
2281
|
if (codepoint <= 0xFF) {
|
2282
|
-
return (
|
2282
|
+
return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
|
2283
2283
|
} else {
|
2284
|
-
return
|
2284
|
+
return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
|
2285
2285
|
}
|
2286
2286
|
}
|
2287
2287
|
|
2288
2288
|
static bool
|
2289
|
-
|
2289
|
+
pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
2290
2290
|
if (*b < 0x80) {
|
2291
|
-
return (
|
2291
|
+
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
|
2292
2292
|
}
|
2293
2293
|
|
2294
2294
|
size_t width;
|
2295
|
-
|
2295
|
+
pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width);
|
2296
2296
|
|
2297
2297
|
if (codepoint <= 0xFF) {
|
2298
|
-
return (
|
2298
|
+
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
|
2299
2299
|
} else {
|
2300
|
-
return
|
2300
|
+
return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
|
2301
2301
|
}
|
2302
2302
|
}
|
2303
2303
|
|
@@ -2305,20 +2305,20 @@ yp_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
|
2305
2305
|
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
|
2306
2306
|
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
|
2307
2307
|
|
2308
|
-
|
2308
|
+
pm_encoding_t pm_encoding_utf_8 = {
|
2309
2309
|
.name = "utf-8",
|
2310
|
-
.char_width =
|
2311
|
-
.alnum_char =
|
2312
|
-
.alpha_char =
|
2313
|
-
.isupper_char =
|
2310
|
+
.char_width = pm_encoding_utf_8_char_width,
|
2311
|
+
.alnum_char = pm_encoding_utf_8_alnum_char,
|
2312
|
+
.alpha_char = pm_encoding_utf_8_alpha_char,
|
2313
|
+
.isupper_char = pm_encoding_utf_8_isupper_char,
|
2314
2314
|
.multibyte = true
|
2315
2315
|
};
|
2316
2316
|
|
2317
|
-
|
2317
|
+
pm_encoding_t pm_encoding_utf8_mac = {
|
2318
2318
|
.name = "utf8-mac",
|
2319
|
-
.char_width =
|
2320
|
-
.alnum_char =
|
2321
|
-
.alpha_char =
|
2322
|
-
.isupper_char =
|
2319
|
+
.char_width = pm_encoding_utf_8_char_width,
|
2320
|
+
.alnum_char = pm_encoding_utf_8_alnum_char,
|
2321
|
+
.alpha_char = pm_encoding_utf_8_alpha_char,
|
2322
|
+
.isupper_char = pm_encoding_utf_8_isupper_char,
|
2323
2323
|
.multibyte = true
|
2324
2324
|
};
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#include "prism/enc/pm_encoding.h"
|
2
|
+
|
3
|
+
static size_t
|
4
|
+
pm_encoding_windows_31j_char_width(const uint8_t *b, ptrdiff_t n) {
|
5
|
+
// These are the single byte characters.
|
6
|
+
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
|
7
|
+
return 1;
|
8
|
+
}
|
9
|
+
|
10
|
+
// These are the double byte characters.
|
11
|
+
if (
|
12
|
+
(n > 1) &&
|
13
|
+
((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
|
14
|
+
(b[1] >= 0x40 && b[1] <= 0xFC)
|
15
|
+
) {
|
16
|
+
return 2;
|
17
|
+
}
|
18
|
+
|
19
|
+
return 0;
|
20
|
+
}
|
21
|
+
|
22
|
+
static size_t
|
23
|
+
pm_encoding_windows_31j_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
24
|
+
if (pm_encoding_windows_31j_char_width(b, n) == 1) {
|
25
|
+
return pm_encoding_ascii_alpha_char(b, n);
|
26
|
+
} else {
|
27
|
+
return 0;
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
static size_t
|
32
|
+
pm_encoding_windows_31j_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
33
|
+
if (pm_encoding_windows_31j_char_width(b, n) == 1) {
|
34
|
+
return pm_encoding_ascii_alnum_char(b, n);
|
35
|
+
} else {
|
36
|
+
return 0;
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
static bool
|
41
|
+
pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
42
|
+
if (pm_encoding_windows_31j_char_width(b, n) == 1) {
|
43
|
+
return pm_encoding_ascii_isupper_char(b, n);
|
44
|
+
} else {
|
45
|
+
return false;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
pm_encoding_t pm_encoding_windows_31j = {
|
50
|
+
.name = "windows-31j",
|
51
|
+
.char_width = pm_encoding_windows_31j_char_width,
|
52
|
+
.alnum_char = pm_encoding_windows_31j_alnum_char,
|
53
|
+
.alpha_char = pm_encoding_windows_31j_alpha_char,
|
54
|
+
.isupper_char = pm_encoding_windows_31j_isupper_char,
|
55
|
+
.multibyte = true
|
56
|
+
};
|