yarp 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +48 -1
  3. data/Makefile +5 -1
  4. data/README.md +4 -3
  5. data/config.yml +461 -150
  6. data/docs/configuration.md +1 -0
  7. data/docs/encoding.md +5 -5
  8. data/docs/ruby_api.md +2 -0
  9. data/docs/serialization.md +3 -3
  10. data/docs/testing.md +2 -2
  11. data/ext/yarp/api_node.c +810 -199
  12. data/ext/yarp/extension.c +94 -31
  13. data/ext/yarp/extension.h +2 -2
  14. data/include/yarp/ast.h +653 -150
  15. data/include/yarp/defines.h +2 -1
  16. data/include/yarp/diagnostic.h +3 -3
  17. data/include/yarp/enc/yp_encoding.h +10 -10
  18. data/include/yarp/node.h +10 -0
  19. data/include/yarp/parser.h +19 -19
  20. data/include/yarp/regexp.h +1 -1
  21. data/include/yarp/unescape.h +7 -5
  22. data/include/yarp/util/yp_buffer.h +3 -0
  23. data/include/yarp/util/yp_char.h +16 -16
  24. data/include/yarp/util/yp_constant_pool.h +2 -2
  25. data/include/yarp/util/yp_newline_list.h +7 -4
  26. data/include/yarp/util/yp_string.h +4 -4
  27. data/include/yarp/util/yp_string_list.h +0 -3
  28. data/include/yarp/util/yp_strpbrk.h +1 -1
  29. data/include/yarp/version.h +2 -2
  30. data/include/yarp.h +14 -3
  31. data/lib/yarp/desugar_visitor.rb +204 -0
  32. data/lib/yarp/ffi.rb +27 -1
  33. data/lib/yarp/lex_compat.rb +93 -25
  34. data/lib/yarp/mutation_visitor.rb +683 -0
  35. data/lib/yarp/node.rb +3121 -597
  36. data/lib/yarp/serialize.rb +198 -126
  37. data/lib/yarp.rb +53 -7
  38. data/src/diagnostic.c +1 -1
  39. data/src/enc/yp_big5.c +15 -42
  40. data/src/enc/yp_euc_jp.c +16 -43
  41. data/src/enc/yp_gbk.c +19 -46
  42. data/src/enc/yp_shift_jis.c +16 -43
  43. data/src/enc/yp_tables.c +36 -38
  44. data/src/enc/yp_unicode.c +20 -25
  45. data/src/enc/yp_windows_31j.c +16 -43
  46. data/src/node.c +1444 -836
  47. data/src/prettyprint.c +324 -103
  48. data/src/regexp.c +21 -21
  49. data/src/serialize.c +429 -276
  50. data/src/token_type.c +2 -2
  51. data/src/unescape.c +184 -136
  52. data/src/util/yp_buffer.c +7 -2
  53. data/src/util/yp_char.c +34 -34
  54. data/src/util/yp_constant_pool.c +4 -4
  55. data/src/util/yp_memchr.c +1 -1
  56. data/src/util/yp_newline_list.c +14 -3
  57. data/src/util/yp_string.c +22 -20
  58. data/src/util/yp_string_list.c +0 -6
  59. data/src/util/yp_strncasecmp.c +3 -6
  60. data/src/util/yp_strpbrk.c +8 -8
  61. data/src/yarp.c +1504 -615
  62. data/yarp.gemspec +3 -1
  63. metadata +4 -2
data/src/enc/yp_unicode.c CHANGED
@@ -10,7 +10,7 @@ typedef uint32_t yp_unicode_codepoint_t;
10
10
  // this table is different from other encodings where we used a lookup table
11
11
  // because the indices of those tables are the byte representations, not the
12
12
  // codepoints themselves.
13
- unsigned char yp_encoding_unicode_table[256] = {
13
+ uint8_t yp_encoding_unicode_table[256] = {
14
14
  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
15
15
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
16
16
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
@@ -2220,7 +2220,7 @@ static const uint8_t yp_utf_8_dfa[] = {
2220
2220
  };
2221
2221
 
2222
2222
  static yp_unicode_codepoint_t
2223
- yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
2223
+ yp_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
2224
2224
  assert(n >= 1);
2225
2225
  size_t maximum = (size_t) n;
2226
2226
 
@@ -2228,7 +2228,7 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
2228
2228
  uint32_t state = 0;
2229
2229
 
2230
2230
  for (size_t index = 0; index < 4 && index < maximum; index++) {
2231
- uint32_t byte = c[index];
2231
+ uint32_t byte = b[index];
2232
2232
  uint32_t type = yp_utf_8_dfa[byte];
2233
2233
 
2234
2234
  codepoint = (state != 0) ?
@@ -2247,60 +2247,55 @@ yp_utf_8_codepoint(const unsigned char *c, ptrdiff_t n, size_t *width) {
2247
2247
  }
2248
2248
 
2249
2249
  static size_t
2250
- yp_encoding_utf_8_char_width(const char *c, ptrdiff_t n) {
2250
+ yp_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
2251
2251
  size_t width;
2252
- const unsigned char *v = (const unsigned char *) c;
2253
-
2254
- yp_utf_8_codepoint(v, n, &width);
2252
+ yp_utf_8_codepoint(b, n, &width);
2255
2253
  return width;
2256
2254
  }
2257
2255
 
2258
2256
  size_t
2259
- yp_encoding_utf_8_alpha_char(const char *c, ptrdiff_t n) {
2260
- const unsigned char *v = (const unsigned char *) c;
2261
- if (*v < 0x80) {
2262
- return (yp_encoding_unicode_table[*v] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
2257
+ yp_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
2258
+ if (*b < 0x80) {
2259
+ return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
2263
2260
  }
2264
2261
 
2265
2262
  size_t width;
2266
- yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
2263
+ yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
2267
2264
 
2268
2265
  if (codepoint <= 0xFF) {
2269
- return (yp_encoding_unicode_table[(unsigned char) codepoint] & YP_ENCODING_ALPHABETIC_BIT) ? width : 0;
2266
+ return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_ALPHABETIC_BIT) ? width : 0;
2270
2267
  } else {
2271
2268
  return yp_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
2272
2269
  }
2273
2270
  }
2274
2271
 
2275
2272
  size_t
2276
- yp_encoding_utf_8_alnum_char(const char *c, ptrdiff_t n) {
2277
- const unsigned char *v = (const unsigned char *) c;
2278
- if (*v < 0x80) {
2279
- return (yp_encoding_unicode_table[*v] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
2273
+ yp_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
2274
+ if (*b < 0x80) {
2275
+ return (yp_encoding_unicode_table[*b] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
2280
2276
  }
2281
2277
 
2282
2278
  size_t width;
2283
- yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
2279
+ yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
2284
2280
 
2285
2281
  if (codepoint <= 0xFF) {
2286
- return (yp_encoding_unicode_table[(unsigned char) codepoint] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
2282
+ return (yp_encoding_unicode_table[(uint8_t) codepoint] & (YP_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
2287
2283
  } else {
2288
2284
  return yp_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
2289
2285
  }
2290
2286
  }
2291
2287
 
2292
2288
  static bool
2293
- yp_encoding_utf_8_isupper_char(const char *c, ptrdiff_t n) {
2294
- const unsigned char *v = (const unsigned char *) c;
2295
- if (*v < 0x80) {
2296
- return (yp_encoding_unicode_table[*v] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
2289
+ yp_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
2290
+ if (*b < 0x80) {
2291
+ return (yp_encoding_unicode_table[*b] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
2297
2292
  }
2298
2293
 
2299
2294
  size_t width;
2300
- yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(v, n, &width);
2295
+ yp_unicode_codepoint_t codepoint = yp_utf_8_codepoint(b, n, &width);
2301
2296
 
2302
2297
  if (codepoint <= 0xFF) {
2303
- return (yp_encoding_unicode_table[(unsigned char) codepoint] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
2298
+ return (yp_encoding_unicode_table[(uint8_t) codepoint] & YP_ENCODING_UPPERCASE_BIT) ? true : false;
2304
2299
  } else {
2305
2300
  return yp_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
2306
2301
  }
@@ -1,73 +1,46 @@
1
1
  #include "yarp/enc/yp_encoding.h"
2
2
 
3
- typedef uint16_t yp_windows_31j_codepoint_t;
4
-
5
- static yp_windows_31j_codepoint_t
6
- yp_windows_31j_codepoint(const char *c, ptrdiff_t n, size_t *width) {
7
- const unsigned char *uc = (const unsigned char *) c;
8
-
3
+ static size_t
4
+ yp_encoding_windows_31j_char_width(const uint8_t *b, ptrdiff_t n) {
9
5
  // These are the single byte characters.
10
- if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
11
- *width = 1;
12
- return *uc;
6
+ if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
7
+ return 1;
13
8
  }
14
9
 
15
10
  // These are the double byte characters.
16
11
  if (
17
12
  (n > 1) &&
18
- ((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
19
- (uc[1] >= 0x40 && uc[1] <= 0xFC)
13
+ ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
14
+ (b[1] >= 0x40 && b[1] <= 0xFC)
20
15
  ) {
21
- *width = 2;
22
- return (yp_windows_31j_codepoint_t) (uc[0] << 8 | uc[1]);
16
+ return 2;
23
17
  }
24
18
 
25
- *width = 0;
26
19
  return 0;
27
20
  }
28
21
 
29
22
  static size_t
30
- yp_encoding_windows_31j_char_width(const char *c, ptrdiff_t n) {
31
- size_t width;
32
- yp_windows_31j_codepoint(c, n, &width);
33
-
34
- return width;
35
- }
36
-
37
- static size_t
38
- yp_encoding_windows_31j_alpha_char(const char *c, ptrdiff_t n) {
39
- size_t width;
40
- yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
41
-
42
- if (width == 1) {
43
- const char value = (const char) codepoint;
44
- return yp_encoding_ascii_alpha_char(&value, n);
23
+ yp_encoding_windows_31j_alpha_char(const uint8_t *b, ptrdiff_t n) {
24
+ if (yp_encoding_windows_31j_char_width(b, n) == 1) {
25
+ return yp_encoding_ascii_alpha_char(b, n);
45
26
  } else {
46
27
  return 0;
47
28
  }
48
29
  }
49
30
 
50
31
  static size_t
51
- yp_encoding_windows_31j_alnum_char(const char *c, ptrdiff_t n) {
52
- size_t width;
53
- yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
54
-
55
- if (width == 1) {
56
- const char value = (const char) codepoint;
57
- return yp_encoding_ascii_alnum_char(&value, n);
32
+ yp_encoding_windows_31j_alnum_char(const uint8_t *b, ptrdiff_t n) {
33
+ if (yp_encoding_windows_31j_char_width(b, n) == 1) {
34
+ return yp_encoding_ascii_alnum_char(b, n);
58
35
  } else {
59
36
  return 0;
60
37
  }
61
38
  }
62
39
 
63
40
  static bool
64
- yp_encoding_windows_31j_isupper_char(const char *c, ptrdiff_t n) {
65
- size_t width;
66
- yp_windows_31j_codepoint_t codepoint = yp_windows_31j_codepoint(c, n, &width);
67
-
68
- if (width == 1) {
69
- const char value = (const char) codepoint;
70
- return yp_encoding_ascii_isupper_char(&value, n);
41
+ yp_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
42
+ if (yp_encoding_windows_31j_char_width(b, n) == 1) {
43
+ return yp_encoding_ascii_isupper_char(b, n);
71
44
  } else {
72
45
  return false;
73
46
  }