prism 0.24.0 → 0.25.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +50 -1
  4. data/Makefile +5 -2
  5. data/README.md +45 -6
  6. data/config.yml +499 -4
  7. data/docs/build_system.md +31 -0
  8. data/docs/configuration.md +2 -0
  9. data/docs/cruby_compilation.md +1 -1
  10. data/docs/parser_translation.md +14 -9
  11. data/docs/releasing.md +2 -2
  12. data/docs/ripper_translation.md +50 -0
  13. data/docs/ruby_api.md +1 -0
  14. data/docs/serialization.md +26 -5
  15. data/ext/prism/api_node.c +911 -815
  16. data/ext/prism/api_pack.c +9 -0
  17. data/ext/prism/extconf.rb +27 -11
  18. data/ext/prism/extension.c +313 -66
  19. data/ext/prism/extension.h +5 -4
  20. data/include/prism/ast.h +213 -64
  21. data/include/prism/defines.h +106 -2
  22. data/include/prism/diagnostic.h +134 -71
  23. data/include/prism/encoding.h +22 -4
  24. data/include/prism/node.h +93 -0
  25. data/include/prism/options.h +82 -7
  26. data/include/prism/pack.h +11 -0
  27. data/include/prism/parser.h +198 -53
  28. data/include/prism/prettyprint.h +8 -0
  29. data/include/prism/static_literals.h +118 -0
  30. data/include/prism/util/pm_buffer.h +65 -2
  31. data/include/prism/util/pm_constant_pool.h +18 -1
  32. data/include/prism/util/pm_integer.h +119 -0
  33. data/include/prism/util/pm_list.h +1 -1
  34. data/include/prism/util/pm_newline_list.h +8 -0
  35. data/include/prism/util/pm_string.h +26 -2
  36. data/include/prism/version.h +2 -2
  37. data/include/prism.h +59 -1
  38. data/lib/prism/compiler.rb +8 -1
  39. data/lib/prism/debug.rb +46 -3
  40. data/lib/prism/desugar_compiler.rb +1 -1
  41. data/lib/prism/dispatcher.rb +29 -0
  42. data/lib/prism/dot_visitor.rb +87 -16
  43. data/lib/prism/dsl.rb +24 -12
  44. data/lib/prism/ffi.rb +67 -12
  45. data/lib/prism/lex_compat.rb +17 -15
  46. data/lib/prism/mutation_compiler.rb +11 -0
  47. data/lib/prism/node.rb +2096 -2499
  48. data/lib/prism/node_ext.rb +77 -29
  49. data/lib/prism/pack.rb +4 -0
  50. data/lib/prism/parse_result/comments.rb +34 -17
  51. data/lib/prism/parse_result/newlines.rb +3 -1
  52. data/lib/prism/parse_result.rb +78 -32
  53. data/lib/prism/pattern.rb +16 -4
  54. data/lib/prism/polyfill/string.rb +12 -0
  55. data/lib/prism/serialize.rb +439 -102
  56. data/lib/prism/translation/parser/compiler.rb +152 -50
  57. data/lib/prism/translation/parser/lexer.rb +103 -22
  58. data/lib/prism/translation/parser/rubocop.rb +41 -13
  59. data/lib/prism/translation/parser.rb +119 -7
  60. data/lib/prism/translation/parser33.rb +1 -1
  61. data/lib/prism/translation/parser34.rb +1 -1
  62. data/lib/prism/translation/ripper/sexp.rb +125 -0
  63. data/lib/prism/translation/ripper/shim.rb +5 -0
  64. data/lib/prism/translation/ripper.rb +3212 -462
  65. data/lib/prism/translation/ruby_parser.rb +35 -18
  66. data/lib/prism/translation.rb +3 -1
  67. data/lib/prism/visitor.rb +10 -0
  68. data/lib/prism.rb +8 -2
  69. data/prism.gemspec +33 -4
  70. data/rbi/prism/compiler.rbi +14 -0
  71. data/rbi/prism/desugar_compiler.rbi +5 -0
  72. data/rbi/prism/mutation_compiler.rbi +5 -0
  73. data/rbi/prism/node.rbi +8221 -0
  74. data/rbi/prism/node_ext.rbi +102 -0
  75. data/rbi/prism/parse_result.rbi +304 -0
  76. data/rbi/prism/translation/parser/compiler.rbi +13 -0
  77. data/rbi/prism/translation/ripper/ripper_compiler.rbi +5 -0
  78. data/rbi/prism/translation/ripper.rbi +25 -0
  79. data/rbi/prism/translation/ruby_parser.rbi +11 -0
  80. data/rbi/prism/visitor.rbi +470 -0
  81. data/rbi/prism.rbi +39 -7749
  82. data/sig/prism/compiler.rbs +9 -0
  83. data/sig/prism/dispatcher.rbs +16 -0
  84. data/sig/prism/dot_visitor.rbs +6 -0
  85. data/sig/prism/dsl.rbs +462 -0
  86. data/sig/prism/mutation_compiler.rbs +158 -0
  87. data/sig/prism/node.rbs +3529 -0
  88. data/sig/prism/node_ext.rbs +78 -0
  89. data/sig/prism/pack.rbs +43 -0
  90. data/sig/prism/parse_result.rbs +127 -0
  91. data/sig/prism/pattern.rbs +13 -0
  92. data/sig/prism/serialize.rbs +7 -0
  93. data/sig/prism/visitor.rbs +168 -0
  94. data/sig/prism.rbs +188 -4767
  95. data/src/diagnostic.c +575 -230
  96. data/src/encoding.c +211 -108
  97. data/src/node.c +7526 -447
  98. data/src/options.c +36 -12
  99. data/src/pack.c +33 -17
  100. data/src/prettyprint.c +1294 -1385
  101. data/src/prism.c +3628 -1099
  102. data/src/regexp.c +17 -2
  103. data/src/serialize.c +47 -28
  104. data/src/static_literals.c +552 -0
  105. data/src/token_type.c +1 -0
  106. data/src/util/pm_buffer.c +147 -20
  107. data/src/util/pm_char.c +4 -4
  108. data/src/util/pm_constant_pool.c +35 -11
  109. data/src/util/pm_integer.c +629 -0
  110. data/src/util/pm_list.c +1 -1
  111. data/src/util/pm_newline_list.c +14 -5
  112. data/src/util/pm_string.c +134 -5
  113. data/src/util/pm_string_list.c +2 -2
  114. metadata +35 -6
  115. data/docs/ripper.md +0 -36
  116. data/rbi/prism_static.rbi +0 -207
  117. data/sig/prism_static.rbs +0 -201
data/src/encoding.c CHANGED
@@ -1499,7 +1499,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
1499
1499
  0x31350, 0x323AF,
1500
1500
  };
1501
1501
 
1502
- #define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
1502
+ #define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
1503
1503
  static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
1504
1504
  0x100, 0x100,
1505
1505
  0x102, 0x102,
@@ -1582,9 +1582,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1582
1582
  0x1B5, 0x1B5,
1583
1583
  0x1B7, 0x1B8,
1584
1584
  0x1BC, 0x1BC,
1585
- 0x1C4, 0x1C4,
1586
- 0x1C7, 0x1C7,
1587
- 0x1CA, 0x1CA,
1585
+ 0x1C4, 0x1C5,
1586
+ 0x1C7, 0x1C8,
1587
+ 0x1CA, 0x1CB,
1588
1588
  0x1CD, 0x1CD,
1589
1589
  0x1CF, 0x1CF,
1590
1590
  0x1D1, 0x1D1,
@@ -1602,7 +1602,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1602
1602
  0x1EA, 0x1EA,
1603
1603
  0x1EC, 0x1EC,
1604
1604
  0x1EE, 0x1EE,
1605
- 0x1F1, 0x1F1,
1605
+ 0x1F1, 0x1F2,
1606
1606
  0x1F4, 0x1F4,
1607
1607
  0x1F6, 0x1F8,
1608
1608
  0x1FA, 0x1FA,
@@ -1910,11 +1910,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1910
1910
  0x1F5D, 0x1F5D,
1911
1911
  0x1F5F, 0x1F5F,
1912
1912
  0x1F68, 0x1F6F,
1913
- 0x1FB8, 0x1FBB,
1914
- 0x1FC8, 0x1FCB,
1913
+ 0x1F88, 0x1F8F,
1914
+ 0x1F98, 0x1F9F,
1915
+ 0x1FA8, 0x1FAF,
1916
+ 0x1FB8, 0x1FBC,
1917
+ 0x1FC8, 0x1FCC,
1915
1918
  0x1FD8, 0x1FDB,
1916
1919
  0x1FE8, 0x1FEC,
1917
- 0x1FF8, 0x1FFB,
1920
+ 0x1FF8, 0x1FFC,
1918
1921
  0x2102, 0x2102,
1919
1922
  0x2107, 0x2107,
1920
1923
  0x210B, 0x210D,
@@ -2355,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
2355
2358
  }
2356
2359
  }
2357
2360
 
2361
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
2362
+
2358
2363
  static pm_unicode_codepoint_t
2359
2364
  pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
2360
2365
  if (b[0] < 0x80) {
@@ -2449,13 +2454,15 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
2449
2454
  }
2450
2455
  }
2451
2456
 
2457
+ #endif
2458
+
2452
2459
  #undef UNICODE_ALPHA_CODEPOINTS_LENGTH
2453
2460
  #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
2454
2461
  #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
2455
2462
 
2456
2463
  /**
2457
2464
  * Each element of the following table contains a bitfield that indicates a
2458
- * piece of information about the corresponding ASCII character.
2465
+ * piece of information about the corresponding US-ASCII character.
2459
2466
  */
2460
2467
  static const uint8_t pm_encoding_ascii_table[256] = {
2461
2468
  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
@@ -2477,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
2477
2484
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
2478
2485
  };
2479
2486
 
2487
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
2488
+
2480
2489
  /**
2481
2490
  * Each element of the following table contains a bitfield that indicates a
2482
2491
  * piece of information about the corresponding CP850 character.
@@ -3624,7 +3633,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
3624
3633
  0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
3625
3634
  0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
3626
3635
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
3627
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3636
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3628
3637
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3629
3638
  3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
3630
3639
  };
@@ -3672,7 +3681,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
3672
3681
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
3673
3682
  0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
3674
3683
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
3675
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3684
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3676
3685
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3677
3686
  3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
3678
3687
  };
@@ -3915,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
3915
3924
  PRISM_ENCODING_TABLE(windows_874)
3916
3925
 
3917
3926
  #undef PRISM_ENCODING_TABLE
3927
+ #endif
3918
3928
 
3919
3929
  /**
3920
3930
  * Returns the size of the next character in the ASCII encoding. This basically
@@ -3973,22 +3983,129 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
3973
3983
  }
3974
3984
 
3975
3985
  /**
3976
- * Certain encodings are equivalent to ASCII below 0x80, so it works for our
3977
- * purposes to have a function here that first checks the bounds and then falls
3978
- * back to checking the ASCII lookup table.
3986
+ * For a lot of encodings the default is that they are a single byte long no
3987
+ * matter what the codepoint, so this function is shared between them.
3988
+ */
3989
+ static size_t
3990
+ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
3991
+ return 1;
3992
+ }
3993
+
3994
+ /**
3995
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
3996
+ * character cannot be decoded from the given bytes.
3997
+ */
3998
+ static size_t
3999
+ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
4000
+ // These are the single byte characters.
4001
+ if (*b < 0x80) {
4002
+ return 1;
4003
+ }
4004
+
4005
+ // These are the double byte characters.
4006
+ if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
4007
+ return 2;
4008
+ }
4009
+
4010
+ // These are the triple byte characters.
4011
+ if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
4012
+ return 3;
4013
+ }
4014
+
4015
+ return 0;
4016
+ }
4017
+
4018
+ /**
4019
+ * Returns the size of the next character in the EUC-JP encoding if it is an
4020
+ * uppercase character.
3979
4021
  */
3980
4022
  static bool
3981
- pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
3982
- return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
4023
+ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
4024
+ size_t width = pm_encoding_euc_jp_char_width(b, n);
4025
+
4026
+ if (width == 1) {
4027
+ return pm_encoding_ascii_isupper_char(b, n);
4028
+ } else if (width == 2) {
4029
+ return (
4030
+ (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
4031
+ (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
4032
+ (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
4033
+ );
4034
+ } else {
4035
+ return false;
4036
+ }
3983
4037
  }
3984
4038
 
3985
4039
  /**
3986
- * For a lot of encodings the default is that they are a single byte long no
3987
- * matter what the codepoint, so this function is shared between them.
4040
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
4041
+ * character cannot be decoded from the given bytes.
3988
4042
  */
3989
4043
  static size_t
3990
- pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
3991
- return 1;
4044
+ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
4045
+ // These are the single byte characters.
4046
+ if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
4047
+ return 1;
4048
+ }
4049
+
4050
+ // These are the double byte characters.
4051
+ if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
4052
+ return 2;
4053
+ }
4054
+
4055
+ return 0;
4056
+ }
4057
+
4058
+ /**
4059
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4060
+ * alphanumeric character.
4061
+ */
4062
+ static size_t
4063
+ pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
4064
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4065
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
4066
+ }
4067
+
4068
+ /**
4069
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4070
+ * alphabetical character.
4071
+ */
4072
+ static size_t
4073
+ pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
4074
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4075
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
4076
+ }
4077
+
4078
+ /**
4079
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4080
+ * uppercase character.
4081
+ */
4082
+ static bool
4083
+ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
4084
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4085
+
4086
+ if (width == 1) {
4087
+ return pm_encoding_ascii_isupper_char(b, n);
4088
+ } else if (width == 2) {
4089
+ return (
4090
+ ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
4091
+ ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
4092
+ ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
4093
+ );
4094
+ } else {
4095
+ return width;
4096
+ }
4097
+ }
4098
+
4099
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4100
+
4101
+ /**
4102
+ * Certain encodings are equivalent to ASCII below 0x80, so it works for our
4103
+ * purposes to have a function here that first checks the bounds and then falls
4104
+ * back to checking the ASCII lookup table.
4105
+ */
4106
+ static bool
4107
+ pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
4108
+ return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
3992
4109
  }
3993
4110
 
3994
4111
  /**
@@ -4022,7 +4139,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
4022
4139
  }
4023
4140
 
4024
4141
  // These are the double byte characters
4025
- if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xfe) && (b[1] >= 0x41 && b[1] <= 0xfe)) {
4142
+ if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
4026
4143
  return 2;
4027
4144
  }
4028
4145
 
@@ -4072,30 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
4072
4189
  return 0;
4073
4190
  }
4074
4191
 
4075
- /**
4076
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
4077
- * character cannot be decoded from the given bytes.
4078
- */
4079
- static size_t
4080
- pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
4081
- // These are the single byte characters.
4082
- if (*b < 0x80) {
4083
- return 1;
4084
- }
4085
-
4086
- // These are the double byte characters.
4087
- if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
4088
- return 2;
4089
- }
4090
-
4091
- // These are the triple byte characters.
4092
- if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
4093
- return 3;
4094
- }
4095
-
4096
- return 0;
4097
- }
4098
-
4099
4192
  /**
4100
4193
  * Returns the size of the next character in the EUC-KR encoding, or 0 if a
4101
4194
  * character cannot be decoded from the given bytes.
@@ -4194,24 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
4194
4287
  return 0;
4195
4288
  }
4196
4289
 
4197
- /**
4198
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
4199
- * character cannot be decoded from the given bytes.
4200
- */
4201
- static size_t
4202
- pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
4203
- // These are the single byte characters.
4204
- if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
4205
- return 1;
4206
- }
4207
-
4208
- // These are the double byte characters.
4209
- if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
4210
- return 2;
4211
- }
4212
-
4213
- return 0;
4214
- }
4290
+ #endif
4215
4291
 
4216
4292
  /**
4217
4293
  * This is the table of all of the encodings that prism supports.
@@ -4225,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
4225
4301
  .isupper_char = pm_encoding_utf_8_isupper_char,
4226
4302
  .multibyte = true
4227
4303
  },
4304
+ [PM_ENCODING_US_ASCII] = {
4305
+ .name = "US-ASCII",
4306
+ .char_width = pm_encoding_ascii_char_width,
4307
+ .alnum_char = pm_encoding_ascii_alnum_char,
4308
+ .alpha_char = pm_encoding_ascii_alpha_char,
4309
+ .isupper_char = pm_encoding_ascii_isupper_char,
4310
+ .multibyte = false
4311
+ },
4228
4312
  [PM_ENCODING_ASCII_8BIT] = {
4229
4313
  .name = "ASCII-8BIT",
4230
4314
  .char_width = pm_encoding_single_char_width,
@@ -4233,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
4233
4317
  .isupper_char = pm_encoding_ascii_isupper_char,
4234
4318
  .multibyte = false
4235
4319
  },
4320
+ [PM_ENCODING_EUC_JP] = {
4321
+ .name = "EUC-JP",
4322
+ .char_width = pm_encoding_euc_jp_char_width,
4323
+ .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4324
+ .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4325
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4326
+ .multibyte = true
4327
+ },
4328
+ [PM_ENCODING_WINDOWS_31J] = {
4329
+ .name = "Windows-31J",
4330
+ .char_width = pm_encoding_shift_jis_char_width,
4331
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4332
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4333
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4334
+ .multibyte = true
4335
+ },
4336
+
4337
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4236
4338
  [PM_ENCODING_BIG5] = {
4237
4339
  .name = "Big5",
4238
4340
  .char_width = pm_encoding_big5_char_width,
@@ -4270,7 +4372,7 @@ const pm_encoding_t pm_encodings[] = {
4270
4372
  .char_width = pm_encoding_euc_jp_char_width,
4271
4373
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4272
4374
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4273
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4375
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4274
4376
  .multibyte = true
4275
4377
  },
4276
4378
  [PM_ENCODING_CP850] = {
@@ -4329,20 +4431,12 @@ const pm_encoding_t pm_encodings[] = {
4329
4431
  .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4330
4432
  .multibyte = true
4331
4433
  },
4332
- [PM_ENCODING_EUC_JP] = {
4333
- .name = "EUC-JP",
4334
- .char_width = pm_encoding_euc_jp_char_width,
4335
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4336
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4337
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4338
- .multibyte = true
4339
- },
4340
4434
  [PM_ENCODING_EUC_JP_MS] = {
4341
4435
  .name = "eucJP-ms",
4342
4436
  .char_width = pm_encoding_euc_jp_char_width,
4343
4437
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4344
4438
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4345
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4439
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4346
4440
  .multibyte = true
4347
4441
  },
4348
4442
  [PM_ENCODING_EUC_JIS_2004] = {
@@ -4350,7 +4444,7 @@ const pm_encoding_t pm_encodings[] = {
4350
4444
  .char_width = pm_encoding_euc_jp_char_width,
4351
4445
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4352
4446
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4353
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4447
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4354
4448
  .multibyte = true
4355
4449
  },
4356
4450
  [PM_ENCODING_EUC_KR] = {
@@ -4708,9 +4802,9 @@ const pm_encoding_t pm_encodings[] = {
4708
4802
  [PM_ENCODING_MAC_JAPANESE] = {
4709
4803
  .name = "MacJapanese",
4710
4804
  .char_width = pm_encoding_shift_jis_char_width,
4711
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4712
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4713
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4805
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4806
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4807
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4714
4808
  .multibyte = true
4715
4809
  },
4716
4810
  [PM_ENCODING_MAC_ROMAN] = {
@@ -4756,33 +4850,33 @@ const pm_encoding_t pm_encodings[] = {
4756
4850
  [PM_ENCODING_SHIFT_JIS] = {
4757
4851
  .name = "Shift_JIS",
4758
4852
  .char_width = pm_encoding_shift_jis_char_width,
4759
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4760
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4761
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4853
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4854
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4855
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4762
4856
  .multibyte = true
4763
4857
  },
4764
4858
  [PM_ENCODING_SJIS_DOCOMO] = {
4765
4859
  .name = "SJIS-DoCoMo",
4766
4860
  .char_width = pm_encoding_shift_jis_char_width,
4767
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4768
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4769
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4861
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4862
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4863
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4770
4864
  .multibyte = true
4771
4865
  },
4772
4866
  [PM_ENCODING_SJIS_KDDI] = {
4773
4867
  .name = "SJIS-KDDI",
4774
4868
  .char_width = pm_encoding_shift_jis_char_width,
4775
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4776
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4777
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4869
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4870
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4871
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4778
4872
  .multibyte = true
4779
4873
  },
4780
4874
  [PM_ENCODING_SJIS_SOFTBANK] = {
4781
4875
  .name = "SJIS-SoftBank",
4782
4876
  .char_width = pm_encoding_shift_jis_char_width,
4783
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4784
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4785
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4877
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4878
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4879
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4786
4880
  .multibyte = true
4787
4881
  },
4788
4882
  [PM_ENCODING_STATELESS_ISO_2022_JP] = {
@@ -4809,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
4809
4903
  .isupper_char = pm_encoding_tis_620_isupper_char,
4810
4904
  .multibyte = false
4811
4905
  },
4812
- [PM_ENCODING_US_ASCII] = {
4813
- .name = "US-ASCII",
4814
- .char_width = pm_encoding_ascii_char_width,
4815
- .alnum_char = pm_encoding_ascii_alnum_char,
4816
- .alpha_char = pm_encoding_ascii_alpha_char,
4817
- .isupper_char = pm_encoding_ascii_isupper_char,
4818
- .multibyte = false
4819
- },
4820
4906
  [PM_ENCODING_UTF8_MAC] = {
4821
4907
  .name = "UTF8-MAC",
4822
4908
  .char_width = pm_encoding_utf_8_char_width,
@@ -4921,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
4921
5007
  .isupper_char = pm_encoding_windows_1258_isupper_char,
4922
5008
  .multibyte = false
4923
5009
  },
4924
- [PM_ENCODING_WINDOWS_31J] = {
4925
- .name = "Windows-31J",
4926
- .char_width = pm_encoding_shift_jis_char_width,
4927
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4928
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4929
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4930
- .multibyte = true
4931
- },
4932
5010
  [PM_ENCODING_WINDOWS_874] = {
4933
5011
  .name = "Windows-874",
4934
5012
  .char_width = pm_encoding_single_char_width,
@@ -4937,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
4937
5015
  .isupper_char = pm_encoding_windows_874_isupper_char,
4938
5016
  .multibyte = false
4939
5017
  }
5018
+ #endif
4940
5019
  };
4941
5020
 
4942
5021
  /**
@@ -4951,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4951
5030
  // UTF-8 can contain extra information at the end about the platform it is
4952
5031
  // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
4953
5032
  if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
5033
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4954
5034
  // We need to explicitly handle UTF-8-HFS, as that one needs to switch
4955
5035
  // over to being UTF8-MAC.
4956
5036
  if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
4957
5037
  return &pm_encodings[PM_ENCODING_UTF8_MAC];
4958
5038
  }
5039
+ #endif
4959
5040
 
4960
5041
  // Otherwise we'll return the default UTF-8 encoding.
4961
5042
  return PM_ENCODING_UTF_8_ENTRY;
@@ -4975,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4975
5056
  break;
4976
5057
  case 'B': case 'b':
4977
5058
  ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
5059
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4978
5060
  ENCODING1("Big5", PM_ENCODING_BIG5);
4979
5061
  ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
4980
5062
  ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
5063
+ #endif
4981
5064
  break;
4982
5065
  case 'C': case 'c':
5066
+ ENCODING1("CP65001", PM_ENCODING_UTF_8);
5067
+ ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
5068
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4983
5069
  ENCODING1("CESU-8", PM_ENCODING_CESU_8);
4984
5070
  ENCODING1("CP437", PM_ENCODING_IBM437);
4985
5071
  ENCODING1("CP720", PM_ENCODING_IBM720);
@@ -4999,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4999
5085
  ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
5000
5086
  ENCODING1("CP878", PM_ENCODING_KOI8_R);
5001
5087
  ENCODING1("CP863", PM_ENCODING_IBM863);
5002
- ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
5003
5088
  ENCODING1("CP936", PM_ENCODING_GBK);
5004
5089
  ENCODING1("CP949", PM_ENCODING_CP949);
5005
5090
  ENCODING1("CP950", PM_ENCODING_CP950);
@@ -5014,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5014
5099
  ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
5015
5100
  ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
5016
5101
  ENCODING1("CP51932", PM_ENCODING_CP51932);
5017
- ENCODING1("CP65001", PM_ENCODING_UTF_8);
5102
+ #endif
5018
5103
  break;
5019
5104
  case 'E': case 'e':
5020
5105
  ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
5106
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5021
5107
  ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
5022
5108
  ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
5023
5109
  ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
5024
5110
  ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
5025
5111
  ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
5026
5112
  ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
5113
+ #endif
5027
5114
  break;
5028
5115
  case 'G': case 'g':
5116
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5029
5117
  ENCODING1("GBK", PM_ENCODING_GBK);
5030
5118
  ENCODING1("GB12345", PM_ENCODING_GB12345);
5031
5119
  ENCODING1("GB18030", PM_ENCODING_GB18030);
5032
5120
  ENCODING1("GB1988", PM_ENCODING_GB1988);
5033
5121
  ENCODING1("GB2312", PM_ENCODING_GB2312);
5122
+ #endif
5034
5123
  break;
5035
5124
  case 'I': case 'i':
5125
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5036
5126
  ENCODING1("IBM437", PM_ENCODING_IBM437);
5037
5127
  ENCODING1("IBM720", PM_ENCODING_IBM720);
5038
5128
  ENCODING1("IBM737", PM_ENCODING_IBM737);
@@ -5064,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5064
5154
  ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
5065
5155
  ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
5066
5156
  ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
5157
+ #endif
5067
5158
  break;
5068
5159
  case 'K': case 'k':
5160
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5069
5161
  ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
5070
5162
  ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
5163
+ #endif
5071
5164
  break;
5072
5165
  case 'M': case 'm':
5166
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5073
5167
  ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
5074
5168
  ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
5075
5169
  ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@@ -5082,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5082
5176
  ENCODING1("macThai", PM_ENCODING_MAC_THAI);
5083
5177
  ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
5084
5178
  ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
5179
+ #endif
5085
5180
  break;
5086
5181
  case 'P': case 'p':
5087
5182
  ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
5088
5183
  break;
5089
5184
  case 'S': case 's':
5090
- ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
5091
5185
  ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
5186
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5187
+ ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
5092
5188
  ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
5093
5189
  ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
5094
5190
  ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
5095
5191
  ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
5096
5192
  ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
5193
+ #endif
5097
5194
  break;
5098
5195
  case 'T': case 't':
5196
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5099
5197
  ENCODING1("TIS-620", PM_ENCODING_TIS_620);
5198
+ #endif
5100
5199
  break;
5101
5200
  case 'U': case 'u':
5102
5201
  ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
5202
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5103
5203
  ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
5104
5204
  ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
5105
5205
  ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
5106
5206
  ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
5207
+ #endif
5107
5208
  break;
5108
5209
  case 'W': case 'w':
5109
5210
  ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
5211
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5110
5212
  ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
5111
5213
  ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
5112
5214
  ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@@ -5117,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5117
5219
  ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
5118
5220
  ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
5119
5221
  ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
5222
+ #endif
5120
5223
  break;
5121
5224
  case '6':
5122
5225
  ENCODING1("646", PM_ENCODING_US_ASCII);