RubyGems - prism - Versions diffs - 0.24.0 → 0.29.0 - Mend

prism 0.24.0 → 0.29.0

Files changed (126) hide show

checksums.yaml +4 -4
data/BSDmakefile +58 -0
data/CHANGELOG.md +132 -1
data/Makefile +25 -18
data/README.md +45 -6
data/config.yml +828 -25
data/docs/build_system.md +31 -0
data/docs/configuration.md +4 -0
data/docs/cruby_compilation.md +1 -1
data/docs/parser_translation.md +14 -9
data/docs/releasing.md +7 -9
data/docs/ripper_translation.md +50 -0
data/docs/ruby_api.md +1 -0
data/docs/serialization.md +26 -5
data/ext/prism/api_node.c +1037 -936
data/ext/prism/api_pack.c +9 -0
data/ext/prism/extconf.rb +62 -18
data/ext/prism/extension.c +351 -71
data/ext/prism/extension.h +5 -4
data/include/prism/ast.h +539 -101
data/include/prism/defines.h +106 -2
data/include/prism/diagnostic.h +168 -74
data/include/prism/encoding.h +22 -4
data/include/prism/node.h +93 -0
data/include/prism/options.h +84 -9
data/include/prism/pack.h +11 -0
data/include/prism/parser.h +213 -54
data/include/prism/prettyprint.h +8 -0
data/include/prism/static_literals.h +120 -0
data/include/prism/util/pm_buffer.h +65 -2
data/include/prism/util/pm_constant_pool.h +18 -1
data/include/prism/util/pm_integer.h +119 -0
data/include/prism/util/pm_list.h +1 -1
data/include/prism/util/pm_newline_list.h +8 -0
data/include/prism/util/pm_string.h +26 -2
data/include/prism/version.h +2 -2
data/include/prism.h +59 -1
data/lib/prism/compiler.rb +8 -1
data/lib/prism/debug.rb +46 -3
data/lib/prism/desugar_compiler.rb +5 -3
data/lib/prism/dispatcher.rb +29 -0
data/lib/prism/dot_visitor.rb +141 -54
data/lib/prism/dsl.rb +48 -36
data/lib/prism/ffi.rb +82 -17
data/lib/prism/inspect_visitor.rb +2156 -0
data/lib/prism/lex_compat.rb +34 -15
data/lib/prism/mutation_compiler.rb +13 -2
data/lib/prism/node.rb +4453 -4459
data/lib/prism/node_ext.rb +249 -30
data/lib/prism/pack.rb +4 -0
data/lib/prism/parse_result/comments.rb +35 -18
data/lib/prism/parse_result/newlines.rb +2 -2
data/lib/prism/parse_result.rb +218 -43
data/lib/prism/pattern.rb +28 -10
data/lib/prism/polyfill/byteindex.rb +13 -0
data/lib/prism/polyfill/unpack1.rb +14 -0
data/lib/prism/reflection.rb +411 -0
data/lib/prism/serialize.rb +480 -112
data/lib/prism/translation/parser/compiler.rb +376 -88
data/lib/prism/translation/parser/lexer.rb +103 -22
data/lib/prism/translation/parser/rubocop.rb +41 -13
data/lib/prism/translation/parser.rb +123 -11
data/lib/prism/translation/parser33.rb +1 -1
data/lib/prism/translation/parser34.rb +1 -1
data/lib/prism/translation/ripper/sexp.rb +125 -0
data/lib/prism/translation/ripper/shim.rb +5 -0
data/lib/prism/translation/ripper.rb +3216 -462
data/lib/prism/translation/ruby_parser.rb +111 -56
data/lib/prism/translation.rb +3 -1
data/lib/prism/visitor.rb +10 -0
data/lib/prism.rb +12 -20
data/prism.gemspec +46 -14
data/rbi/prism/compiler.rbi +12 -0
data/rbi/prism/inspect_visitor.rbi +12 -0
data/rbi/prism/node.rbi +8712 -0
data/rbi/prism/node_ext.rbi +107 -0
data/rbi/prism/parse_result.rbi +358 -0
data/rbi/prism/reflection.rbi +58 -0
data/rbi/prism/translation/parser.rbi +11 -0
data/rbi/prism/translation/parser33.rbi +6 -0
data/rbi/prism/translation/parser34.rbi +6 -0
data/rbi/prism/translation/ripper.rbi +15 -0
data/rbi/prism/visitor.rbi +470 -0
data/rbi/prism.rbi +38 -7748
data/sig/prism/compiler.rbs +9 -0
data/sig/prism/dispatcher.rbs +16 -0
data/sig/prism/dot_visitor.rbs +6 -0
data/sig/prism/dsl.rbs +462 -0
data/sig/prism/inspect_visitor.rbs +22 -0
data/sig/prism/lex_compat.rbs +10 -0
data/sig/prism/mutation_compiler.rbs +158 -0
data/sig/prism/node.rbs +3558 -0
data/sig/prism/node_ext.rbs +82 -0
data/sig/prism/pack.rbs +43 -0
data/sig/prism/parse_result.rbs +160 -0
data/sig/prism/pattern.rbs +13 -0
data/sig/prism/reflection.rbs +50 -0
data/sig/prism/serialize.rbs +6 -0
data/sig/prism/visitor.rbs +168 -0
data/sig/prism.rbs +188 -4767
data/src/diagnostic.c +636 -230
data/src/encoding.c +211 -108
data/src/node.c +7555 -451
data/src/options.c +66 -31
data/src/pack.c +33 -17
data/src/prettyprint.c +1383 -1431
data/src/prism.c +4734 -1310
data/src/regexp.c +17 -2
data/src/serialize.c +68 -46
data/src/static_literals.c +638 -0
data/src/token_type.c +10 -9
data/src/util/pm_buffer.c +147 -20
data/src/util/pm_char.c +4 -4
data/src/util/pm_constant_pool.c +35 -11
data/src/util/pm_integer.c +642 -0
data/src/util/pm_list.c +1 -1
data/src/util/pm_newline_list.c +14 -5
data/src/util/pm_string.c +134 -5
data/src/util/pm_string_list.c +2 -2
metadata +41 -9
data/docs/ripper.md +0 -36
data/include/prism/util/pm_state_stack.h +0 -42
data/lib/prism/node_inspector.rb +0 -68
data/rbi/prism_static.rbi +0 -207
data/sig/prism_static.rbs +0 -201
data/src/util/pm_state_stack.c +0 -25

data/src/encoding.c CHANGED Viewed

@@ -1499,7 +1499,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x31350, 0x323AF,
 };
-#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
+#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
 static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
     0x100, 0x100,
     0x102, 0x102,
@@ -1582,9 +1582,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x1B5, 0x1B5,
     0x1B7, 0x1B8,
     0x1BC, 0x1BC,
-    0x1C4, 0x1C4,
-    0x1C7, 0x1C7,
-    0x1CA, 0x1CA,
+    0x1C4, 0x1C5,
+    0x1C7, 0x1C8,
+    0x1CA, 0x1CB,
     0x1CD, 0x1CD,
     0x1CF, 0x1CF,
     0x1D1, 0x1D1,
@@ -1602,7 +1602,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x1EA, 0x1EA,
     0x1EC, 0x1EC,
     0x1EE, 0x1EE,
-    0x1F1, 0x1F1,
+    0x1F1, 0x1F2,
     0x1F4, 0x1F4,
     0x1F6, 0x1F8,
     0x1FA, 0x1FA,
@@ -1910,11 +1910,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x1F5D, 0x1F5D,
     0x1F5F, 0x1F5F,
     0x1F68, 0x1F6F,
-    0x1FB8, 0x1FBB,
-    0x1FC8, 0x1FCB,
+    0x1F88, 0x1F8F,
+    0x1F98, 0x1F9F,
+    0x1FA8, 0x1FAF,
+    0x1FB8, 0x1FBC,
+    0x1FC8, 0x1FCC,
     0x1FD8, 0x1FDB,
     0x1FE8, 0x1FEC,
-    0x1FF8, 0x1FFB,
+    0x1FF8, 0x1FFC,
     0x2102, 0x2102,
     0x2107, 0x2107,
     0x210B, 0x210D,
@@ -2355,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
 static pm_unicode_codepoint_t
 pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
     if (b[0] < 0x80) {
@@ -2449,13 +2454,15 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
+#endif
 #undef UNICODE_ALPHA_CODEPOINTS_LENGTH
 #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
 #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
 /**
  * Each element of the following table contains a bitfield that indicates a
- * piece of information about the corresponding ASCII character.
+ * piece of information about the corresponding US-ASCII character.
  */
 static const uint8_t pm_encoding_ascii_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
@@ -2477,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
 };
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
 /**
  * Each element of the following table contains a bitfield that indicates a
  * piece of information about the corresponding CP850 character.
@@ -3624,7 +3633,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
     0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
     0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
-    7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
@@ -3672,7 +3681,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
     0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
-    7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
@@ -3915,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
 PRISM_ENCODING_TABLE(windows_874)
 #undef PRISM_ENCODING_TABLE
+#endif
 /**
  * Returns the size of the next character in the ASCII encoding. This basically
@@ -3973,22 +3983,129 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
 }
 /**
- * Certain encodings are equivalent to ASCII below 0x80, so it works for our
- * purposes to have a function here that first checks the bounds and then falls
- * back to checking the ASCII lookup table.
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
+static size_t
+pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+    return 1;
+}
+/**
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
+ */
+static size_t
+pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
+    // These are the single byte characters.
+    if (*b < 0x80) {
+        return 1;
+    }
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
+        return 2;
+    }
+    // These are the triple byte characters.
+    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
+        return 3;
+    }
+    return 0;
+}
+/**
+ * Returns the size of the next character in the EUC-JP encoding if it is an
+ * uppercase character.
  */
 static bool
-pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
-    return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
+pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_euc_jp_char_width(b, n);
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
+            (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
+            (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
+        );
+    } else {
+        return false;
+    }
 }
 /**
- * For a lot of encodings the default is that they are a single byte long no
- * matter what the codepoint, so this function is shared between them.
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
  */
 static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return 1;
+pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
+    // These are the single byte characters.
+    if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
+        return 1;
+    }
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
+        return 2;
+    }
+    return 0;
+}
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphanumeric character.
+ */
+static size_t
+pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
+}
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphabetical character.
+ */
+static size_t
+pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
+}
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * uppercase character.
+ */
+static bool
+pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
+            ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
+            ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
+        );
+    } else {
+        return width;
+    }
+}
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+/**
+ * Certain encodings are equivalent to ASCII below 0x80, so it works for our
+ * purposes to have a function here that first checks the bounds and then falls
+ * back to checking the ASCII lookup table.
+ */
+static bool
+pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
+    return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
 }
 /**
@@ -4022,7 +4139,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
     }
     // These are the double byte characters
-    if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xfe) && (b[1] >= 0x41 && b[1] <= 0xfe)) {
+    if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
         return 2;
     }
@@ -4072,30 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
     return 0;
 }
-/**
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (*b < 0x80) {
-        return 1;
-    }
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
-        return 2;
-    }
-    // These are the triple byte characters.
-    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
-        return 3;
-    }
-    return 0;
-}
 /**
  * Returns the size of the next character in the EUC-KR encoding, or 0 if a
  * character cannot be decoded from the given bytes.
@@ -4194,24 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
     return 0;
 }
-/**
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
-        return 1;
-    }
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
-        return 2;
-    }
-    return 0;
-}
+#endif
 /**
  * This is the table of all of the encodings that prism supports.
@@ -4225,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_utf_8_isupper_char,
         .multibyte = true
     },
+    [PM_ENCODING_US_ASCII] = {
+        .name = "US-ASCII",
+        .char_width = pm_encoding_ascii_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char,
+        .alpha_char = pm_encoding_ascii_alpha_char,
+        .isupper_char = pm_encoding_ascii_isupper_char,
+        .multibyte = false
+    },
     [PM_ENCODING_ASCII_8BIT] = {
         .name = "ASCII-8BIT",
         .char_width = pm_encoding_single_char_width,
@@ -4233,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_ascii_isupper_char,
         .multibyte = false
     },
+    [PM_ENCODING_EUC_JP] = {
+        .name = "EUC-JP",
+        .char_width = pm_encoding_euc_jp_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
+        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
+        .multibyte = true
+    },
+    [PM_ENCODING_WINDOWS_31J] = {
+        .name = "Windows-31J",
+        .char_width = pm_encoding_shift_jis_char_width,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
+        .multibyte = true
+    },
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
     [PM_ENCODING_BIG5] = {
         .name = "Big5",
         .char_width = pm_encoding_big5_char_width,
@@ -4270,7 +4372,7 @@ const pm_encoding_t pm_encodings[] = {
         .char_width = pm_encoding_euc_jp_char_width,
         .alnum_char = pm_encoding_ascii_alnum_char_7bit,
         .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_CP850] = {
@@ -4329,20 +4431,12 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_ascii_isupper_char_7bit,
         .multibyte = true
     },
-    [PM_ENCODING_EUC_JP] = {
-        .name = "EUC-JP",
-        .char_width = pm_encoding_euc_jp_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
-        .multibyte = true
-    },
     [PM_ENCODING_EUC_JP_MS] = {
         .name = "eucJP-ms",
         .char_width = pm_encoding_euc_jp_char_width,
         .alnum_char = pm_encoding_ascii_alnum_char_7bit,
         .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_EUC_JIS_2004] = {
@@ -4350,7 +4444,7 @@ const pm_encoding_t pm_encodings[] = {
         .char_width = pm_encoding_euc_jp_char_width,
         .alnum_char = pm_encoding_ascii_alnum_char_7bit,
         .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_EUC_KR] = {
@@ -4708,9 +4802,9 @@ const pm_encoding_t pm_encodings[] = {
     [PM_ENCODING_MAC_JAPANESE] = {
         .name = "MacJapanese",
         .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_MAC_ROMAN] = {
@@ -4756,33 +4850,33 @@ const pm_encoding_t pm_encodings[] = {
     [PM_ENCODING_SHIFT_JIS] = {
         .name = "Shift_JIS",
         .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_SJIS_DOCOMO] = {
         .name = "SJIS-DoCoMo",
         .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_SJIS_KDDI] = {
         .name = "SJIS-KDDI",
         .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_SJIS_SOFTBANK] = {
         .name = "SJIS-SoftBank",
         .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
         .multibyte = true
     },
     [PM_ENCODING_STATELESS_ISO_2022_JP] = {
@@ -4809,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_tis_620_isupper_char,
         .multibyte = false
     },
-    [PM_ENCODING_US_ASCII] = {
-        .name = "US-ASCII",
-        .char_width = pm_encoding_ascii_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char,
-        .alpha_char = pm_encoding_ascii_alpha_char,
-        .isupper_char = pm_encoding_ascii_isupper_char,
-        .multibyte = false
-    },
     [PM_ENCODING_UTF8_MAC] = {
         .name = "UTF8-MAC",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4921,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_windows_1258_isupper_char,
         .multibyte = false
     },
-    [PM_ENCODING_WINDOWS_31J] = {
-        .name = "Windows-31J",
-        .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
-        .multibyte = true
-    },
     [PM_ENCODING_WINDOWS_874] = {
         .name = "Windows-874",
         .char_width = pm_encoding_single_char_width,
@@ -4937,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_windows_874_isupper_char,
         .multibyte = false
     }
+#endif
 };
 /**
@@ -4951,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
     // UTF-8 can contain extra information at the end about the platform it is
     // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
     if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
         // We need to explicitly handle UTF-8-HFS, as that one needs to switch
         // over to being UTF8-MAC.
         if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
             return &pm_encodings[PM_ENCODING_UTF8_MAC];
         }
+#endif
         // Otherwise we'll return the default UTF-8 encoding.
         return PM_ENCODING_UTF_8_ENTRY;
@@ -4975,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 break;
             case 'B': case 'b':
                 ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("Big5", PM_ENCODING_BIG5);
                 ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
                 ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
+#endif
                 break;
             case 'C': case 'c':
+                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("CESU-8", PM_ENCODING_CESU_8);
                 ENCODING1("CP437", PM_ENCODING_IBM437);
                 ENCODING1("CP720", PM_ENCODING_IBM720);
@@ -4999,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
                 ENCODING1("CP878", PM_ENCODING_KOI8_R);
                 ENCODING1("CP863", PM_ENCODING_IBM863);
-                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
                 ENCODING1("CP936", PM_ENCODING_GBK);
                 ENCODING1("CP949", PM_ENCODING_CP949);
                 ENCODING1("CP950", PM_ENCODING_CP950);
@@ -5014,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
                 ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
                 ENCODING1("CP51932", PM_ENCODING_CP51932);
-                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+#endif
                 break;
             case 'E': case 'e':
                 ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
                 ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
                 ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
                 ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
                 ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
                 ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
+#endif
                 break;
             case 'G': case 'g':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("GBK", PM_ENCODING_GBK);
                 ENCODING1("GB12345", PM_ENCODING_GB12345);
                 ENCODING1("GB18030", PM_ENCODING_GB18030);
                 ENCODING1("GB1988", PM_ENCODING_GB1988);
                 ENCODING1("GB2312", PM_ENCODING_GB2312);
+#endif
                 break;
             case 'I': case 'i':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("IBM437", PM_ENCODING_IBM437);
                 ENCODING1("IBM720", PM_ENCODING_IBM720);
                 ENCODING1("IBM737", PM_ENCODING_IBM737);
@@ -5064,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
                 ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
                 ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
+#endif
                 break;
             case 'K': case 'k':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
                 ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
+#endif
                 break;
             case 'M': case 'm':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
                 ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
                 ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@@ -5082,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("macThai", PM_ENCODING_MAC_THAI);
                 ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
                 ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
+#endif
                 break;
             case 'P': case 'p':
                 ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
                 break;
             case 'S': case 's':
-                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                 ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                 ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
                 ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
                 ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
                 ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
                 ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
+#endif
                 break;
             case 'T': case 't':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("TIS-620", PM_ENCODING_TIS_620);
+#endif
                 break;
             case 'U': case 'u':
                 ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
                 ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
                 ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
                 ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
+#endif
                 break;
             case 'W': case 'w':
                 ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
                 ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
                 ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@@ -5117,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
                 ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
                 ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
+#endif
                 break;
             case '6':
                 ENCODING1("646", PM_ENCODING_US_ASCII);