prism 0.24.0 → 0.29.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/BSDmakefile +58 -0
- data/CHANGELOG.md +132 -1
- data/Makefile +25 -18
- data/README.md +45 -6
- data/config.yml +828 -25
- data/docs/build_system.md +31 -0
- data/docs/configuration.md +4 -0
- data/docs/cruby_compilation.md +1 -1
- data/docs/parser_translation.md +14 -9
- data/docs/releasing.md +7 -9
- data/docs/ripper_translation.md +50 -0
- data/docs/ruby_api.md +1 -0
- data/docs/serialization.md +26 -5
- data/ext/prism/api_node.c +1037 -936
- data/ext/prism/api_pack.c +9 -0
- data/ext/prism/extconf.rb +62 -18
- data/ext/prism/extension.c +351 -71
- data/ext/prism/extension.h +5 -4
- data/include/prism/ast.h +539 -101
- data/include/prism/defines.h +106 -2
- data/include/prism/diagnostic.h +168 -74
- data/include/prism/encoding.h +22 -4
- data/include/prism/node.h +93 -0
- data/include/prism/options.h +84 -9
- data/include/prism/pack.h +11 -0
- data/include/prism/parser.h +213 -54
- data/include/prism/prettyprint.h +8 -0
- data/include/prism/static_literals.h +120 -0
- data/include/prism/util/pm_buffer.h +65 -2
- data/include/prism/util/pm_constant_pool.h +18 -1
- data/include/prism/util/pm_integer.h +119 -0
- data/include/prism/util/pm_list.h +1 -1
- data/include/prism/util/pm_newline_list.h +8 -0
- data/include/prism/util/pm_string.h +26 -2
- data/include/prism/version.h +2 -2
- data/include/prism.h +59 -1
- data/lib/prism/compiler.rb +8 -1
- data/lib/prism/debug.rb +46 -3
- data/lib/prism/desugar_compiler.rb +5 -3
- data/lib/prism/dispatcher.rb +29 -0
- data/lib/prism/dot_visitor.rb +141 -54
- data/lib/prism/dsl.rb +48 -36
- data/lib/prism/ffi.rb +82 -17
- data/lib/prism/inspect_visitor.rb +2156 -0
- data/lib/prism/lex_compat.rb +34 -15
- data/lib/prism/mutation_compiler.rb +13 -2
- data/lib/prism/node.rb +4453 -4459
- data/lib/prism/node_ext.rb +249 -30
- data/lib/prism/pack.rb +4 -0
- data/lib/prism/parse_result/comments.rb +35 -18
- data/lib/prism/parse_result/newlines.rb +2 -2
- data/lib/prism/parse_result.rb +218 -43
- data/lib/prism/pattern.rb +28 -10
- data/lib/prism/polyfill/byteindex.rb +13 -0
- data/lib/prism/polyfill/unpack1.rb +14 -0
- data/lib/prism/reflection.rb +411 -0
- data/lib/prism/serialize.rb +480 -112
- data/lib/prism/translation/parser/compiler.rb +376 -88
- data/lib/prism/translation/parser/lexer.rb +103 -22
- data/lib/prism/translation/parser/rubocop.rb +41 -13
- data/lib/prism/translation/parser.rb +123 -11
- data/lib/prism/translation/parser33.rb +1 -1
- data/lib/prism/translation/parser34.rb +1 -1
- data/lib/prism/translation/ripper/sexp.rb +125 -0
- data/lib/prism/translation/ripper/shim.rb +5 -0
- data/lib/prism/translation/ripper.rb +3216 -462
- data/lib/prism/translation/ruby_parser.rb +111 -56
- data/lib/prism/translation.rb +3 -1
- data/lib/prism/visitor.rb +10 -0
- data/lib/prism.rb +12 -20
- data/prism.gemspec +46 -14
- data/rbi/prism/compiler.rbi +12 -0
- data/rbi/prism/inspect_visitor.rbi +12 -0
- data/rbi/prism/node.rbi +8712 -0
- data/rbi/prism/node_ext.rbi +107 -0
- data/rbi/prism/parse_result.rbi +358 -0
- data/rbi/prism/reflection.rbi +58 -0
- data/rbi/prism/translation/parser.rbi +11 -0
- data/rbi/prism/translation/parser33.rbi +6 -0
- data/rbi/prism/translation/parser34.rbi +6 -0
- data/rbi/prism/translation/ripper.rbi +15 -0
- data/rbi/prism/visitor.rbi +470 -0
- data/rbi/prism.rbi +38 -7748
- data/sig/prism/compiler.rbs +9 -0
- data/sig/prism/dispatcher.rbs +16 -0
- data/sig/prism/dot_visitor.rbs +6 -0
- data/sig/prism/dsl.rbs +462 -0
- data/sig/prism/inspect_visitor.rbs +22 -0
- data/sig/prism/lex_compat.rbs +10 -0
- data/sig/prism/mutation_compiler.rbs +158 -0
- data/sig/prism/node.rbs +3558 -0
- data/sig/prism/node_ext.rbs +82 -0
- data/sig/prism/pack.rbs +43 -0
- data/sig/prism/parse_result.rbs +160 -0
- data/sig/prism/pattern.rbs +13 -0
- data/sig/prism/reflection.rbs +50 -0
- data/sig/prism/serialize.rbs +6 -0
- data/sig/prism/visitor.rbs +168 -0
- data/sig/prism.rbs +188 -4767
- data/src/diagnostic.c +636 -230
- data/src/encoding.c +211 -108
- data/src/node.c +7555 -451
- data/src/options.c +66 -31
- data/src/pack.c +33 -17
- data/src/prettyprint.c +1383 -1431
- data/src/prism.c +4734 -1310
- data/src/regexp.c +17 -2
- data/src/serialize.c +68 -46
- data/src/static_literals.c +638 -0
- data/src/token_type.c +10 -9
- data/src/util/pm_buffer.c +147 -20
- data/src/util/pm_char.c +4 -4
- data/src/util/pm_constant_pool.c +35 -11
- data/src/util/pm_integer.c +642 -0
- data/src/util/pm_list.c +1 -1
- data/src/util/pm_newline_list.c +14 -5
- data/src/util/pm_string.c +134 -5
- data/src/util/pm_string_list.c +2 -2
- metadata +41 -9
- data/docs/ripper.md +0 -36
- data/include/prism/util/pm_state_stack.h +0 -42
- data/lib/prism/node_inspector.rb +0 -68
- data/rbi/prism_static.rbi +0 -207
- data/sig/prism_static.rbs +0 -201
- data/src/util/pm_state_stack.c +0 -25
data/src/encoding.c
CHANGED
@@ -1499,7 +1499,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
|
|
1499
1499
|
0x31350, 0x323AF,
|
1500
1500
|
};
|
1501
1501
|
|
1502
|
-
#define UNICODE_ISUPPER_CODEPOINTS_LENGTH
|
1502
|
+
#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
|
1503
1503
|
static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
|
1504
1504
|
0x100, 0x100,
|
1505
1505
|
0x102, 0x102,
|
@@ -1582,9 +1582,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
|
|
1582
1582
|
0x1B5, 0x1B5,
|
1583
1583
|
0x1B7, 0x1B8,
|
1584
1584
|
0x1BC, 0x1BC,
|
1585
|
-
0x1C4,
|
1586
|
-
0x1C7,
|
1587
|
-
0x1CA,
|
1585
|
+
0x1C4, 0x1C5,
|
1586
|
+
0x1C7, 0x1C8,
|
1587
|
+
0x1CA, 0x1CB,
|
1588
1588
|
0x1CD, 0x1CD,
|
1589
1589
|
0x1CF, 0x1CF,
|
1590
1590
|
0x1D1, 0x1D1,
|
@@ -1602,7 +1602,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
|
|
1602
1602
|
0x1EA, 0x1EA,
|
1603
1603
|
0x1EC, 0x1EC,
|
1604
1604
|
0x1EE, 0x1EE,
|
1605
|
-
0x1F1,
|
1605
|
+
0x1F1, 0x1F2,
|
1606
1606
|
0x1F4, 0x1F4,
|
1607
1607
|
0x1F6, 0x1F8,
|
1608
1608
|
0x1FA, 0x1FA,
|
@@ -1910,11 +1910,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
|
|
1910
1910
|
0x1F5D, 0x1F5D,
|
1911
1911
|
0x1F5F, 0x1F5F,
|
1912
1912
|
0x1F68, 0x1F6F,
|
1913
|
-
|
1914
|
-
|
1913
|
+
0x1F88, 0x1F8F,
|
1914
|
+
0x1F98, 0x1F9F,
|
1915
|
+
0x1FA8, 0x1FAF,
|
1916
|
+
0x1FB8, 0x1FBC,
|
1917
|
+
0x1FC8, 0x1FCC,
|
1915
1918
|
0x1FD8, 0x1FDB,
|
1916
1919
|
0x1FE8, 0x1FEC,
|
1917
|
-
0x1FF8,
|
1920
|
+
0x1FF8, 0x1FFC,
|
1918
1921
|
0x2102, 0x2102,
|
1919
1922
|
0x2107, 0x2107,
|
1920
1923
|
0x210B, 0x210D,
|
@@ -2355,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
|
2355
2358
|
}
|
2356
2359
|
}
|
2357
2360
|
|
2361
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
2362
|
+
|
2358
2363
|
static pm_unicode_codepoint_t
|
2359
2364
|
pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
2360
2365
|
if (b[0] < 0x80) {
|
@@ -2449,13 +2454,15 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
|
2449
2454
|
}
|
2450
2455
|
}
|
2451
2456
|
|
2457
|
+
#endif
|
2458
|
+
|
2452
2459
|
#undef UNICODE_ALPHA_CODEPOINTS_LENGTH
|
2453
2460
|
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
|
2454
2461
|
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
|
2455
2462
|
|
2456
2463
|
/**
|
2457
2464
|
* Each element of the following table contains a bitfield that indicates a
|
2458
|
-
* piece of information about the corresponding ASCII character.
|
2465
|
+
* piece of information about the corresponding US-ASCII character.
|
2459
2466
|
*/
|
2460
2467
|
static const uint8_t pm_encoding_ascii_table[256] = {
|
2461
2468
|
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
@@ -2477,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
|
|
2477
2484
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
2478
2485
|
};
|
2479
2486
|
|
2487
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
2488
|
+
|
2480
2489
|
/**
|
2481
2490
|
* Each element of the following table contains a bitfield that indicates a
|
2482
2491
|
* piece of information about the corresponding CP850 character.
|
@@ -3624,7 +3633,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
|
|
3624
3633
|
0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
|
3625
3634
|
0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
|
3626
3635
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
|
3627
|
-
7, 7, 7, 7, 7, 7, 7,
|
3636
|
+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
|
3628
3637
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
|
3629
3638
|
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
3630
3639
|
};
|
@@ -3672,7 +3681,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
|
|
3672
3681
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
|
3673
3682
|
0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
|
3674
3683
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
|
3675
|
-
7, 7, 7, 7, 7, 7, 7,
|
3684
|
+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
|
3676
3685
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
|
3677
3686
|
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
3678
3687
|
};
|
@@ -3915,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
|
|
3915
3924
|
PRISM_ENCODING_TABLE(windows_874)
|
3916
3925
|
|
3917
3926
|
#undef PRISM_ENCODING_TABLE
|
3927
|
+
#endif
|
3918
3928
|
|
3919
3929
|
/**
|
3920
3930
|
* Returns the size of the next character in the ASCII encoding. This basically
|
@@ -3973,22 +3983,129 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
|
|
3973
3983
|
}
|
3974
3984
|
|
3975
3985
|
/**
|
3976
|
-
*
|
3977
|
-
*
|
3978
|
-
|
3986
|
+
* For a lot of encodings the default is that they are a single byte long no
|
3987
|
+
* matter what the codepoint, so this function is shared between them.
|
3988
|
+
*/
|
3989
|
+
static size_t
|
3990
|
+
pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
3991
|
+
return 1;
|
3992
|
+
}
|
3993
|
+
|
3994
|
+
/**
|
3995
|
+
* Returns the size of the next character in the EUC-JP encoding, or 0 if a
|
3996
|
+
* character cannot be decoded from the given bytes.
|
3997
|
+
*/
|
3998
|
+
static size_t
|
3999
|
+
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
|
4000
|
+
// These are the single byte characters.
|
4001
|
+
if (*b < 0x80) {
|
4002
|
+
return 1;
|
4003
|
+
}
|
4004
|
+
|
4005
|
+
// These are the double byte characters.
|
4006
|
+
if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
|
4007
|
+
return 2;
|
4008
|
+
}
|
4009
|
+
|
4010
|
+
// These are the triple byte characters.
|
4011
|
+
if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
|
4012
|
+
return 3;
|
4013
|
+
}
|
4014
|
+
|
4015
|
+
return 0;
|
4016
|
+
}
|
4017
|
+
|
4018
|
+
/**
|
4019
|
+
* Returns the size of the next character in the EUC-JP encoding if it is an
|
4020
|
+
* uppercase character.
|
3979
4021
|
*/
|
3980
4022
|
static bool
|
3981
|
-
|
3982
|
-
|
4023
|
+
pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
4024
|
+
size_t width = pm_encoding_euc_jp_char_width(b, n);
|
4025
|
+
|
4026
|
+
if (width == 1) {
|
4027
|
+
return pm_encoding_ascii_isupper_char(b, n);
|
4028
|
+
} else if (width == 2) {
|
4029
|
+
return (
|
4030
|
+
(b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
|
4031
|
+
(b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
|
4032
|
+
(b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
|
4033
|
+
);
|
4034
|
+
} else {
|
4035
|
+
return false;
|
4036
|
+
}
|
3983
4037
|
}
|
3984
4038
|
|
3985
4039
|
/**
|
3986
|
-
*
|
3987
|
-
*
|
4040
|
+
* Returns the size of the next character in the Shift_JIS encoding, or 0 if a
|
4041
|
+
* character cannot be decoded from the given bytes.
|
3988
4042
|
*/
|
3989
4043
|
static size_t
|
3990
|
-
|
3991
|
-
|
4044
|
+
pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
|
4045
|
+
// These are the single byte characters.
|
4046
|
+
if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
|
4047
|
+
return 1;
|
4048
|
+
}
|
4049
|
+
|
4050
|
+
// These are the double byte characters.
|
4051
|
+
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
|
4052
|
+
return 2;
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
return 0;
|
4056
|
+
}
|
4057
|
+
|
4058
|
+
/**
|
4059
|
+
* Returns the size of the next character in the Shift_JIS encoding if it is an
|
4060
|
+
* alphanumeric character.
|
4061
|
+
*/
|
4062
|
+
static size_t
|
4063
|
+
pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
4064
|
+
size_t width = pm_encoding_shift_jis_char_width(b, n);
|
4065
|
+
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
|
4066
|
+
}
|
4067
|
+
|
4068
|
+
/**
|
4069
|
+
* Returns the size of the next character in the Shift_JIS encoding if it is an
|
4070
|
+
* alphabetical character.
|
4071
|
+
*/
|
4072
|
+
static size_t
|
4073
|
+
pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
4074
|
+
size_t width = pm_encoding_shift_jis_char_width(b, n);
|
4075
|
+
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
|
4076
|
+
}
|
4077
|
+
|
4078
|
+
/**
|
4079
|
+
* Returns the size of the next character in the Shift_JIS encoding if it is an
|
4080
|
+
* uppercase character.
|
4081
|
+
*/
|
4082
|
+
static bool
|
4083
|
+
pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
4084
|
+
size_t width = pm_encoding_shift_jis_char_width(b, n);
|
4085
|
+
|
4086
|
+
if (width == 1) {
|
4087
|
+
return pm_encoding_ascii_isupper_char(b, n);
|
4088
|
+
} else if (width == 2) {
|
4089
|
+
return (
|
4090
|
+
((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
|
4091
|
+
((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
|
4092
|
+
((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
|
4093
|
+
);
|
4094
|
+
} else {
|
4095
|
+
return width;
|
4096
|
+
}
|
4097
|
+
}
|
4098
|
+
|
4099
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
4100
|
+
|
4101
|
+
/**
|
4102
|
+
* Certain encodings are equivalent to ASCII below 0x80, so it works for our
|
4103
|
+
* purposes to have a function here that first checks the bounds and then falls
|
4104
|
+
* back to checking the ASCII lookup table.
|
4105
|
+
*/
|
4106
|
+
static bool
|
4107
|
+
pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
|
4108
|
+
return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
|
3992
4109
|
}
|
3993
4110
|
|
3994
4111
|
/**
|
@@ -4022,7 +4139,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
|
|
4022
4139
|
}
|
4023
4140
|
|
4024
4141
|
// These are the double byte characters
|
4025
|
-
if ((n > 1) && (b[0] >= 0x81 && b[0] <=
|
4142
|
+
if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
|
4026
4143
|
return 2;
|
4027
4144
|
}
|
4028
4145
|
|
@@ -4072,30 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
|
|
4072
4189
|
return 0;
|
4073
4190
|
}
|
4074
4191
|
|
4075
|
-
/**
|
4076
|
-
* Returns the size of the next character in the EUC-JP encoding, or 0 if a
|
4077
|
-
* character cannot be decoded from the given bytes.
|
4078
|
-
*/
|
4079
|
-
static size_t
|
4080
|
-
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
|
4081
|
-
// These are the single byte characters.
|
4082
|
-
if (*b < 0x80) {
|
4083
|
-
return 1;
|
4084
|
-
}
|
4085
|
-
|
4086
|
-
// These are the double byte characters.
|
4087
|
-
if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
|
4088
|
-
return 2;
|
4089
|
-
}
|
4090
|
-
|
4091
|
-
// These are the triple byte characters.
|
4092
|
-
if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
|
4093
|
-
return 3;
|
4094
|
-
}
|
4095
|
-
|
4096
|
-
return 0;
|
4097
|
-
}
|
4098
|
-
|
4099
4192
|
/**
|
4100
4193
|
* Returns the size of the next character in the EUC-KR encoding, or 0 if a
|
4101
4194
|
* character cannot be decoded from the given bytes.
|
@@ -4194,24 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
|
|
4194
4287
|
return 0;
|
4195
4288
|
}
|
4196
4289
|
|
4197
|
-
|
4198
|
-
* Returns the size of the next character in the Shift_JIS encoding, or 0 if a
|
4199
|
-
* character cannot be decoded from the given bytes.
|
4200
|
-
*/
|
4201
|
-
static size_t
|
4202
|
-
pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
|
4203
|
-
// These are the single byte characters.
|
4204
|
-
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
|
4205
|
-
return 1;
|
4206
|
-
}
|
4207
|
-
|
4208
|
-
// These are the double byte characters.
|
4209
|
-
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
|
4210
|
-
return 2;
|
4211
|
-
}
|
4212
|
-
|
4213
|
-
return 0;
|
4214
|
-
}
|
4290
|
+
#endif
|
4215
4291
|
|
4216
4292
|
/**
|
4217
4293
|
* This is the table of all of the encodings that prism supports.
|
@@ -4225,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
|
|
4225
4301
|
.isupper_char = pm_encoding_utf_8_isupper_char,
|
4226
4302
|
.multibyte = true
|
4227
4303
|
},
|
4304
|
+
[PM_ENCODING_US_ASCII] = {
|
4305
|
+
.name = "US-ASCII",
|
4306
|
+
.char_width = pm_encoding_ascii_char_width,
|
4307
|
+
.alnum_char = pm_encoding_ascii_alnum_char,
|
4308
|
+
.alpha_char = pm_encoding_ascii_alpha_char,
|
4309
|
+
.isupper_char = pm_encoding_ascii_isupper_char,
|
4310
|
+
.multibyte = false
|
4311
|
+
},
|
4228
4312
|
[PM_ENCODING_ASCII_8BIT] = {
|
4229
4313
|
.name = "ASCII-8BIT",
|
4230
4314
|
.char_width = pm_encoding_single_char_width,
|
@@ -4233,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
|
|
4233
4317
|
.isupper_char = pm_encoding_ascii_isupper_char,
|
4234
4318
|
.multibyte = false
|
4235
4319
|
},
|
4320
|
+
[PM_ENCODING_EUC_JP] = {
|
4321
|
+
.name = "EUC-JP",
|
4322
|
+
.char_width = pm_encoding_euc_jp_char_width,
|
4323
|
+
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4324
|
+
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4325
|
+
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
4326
|
+
.multibyte = true
|
4327
|
+
},
|
4328
|
+
[PM_ENCODING_WINDOWS_31J] = {
|
4329
|
+
.name = "Windows-31J",
|
4330
|
+
.char_width = pm_encoding_shift_jis_char_width,
|
4331
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4332
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4333
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4334
|
+
.multibyte = true
|
4335
|
+
},
|
4336
|
+
|
4337
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
4236
4338
|
[PM_ENCODING_BIG5] = {
|
4237
4339
|
.name = "Big5",
|
4238
4340
|
.char_width = pm_encoding_big5_char_width,
|
@@ -4270,7 +4372,7 @@ const pm_encoding_t pm_encodings[] = {
|
|
4270
4372
|
.char_width = pm_encoding_euc_jp_char_width,
|
4271
4373
|
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4272
4374
|
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4273
|
-
.isupper_char =
|
4375
|
+
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
4274
4376
|
.multibyte = true
|
4275
4377
|
},
|
4276
4378
|
[PM_ENCODING_CP850] = {
|
@@ -4329,20 +4431,12 @@ const pm_encoding_t pm_encodings[] = {
|
|
4329
4431
|
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
|
4330
4432
|
.multibyte = true
|
4331
4433
|
},
|
4332
|
-
[PM_ENCODING_EUC_JP] = {
|
4333
|
-
.name = "EUC-JP",
|
4334
|
-
.char_width = pm_encoding_euc_jp_char_width,
|
4335
|
-
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4336
|
-
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4337
|
-
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
|
4338
|
-
.multibyte = true
|
4339
|
-
},
|
4340
4434
|
[PM_ENCODING_EUC_JP_MS] = {
|
4341
4435
|
.name = "eucJP-ms",
|
4342
4436
|
.char_width = pm_encoding_euc_jp_char_width,
|
4343
4437
|
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4344
4438
|
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4345
|
-
.isupper_char =
|
4439
|
+
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
4346
4440
|
.multibyte = true
|
4347
4441
|
},
|
4348
4442
|
[PM_ENCODING_EUC_JIS_2004] = {
|
@@ -4350,7 +4444,7 @@ const pm_encoding_t pm_encodings[] = {
|
|
4350
4444
|
.char_width = pm_encoding_euc_jp_char_width,
|
4351
4445
|
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4352
4446
|
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4353
|
-
.isupper_char =
|
4447
|
+
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
4354
4448
|
.multibyte = true
|
4355
4449
|
},
|
4356
4450
|
[PM_ENCODING_EUC_KR] = {
|
@@ -4708,9 +4802,9 @@ const pm_encoding_t pm_encodings[] = {
|
|
4708
4802
|
[PM_ENCODING_MAC_JAPANESE] = {
|
4709
4803
|
.name = "MacJapanese",
|
4710
4804
|
.char_width = pm_encoding_shift_jis_char_width,
|
4711
|
-
.alnum_char =
|
4712
|
-
.alpha_char =
|
4713
|
-
.isupper_char =
|
4805
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4806
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4807
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4714
4808
|
.multibyte = true
|
4715
4809
|
},
|
4716
4810
|
[PM_ENCODING_MAC_ROMAN] = {
|
@@ -4756,33 +4850,33 @@ const pm_encoding_t pm_encodings[] = {
|
|
4756
4850
|
[PM_ENCODING_SHIFT_JIS] = {
|
4757
4851
|
.name = "Shift_JIS",
|
4758
4852
|
.char_width = pm_encoding_shift_jis_char_width,
|
4759
|
-
.alnum_char =
|
4760
|
-
.alpha_char =
|
4761
|
-
.isupper_char =
|
4853
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4854
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4855
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4762
4856
|
.multibyte = true
|
4763
4857
|
},
|
4764
4858
|
[PM_ENCODING_SJIS_DOCOMO] = {
|
4765
4859
|
.name = "SJIS-DoCoMo",
|
4766
4860
|
.char_width = pm_encoding_shift_jis_char_width,
|
4767
|
-
.alnum_char =
|
4768
|
-
.alpha_char =
|
4769
|
-
.isupper_char =
|
4861
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4862
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4863
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4770
4864
|
.multibyte = true
|
4771
4865
|
},
|
4772
4866
|
[PM_ENCODING_SJIS_KDDI] = {
|
4773
4867
|
.name = "SJIS-KDDI",
|
4774
4868
|
.char_width = pm_encoding_shift_jis_char_width,
|
4775
|
-
.alnum_char =
|
4776
|
-
.alpha_char =
|
4777
|
-
.isupper_char =
|
4869
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4870
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4871
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4778
4872
|
.multibyte = true
|
4779
4873
|
},
|
4780
4874
|
[PM_ENCODING_SJIS_SOFTBANK] = {
|
4781
4875
|
.name = "SJIS-SoftBank",
|
4782
4876
|
.char_width = pm_encoding_shift_jis_char_width,
|
4783
|
-
.alnum_char =
|
4784
|
-
.alpha_char =
|
4785
|
-
.isupper_char =
|
4877
|
+
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
4878
|
+
.alpha_char = pm_encoding_shift_jis_alpha_char,
|
4879
|
+
.isupper_char = pm_encoding_shift_jis_isupper_char,
|
4786
4880
|
.multibyte = true
|
4787
4881
|
},
|
4788
4882
|
[PM_ENCODING_STATELESS_ISO_2022_JP] = {
|
@@ -4809,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
|
|
4809
4903
|
.isupper_char = pm_encoding_tis_620_isupper_char,
|
4810
4904
|
.multibyte = false
|
4811
4905
|
},
|
4812
|
-
[PM_ENCODING_US_ASCII] = {
|
4813
|
-
.name = "US-ASCII",
|
4814
|
-
.char_width = pm_encoding_ascii_char_width,
|
4815
|
-
.alnum_char = pm_encoding_ascii_alnum_char,
|
4816
|
-
.alpha_char = pm_encoding_ascii_alpha_char,
|
4817
|
-
.isupper_char = pm_encoding_ascii_isupper_char,
|
4818
|
-
.multibyte = false
|
4819
|
-
},
|
4820
4906
|
[PM_ENCODING_UTF8_MAC] = {
|
4821
4907
|
.name = "UTF8-MAC",
|
4822
4908
|
.char_width = pm_encoding_utf_8_char_width,
|
@@ -4921,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
|
|
4921
5007
|
.isupper_char = pm_encoding_windows_1258_isupper_char,
|
4922
5008
|
.multibyte = false
|
4923
5009
|
},
|
4924
|
-
[PM_ENCODING_WINDOWS_31J] = {
|
4925
|
-
.name = "Windows-31J",
|
4926
|
-
.char_width = pm_encoding_shift_jis_char_width,
|
4927
|
-
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
|
4928
|
-
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
|
4929
|
-
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
|
4930
|
-
.multibyte = true
|
4931
|
-
},
|
4932
5010
|
[PM_ENCODING_WINDOWS_874] = {
|
4933
5011
|
.name = "Windows-874",
|
4934
5012
|
.char_width = pm_encoding_single_char_width,
|
@@ -4937,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
|
|
4937
5015
|
.isupper_char = pm_encoding_windows_874_isupper_char,
|
4938
5016
|
.multibyte = false
|
4939
5017
|
}
|
5018
|
+
#endif
|
4940
5019
|
};
|
4941
5020
|
|
4942
5021
|
/**
|
@@ -4951,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
4951
5030
|
// UTF-8 can contain extra information at the end about the platform it is
|
4952
5031
|
// encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
|
4953
5032
|
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
|
5033
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
4954
5034
|
// We need to explicitly handle UTF-8-HFS, as that one needs to switch
|
4955
5035
|
// over to being UTF8-MAC.
|
4956
5036
|
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
|
4957
5037
|
return &pm_encodings[PM_ENCODING_UTF8_MAC];
|
4958
5038
|
}
|
5039
|
+
#endif
|
4959
5040
|
|
4960
5041
|
// Otherwise we'll return the default UTF-8 encoding.
|
4961
5042
|
return PM_ENCODING_UTF_8_ENTRY;
|
@@ -4975,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
4975
5056
|
break;
|
4976
5057
|
case 'B': case 'b':
|
4977
5058
|
ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
|
5059
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
4978
5060
|
ENCODING1("Big5", PM_ENCODING_BIG5);
|
4979
5061
|
ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
|
4980
5062
|
ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
|
5063
|
+
#endif
|
4981
5064
|
break;
|
4982
5065
|
case 'C': case 'c':
|
5066
|
+
ENCODING1("CP65001", PM_ENCODING_UTF_8);
|
5067
|
+
ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
|
5068
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
4983
5069
|
ENCODING1("CESU-8", PM_ENCODING_CESU_8);
|
4984
5070
|
ENCODING1("CP437", PM_ENCODING_IBM437);
|
4985
5071
|
ENCODING1("CP720", PM_ENCODING_IBM720);
|
@@ -4999,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
4999
5085
|
ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
|
5000
5086
|
ENCODING1("CP878", PM_ENCODING_KOI8_R);
|
5001
5087
|
ENCODING1("CP863", PM_ENCODING_IBM863);
|
5002
|
-
ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
|
5003
5088
|
ENCODING1("CP936", PM_ENCODING_GBK);
|
5004
5089
|
ENCODING1("CP949", PM_ENCODING_CP949);
|
5005
5090
|
ENCODING1("CP950", PM_ENCODING_CP950);
|
@@ -5014,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
5014
5099
|
ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
|
5015
5100
|
ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
|
5016
5101
|
ENCODING1("CP51932", PM_ENCODING_CP51932);
|
5017
|
-
|
5102
|
+
#endif
|
5018
5103
|
break;
|
5019
5104
|
case 'E': case 'e':
|
5020
5105
|
ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
|
5106
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5021
5107
|
ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
|
5022
5108
|
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
|
5023
5109
|
ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
|
5024
5110
|
ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
|
5025
5111
|
ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
|
5026
5112
|
ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
|
5113
|
+
#endif
|
5027
5114
|
break;
|
5028
5115
|
case 'G': case 'g':
|
5116
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5029
5117
|
ENCODING1("GBK", PM_ENCODING_GBK);
|
5030
5118
|
ENCODING1("GB12345", PM_ENCODING_GB12345);
|
5031
5119
|
ENCODING1("GB18030", PM_ENCODING_GB18030);
|
5032
5120
|
ENCODING1("GB1988", PM_ENCODING_GB1988);
|
5033
5121
|
ENCODING1("GB2312", PM_ENCODING_GB2312);
|
5122
|
+
#endif
|
5034
5123
|
break;
|
5035
5124
|
case 'I': case 'i':
|
5125
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5036
5126
|
ENCODING1("IBM437", PM_ENCODING_IBM437);
|
5037
5127
|
ENCODING1("IBM720", PM_ENCODING_IBM720);
|
5038
5128
|
ENCODING1("IBM737", PM_ENCODING_IBM737);
|
@@ -5064,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
5064
5154
|
ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
|
5065
5155
|
ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
|
5066
5156
|
ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
|
5157
|
+
#endif
|
5067
5158
|
break;
|
5068
5159
|
case 'K': case 'k':
|
5160
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5069
5161
|
ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
|
5070
5162
|
ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
|
5163
|
+
#endif
|
5071
5164
|
break;
|
5072
5165
|
case 'M': case 'm':
|
5166
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5073
5167
|
ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
|
5074
5168
|
ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
|
5075
5169
|
ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
|
@@ -5082,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
5082
5176
|
ENCODING1("macThai", PM_ENCODING_MAC_THAI);
|
5083
5177
|
ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
|
5084
5178
|
ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
|
5179
|
+
#endif
|
5085
5180
|
break;
|
5086
5181
|
case 'P': case 'p':
|
5087
5182
|
ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
|
5088
5183
|
break;
|
5089
5184
|
case 'S': case 's':
|
5090
|
-
ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
|
5091
5185
|
ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
|
5186
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5187
|
+
ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
|
5092
5188
|
ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
|
5093
5189
|
ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
|
5094
5190
|
ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
|
5095
5191
|
ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
|
5096
5192
|
ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
|
5193
|
+
#endif
|
5097
5194
|
break;
|
5098
5195
|
case 'T': case 't':
|
5196
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5099
5197
|
ENCODING1("TIS-620", PM_ENCODING_TIS_620);
|
5198
|
+
#endif
|
5100
5199
|
break;
|
5101
5200
|
case 'U': case 'u':
|
5102
5201
|
ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
|
5202
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5103
5203
|
ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
|
5104
5204
|
ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
|
5105
5205
|
ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
|
5106
5206
|
ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
|
5207
|
+
#endif
|
5107
5208
|
break;
|
5108
5209
|
case 'W': case 'w':
|
5109
5210
|
ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
|
5211
|
+
#ifndef PRISM_ENCODING_EXCLUDE_FULL
|
5110
5212
|
ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
|
5111
5213
|
ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
|
5112
5214
|
ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
|
@@ -5117,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
|
|
5117
5219
|
ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
|
5118
5220
|
ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
|
5119
5221
|
ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
|
5222
|
+
#endif
|
5120
5223
|
break;
|
5121
5224
|
case '6':
|
5122
5225
|
ENCODING1("646", PM_ENCODING_US_ASCII);
|