jruby-prism-parser 0.24.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +269 -1
  4. data/CONTRIBUTING.md +0 -4
  5. data/Makefile +25 -18
  6. data/README.md +57 -6
  7. data/config.yml +1724 -140
  8. data/docs/build_system.md +39 -11
  9. data/docs/configuration.md +4 -0
  10. data/docs/cruby_compilation.md +1 -1
  11. data/docs/fuzzing.md +1 -1
  12. data/docs/parser_translation.md +14 -9
  13. data/docs/parsing_rules.md +4 -1
  14. data/docs/releasing.md +8 -10
  15. data/docs/relocation.md +34 -0
  16. data/docs/ripper_translation.md +72 -0
  17. data/docs/ruby_api.md +2 -1
  18. data/docs/serialization.md +29 -5
  19. data/ext/prism/api_node.c +3395 -1999
  20. data/ext/prism/api_pack.c +9 -0
  21. data/ext/prism/extconf.rb +55 -34
  22. data/ext/prism/extension.c +597 -346
  23. data/ext/prism/extension.h +6 -5
  24. data/include/prism/ast.h +2612 -455
  25. data/include/prism/defines.h +160 -2
  26. data/include/prism/diagnostic.h +188 -76
  27. data/include/prism/encoding.h +22 -4
  28. data/include/prism/node.h +89 -17
  29. data/include/prism/options.h +224 -12
  30. data/include/prism/pack.h +11 -0
  31. data/include/prism/parser.h +267 -66
  32. data/include/prism/prettyprint.h +8 -0
  33. data/include/prism/regexp.h +18 -8
  34. data/include/prism/static_literals.h +121 -0
  35. data/include/prism/util/pm_buffer.h +75 -2
  36. data/include/prism/util/pm_char.h +1 -2
  37. data/include/prism/util/pm_constant_pool.h +18 -9
  38. data/include/prism/util/pm_integer.h +126 -0
  39. data/include/prism/util/pm_list.h +1 -1
  40. data/include/prism/util/pm_newline_list.h +19 -0
  41. data/include/prism/util/pm_string.h +48 -8
  42. data/include/prism/version.h +3 -3
  43. data/include/prism.h +99 -5
  44. data/jruby-prism.jar +0 -0
  45. data/lib/prism/compiler.rb +11 -1
  46. data/lib/prism/desugar_compiler.rb +113 -74
  47. data/lib/prism/dispatcher.rb +45 -1
  48. data/lib/prism/dot_visitor.rb +201 -77
  49. data/lib/prism/dsl.rb +673 -461
  50. data/lib/prism/ffi.rb +233 -45
  51. data/lib/prism/inspect_visitor.rb +2389 -0
  52. data/lib/prism/lex_compat.rb +35 -16
  53. data/lib/prism/mutation_compiler.rb +24 -8
  54. data/lib/prism/node.rb +7731 -8460
  55. data/lib/prism/node_ext.rb +328 -32
  56. data/lib/prism/pack.rb +4 -0
  57. data/lib/prism/parse_result/comments.rb +34 -24
  58. data/lib/prism/parse_result/errors.rb +65 -0
  59. data/lib/prism/parse_result/newlines.rb +102 -12
  60. data/lib/prism/parse_result.rb +448 -44
  61. data/lib/prism/pattern.rb +28 -10
  62. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  63. data/lib/prism/polyfill/byteindex.rb +13 -0
  64. data/lib/prism/polyfill/unpack1.rb +14 -0
  65. data/lib/prism/reflection.rb +413 -0
  66. data/lib/prism/relocation.rb +504 -0
  67. data/lib/prism/serialize.rb +1940 -1198
  68. data/lib/prism/string_query.rb +30 -0
  69. data/lib/prism/translation/parser/builder.rb +61 -0
  70. data/lib/prism/translation/parser/compiler.rb +569 -195
  71. data/lib/prism/translation/parser/lexer.rb +516 -39
  72. data/lib/prism/translation/parser.rb +177 -12
  73. data/lib/prism/translation/parser33.rb +1 -1
  74. data/lib/prism/translation/parser34.rb +1 -1
  75. data/lib/prism/translation/parser35.rb +12 -0
  76. data/lib/prism/translation/ripper/sexp.rb +125 -0
  77. data/lib/prism/translation/ripper/shim.rb +5 -0
  78. data/lib/prism/translation/ripper.rb +3224 -462
  79. data/lib/prism/translation/ruby_parser.rb +194 -69
  80. data/lib/prism/translation.rb +4 -1
  81. data/lib/prism/version.rb +1 -1
  82. data/lib/prism/visitor.rb +13 -0
  83. data/lib/prism.rb +17 -27
  84. data/prism.gemspec +57 -17
  85. data/rbi/prism/compiler.rbi +12 -0
  86. data/rbi/prism/dsl.rbi +524 -0
  87. data/rbi/prism/inspect_visitor.rbi +12 -0
  88. data/rbi/prism/node.rbi +8722 -0
  89. data/rbi/prism/node_ext.rbi +107 -0
  90. data/rbi/prism/parse_result.rbi +404 -0
  91. data/rbi/prism/reflection.rbi +58 -0
  92. data/rbi/prism/string_query.rbi +12 -0
  93. data/rbi/prism/translation/parser.rbi +11 -0
  94. data/rbi/prism/translation/parser33.rbi +6 -0
  95. data/rbi/prism/translation/parser34.rbi +6 -0
  96. data/rbi/prism/translation/parser35.rbi +6 -0
  97. data/rbi/prism/translation/ripper.rbi +15 -0
  98. data/rbi/prism/visitor.rbi +473 -0
  99. data/rbi/prism.rbi +44 -7745
  100. data/sig/prism/compiler.rbs +9 -0
  101. data/sig/prism/dispatcher.rbs +16 -0
  102. data/sig/prism/dot_visitor.rbs +6 -0
  103. data/sig/prism/dsl.rbs +351 -0
  104. data/sig/prism/inspect_visitor.rbs +22 -0
  105. data/sig/prism/lex_compat.rbs +10 -0
  106. data/sig/prism/mutation_compiler.rbs +159 -0
  107. data/sig/prism/node.rbs +3614 -0
  108. data/sig/prism/node_ext.rbs +82 -0
  109. data/sig/prism/pack.rbs +43 -0
  110. data/sig/prism/parse_result.rbs +192 -0
  111. data/sig/prism/pattern.rbs +13 -0
  112. data/sig/prism/reflection.rbs +50 -0
  113. data/sig/prism/relocation.rbs +185 -0
  114. data/sig/prism/serialize.rbs +8 -0
  115. data/sig/prism/string_query.rbs +11 -0
  116. data/sig/prism/visitor.rbs +169 -0
  117. data/sig/prism.rbs +248 -4767
  118. data/src/diagnostic.c +672 -230
  119. data/src/encoding.c +211 -108
  120. data/src/node.c +7541 -1653
  121. data/src/options.c +135 -20
  122. data/src/pack.c +33 -17
  123. data/src/prettyprint.c +1543 -1485
  124. data/src/prism.c +7813 -3050
  125. data/src/regexp.c +225 -73
  126. data/src/serialize.c +101 -77
  127. data/src/static_literals.c +617 -0
  128. data/src/token_type.c +14 -13
  129. data/src/util/pm_buffer.c +187 -20
  130. data/src/util/pm_char.c +5 -5
  131. data/src/util/pm_constant_pool.c +39 -19
  132. data/src/util/pm_integer.c +670 -0
  133. data/src/util/pm_list.c +1 -1
  134. data/src/util/pm_newline_list.c +43 -5
  135. data/src/util/pm_string.c +213 -33
  136. data/src/util/pm_strncasecmp.c +13 -1
  137. data/src/util/pm_strpbrk.c +32 -6
  138. metadata +55 -19
  139. data/docs/ripper.md +0 -36
  140. data/include/prism/util/pm_state_stack.h +0 -42
  141. data/include/prism/util/pm_string_list.h +0 -44
  142. data/lib/prism/debug.rb +0 -206
  143. data/lib/prism/node_inspector.rb +0 -68
  144. data/lib/prism/translation/parser/rubocop.rb +0 -45
  145. data/rbi/prism_static.rbi +0 -207
  146. data/sig/prism_static.rbs +0 -201
  147. data/src/util/pm_state_stack.c +0 -25
  148. data/src/util/pm_string_list.c +0 -28
data/src/encoding.c CHANGED
@@ -1499,7 +1499,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
1499
1499
  0x31350, 0x323AF,
1500
1500
  };
1501
1501
 
1502
- #define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
1502
+ #define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
1503
1503
  static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
1504
1504
  0x100, 0x100,
1505
1505
  0x102, 0x102,
@@ -1582,9 +1582,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1582
1582
  0x1B5, 0x1B5,
1583
1583
  0x1B7, 0x1B8,
1584
1584
  0x1BC, 0x1BC,
1585
- 0x1C4, 0x1C4,
1586
- 0x1C7, 0x1C7,
1587
- 0x1CA, 0x1CA,
1585
+ 0x1C4, 0x1C5,
1586
+ 0x1C7, 0x1C8,
1587
+ 0x1CA, 0x1CB,
1588
1588
  0x1CD, 0x1CD,
1589
1589
  0x1CF, 0x1CF,
1590
1590
  0x1D1, 0x1D1,
@@ -1602,7 +1602,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1602
1602
  0x1EA, 0x1EA,
1603
1603
  0x1EC, 0x1EC,
1604
1604
  0x1EE, 0x1EE,
1605
- 0x1F1, 0x1F1,
1605
+ 0x1F1, 0x1F2,
1606
1606
  0x1F4, 0x1F4,
1607
1607
  0x1F6, 0x1F8,
1608
1608
  0x1FA, 0x1FA,
@@ -1910,11 +1910,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
1910
1910
  0x1F5D, 0x1F5D,
1911
1911
  0x1F5F, 0x1F5F,
1912
1912
  0x1F68, 0x1F6F,
1913
- 0x1FB8, 0x1FBB,
1914
- 0x1FC8, 0x1FCB,
1913
+ 0x1F88, 0x1F8F,
1914
+ 0x1F98, 0x1F9F,
1915
+ 0x1FA8, 0x1FAF,
1916
+ 0x1FB8, 0x1FBC,
1917
+ 0x1FC8, 0x1FCC,
1915
1918
  0x1FD8, 0x1FDB,
1916
1919
  0x1FE8, 0x1FEC,
1917
- 0x1FF8, 0x1FFB,
1920
+ 0x1FF8, 0x1FFC,
1918
1921
  0x2102, 0x2102,
1919
1922
  0x2107, 0x2107,
1920
1923
  0x210B, 0x210D,
@@ -2355,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
2355
2358
  }
2356
2359
  }
2357
2360
 
2361
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
2362
+
2358
2363
  static pm_unicode_codepoint_t
2359
2364
  pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
2360
2365
  if (b[0] < 0x80) {
@@ -2449,13 +2454,15 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
2449
2454
  }
2450
2455
  }
2451
2456
 
2457
+ #endif
2458
+
2452
2459
  #undef UNICODE_ALPHA_CODEPOINTS_LENGTH
2453
2460
  #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
2454
2461
  #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
2455
2462
 
2456
2463
  /**
2457
2464
  * Each element of the following table contains a bitfield that indicates a
2458
- * piece of information about the corresponding ASCII character.
2465
+ * piece of information about the corresponding US-ASCII character.
2459
2466
  */
2460
2467
  static const uint8_t pm_encoding_ascii_table[256] = {
2461
2468
  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
@@ -2477,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
2477
2484
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
2478
2485
  };
2479
2486
 
2487
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
2488
+
2480
2489
  /**
2481
2490
  * Each element of the following table contains a bitfield that indicates a
2482
2491
  * piece of information about the corresponding CP850 character.
@@ -3624,7 +3633,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
3624
3633
  0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
3625
3634
  0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
3626
3635
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
3627
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3636
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3628
3637
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3629
3638
  3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
3630
3639
  };
@@ -3672,7 +3681,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
3672
3681
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
3673
3682
  0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
3674
3683
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
3675
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3684
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3676
3685
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3677
3686
  3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
3678
3687
  };
@@ -3915,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
3915
3924
  PRISM_ENCODING_TABLE(windows_874)
3916
3925
 
3917
3926
  #undef PRISM_ENCODING_TABLE
3927
+ #endif
3918
3928
 
3919
3929
  /**
3920
3930
  * Returns the size of the next character in the ASCII encoding. This basically
@@ -3973,22 +3983,129 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
3973
3983
  }
3974
3984
 
3975
3985
  /**
3976
- * Certain encodings are equivalent to ASCII below 0x80, so it works for our
3977
- * purposes to have a function here that first checks the bounds and then falls
3978
- * back to checking the ASCII lookup table.
3986
+ * For a lot of encodings the default is that they are a single byte long no
3987
+ * matter what the codepoint, so this function is shared between them.
3988
+ */
3989
+ static size_t
3990
+ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
3991
+ return 1;
3992
+ }
3993
+
3994
+ /**
3995
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
3996
+ * character cannot be decoded from the given bytes.
3997
+ */
3998
+ static size_t
3999
+ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
4000
+ // These are the single byte characters.
4001
+ if (*b < 0x80) {
4002
+ return 1;
4003
+ }
4004
+
4005
+ // These are the double byte characters.
4006
+ if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
4007
+ return 2;
4008
+ }
4009
+
4010
+ // These are the triple byte characters.
4011
+ if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
4012
+ return 3;
4013
+ }
4014
+
4015
+ return 0;
4016
+ }
4017
+
4018
+ /**
4019
+ * Returns the size of the next character in the EUC-JP encoding if it is an
4020
+ * uppercase character.
3979
4021
  */
3980
4022
  static bool
3981
- pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
3982
- return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
4023
+ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
4024
+ size_t width = pm_encoding_euc_jp_char_width(b, n);
4025
+
4026
+ if (width == 1) {
4027
+ return pm_encoding_ascii_isupper_char(b, n);
4028
+ } else if (width == 2) {
4029
+ return (
4030
+ (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
4031
+ (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
4032
+ (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
4033
+ );
4034
+ } else {
4035
+ return false;
4036
+ }
3983
4037
  }
3984
4038
 
3985
4039
  /**
3986
- * For a lot of encodings the default is that they are a single byte long no
3987
- * matter what the codepoint, so this function is shared between them.
4040
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
4041
+ * character cannot be decoded from the given bytes.
3988
4042
  */
3989
4043
  static size_t
3990
- pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
3991
- return 1;
4044
+ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
4045
+ // These are the single byte characters.
4046
+ if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
4047
+ return 1;
4048
+ }
4049
+
4050
+ // These are the double byte characters.
4051
+ if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
4052
+ return 2;
4053
+ }
4054
+
4055
+ return 0;
4056
+ }
4057
+
4058
+ /**
4059
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4060
+ * alphanumeric character.
4061
+ */
4062
+ static size_t
4063
+ pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
4064
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4065
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
4066
+ }
4067
+
4068
+ /**
4069
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4070
+ * alphabetical character.
4071
+ */
4072
+ static size_t
4073
+ pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
4074
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4075
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
4076
+ }
4077
+
4078
+ /**
4079
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
4080
+ * uppercase character.
4081
+ */
4082
+ static bool
4083
+ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
4084
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
4085
+
4086
+ if (width == 1) {
4087
+ return pm_encoding_ascii_isupper_char(b, n);
4088
+ } else if (width == 2) {
4089
+ return (
4090
+ ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
4091
+ ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
4092
+ ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
4093
+ );
4094
+ } else {
4095
+ return width;
4096
+ }
4097
+ }
4098
+
4099
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4100
+
4101
+ /**
4102
+ * Certain encodings are equivalent to ASCII below 0x80, so it works for our
4103
+ * purposes to have a function here that first checks the bounds and then falls
4104
+ * back to checking the ASCII lookup table.
4105
+ */
4106
+ static bool
4107
+ pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
4108
+ return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
3992
4109
  }
3993
4110
 
3994
4111
  /**
@@ -4022,7 +4139,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
4022
4139
  }
4023
4140
 
4024
4141
  // These are the double byte characters
4025
- if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xfe) && (b[1] >= 0x41 && b[1] <= 0xfe)) {
4142
+ if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
4026
4143
  return 2;
4027
4144
  }
4028
4145
 
@@ -4072,30 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
4072
4189
  return 0;
4073
4190
  }
4074
4191
 
4075
- /**
4076
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
4077
- * character cannot be decoded from the given bytes.
4078
- */
4079
- static size_t
4080
- pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
4081
- // These are the single byte characters.
4082
- if (*b < 0x80) {
4083
- return 1;
4084
- }
4085
-
4086
- // These are the double byte characters.
4087
- if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
4088
- return 2;
4089
- }
4090
-
4091
- // These are the triple byte characters.
4092
- if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
4093
- return 3;
4094
- }
4095
-
4096
- return 0;
4097
- }
4098
-
4099
4192
  /**
4100
4193
  * Returns the size of the next character in the EUC-KR encoding, or 0 if a
4101
4194
  * character cannot be decoded from the given bytes.
@@ -4194,24 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
4194
4287
  return 0;
4195
4288
  }
4196
4289
 
4197
- /**
4198
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
4199
- * character cannot be decoded from the given bytes.
4200
- */
4201
- static size_t
4202
- pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
4203
- // These are the single byte characters.
4204
- if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
4205
- return 1;
4206
- }
4207
-
4208
- // These are the double byte characters.
4209
- if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
4210
- return 2;
4211
- }
4212
-
4213
- return 0;
4214
- }
4290
+ #endif
4215
4291
 
4216
4292
  /**
4217
4293
  * This is the table of all of the encodings that prism supports.
@@ -4225,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
4225
4301
  .isupper_char = pm_encoding_utf_8_isupper_char,
4226
4302
  .multibyte = true
4227
4303
  },
4304
+ [PM_ENCODING_US_ASCII] = {
4305
+ .name = "US-ASCII",
4306
+ .char_width = pm_encoding_ascii_char_width,
4307
+ .alnum_char = pm_encoding_ascii_alnum_char,
4308
+ .alpha_char = pm_encoding_ascii_alpha_char,
4309
+ .isupper_char = pm_encoding_ascii_isupper_char,
4310
+ .multibyte = false
4311
+ },
4228
4312
  [PM_ENCODING_ASCII_8BIT] = {
4229
4313
  .name = "ASCII-8BIT",
4230
4314
  .char_width = pm_encoding_single_char_width,
@@ -4233,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
4233
4317
  .isupper_char = pm_encoding_ascii_isupper_char,
4234
4318
  .multibyte = false
4235
4319
  },
4320
+ [PM_ENCODING_EUC_JP] = {
4321
+ .name = "EUC-JP",
4322
+ .char_width = pm_encoding_euc_jp_char_width,
4323
+ .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4324
+ .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4325
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4326
+ .multibyte = true
4327
+ },
4328
+ [PM_ENCODING_WINDOWS_31J] = {
4329
+ .name = "Windows-31J",
4330
+ .char_width = pm_encoding_shift_jis_char_width,
4331
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4332
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4333
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4334
+ .multibyte = true
4335
+ },
4336
+
4337
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4236
4338
  [PM_ENCODING_BIG5] = {
4237
4339
  .name = "Big5",
4238
4340
  .char_width = pm_encoding_big5_char_width,
@@ -4270,7 +4372,7 @@ const pm_encoding_t pm_encodings[] = {
4270
4372
  .char_width = pm_encoding_euc_jp_char_width,
4271
4373
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4272
4374
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4273
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4375
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4274
4376
  .multibyte = true
4275
4377
  },
4276
4378
  [PM_ENCODING_CP850] = {
@@ -4329,20 +4431,12 @@ const pm_encoding_t pm_encodings[] = {
4329
4431
  .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4330
4432
  .multibyte = true
4331
4433
  },
4332
- [PM_ENCODING_EUC_JP] = {
4333
- .name = "EUC-JP",
4334
- .char_width = pm_encoding_euc_jp_char_width,
4335
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4336
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4337
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4338
- .multibyte = true
4339
- },
4340
4434
  [PM_ENCODING_EUC_JP_MS] = {
4341
4435
  .name = "eucJP-ms",
4342
4436
  .char_width = pm_encoding_euc_jp_char_width,
4343
4437
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4344
4438
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4345
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4439
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4346
4440
  .multibyte = true
4347
4441
  },
4348
4442
  [PM_ENCODING_EUC_JIS_2004] = {
@@ -4350,7 +4444,7 @@ const pm_encoding_t pm_encodings[] = {
4350
4444
  .char_width = pm_encoding_euc_jp_char_width,
4351
4445
  .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4352
4446
  .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4353
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4447
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
4354
4448
  .multibyte = true
4355
4449
  },
4356
4450
  [PM_ENCODING_EUC_KR] = {
@@ -4708,9 +4802,9 @@ const pm_encoding_t pm_encodings[] = {
4708
4802
  [PM_ENCODING_MAC_JAPANESE] = {
4709
4803
  .name = "MacJapanese",
4710
4804
  .char_width = pm_encoding_shift_jis_char_width,
4711
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4712
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4713
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4805
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4806
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4807
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4714
4808
  .multibyte = true
4715
4809
  },
4716
4810
  [PM_ENCODING_MAC_ROMAN] = {
@@ -4756,33 +4850,33 @@ const pm_encoding_t pm_encodings[] = {
4756
4850
  [PM_ENCODING_SHIFT_JIS] = {
4757
4851
  .name = "Shift_JIS",
4758
4852
  .char_width = pm_encoding_shift_jis_char_width,
4759
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4760
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4761
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4853
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4854
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4855
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4762
4856
  .multibyte = true
4763
4857
  },
4764
4858
  [PM_ENCODING_SJIS_DOCOMO] = {
4765
4859
  .name = "SJIS-DoCoMo",
4766
4860
  .char_width = pm_encoding_shift_jis_char_width,
4767
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4768
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4769
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4861
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4862
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4863
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4770
4864
  .multibyte = true
4771
4865
  },
4772
4866
  [PM_ENCODING_SJIS_KDDI] = {
4773
4867
  .name = "SJIS-KDDI",
4774
4868
  .char_width = pm_encoding_shift_jis_char_width,
4775
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4776
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4777
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4869
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4870
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4871
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4778
4872
  .multibyte = true
4779
4873
  },
4780
4874
  [PM_ENCODING_SJIS_SOFTBANK] = {
4781
4875
  .name = "SJIS-SoftBank",
4782
4876
  .char_width = pm_encoding_shift_jis_char_width,
4783
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4784
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4785
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4877
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
4878
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
4879
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
4786
4880
  .multibyte = true
4787
4881
  },
4788
4882
  [PM_ENCODING_STATELESS_ISO_2022_JP] = {
@@ -4809,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
4809
4903
  .isupper_char = pm_encoding_tis_620_isupper_char,
4810
4904
  .multibyte = false
4811
4905
  },
4812
- [PM_ENCODING_US_ASCII] = {
4813
- .name = "US-ASCII",
4814
- .char_width = pm_encoding_ascii_char_width,
4815
- .alnum_char = pm_encoding_ascii_alnum_char,
4816
- .alpha_char = pm_encoding_ascii_alpha_char,
4817
- .isupper_char = pm_encoding_ascii_isupper_char,
4818
- .multibyte = false
4819
- },
4820
4906
  [PM_ENCODING_UTF8_MAC] = {
4821
4907
  .name = "UTF8-MAC",
4822
4908
  .char_width = pm_encoding_utf_8_char_width,
@@ -4921,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
4921
5007
  .isupper_char = pm_encoding_windows_1258_isupper_char,
4922
5008
  .multibyte = false
4923
5009
  },
4924
- [PM_ENCODING_WINDOWS_31J] = {
4925
- .name = "Windows-31J",
4926
- .char_width = pm_encoding_shift_jis_char_width,
4927
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
4928
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
4929
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
4930
- .multibyte = true
4931
- },
4932
5010
  [PM_ENCODING_WINDOWS_874] = {
4933
5011
  .name = "Windows-874",
4934
5012
  .char_width = pm_encoding_single_char_width,
@@ -4937,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
4937
5015
  .isupper_char = pm_encoding_windows_874_isupper_char,
4938
5016
  .multibyte = false
4939
5017
  }
5018
+ #endif
4940
5019
  };
4941
5020
 
4942
5021
  /**
@@ -4951,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4951
5030
  // UTF-8 can contain extra information at the end about the platform it is
4952
5031
  // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
4953
5032
  if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
5033
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4954
5034
  // We need to explicitly handle UTF-8-HFS, as that one needs to switch
4955
5035
  // over to being UTF8-MAC.
4956
5036
  if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
4957
5037
  return &pm_encodings[PM_ENCODING_UTF8_MAC];
4958
5038
  }
5039
+ #endif
4959
5040
 
4960
5041
  // Otherwise we'll return the default UTF-8 encoding.
4961
5042
  return PM_ENCODING_UTF_8_ENTRY;
@@ -4975,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4975
5056
  break;
4976
5057
  case 'B': case 'b':
4977
5058
  ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
5059
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4978
5060
  ENCODING1("Big5", PM_ENCODING_BIG5);
4979
5061
  ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
4980
5062
  ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
5063
+ #endif
4981
5064
  break;
4982
5065
  case 'C': case 'c':
5066
+ ENCODING1("CP65001", PM_ENCODING_UTF_8);
5067
+ ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
5068
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
4983
5069
  ENCODING1("CESU-8", PM_ENCODING_CESU_8);
4984
5070
  ENCODING1("CP437", PM_ENCODING_IBM437);
4985
5071
  ENCODING1("CP720", PM_ENCODING_IBM720);
@@ -4999,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
4999
5085
  ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
5000
5086
  ENCODING1("CP878", PM_ENCODING_KOI8_R);
5001
5087
  ENCODING1("CP863", PM_ENCODING_IBM863);
5002
- ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
5003
5088
  ENCODING1("CP936", PM_ENCODING_GBK);
5004
5089
  ENCODING1("CP949", PM_ENCODING_CP949);
5005
5090
  ENCODING1("CP950", PM_ENCODING_CP950);
@@ -5014,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5014
5099
  ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
5015
5100
  ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
5016
5101
  ENCODING1("CP51932", PM_ENCODING_CP51932);
5017
- ENCODING1("CP65001", PM_ENCODING_UTF_8);
5102
+ #endif
5018
5103
  break;
5019
5104
  case 'E': case 'e':
5020
5105
  ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
5106
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5021
5107
  ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
5022
5108
  ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
5023
5109
  ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
5024
5110
  ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
5025
5111
  ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
5026
5112
  ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
5113
+ #endif
5027
5114
  break;
5028
5115
  case 'G': case 'g':
5116
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5029
5117
  ENCODING1("GBK", PM_ENCODING_GBK);
5030
5118
  ENCODING1("GB12345", PM_ENCODING_GB12345);
5031
5119
  ENCODING1("GB18030", PM_ENCODING_GB18030);
5032
5120
  ENCODING1("GB1988", PM_ENCODING_GB1988);
5033
5121
  ENCODING1("GB2312", PM_ENCODING_GB2312);
5122
+ #endif
5034
5123
  break;
5035
5124
  case 'I': case 'i':
5125
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5036
5126
  ENCODING1("IBM437", PM_ENCODING_IBM437);
5037
5127
  ENCODING1("IBM720", PM_ENCODING_IBM720);
5038
5128
  ENCODING1("IBM737", PM_ENCODING_IBM737);
@@ -5064,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5064
5154
  ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
5065
5155
  ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
5066
5156
  ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
5157
+ #endif
5067
5158
  break;
5068
5159
  case 'K': case 'k':
5160
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5069
5161
  ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
5070
5162
  ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
5163
+ #endif
5071
5164
  break;
5072
5165
  case 'M': case 'm':
5166
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5073
5167
  ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
5074
5168
  ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
5075
5169
  ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@@ -5082,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5082
5176
  ENCODING1("macThai", PM_ENCODING_MAC_THAI);
5083
5177
  ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
5084
5178
  ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
5179
+ #endif
5085
5180
  break;
5086
5181
  case 'P': case 'p':
5087
5182
  ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
5088
5183
  break;
5089
5184
  case 'S': case 's':
5090
- ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
5091
5185
  ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
5186
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5187
+ ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
5092
5188
  ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
5093
5189
  ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
5094
5190
  ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
5095
5191
  ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
5096
5192
  ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
5193
+ #endif
5097
5194
  break;
5098
5195
  case 'T': case 't':
5196
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5099
5197
  ENCODING1("TIS-620", PM_ENCODING_TIS_620);
5198
+ #endif
5100
5199
  break;
5101
5200
  case 'U': case 'u':
5102
5201
  ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
5202
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5103
5203
  ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
5104
5204
  ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
5105
5205
  ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
5106
5206
  ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
5207
+ #endif
5107
5208
  break;
5108
5209
  case 'W': case 'w':
5109
5210
  ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
5211
+ #ifndef PRISM_ENCODING_EXCLUDE_FULL
5110
5212
  ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
5111
5213
  ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
5112
5214
  ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@@ -5117,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
5117
5219
  ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
5118
5220
  ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
5119
5221
  ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
5222
+ #endif
5120
5223
  break;
5121
5224
  case '6':
5122
5225
  ENCODING1("646", PM_ENCODING_US_ASCII);