prism 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -1
  3. data/README.md +2 -1
  4. data/config.yml +188 -55
  5. data/docs/building.md +9 -2
  6. data/docs/configuration.md +10 -9
  7. data/docs/encoding.md +24 -56
  8. data/docs/local_variable_depth.md +229 -0
  9. data/docs/ruby_api.md +2 -0
  10. data/docs/serialization.md +18 -13
  11. data/ext/prism/api_node.c +337 -195
  12. data/ext/prism/extconf.rb +13 -7
  13. data/ext/prism/extension.c +96 -32
  14. data/ext/prism/extension.h +1 -1
  15. data/include/prism/ast.h +340 -137
  16. data/include/prism/defines.h +17 -0
  17. data/include/prism/diagnostic.h +11 -5
  18. data/include/prism/encoding.h +248 -0
  19. data/include/prism/options.h +2 -2
  20. data/include/prism/parser.h +62 -42
  21. data/include/prism/regexp.h +2 -2
  22. data/include/prism/util/pm_buffer.h +9 -1
  23. data/include/prism/util/pm_memchr.h +2 -2
  24. data/include/prism/util/pm_strpbrk.h +3 -3
  25. data/include/prism/version.h +2 -2
  26. data/include/prism.h +13 -15
  27. data/lib/prism/compiler.rb +12 -0
  28. data/lib/prism/debug.rb +9 -4
  29. data/lib/prism/desugar_compiler.rb +3 -3
  30. data/lib/prism/dispatcher.rb +56 -0
  31. data/lib/prism/dot_visitor.rb +476 -198
  32. data/lib/prism/dsl.rb +66 -46
  33. data/lib/prism/ffi.rb +16 -3
  34. data/lib/prism/lex_compat.rb +19 -9
  35. data/lib/prism/mutation_compiler.rb +20 -0
  36. data/lib/prism/node.rb +1173 -450
  37. data/lib/prism/node_ext.rb +41 -16
  38. data/lib/prism/parse_result.rb +12 -15
  39. data/lib/prism/ripper_compat.rb +49 -34
  40. data/lib/prism/serialize.rb +242 -212
  41. data/lib/prism/visitor.rb +12 -0
  42. data/lib/prism.rb +20 -4
  43. data/prism.gemspec +4 -10
  44. data/rbi/prism.rbi +605 -230
  45. data/rbi/prism_static.rbi +3 -0
  46. data/sig/prism.rbs +379 -124
  47. data/sig/prism_static.rbs +1 -0
  48. data/src/diagnostic.c +228 -222
  49. data/src/encoding.c +5137 -0
  50. data/src/node.c +66 -0
  51. data/src/options.c +21 -2
  52. data/src/prettyprint.c +806 -406
  53. data/src/prism.c +1092 -700
  54. data/src/regexp.c +3 -3
  55. data/src/serialize.c +227 -157
  56. data/src/util/pm_buffer.c +10 -1
  57. data/src/util/pm_memchr.c +1 -1
  58. data/src/util/pm_strpbrk.c +4 -4
  59. metadata +5 -11
  60. data/include/prism/enc/pm_encoding.h +0 -227
  61. data/src/enc/pm_big5.c +0 -116
  62. data/src/enc/pm_cp51932.c +0 -57
  63. data/src/enc/pm_euc_jp.c +0 -69
  64. data/src/enc/pm_gbk.c +0 -65
  65. data/src/enc/pm_shift_jis.c +0 -57
  66. data/src/enc/pm_tables.c +0 -2073
  67. data/src/enc/pm_unicode.c +0 -2369
  68. data/src/enc/pm_windows_31j.c +0 -57
data/src/util/pm_buffer.c CHANGED
@@ -138,7 +138,7 @@ pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) {
138
138
  * Append a 32-bit unsigned integer to the buffer as a variable-length integer.
139
139
  */
140
140
  void
141
- pm_buffer_append_varint(pm_buffer_t *buffer, uint32_t value) {
141
+ pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) {
142
142
  if (value < 128) {
143
143
  pm_buffer_append_byte(buffer, (uint8_t) value);
144
144
  } else {
@@ -151,6 +151,15 @@ pm_buffer_append_varint(pm_buffer_t *buffer, uint32_t value) {
151
151
  }
152
152
  }
153
153
 
154
+ /**
155
+ * Append a 32-bit signed integer to the buffer as a variable-length integer.
156
+ */
157
+ void
158
+ pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) {
159
+ uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31));
160
+ pm_buffer_append_varuint(buffer, unsigned_int);
161
+ }
162
+
154
163
  /**
155
164
  * Concatenate one buffer onto another.
156
165
  */
data/src/util/pm_memchr.c CHANGED
@@ -8,7 +8,7 @@
8
8
  * of a multibyte character.
9
9
  */
10
10
  void *
11
- pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, pm_encoding_t *encoding) {
11
+ pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) {
12
12
  if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) {
13
13
  const uint8_t *source = (const uint8_t *) memory;
14
14
  size_t index = 0;
@@ -4,7 +4,7 @@
4
4
  * This is the slow path that does care about the encoding.
5
5
  */
6
6
  static inline const uint8_t *
7
- pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
7
+ pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
8
8
  size_t index = 0;
9
9
 
10
10
  while (index < maximum) {
@@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
12
12
  return source + index;
13
13
  }
14
14
 
15
- size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index));
15
+ size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
16
16
  if (width == 0) {
17
17
  return NULL;
18
18
  }
@@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
61
61
  * need to take a slower path and iterate one multi-byte character at a time.
62
62
  */
63
63
  const uint8_t *
64
- pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
64
+ pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
65
65
  if (length <= 0) {
66
66
  return NULL;
67
- } else if (parser->encoding_changed && parser->encoding.multibyte) {
67
+ } else if (parser->encoding_changed && parser->encoding->multibyte) {
68
68
  return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
69
69
  } else {
70
70
  return pm_strpbrk_single_byte(source, charset, (size_t) length);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prism
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.0
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-21 00:00:00.000000000 Z
11
+ date: 2023-12-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email:
@@ -33,6 +33,7 @@ files:
33
33
  - docs/fuzzing.md
34
34
  - docs/heredocs.md
35
35
  - docs/javascript.md
36
+ - docs/local_variable_depth.md
36
37
  - docs/mapping.md
37
38
  - docs/releasing.md
38
39
  - docs/ripper.md
@@ -48,7 +49,7 @@ files:
48
49
  - include/prism/ast.h
49
50
  - include/prism/defines.h
50
51
  - include/prism/diagnostic.h
51
- - include/prism/enc/pm_encoding.h
52
+ - include/prism/encoding.h
52
53
  - include/prism/node.h
53
54
  - include/prism/options.h
54
55
  - include/prism/pack.h
@@ -94,14 +95,7 @@ files:
94
95
  - sig/prism.rbs
95
96
  - sig/prism_static.rbs
96
97
  - src/diagnostic.c
97
- - src/enc/pm_big5.c
98
- - src/enc/pm_cp51932.c
99
- - src/enc/pm_euc_jp.c
100
- - src/enc/pm_gbk.c
101
- - src/enc/pm_shift_jis.c
102
- - src/enc/pm_tables.c
103
- - src/enc/pm_unicode.c
104
- - src/enc/pm_windows_31j.c
98
+ - src/encoding.c
105
99
  - src/node.c
106
100
  - src/options.c
107
101
  - src/pack.c
@@ -1,227 +0,0 @@
1
- /**
2
- * @file pm_encoding.h
3
- *
4
- * The encoding interface and implementations used by the parser.
5
- */
6
- #ifndef PRISM_ENCODING_H
7
- #define PRISM_ENCODING_H
8
-
9
- #include "prism/defines.h"
10
-
11
- #include <assert.h>
12
- #include <stdbool.h>
13
- #include <stddef.h>
14
- #include <stdint.h>
15
-
16
- /**
17
- * This struct defines the functions necessary to implement the encoding
18
- * interface so we can determine how many bytes the subsequent character takes.
19
- * Each callback should return the number of bytes, or 0 if the next bytes are
20
- * invalid for the encoding and type.
21
- */
22
- typedef struct {
23
- /**
24
- * Return the number of bytes that the next character takes if it is valid
25
- * in the encoding. Does not read more than n bytes. It is assumed that n is
26
- * at least 1.
27
- */
28
- size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
29
-
30
- /**
31
- * Return the number of bytes that the next character takes if it is valid
32
- * in the encoding and is alphabetical. Does not read more than n bytes. It
33
- * is assumed that n is at least 1.
34
- */
35
- size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
36
-
37
- /**
38
- * Return the number of bytes that the next character takes if it is valid
39
- * in the encoding and is alphanumeric. Does not read more than n bytes. It
40
- * is assumed that n is at least 1.
41
- */
42
- size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
43
-
44
- /**
45
- * Return true if the next character is valid in the encoding and is an
46
- * uppercase character. Does not read more than n bytes. It is assumed that
47
- * n is at least 1.
48
- */
49
- bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
50
-
51
- /**
52
- * The name of the encoding. This should correspond to a value that can be
53
- * passed to Encoding.find in Ruby.
54
- */
55
- const char *name;
56
-
57
- /**
58
- * Return true if the encoding is a multibyte encoding.
59
- */
60
- bool multibyte;
61
- } pm_encoding_t;
62
-
63
- /**
64
- * All of the lookup tables use the first bit of each embedded byte to indicate
65
- * whether the codepoint is alphabetical.
66
- */
67
- #define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
68
-
69
- /**
70
- * All of the lookup tables use the second bit of each embedded byte to indicate
71
- * whether the codepoint is alphanumeric.
72
- */
73
- #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
74
-
75
- /**
76
- * All of the lookup tables use the third bit of each embedded byte to indicate
77
- * whether the codepoint is uppercase.
78
- */
79
- #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
80
-
81
- /**
82
- * Return the size of the next character in the ASCII encoding if it is an
83
- * alphabetical character.
84
- *
85
- * @param b The bytes to read.
86
- * @param n The number of bytes that can be read.
87
- * @returns The number of bytes that the next character takes if it is valid in
88
- * the encoding, or 0 if it is not.
89
- */
90
- size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
91
-
92
- /**
93
- * Return the size of the next character in the ASCII encoding if it is an
94
- * alphanumeric character.
95
- *
96
- * @param b The bytes to read.
97
- * @param n The number of bytes that can be read.
98
- * @returns The number of bytes that the next character takes if it is valid in
99
- * the encoding, or 0 if it is not.
100
- */
101
- size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
102
-
103
- /**
104
- * Return true if the next character in the ASCII encoding if it is an uppercase
105
- * character.
106
- *
107
- * @param b The bytes to read.
108
- * @param n The number of bytes that can be read.
109
- * @returns True if the next character is valid in the encoding and is an
110
- * uppercase character, or false if it is not.
111
- */
112
- bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
113
-
114
- /**
115
- * Return the size of the next character in the UTF-8 encoding if it is an
116
- * alphabetical character.
117
- *
118
- * @param b The bytes to read.
119
- * @param n The number of bytes that can be read.
120
- * @returns The number of bytes that the next character takes if it is valid in
121
- * the encoding, or 0 if it is not.
122
- */
123
- size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
124
-
125
- /**
126
- * Return the size of the next character in the UTF-8 encoding if it is an
127
- * alphanumeric character.
128
- *
129
- * @param b The bytes to read.
130
- * @param n The number of bytes that can be read.
131
- * @returns The number of bytes that the next character takes if it is valid in
132
- * the encoding, or 0 if it is not.
133
- */
134
- size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
135
-
136
- /**
137
- * Return true if the next character in the UTF-8 encoding if it is an uppercase
138
- * character.
139
- *
140
- * @param b The bytes to read.
141
- * @param n The number of bytes that can be read.
142
- * @returns True if the next character is valid in the encoding and is an
143
- * uppercase character, or false if it is not.
144
- */
145
- bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
146
-
147
- /**
148
- * This lookup table is referenced in both the UTF-8 encoding file and the
149
- * parser directly in order to speed up the default encoding processing. It is
150
- * used to indicate whether a character is alphabetical, alphanumeric, or
151
- * uppercase in unicode mappings.
152
- */
153
- extern const uint8_t pm_encoding_unicode_table[256];
154
-
155
- // Below are the encodings that are supported by the parser. They are defined in
156
- // their own files in the src/enc directory.
157
-
158
- extern pm_encoding_t pm_encoding_ascii;
159
- extern pm_encoding_t pm_encoding_ascii_8bit;
160
- extern pm_encoding_t pm_encoding_big5;
161
- extern pm_encoding_t pm_encoding_big5_hkscs;
162
- extern pm_encoding_t pm_encoding_big5_uao;
163
- extern pm_encoding_t pm_encoding_cp51932;
164
- extern pm_encoding_t pm_encoding_cp850;
165
- extern pm_encoding_t pm_encoding_cp852;
166
- extern pm_encoding_t pm_encoding_cp855;
167
- extern pm_encoding_t pm_encoding_euc_jp;
168
- extern pm_encoding_t pm_encoding_gb1988;
169
- extern pm_encoding_t pm_encoding_gbk;
170
- extern pm_encoding_t pm_encoding_ibm437;
171
- extern pm_encoding_t pm_encoding_ibm720;
172
- extern pm_encoding_t pm_encoding_ibm737;
173
- extern pm_encoding_t pm_encoding_ibm775;
174
- extern pm_encoding_t pm_encoding_ibm852;
175
- extern pm_encoding_t pm_encoding_ibm855;
176
- extern pm_encoding_t pm_encoding_ibm857;
177
- extern pm_encoding_t pm_encoding_ibm860;
178
- extern pm_encoding_t pm_encoding_ibm861;
179
- extern pm_encoding_t pm_encoding_ibm862;
180
- extern pm_encoding_t pm_encoding_ibm863;
181
- extern pm_encoding_t pm_encoding_ibm864;
182
- extern pm_encoding_t pm_encoding_ibm865;
183
- extern pm_encoding_t pm_encoding_ibm866;
184
- extern pm_encoding_t pm_encoding_ibm869;
185
- extern pm_encoding_t pm_encoding_iso_8859_1;
186
- extern pm_encoding_t pm_encoding_iso_8859_2;
187
- extern pm_encoding_t pm_encoding_iso_8859_3;
188
- extern pm_encoding_t pm_encoding_iso_8859_4;
189
- extern pm_encoding_t pm_encoding_iso_8859_5;
190
- extern pm_encoding_t pm_encoding_iso_8859_6;
191
- extern pm_encoding_t pm_encoding_iso_8859_7;
192
- extern pm_encoding_t pm_encoding_iso_8859_8;
193
- extern pm_encoding_t pm_encoding_iso_8859_9;
194
- extern pm_encoding_t pm_encoding_iso_8859_10;
195
- extern pm_encoding_t pm_encoding_iso_8859_11;
196
- extern pm_encoding_t pm_encoding_iso_8859_13;
197
- extern pm_encoding_t pm_encoding_iso_8859_14;
198
- extern pm_encoding_t pm_encoding_iso_8859_15;
199
- extern pm_encoding_t pm_encoding_iso_8859_16;
200
- extern pm_encoding_t pm_encoding_koi8_r;
201
- extern pm_encoding_t pm_encoding_mac_cent_euro;
202
- extern pm_encoding_t pm_encoding_mac_croatian;
203
- extern pm_encoding_t pm_encoding_mac_cyrillic;
204
- extern pm_encoding_t pm_encoding_mac_greek;
205
- extern pm_encoding_t pm_encoding_mac_iceland;
206
- extern pm_encoding_t pm_encoding_mac_roman;
207
- extern pm_encoding_t pm_encoding_mac_romania;
208
- extern pm_encoding_t pm_encoding_mac_thai;
209
- extern pm_encoding_t pm_encoding_mac_turkish;
210
- extern pm_encoding_t pm_encoding_mac_ukraine;
211
- extern pm_encoding_t pm_encoding_shift_jis;
212
- extern pm_encoding_t pm_encoding_tis_620;
213
- extern pm_encoding_t pm_encoding_utf_8;
214
- extern pm_encoding_t pm_encoding_utf8_mac;
215
- extern pm_encoding_t pm_encoding_windows_1250;
216
- extern pm_encoding_t pm_encoding_windows_1251;
217
- extern pm_encoding_t pm_encoding_windows_1252;
218
- extern pm_encoding_t pm_encoding_windows_1253;
219
- extern pm_encoding_t pm_encoding_windows_1254;
220
- extern pm_encoding_t pm_encoding_windows_1255;
221
- extern pm_encoding_t pm_encoding_windows_1256;
222
- extern pm_encoding_t pm_encoding_windows_1257;
223
- extern pm_encoding_t pm_encoding_windows_1258;
224
- extern pm_encoding_t pm_encoding_windows_31j;
225
- extern pm_encoding_t pm_encoding_windows_874;
226
-
227
- #endif
data/src/enc/pm_big5.c DELETED
@@ -1,116 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE)) {
12
- return 2;
13
- }
14
-
15
- return 0;
16
- }
17
-
18
- static size_t
19
- pm_encoding_big5_star_char_width(const uint8_t *b, ptrdiff_t n) {
20
- // These are the single byte characters.
21
- if (*b < 0x80) {
22
- return 1;
23
- }
24
-
25
- // These are the double byte characters.
26
- if ((n > 1) && (b[0] >= 0x87 && b[0] <= 0xFE) &&
27
- ((b[1] >= 0x40 && b[1] <= 0x7E) || (b[1] >= 0xA1 && b[1] <= 0xFE))) {
28
- return 2;
29
- }
30
-
31
- return 0;
32
- }
33
-
34
- static size_t
35
- pm_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) {
36
- if (pm_encoding_big5_char_width(b, n) == 1) {
37
- return pm_encoding_ascii_alpha_char(b, n);
38
- } else {
39
- return 0;
40
- }
41
- }
42
-
43
- static size_t
44
- pm_encoding_big5_star_alpha_char(const uint8_t *b, ptrdiff_t n) {
45
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
46
- return pm_encoding_ascii_alpha_char(b, n);
47
- } else {
48
- return 0;
49
- }
50
- }
51
-
52
- static size_t
53
- pm_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) {
54
- if (pm_encoding_big5_char_width(b, n) == 1) {
55
- return pm_encoding_ascii_alnum_char(b, n);
56
- } else {
57
- return 0;
58
- }
59
- }
60
-
61
- static size_t
62
- pm_encoding_big5_star_alnum_char(const uint8_t *b, ptrdiff_t n) {
63
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
64
- return pm_encoding_ascii_alnum_char(b, n);
65
- } else {
66
- return 0;
67
- }
68
- }
69
-
70
- static bool
71
- pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
72
- if (pm_encoding_big5_char_width(b, n) == 1) {
73
- return pm_encoding_ascii_isupper_char(b, n);
74
- } else {
75
- return false;
76
- }
77
- }
78
-
79
- static bool
80
- pm_encoding_big5_star_isupper_char(const uint8_t *b, ptrdiff_t n) {
81
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
82
- return pm_encoding_ascii_isupper_char(b, n);
83
- } else {
84
- return false;
85
- }
86
- }
87
-
88
- /** Big5 encoding */
89
- pm_encoding_t pm_encoding_big5 = {
90
- .name = "big5",
91
- .char_width = pm_encoding_big5_char_width,
92
- .alnum_char = pm_encoding_big5_alnum_char,
93
- .alpha_char = pm_encoding_big5_alpha_char,
94
- .isupper_char = pm_encoding_big5_isupper_char,
95
- .multibyte = true
96
- };
97
-
98
- /** Big5-HKSCS encoding */
99
- pm_encoding_t pm_encoding_big5_hkscs = {
100
- .name = "big5-hkscs",
101
- .char_width = pm_encoding_big5_star_char_width,
102
- .alnum_char = pm_encoding_big5_star_alnum_char,
103
- .alpha_char = pm_encoding_big5_star_alpha_char,
104
- .isupper_char = pm_encoding_big5_star_isupper_char,
105
- .multibyte = true
106
- };
107
-
108
- /** Big5-UAO encoding */
109
- pm_encoding_t pm_encoding_big5_uao = {
110
- .name = "big5-uao",
111
- .char_width = pm_encoding_big5_star_char_width,
112
- .alnum_char = pm_encoding_big5_star_alnum_char,
113
- .alpha_char = pm_encoding_big5_star_alpha_char,
114
- .isupper_char = pm_encoding_big5_star_isupper_char,
115
- .multibyte = true
116
- };
data/src/enc/pm_cp51932.c DELETED
@@ -1,57 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_cp51932_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- ((b[0] >= 0xa1 && b[0] <= 0xfe) || (b[0] == 0x8e)) &&
14
- (b[1] >= 0xa1 && b[1] <= 0xfe)
15
- ) {
16
- return 2;
17
- }
18
-
19
- return 0;
20
- }
21
-
22
- static size_t
23
- pm_encoding_cp51932_alpha_char(const uint8_t *b, ptrdiff_t n) {
24
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
25
- return pm_encoding_ascii_alpha_char(b, n);
26
- } else {
27
- return 0;
28
- }
29
- }
30
-
31
- static size_t
32
- pm_encoding_cp51932_alnum_char(const uint8_t *b, ptrdiff_t n) {
33
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
34
- return pm_encoding_ascii_alnum_char(b, n);
35
- } else {
36
- return 0;
37
- }
38
- }
39
-
40
- static bool
41
- pm_encoding_cp51932_isupper_char(const uint8_t *b, ptrdiff_t n) {
42
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
43
- return pm_encoding_ascii_isupper_char(b, n);
44
- } else {
45
- return 0;
46
- }
47
- }
48
-
49
- /** cp51932 encoding */
50
- pm_encoding_t pm_encoding_cp51932 = {
51
- .name = "cp51932",
52
- .char_width = pm_encoding_cp51932_char_width,
53
- .alnum_char = pm_encoding_cp51932_alnum_char,
54
- .alpha_char = pm_encoding_cp51932_alpha_char,
55
- .isupper_char = pm_encoding_cp51932_isupper_char,
56
- .multibyte = true
57
- };
data/src/enc/pm_euc_jp.c DELETED
@@ -1,69 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- (
14
- ((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) ||
15
- ((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE))
16
- )
17
- ) {
18
- return 2;
19
- }
20
-
21
- // These are the triple byte characters.
22
- if (
23
- (n > 2) &&
24
- (b[0] == 0x8F) &&
25
- (b[1] >= 0xA1 && b[2] <= 0xFE) &&
26
- (b[2] >= 0xA1 && b[2] <= 0xFE)
27
- ) {
28
- return 3;
29
- }
30
-
31
- return 0;
32
- }
33
-
34
- static size_t
35
- pm_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) {
36
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
37
- return pm_encoding_ascii_alpha_char(b, n);
38
- } else {
39
- return 0;
40
- }
41
- }
42
-
43
- static size_t
44
- pm_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) {
45
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
46
- return pm_encoding_ascii_alnum_char(b, n);
47
- } else {
48
- return 0;
49
- }
50
- }
51
-
52
- static bool
53
- pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
54
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
55
- return pm_encoding_ascii_isupper_char(b, n);
56
- } else {
57
- return 0;
58
- }
59
- }
60
-
61
- /** EUC-JP encoding */
62
- pm_encoding_t pm_encoding_euc_jp = {
63
- .name = "euc-jp",
64
- .char_width = pm_encoding_euc_jp_char_width,
65
- .alnum_char = pm_encoding_euc_jp_alnum_char,
66
- .alpha_char = pm_encoding_euc_jp_alpha_char,
67
- .isupper_char = pm_encoding_euc_jp_isupper_char,
68
- .multibyte = true
69
- };
data/src/enc/pm_gbk.c DELETED
@@ -1,65 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b <= 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- (
14
- ((b[0] >= 0xA1 && b[0] <= 0xA9) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/1
15
- ((b[0] >= 0xB0 && b[0] <= 0xF7) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/2
16
- ((b[0] >= 0x81 && b[0] <= 0xA0) && (b[1] >= 0x40 && b[1] <= 0xFE) && (b[1] != 0x7F)) || // GBK/3
17
- ((b[0] >= 0xAA && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/4
18
- ((b[0] >= 0xA8 && b[0] <= 0xA9) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/5
19
- ((b[0] >= 0xAA && b[0] <= 0xAF) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 1
20
- ((b[0] >= 0xF8 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 2
21
- ((b[0] >= 0xA1 && b[0] <= 0xA7) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) // user-defined 3
22
- )
23
- ) {
24
- return 2;
25
- }
26
-
27
- return 0;
28
- }
29
-
30
- static size_t
31
- pm_encoding_gbk_alpha_char(const uint8_t *b, ptrdiff_t n) {
32
- if (pm_encoding_gbk_char_width(b, n) == 1) {
33
- return pm_encoding_ascii_alpha_char(b, n);
34
- } else {
35
- return 0;
36
- }
37
- }
38
-
39
- static size_t
40
- pm_encoding_gbk_alnum_char(const uint8_t *b, ptrdiff_t n) {
41
- if (pm_encoding_gbk_char_width(b, n) == 1) {
42
- return pm_encoding_ascii_alnum_char(b, n);
43
- } else {
44
- return 0;
45
- }
46
- }
47
-
48
- static bool
49
- pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
50
- if (pm_encoding_gbk_char_width(b, n) == 1) {
51
- return pm_encoding_ascii_isupper_char(b, n);
52
- } else {
53
- return false;
54
- }
55
- }
56
-
57
- /** GBK encoding */
58
- pm_encoding_t pm_encoding_gbk = {
59
- .name = "gbk",
60
- .char_width = pm_encoding_gbk_char_width,
61
- .alnum_char = pm_encoding_gbk_alnum_char,
62
- .alpha_char = pm_encoding_gbk_alpha_char,
63
- .isupper_char = pm_encoding_gbk_isupper_char,
64
- .multibyte = true
65
- };