prism 0.18.0 → 0.19.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -1
  3. data/README.md +2 -1
  4. data/config.yml +188 -55
  5. data/docs/building.md +9 -2
  6. data/docs/configuration.md +10 -9
  7. data/docs/encoding.md +24 -56
  8. data/docs/local_variable_depth.md +229 -0
  9. data/docs/ruby_api.md +2 -0
  10. data/docs/serialization.md +18 -13
  11. data/ext/prism/api_node.c +337 -195
  12. data/ext/prism/extconf.rb +13 -7
  13. data/ext/prism/extension.c +96 -32
  14. data/ext/prism/extension.h +1 -1
  15. data/include/prism/ast.h +340 -137
  16. data/include/prism/defines.h +17 -0
  17. data/include/prism/diagnostic.h +11 -5
  18. data/include/prism/encoding.h +248 -0
  19. data/include/prism/options.h +2 -2
  20. data/include/prism/parser.h +62 -42
  21. data/include/prism/regexp.h +2 -2
  22. data/include/prism/util/pm_buffer.h +9 -1
  23. data/include/prism/util/pm_memchr.h +2 -2
  24. data/include/prism/util/pm_strpbrk.h +3 -3
  25. data/include/prism/version.h +2 -2
  26. data/include/prism.h +13 -15
  27. data/lib/prism/compiler.rb +12 -0
  28. data/lib/prism/debug.rb +9 -4
  29. data/lib/prism/desugar_compiler.rb +3 -3
  30. data/lib/prism/dispatcher.rb +56 -0
  31. data/lib/prism/dot_visitor.rb +476 -198
  32. data/lib/prism/dsl.rb +66 -46
  33. data/lib/prism/ffi.rb +16 -3
  34. data/lib/prism/lex_compat.rb +19 -9
  35. data/lib/prism/mutation_compiler.rb +20 -0
  36. data/lib/prism/node.rb +1173 -450
  37. data/lib/prism/node_ext.rb +41 -16
  38. data/lib/prism/parse_result.rb +12 -15
  39. data/lib/prism/ripper_compat.rb +49 -34
  40. data/lib/prism/serialize.rb +242 -212
  41. data/lib/prism/visitor.rb +12 -0
  42. data/lib/prism.rb +20 -4
  43. data/prism.gemspec +4 -10
  44. data/rbi/prism.rbi +605 -230
  45. data/rbi/prism_static.rbi +3 -0
  46. data/sig/prism.rbs +379 -124
  47. data/sig/prism_static.rbs +1 -0
  48. data/src/diagnostic.c +228 -222
  49. data/src/encoding.c +5137 -0
  50. data/src/node.c +66 -0
  51. data/src/options.c +21 -2
  52. data/src/prettyprint.c +806 -406
  53. data/src/prism.c +1092 -700
  54. data/src/regexp.c +3 -3
  55. data/src/serialize.c +227 -157
  56. data/src/util/pm_buffer.c +10 -1
  57. data/src/util/pm_memchr.c +1 -1
  58. data/src/util/pm_strpbrk.c +4 -4
  59. metadata +5 -11
  60. data/include/prism/enc/pm_encoding.h +0 -227
  61. data/src/enc/pm_big5.c +0 -116
  62. data/src/enc/pm_cp51932.c +0 -57
  63. data/src/enc/pm_euc_jp.c +0 -69
  64. data/src/enc/pm_gbk.c +0 -65
  65. data/src/enc/pm_shift_jis.c +0 -57
  66. data/src/enc/pm_tables.c +0 -2073
  67. data/src/enc/pm_unicode.c +0 -2369
  68. data/src/enc/pm_windows_31j.c +0 -57
data/src/util/pm_buffer.c CHANGED
@@ -138,7 +138,7 @@ pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) {
138
138
  * Append a 32-bit unsigned integer to the buffer as a variable-length integer.
139
139
  */
140
140
  void
141
- pm_buffer_append_varint(pm_buffer_t *buffer, uint32_t value) {
141
+ pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) {
142
142
  if (value < 128) {
143
143
  pm_buffer_append_byte(buffer, (uint8_t) value);
144
144
  } else {
@@ -151,6 +151,15 @@ pm_buffer_append_varint(pm_buffer_t *buffer, uint32_t value) {
151
151
  }
152
152
  }
153
153
 
154
+ /**
155
+ * Append a 32-bit signed integer to the buffer as a variable-length integer.
156
+ */
157
+ void
158
+ pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) {
159
+ uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31));
160
+ pm_buffer_append_varuint(buffer, unsigned_int);
161
+ }
162
+
154
163
  /**
155
164
  * Concatenate one buffer onto another.
156
165
  */
data/src/util/pm_memchr.c CHANGED
@@ -8,7 +8,7 @@
8
8
  * of a multibyte character.
9
9
  */
10
10
  void *
11
- pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, pm_encoding_t *encoding) {
11
+ pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) {
12
12
  if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) {
13
13
  const uint8_t *source = (const uint8_t *) memory;
14
14
  size_t index = 0;
@@ -4,7 +4,7 @@
4
4
  * This is the slow path that does care about the encoding.
5
5
  */
6
6
  static inline const uint8_t *
7
- pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
7
+ pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
8
8
  size_t index = 0;
9
9
 
10
10
  while (index < maximum) {
@@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
12
12
  return source + index;
13
13
  }
14
14
 
15
- size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index));
15
+ size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
16
16
  if (width == 0) {
17
17
  return NULL;
18
18
  }
@@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
61
61
  * need to take a slower path and iterate one multi-byte character at a time.
62
62
  */
63
63
  const uint8_t *
64
- pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
64
+ pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
65
65
  if (length <= 0) {
66
66
  return NULL;
67
- } else if (parser->encoding_changed && parser->encoding.multibyte) {
67
+ } else if (parser->encoding_changed && parser->encoding->multibyte) {
68
68
  return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
69
69
  } else {
70
70
  return pm_strpbrk_single_byte(source, charset, (size_t) length);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prism
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.0
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-21 00:00:00.000000000 Z
11
+ date: 2023-12-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email:
@@ -33,6 +33,7 @@ files:
33
33
  - docs/fuzzing.md
34
34
  - docs/heredocs.md
35
35
  - docs/javascript.md
36
+ - docs/local_variable_depth.md
36
37
  - docs/mapping.md
37
38
  - docs/releasing.md
38
39
  - docs/ripper.md
@@ -48,7 +49,7 @@ files:
48
49
  - include/prism/ast.h
49
50
  - include/prism/defines.h
50
51
  - include/prism/diagnostic.h
51
- - include/prism/enc/pm_encoding.h
52
+ - include/prism/encoding.h
52
53
  - include/prism/node.h
53
54
  - include/prism/options.h
54
55
  - include/prism/pack.h
@@ -94,14 +95,7 @@ files:
94
95
  - sig/prism.rbs
95
96
  - sig/prism_static.rbs
96
97
  - src/diagnostic.c
97
- - src/enc/pm_big5.c
98
- - src/enc/pm_cp51932.c
99
- - src/enc/pm_euc_jp.c
100
- - src/enc/pm_gbk.c
101
- - src/enc/pm_shift_jis.c
102
- - src/enc/pm_tables.c
103
- - src/enc/pm_unicode.c
104
- - src/enc/pm_windows_31j.c
98
+ - src/encoding.c
105
99
  - src/node.c
106
100
  - src/options.c
107
101
  - src/pack.c
@@ -1,227 +0,0 @@
1
- /**
2
- * @file pm_encoding.h
3
- *
4
- * The encoding interface and implementations used by the parser.
5
- */
6
- #ifndef PRISM_ENCODING_H
7
- #define PRISM_ENCODING_H
8
-
9
- #include "prism/defines.h"
10
-
11
- #include <assert.h>
12
- #include <stdbool.h>
13
- #include <stddef.h>
14
- #include <stdint.h>
15
-
16
- /**
17
- * This struct defines the functions necessary to implement the encoding
18
- * interface so we can determine how many bytes the subsequent character takes.
19
- * Each callback should return the number of bytes, or 0 if the next bytes are
20
- * invalid for the encoding and type.
21
- */
22
- typedef struct {
23
- /**
24
- * Return the number of bytes that the next character takes if it is valid
25
- * in the encoding. Does not read more than n bytes. It is assumed that n is
26
- * at least 1.
27
- */
28
- size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
29
-
30
- /**
31
- * Return the number of bytes that the next character takes if it is valid
32
- * in the encoding and is alphabetical. Does not read more than n bytes. It
33
- * is assumed that n is at least 1.
34
- */
35
- size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
36
-
37
- /**
38
- * Return the number of bytes that the next character takes if it is valid
39
- * in the encoding and is alphanumeric. Does not read more than n bytes. It
40
- * is assumed that n is at least 1.
41
- */
42
- size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
43
-
44
- /**
45
- * Return true if the next character is valid in the encoding and is an
46
- * uppercase character. Does not read more than n bytes. It is assumed that
47
- * n is at least 1.
48
- */
49
- bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
50
-
51
- /**
52
- * The name of the encoding. This should correspond to a value that can be
53
- * passed to Encoding.find in Ruby.
54
- */
55
- const char *name;
56
-
57
- /**
58
- * Return true if the encoding is a multibyte encoding.
59
- */
60
- bool multibyte;
61
- } pm_encoding_t;
62
-
63
- /**
64
- * All of the lookup tables use the first bit of each embedded byte to indicate
65
- * whether the codepoint is alphabetical.
66
- */
67
- #define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
68
-
69
- /**
70
- * All of the lookup tables use the second bit of each embedded byte to indicate
71
- * whether the codepoint is alphanumeric.
72
- */
73
- #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
74
-
75
- /**
76
- * All of the lookup tables use the third bit of each embedded byte to indicate
77
- * whether the codepoint is uppercase.
78
- */
79
- #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
80
-
81
- /**
82
- * Return the size of the next character in the ASCII encoding if it is an
83
- * alphabetical character.
84
- *
85
- * @param b The bytes to read.
86
- * @param n The number of bytes that can be read.
87
- * @returns The number of bytes that the next character takes if it is valid in
88
- * the encoding, or 0 if it is not.
89
- */
90
- size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
91
-
92
- /**
93
- * Return the size of the next character in the ASCII encoding if it is an
94
- * alphanumeric character.
95
- *
96
- * @param b The bytes to read.
97
- * @param n The number of bytes that can be read.
98
- * @returns The number of bytes that the next character takes if it is valid in
99
- * the encoding, or 0 if it is not.
100
- */
101
- size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
102
-
103
- /**
104
- * Return true if the next character in the ASCII encoding if it is an uppercase
105
- * character.
106
- *
107
- * @param b The bytes to read.
108
- * @param n The number of bytes that can be read.
109
- * @returns True if the next character is valid in the encoding and is an
110
- * uppercase character, or false if it is not.
111
- */
112
- bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
113
-
114
- /**
115
- * Return the size of the next character in the UTF-8 encoding if it is an
116
- * alphabetical character.
117
- *
118
- * @param b The bytes to read.
119
- * @param n The number of bytes that can be read.
120
- * @returns The number of bytes that the next character takes if it is valid in
121
- * the encoding, or 0 if it is not.
122
- */
123
- size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
124
-
125
- /**
126
- * Return the size of the next character in the UTF-8 encoding if it is an
127
- * alphanumeric character.
128
- *
129
- * @param b The bytes to read.
130
- * @param n The number of bytes that can be read.
131
- * @returns The number of bytes that the next character takes if it is valid in
132
- * the encoding, or 0 if it is not.
133
- */
134
- size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
135
-
136
- /**
137
- * Return true if the next character in the UTF-8 encoding if it is an uppercase
138
- * character.
139
- *
140
- * @param b The bytes to read.
141
- * @param n The number of bytes that can be read.
142
- * @returns True if the next character is valid in the encoding and is an
143
- * uppercase character, or false if it is not.
144
- */
145
- bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
146
-
147
- /**
148
- * This lookup table is referenced in both the UTF-8 encoding file and the
149
- * parser directly in order to speed up the default encoding processing. It is
150
- * used to indicate whether a character is alphabetical, alphanumeric, or
151
- * uppercase in unicode mappings.
152
- */
153
- extern const uint8_t pm_encoding_unicode_table[256];
154
-
155
- // Below are the encodings that are supported by the parser. They are defined in
156
- // their own files in the src/enc directory.
157
-
158
- extern pm_encoding_t pm_encoding_ascii;
159
- extern pm_encoding_t pm_encoding_ascii_8bit;
160
- extern pm_encoding_t pm_encoding_big5;
161
- extern pm_encoding_t pm_encoding_big5_hkscs;
162
- extern pm_encoding_t pm_encoding_big5_uao;
163
- extern pm_encoding_t pm_encoding_cp51932;
164
- extern pm_encoding_t pm_encoding_cp850;
165
- extern pm_encoding_t pm_encoding_cp852;
166
- extern pm_encoding_t pm_encoding_cp855;
167
- extern pm_encoding_t pm_encoding_euc_jp;
168
- extern pm_encoding_t pm_encoding_gb1988;
169
- extern pm_encoding_t pm_encoding_gbk;
170
- extern pm_encoding_t pm_encoding_ibm437;
171
- extern pm_encoding_t pm_encoding_ibm720;
172
- extern pm_encoding_t pm_encoding_ibm737;
173
- extern pm_encoding_t pm_encoding_ibm775;
174
- extern pm_encoding_t pm_encoding_ibm852;
175
- extern pm_encoding_t pm_encoding_ibm855;
176
- extern pm_encoding_t pm_encoding_ibm857;
177
- extern pm_encoding_t pm_encoding_ibm860;
178
- extern pm_encoding_t pm_encoding_ibm861;
179
- extern pm_encoding_t pm_encoding_ibm862;
180
- extern pm_encoding_t pm_encoding_ibm863;
181
- extern pm_encoding_t pm_encoding_ibm864;
182
- extern pm_encoding_t pm_encoding_ibm865;
183
- extern pm_encoding_t pm_encoding_ibm866;
184
- extern pm_encoding_t pm_encoding_ibm869;
185
- extern pm_encoding_t pm_encoding_iso_8859_1;
186
- extern pm_encoding_t pm_encoding_iso_8859_2;
187
- extern pm_encoding_t pm_encoding_iso_8859_3;
188
- extern pm_encoding_t pm_encoding_iso_8859_4;
189
- extern pm_encoding_t pm_encoding_iso_8859_5;
190
- extern pm_encoding_t pm_encoding_iso_8859_6;
191
- extern pm_encoding_t pm_encoding_iso_8859_7;
192
- extern pm_encoding_t pm_encoding_iso_8859_8;
193
- extern pm_encoding_t pm_encoding_iso_8859_9;
194
- extern pm_encoding_t pm_encoding_iso_8859_10;
195
- extern pm_encoding_t pm_encoding_iso_8859_11;
196
- extern pm_encoding_t pm_encoding_iso_8859_13;
197
- extern pm_encoding_t pm_encoding_iso_8859_14;
198
- extern pm_encoding_t pm_encoding_iso_8859_15;
199
- extern pm_encoding_t pm_encoding_iso_8859_16;
200
- extern pm_encoding_t pm_encoding_koi8_r;
201
- extern pm_encoding_t pm_encoding_mac_cent_euro;
202
- extern pm_encoding_t pm_encoding_mac_croatian;
203
- extern pm_encoding_t pm_encoding_mac_cyrillic;
204
- extern pm_encoding_t pm_encoding_mac_greek;
205
- extern pm_encoding_t pm_encoding_mac_iceland;
206
- extern pm_encoding_t pm_encoding_mac_roman;
207
- extern pm_encoding_t pm_encoding_mac_romania;
208
- extern pm_encoding_t pm_encoding_mac_thai;
209
- extern pm_encoding_t pm_encoding_mac_turkish;
210
- extern pm_encoding_t pm_encoding_mac_ukraine;
211
- extern pm_encoding_t pm_encoding_shift_jis;
212
- extern pm_encoding_t pm_encoding_tis_620;
213
- extern pm_encoding_t pm_encoding_utf_8;
214
- extern pm_encoding_t pm_encoding_utf8_mac;
215
- extern pm_encoding_t pm_encoding_windows_1250;
216
- extern pm_encoding_t pm_encoding_windows_1251;
217
- extern pm_encoding_t pm_encoding_windows_1252;
218
- extern pm_encoding_t pm_encoding_windows_1253;
219
- extern pm_encoding_t pm_encoding_windows_1254;
220
- extern pm_encoding_t pm_encoding_windows_1255;
221
- extern pm_encoding_t pm_encoding_windows_1256;
222
- extern pm_encoding_t pm_encoding_windows_1257;
223
- extern pm_encoding_t pm_encoding_windows_1258;
224
- extern pm_encoding_t pm_encoding_windows_31j;
225
- extern pm_encoding_t pm_encoding_windows_874;
226
-
227
- #endif
data/src/enc/pm_big5.c DELETED
@@ -1,116 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE)) {
12
- return 2;
13
- }
14
-
15
- return 0;
16
- }
17
-
18
- static size_t
19
- pm_encoding_big5_star_char_width(const uint8_t *b, ptrdiff_t n) {
20
- // These are the single byte characters.
21
- if (*b < 0x80) {
22
- return 1;
23
- }
24
-
25
- // These are the double byte characters.
26
- if ((n > 1) && (b[0] >= 0x87 && b[0] <= 0xFE) &&
27
- ((b[1] >= 0x40 && b[1] <= 0x7E) || (b[1] >= 0xA1 && b[1] <= 0xFE))) {
28
- return 2;
29
- }
30
-
31
- return 0;
32
- }
33
-
34
- static size_t
35
- pm_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) {
36
- if (pm_encoding_big5_char_width(b, n) == 1) {
37
- return pm_encoding_ascii_alpha_char(b, n);
38
- } else {
39
- return 0;
40
- }
41
- }
42
-
43
- static size_t
44
- pm_encoding_big5_star_alpha_char(const uint8_t *b, ptrdiff_t n) {
45
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
46
- return pm_encoding_ascii_alpha_char(b, n);
47
- } else {
48
- return 0;
49
- }
50
- }
51
-
52
- static size_t
53
- pm_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) {
54
- if (pm_encoding_big5_char_width(b, n) == 1) {
55
- return pm_encoding_ascii_alnum_char(b, n);
56
- } else {
57
- return 0;
58
- }
59
- }
60
-
61
- static size_t
62
- pm_encoding_big5_star_alnum_char(const uint8_t *b, ptrdiff_t n) {
63
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
64
- return pm_encoding_ascii_alnum_char(b, n);
65
- } else {
66
- return 0;
67
- }
68
- }
69
-
70
- static bool
71
- pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
72
- if (pm_encoding_big5_char_width(b, n) == 1) {
73
- return pm_encoding_ascii_isupper_char(b, n);
74
- } else {
75
- return false;
76
- }
77
- }
78
-
79
- static bool
80
- pm_encoding_big5_star_isupper_char(const uint8_t *b, ptrdiff_t n) {
81
- if (pm_encoding_big5_star_char_width(b, n) == 1) {
82
- return pm_encoding_ascii_isupper_char(b, n);
83
- } else {
84
- return false;
85
- }
86
- }
87
-
88
- /** Big5 encoding */
89
- pm_encoding_t pm_encoding_big5 = {
90
- .name = "big5",
91
- .char_width = pm_encoding_big5_char_width,
92
- .alnum_char = pm_encoding_big5_alnum_char,
93
- .alpha_char = pm_encoding_big5_alpha_char,
94
- .isupper_char = pm_encoding_big5_isupper_char,
95
- .multibyte = true
96
- };
97
-
98
- /** Big5-HKSCS encoding */
99
- pm_encoding_t pm_encoding_big5_hkscs = {
100
- .name = "big5-hkscs",
101
- .char_width = pm_encoding_big5_star_char_width,
102
- .alnum_char = pm_encoding_big5_star_alnum_char,
103
- .alpha_char = pm_encoding_big5_star_alpha_char,
104
- .isupper_char = pm_encoding_big5_star_isupper_char,
105
- .multibyte = true
106
- };
107
-
108
- /** Big5-UAO encoding */
109
- pm_encoding_t pm_encoding_big5_uao = {
110
- .name = "big5-uao",
111
- .char_width = pm_encoding_big5_star_char_width,
112
- .alnum_char = pm_encoding_big5_star_alnum_char,
113
- .alpha_char = pm_encoding_big5_star_alpha_char,
114
- .isupper_char = pm_encoding_big5_star_isupper_char,
115
- .multibyte = true
116
- };
data/src/enc/pm_cp51932.c DELETED
@@ -1,57 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_cp51932_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- ((b[0] >= 0xa1 && b[0] <= 0xfe) || (b[0] == 0x8e)) &&
14
- (b[1] >= 0xa1 && b[1] <= 0xfe)
15
- ) {
16
- return 2;
17
- }
18
-
19
- return 0;
20
- }
21
-
22
- static size_t
23
- pm_encoding_cp51932_alpha_char(const uint8_t *b, ptrdiff_t n) {
24
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
25
- return pm_encoding_ascii_alpha_char(b, n);
26
- } else {
27
- return 0;
28
- }
29
- }
30
-
31
- static size_t
32
- pm_encoding_cp51932_alnum_char(const uint8_t *b, ptrdiff_t n) {
33
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
34
- return pm_encoding_ascii_alnum_char(b, n);
35
- } else {
36
- return 0;
37
- }
38
- }
39
-
40
- static bool
41
- pm_encoding_cp51932_isupper_char(const uint8_t *b, ptrdiff_t n) {
42
- if (pm_encoding_cp51932_char_width(b, n) == 1) {
43
- return pm_encoding_ascii_isupper_char(b, n);
44
- } else {
45
- return 0;
46
- }
47
- }
48
-
49
- /** cp51932 encoding */
50
- pm_encoding_t pm_encoding_cp51932 = {
51
- .name = "cp51932",
52
- .char_width = pm_encoding_cp51932_char_width,
53
- .alnum_char = pm_encoding_cp51932_alnum_char,
54
- .alpha_char = pm_encoding_cp51932_alpha_char,
55
- .isupper_char = pm_encoding_cp51932_isupper_char,
56
- .multibyte = true
57
- };
data/src/enc/pm_euc_jp.c DELETED
@@ -1,69 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b < 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- (
14
- ((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) ||
15
- ((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE))
16
- )
17
- ) {
18
- return 2;
19
- }
20
-
21
- // These are the triple byte characters.
22
- if (
23
- (n > 2) &&
24
- (b[0] == 0x8F) &&
25
- (b[1] >= 0xA1 && b[2] <= 0xFE) &&
26
- (b[2] >= 0xA1 && b[2] <= 0xFE)
27
- ) {
28
- return 3;
29
- }
30
-
31
- return 0;
32
- }
33
-
34
- static size_t
35
- pm_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) {
36
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
37
- return pm_encoding_ascii_alpha_char(b, n);
38
- } else {
39
- return 0;
40
- }
41
- }
42
-
43
- static size_t
44
- pm_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) {
45
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
46
- return pm_encoding_ascii_alnum_char(b, n);
47
- } else {
48
- return 0;
49
- }
50
- }
51
-
52
- static bool
53
- pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
54
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
55
- return pm_encoding_ascii_isupper_char(b, n);
56
- } else {
57
- return 0;
58
- }
59
- }
60
-
61
- /** EUC-JP encoding */
62
- pm_encoding_t pm_encoding_euc_jp = {
63
- .name = "euc-jp",
64
- .char_width = pm_encoding_euc_jp_char_width,
65
- .alnum_char = pm_encoding_euc_jp_alnum_char,
66
- .alpha_char = pm_encoding_euc_jp_alpha_char,
67
- .isupper_char = pm_encoding_euc_jp_isupper_char,
68
- .multibyte = true
69
- };
data/src/enc/pm_gbk.c DELETED
@@ -1,65 +0,0 @@
1
- #include "prism/enc/pm_encoding.h"
2
-
3
- static size_t
4
- pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
5
- // These are the single byte characters.
6
- if (*b <= 0x80) {
7
- return 1;
8
- }
9
-
10
- // These are the double byte characters.
11
- if (
12
- (n > 1) &&
13
- (
14
- ((b[0] >= 0xA1 && b[0] <= 0xA9) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/1
15
- ((b[0] >= 0xB0 && b[0] <= 0xF7) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/2
16
- ((b[0] >= 0x81 && b[0] <= 0xA0) && (b[1] >= 0x40 && b[1] <= 0xFE) && (b[1] != 0x7F)) || // GBK/3
17
- ((b[0] >= 0xAA && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/4
18
- ((b[0] >= 0xA8 && b[0] <= 0xA9) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/5
19
- ((b[0] >= 0xAA && b[0] <= 0xAF) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 1
20
- ((b[0] >= 0xF8 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 2
21
- ((b[0] >= 0xA1 && b[0] <= 0xA7) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) // user-defined 3
22
- )
23
- ) {
24
- return 2;
25
- }
26
-
27
- return 0;
28
- }
29
-
30
- static size_t
31
- pm_encoding_gbk_alpha_char(const uint8_t *b, ptrdiff_t n) {
32
- if (pm_encoding_gbk_char_width(b, n) == 1) {
33
- return pm_encoding_ascii_alpha_char(b, n);
34
- } else {
35
- return 0;
36
- }
37
- }
38
-
39
- static size_t
40
- pm_encoding_gbk_alnum_char(const uint8_t *b, ptrdiff_t n) {
41
- if (pm_encoding_gbk_char_width(b, n) == 1) {
42
- return pm_encoding_ascii_alnum_char(b, n);
43
- } else {
44
- return 0;
45
- }
46
- }
47
-
48
- static bool
49
- pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
50
- if (pm_encoding_gbk_char_width(b, n) == 1) {
51
- return pm_encoding_ascii_isupper_char(b, n);
52
- } else {
53
- return false;
54
- }
55
- }
56
-
57
- /** GBK encoding */
58
- pm_encoding_t pm_encoding_gbk = {
59
- .name = "gbk",
60
- .char_width = pm_encoding_gbk_char_width,
61
- .alnum_char = pm_encoding_gbk_alnum_char,
62
- .alpha_char = pm_encoding_gbk_alpha_char,
63
- .isupper_char = pm_encoding_gbk_isupper_char,
64
- .multibyte = true
65
- };