prism 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -1
- data/README.md +2 -1
- data/config.yml +188 -55
- data/docs/building.md +9 -2
- data/docs/configuration.md +10 -9
- data/docs/encoding.md +24 -56
- data/docs/local_variable_depth.md +229 -0
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +18 -13
- data/ext/prism/api_node.c +337 -195
- data/ext/prism/extconf.rb +13 -7
- data/ext/prism/extension.c +96 -32
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +340 -137
- data/include/prism/defines.h +17 -0
- data/include/prism/diagnostic.h +11 -5
- data/include/prism/encoding.h +248 -0
- data/include/prism/options.h +2 -2
- data/include/prism/parser.h +62 -42
- data/include/prism/regexp.h +2 -2
- data/include/prism/util/pm_buffer.h +9 -1
- data/include/prism/util/pm_memchr.h +2 -2
- data/include/prism/util/pm_strpbrk.h +3 -3
- data/include/prism/version.h +2 -2
- data/include/prism.h +13 -15
- data/lib/prism/compiler.rb +12 -0
- data/lib/prism/debug.rb +9 -4
- data/lib/prism/desugar_compiler.rb +3 -3
- data/lib/prism/dispatcher.rb +56 -0
- data/lib/prism/dot_visitor.rb +476 -198
- data/lib/prism/dsl.rb +66 -46
- data/lib/prism/ffi.rb +16 -3
- data/lib/prism/lex_compat.rb +19 -9
- data/lib/prism/mutation_compiler.rb +20 -0
- data/lib/prism/node.rb +1173 -450
- data/lib/prism/node_ext.rb +41 -16
- data/lib/prism/parse_result.rb +12 -15
- data/lib/prism/ripper_compat.rb +49 -34
- data/lib/prism/serialize.rb +242 -212
- data/lib/prism/visitor.rb +12 -0
- data/lib/prism.rb +20 -4
- data/prism.gemspec +4 -10
- data/rbi/prism.rbi +605 -230
- data/rbi/prism_static.rbi +3 -0
- data/sig/prism.rbs +379 -124
- data/sig/prism_static.rbs +1 -0
- data/src/diagnostic.c +228 -222
- data/src/encoding.c +5137 -0
- data/src/node.c +66 -0
- data/src/options.c +21 -2
- data/src/prettyprint.c +806 -406
- data/src/prism.c +1092 -700
- data/src/regexp.c +3 -3
- data/src/serialize.c +227 -157
- data/src/util/pm_buffer.c +10 -1
- data/src/util/pm_memchr.c +1 -1
- data/src/util/pm_strpbrk.c +4 -4
- metadata +5 -11
- data/include/prism/enc/pm_encoding.h +0 -227
- data/src/enc/pm_big5.c +0 -116
- data/src/enc/pm_cp51932.c +0 -57
- data/src/enc/pm_euc_jp.c +0 -69
- data/src/enc/pm_gbk.c +0 -65
- data/src/enc/pm_shift_jis.c +0 -57
- data/src/enc/pm_tables.c +0 -2073
- data/src/enc/pm_unicode.c +0 -2369
- data/src/enc/pm_windows_31j.c +0 -57
data/src/util/pm_buffer.c
CHANGED
@@ -138,7 +138,7 @@ pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) {
|
|
138
138
|
* Append a 32-bit unsigned integer to the buffer as a variable-length integer.
|
139
139
|
*/
|
140
140
|
void
|
141
|
-
|
141
|
+
pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) {
|
142
142
|
if (value < 128) {
|
143
143
|
pm_buffer_append_byte(buffer, (uint8_t) value);
|
144
144
|
} else {
|
@@ -151,6 +151,15 @@ pm_buffer_append_varint(pm_buffer_t *buffer, uint32_t value) {
|
|
151
151
|
}
|
152
152
|
}
|
153
153
|
|
154
|
+
/**
|
155
|
+
* Append a 32-bit signed integer to the buffer as a variable-length integer.
|
156
|
+
*/
|
157
|
+
void
|
158
|
+
pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) {
|
159
|
+
uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31));
|
160
|
+
pm_buffer_append_varuint(buffer, unsigned_int);
|
161
|
+
}
|
162
|
+
|
154
163
|
/**
|
155
164
|
* Concatenate one buffer onto another.
|
156
165
|
*/
|
data/src/util/pm_memchr.c
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
* of a multibyte character.
|
9
9
|
*/
|
10
10
|
void *
|
11
|
-
pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, pm_encoding_t *encoding) {
|
11
|
+
pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) {
|
12
12
|
if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) {
|
13
13
|
const uint8_t *source = (const uint8_t *) memory;
|
14
14
|
size_t index = 0;
|
data/src/util/pm_strpbrk.c
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
* This is the slow path that does care about the encoding.
|
5
5
|
*/
|
6
6
|
static inline const uint8_t *
|
7
|
-
pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
7
|
+
pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
|
8
8
|
size_t index = 0;
|
9
9
|
|
10
10
|
while (index < maximum) {
|
@@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
|
|
12
12
|
return source + index;
|
13
13
|
}
|
14
14
|
|
15
|
-
size_t width = parser->encoding
|
15
|
+
size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
16
16
|
if (width == 0) {
|
17
17
|
return NULL;
|
18
18
|
}
|
@@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
|
|
61
61
|
* need to take a slower path and iterate one multi-byte character at a time.
|
62
62
|
*/
|
63
63
|
const uint8_t *
|
64
|
-
pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
|
64
|
+
pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
|
65
65
|
if (length <= 0) {
|
66
66
|
return NULL;
|
67
|
-
} else if (parser->encoding_changed && parser->encoding
|
67
|
+
} else if (parser->encoding_changed && parser->encoding->multibyte) {
|
68
68
|
return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
|
69
69
|
} else {
|
70
70
|
return pm_strpbrk_single_byte(source, charset, (size_t) length);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: prism
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.19.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shopify
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email:
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- docs/fuzzing.md
|
34
34
|
- docs/heredocs.md
|
35
35
|
- docs/javascript.md
|
36
|
+
- docs/local_variable_depth.md
|
36
37
|
- docs/mapping.md
|
37
38
|
- docs/releasing.md
|
38
39
|
- docs/ripper.md
|
@@ -48,7 +49,7 @@ files:
|
|
48
49
|
- include/prism/ast.h
|
49
50
|
- include/prism/defines.h
|
50
51
|
- include/prism/diagnostic.h
|
51
|
-
- include/prism/
|
52
|
+
- include/prism/encoding.h
|
52
53
|
- include/prism/node.h
|
53
54
|
- include/prism/options.h
|
54
55
|
- include/prism/pack.h
|
@@ -94,14 +95,7 @@ files:
|
|
94
95
|
- sig/prism.rbs
|
95
96
|
- sig/prism_static.rbs
|
96
97
|
- src/diagnostic.c
|
97
|
-
- src/
|
98
|
-
- src/enc/pm_cp51932.c
|
99
|
-
- src/enc/pm_euc_jp.c
|
100
|
-
- src/enc/pm_gbk.c
|
101
|
-
- src/enc/pm_shift_jis.c
|
102
|
-
- src/enc/pm_tables.c
|
103
|
-
- src/enc/pm_unicode.c
|
104
|
-
- src/enc/pm_windows_31j.c
|
98
|
+
- src/encoding.c
|
105
99
|
- src/node.c
|
106
100
|
- src/options.c
|
107
101
|
- src/pack.c
|
@@ -1,227 +0,0 @@
|
|
1
|
-
/**
|
2
|
-
* @file pm_encoding.h
|
3
|
-
*
|
4
|
-
* The encoding interface and implementations used by the parser.
|
5
|
-
*/
|
6
|
-
#ifndef PRISM_ENCODING_H
|
7
|
-
#define PRISM_ENCODING_H
|
8
|
-
|
9
|
-
#include "prism/defines.h"
|
10
|
-
|
11
|
-
#include <assert.h>
|
12
|
-
#include <stdbool.h>
|
13
|
-
#include <stddef.h>
|
14
|
-
#include <stdint.h>
|
15
|
-
|
16
|
-
/**
|
17
|
-
* This struct defines the functions necessary to implement the encoding
|
18
|
-
* interface so we can determine how many bytes the subsequent character takes.
|
19
|
-
* Each callback should return the number of bytes, or 0 if the next bytes are
|
20
|
-
* invalid for the encoding and type.
|
21
|
-
*/
|
22
|
-
typedef struct {
|
23
|
-
/**
|
24
|
-
* Return the number of bytes that the next character takes if it is valid
|
25
|
-
* in the encoding. Does not read more than n bytes. It is assumed that n is
|
26
|
-
* at least 1.
|
27
|
-
*/
|
28
|
-
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
|
29
|
-
|
30
|
-
/**
|
31
|
-
* Return the number of bytes that the next character takes if it is valid
|
32
|
-
* in the encoding and is alphabetical. Does not read more than n bytes. It
|
33
|
-
* is assumed that n is at least 1.
|
34
|
-
*/
|
35
|
-
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
|
36
|
-
|
37
|
-
/**
|
38
|
-
* Return the number of bytes that the next character takes if it is valid
|
39
|
-
* in the encoding and is alphanumeric. Does not read more than n bytes. It
|
40
|
-
* is assumed that n is at least 1.
|
41
|
-
*/
|
42
|
-
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
|
43
|
-
|
44
|
-
/**
|
45
|
-
* Return true if the next character is valid in the encoding and is an
|
46
|
-
* uppercase character. Does not read more than n bytes. It is assumed that
|
47
|
-
* n is at least 1.
|
48
|
-
*/
|
49
|
-
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
|
50
|
-
|
51
|
-
/**
|
52
|
-
* The name of the encoding. This should correspond to a value that can be
|
53
|
-
* passed to Encoding.find in Ruby.
|
54
|
-
*/
|
55
|
-
const char *name;
|
56
|
-
|
57
|
-
/**
|
58
|
-
* Return true if the encoding is a multibyte encoding.
|
59
|
-
*/
|
60
|
-
bool multibyte;
|
61
|
-
} pm_encoding_t;
|
62
|
-
|
63
|
-
/**
|
64
|
-
* All of the lookup tables use the first bit of each embedded byte to indicate
|
65
|
-
* whether the codepoint is alphabetical.
|
66
|
-
*/
|
67
|
-
#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
|
68
|
-
|
69
|
-
/**
|
70
|
-
* All of the lookup tables use the second bit of each embedded byte to indicate
|
71
|
-
* whether the codepoint is alphanumeric.
|
72
|
-
*/
|
73
|
-
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
|
74
|
-
|
75
|
-
/**
|
76
|
-
* All of the lookup tables use the third bit of each embedded byte to indicate
|
77
|
-
* whether the codepoint is uppercase.
|
78
|
-
*/
|
79
|
-
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
|
80
|
-
|
81
|
-
/**
|
82
|
-
* Return the size of the next character in the ASCII encoding if it is an
|
83
|
-
* alphabetical character.
|
84
|
-
*
|
85
|
-
* @param b The bytes to read.
|
86
|
-
* @param n The number of bytes that can be read.
|
87
|
-
* @returns The number of bytes that the next character takes if it is valid in
|
88
|
-
* the encoding, or 0 if it is not.
|
89
|
-
*/
|
90
|
-
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
91
|
-
|
92
|
-
/**
|
93
|
-
* Return the size of the next character in the ASCII encoding if it is an
|
94
|
-
* alphanumeric character.
|
95
|
-
*
|
96
|
-
* @param b The bytes to read.
|
97
|
-
* @param n The number of bytes that can be read.
|
98
|
-
* @returns The number of bytes that the next character takes if it is valid in
|
99
|
-
* the encoding, or 0 if it is not.
|
100
|
-
*/
|
101
|
-
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
102
|
-
|
103
|
-
/**
|
104
|
-
* Return true if the next character in the ASCII encoding if it is an uppercase
|
105
|
-
* character.
|
106
|
-
*
|
107
|
-
* @param b The bytes to read.
|
108
|
-
* @param n The number of bytes that can be read.
|
109
|
-
* @returns True if the next character is valid in the encoding and is an
|
110
|
-
* uppercase character, or false if it is not.
|
111
|
-
*/
|
112
|
-
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
113
|
-
|
114
|
-
/**
|
115
|
-
* Return the size of the next character in the UTF-8 encoding if it is an
|
116
|
-
* alphabetical character.
|
117
|
-
*
|
118
|
-
* @param b The bytes to read.
|
119
|
-
* @param n The number of bytes that can be read.
|
120
|
-
* @returns The number of bytes that the next character takes if it is valid in
|
121
|
-
* the encoding, or 0 if it is not.
|
122
|
-
*/
|
123
|
-
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
|
124
|
-
|
125
|
-
/**
|
126
|
-
* Return the size of the next character in the UTF-8 encoding if it is an
|
127
|
-
* alphanumeric character.
|
128
|
-
*
|
129
|
-
* @param b The bytes to read.
|
130
|
-
* @param n The number of bytes that can be read.
|
131
|
-
* @returns The number of bytes that the next character takes if it is valid in
|
132
|
-
* the encoding, or 0 if it is not.
|
133
|
-
*/
|
134
|
-
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
|
135
|
-
|
136
|
-
/**
|
137
|
-
* Return true if the next character in the UTF-8 encoding if it is an uppercase
|
138
|
-
* character.
|
139
|
-
*
|
140
|
-
* @param b The bytes to read.
|
141
|
-
* @param n The number of bytes that can be read.
|
142
|
-
* @returns True if the next character is valid in the encoding and is an
|
143
|
-
* uppercase character, or false if it is not.
|
144
|
-
*/
|
145
|
-
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
|
146
|
-
|
147
|
-
/**
|
148
|
-
* This lookup table is referenced in both the UTF-8 encoding file and the
|
149
|
-
* parser directly in order to speed up the default encoding processing. It is
|
150
|
-
* used to indicate whether a character is alphabetical, alphanumeric, or
|
151
|
-
* uppercase in unicode mappings.
|
152
|
-
*/
|
153
|
-
extern const uint8_t pm_encoding_unicode_table[256];
|
154
|
-
|
155
|
-
// Below are the encodings that are supported by the parser. They are defined in
|
156
|
-
// their own files in the src/enc directory.
|
157
|
-
|
158
|
-
extern pm_encoding_t pm_encoding_ascii;
|
159
|
-
extern pm_encoding_t pm_encoding_ascii_8bit;
|
160
|
-
extern pm_encoding_t pm_encoding_big5;
|
161
|
-
extern pm_encoding_t pm_encoding_big5_hkscs;
|
162
|
-
extern pm_encoding_t pm_encoding_big5_uao;
|
163
|
-
extern pm_encoding_t pm_encoding_cp51932;
|
164
|
-
extern pm_encoding_t pm_encoding_cp850;
|
165
|
-
extern pm_encoding_t pm_encoding_cp852;
|
166
|
-
extern pm_encoding_t pm_encoding_cp855;
|
167
|
-
extern pm_encoding_t pm_encoding_euc_jp;
|
168
|
-
extern pm_encoding_t pm_encoding_gb1988;
|
169
|
-
extern pm_encoding_t pm_encoding_gbk;
|
170
|
-
extern pm_encoding_t pm_encoding_ibm437;
|
171
|
-
extern pm_encoding_t pm_encoding_ibm720;
|
172
|
-
extern pm_encoding_t pm_encoding_ibm737;
|
173
|
-
extern pm_encoding_t pm_encoding_ibm775;
|
174
|
-
extern pm_encoding_t pm_encoding_ibm852;
|
175
|
-
extern pm_encoding_t pm_encoding_ibm855;
|
176
|
-
extern pm_encoding_t pm_encoding_ibm857;
|
177
|
-
extern pm_encoding_t pm_encoding_ibm860;
|
178
|
-
extern pm_encoding_t pm_encoding_ibm861;
|
179
|
-
extern pm_encoding_t pm_encoding_ibm862;
|
180
|
-
extern pm_encoding_t pm_encoding_ibm863;
|
181
|
-
extern pm_encoding_t pm_encoding_ibm864;
|
182
|
-
extern pm_encoding_t pm_encoding_ibm865;
|
183
|
-
extern pm_encoding_t pm_encoding_ibm866;
|
184
|
-
extern pm_encoding_t pm_encoding_ibm869;
|
185
|
-
extern pm_encoding_t pm_encoding_iso_8859_1;
|
186
|
-
extern pm_encoding_t pm_encoding_iso_8859_2;
|
187
|
-
extern pm_encoding_t pm_encoding_iso_8859_3;
|
188
|
-
extern pm_encoding_t pm_encoding_iso_8859_4;
|
189
|
-
extern pm_encoding_t pm_encoding_iso_8859_5;
|
190
|
-
extern pm_encoding_t pm_encoding_iso_8859_6;
|
191
|
-
extern pm_encoding_t pm_encoding_iso_8859_7;
|
192
|
-
extern pm_encoding_t pm_encoding_iso_8859_8;
|
193
|
-
extern pm_encoding_t pm_encoding_iso_8859_9;
|
194
|
-
extern pm_encoding_t pm_encoding_iso_8859_10;
|
195
|
-
extern pm_encoding_t pm_encoding_iso_8859_11;
|
196
|
-
extern pm_encoding_t pm_encoding_iso_8859_13;
|
197
|
-
extern pm_encoding_t pm_encoding_iso_8859_14;
|
198
|
-
extern pm_encoding_t pm_encoding_iso_8859_15;
|
199
|
-
extern pm_encoding_t pm_encoding_iso_8859_16;
|
200
|
-
extern pm_encoding_t pm_encoding_koi8_r;
|
201
|
-
extern pm_encoding_t pm_encoding_mac_cent_euro;
|
202
|
-
extern pm_encoding_t pm_encoding_mac_croatian;
|
203
|
-
extern pm_encoding_t pm_encoding_mac_cyrillic;
|
204
|
-
extern pm_encoding_t pm_encoding_mac_greek;
|
205
|
-
extern pm_encoding_t pm_encoding_mac_iceland;
|
206
|
-
extern pm_encoding_t pm_encoding_mac_roman;
|
207
|
-
extern pm_encoding_t pm_encoding_mac_romania;
|
208
|
-
extern pm_encoding_t pm_encoding_mac_thai;
|
209
|
-
extern pm_encoding_t pm_encoding_mac_turkish;
|
210
|
-
extern pm_encoding_t pm_encoding_mac_ukraine;
|
211
|
-
extern pm_encoding_t pm_encoding_shift_jis;
|
212
|
-
extern pm_encoding_t pm_encoding_tis_620;
|
213
|
-
extern pm_encoding_t pm_encoding_utf_8;
|
214
|
-
extern pm_encoding_t pm_encoding_utf8_mac;
|
215
|
-
extern pm_encoding_t pm_encoding_windows_1250;
|
216
|
-
extern pm_encoding_t pm_encoding_windows_1251;
|
217
|
-
extern pm_encoding_t pm_encoding_windows_1252;
|
218
|
-
extern pm_encoding_t pm_encoding_windows_1253;
|
219
|
-
extern pm_encoding_t pm_encoding_windows_1254;
|
220
|
-
extern pm_encoding_t pm_encoding_windows_1255;
|
221
|
-
extern pm_encoding_t pm_encoding_windows_1256;
|
222
|
-
extern pm_encoding_t pm_encoding_windows_1257;
|
223
|
-
extern pm_encoding_t pm_encoding_windows_1258;
|
224
|
-
extern pm_encoding_t pm_encoding_windows_31j;
|
225
|
-
extern pm_encoding_t pm_encoding_windows_874;
|
226
|
-
|
227
|
-
#endif
|
data/src/enc/pm_big5.c
DELETED
@@ -1,116 +0,0 @@
|
|
1
|
-
#include "prism/enc/pm_encoding.h"
|
2
|
-
|
3
|
-
static size_t
|
4
|
-
pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
|
5
|
-
// These are the single byte characters.
|
6
|
-
if (*b < 0x80) {
|
7
|
-
return 1;
|
8
|
-
}
|
9
|
-
|
10
|
-
// These are the double byte characters.
|
11
|
-
if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE)) {
|
12
|
-
return 2;
|
13
|
-
}
|
14
|
-
|
15
|
-
return 0;
|
16
|
-
}
|
17
|
-
|
18
|
-
static size_t
|
19
|
-
pm_encoding_big5_star_char_width(const uint8_t *b, ptrdiff_t n) {
|
20
|
-
// These are the single byte characters.
|
21
|
-
if (*b < 0x80) {
|
22
|
-
return 1;
|
23
|
-
}
|
24
|
-
|
25
|
-
// These are the double byte characters.
|
26
|
-
if ((n > 1) && (b[0] >= 0x87 && b[0] <= 0xFE) &&
|
27
|
-
((b[1] >= 0x40 && b[1] <= 0x7E) || (b[1] >= 0xA1 && b[1] <= 0xFE))) {
|
28
|
-
return 2;
|
29
|
-
}
|
30
|
-
|
31
|
-
return 0;
|
32
|
-
}
|
33
|
-
|
34
|
-
static size_t
|
35
|
-
pm_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
36
|
-
if (pm_encoding_big5_char_width(b, n) == 1) {
|
37
|
-
return pm_encoding_ascii_alpha_char(b, n);
|
38
|
-
} else {
|
39
|
-
return 0;
|
40
|
-
}
|
41
|
-
}
|
42
|
-
|
43
|
-
static size_t
|
44
|
-
pm_encoding_big5_star_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
45
|
-
if (pm_encoding_big5_star_char_width(b, n) == 1) {
|
46
|
-
return pm_encoding_ascii_alpha_char(b, n);
|
47
|
-
} else {
|
48
|
-
return 0;
|
49
|
-
}
|
50
|
-
}
|
51
|
-
|
52
|
-
static size_t
|
53
|
-
pm_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
54
|
-
if (pm_encoding_big5_char_width(b, n) == 1) {
|
55
|
-
return pm_encoding_ascii_alnum_char(b, n);
|
56
|
-
} else {
|
57
|
-
return 0;
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
static size_t
|
62
|
-
pm_encoding_big5_star_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
63
|
-
if (pm_encoding_big5_star_char_width(b, n) == 1) {
|
64
|
-
return pm_encoding_ascii_alnum_char(b, n);
|
65
|
-
} else {
|
66
|
-
return 0;
|
67
|
-
}
|
68
|
-
}
|
69
|
-
|
70
|
-
static bool
|
71
|
-
pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
72
|
-
if (pm_encoding_big5_char_width(b, n) == 1) {
|
73
|
-
return pm_encoding_ascii_isupper_char(b, n);
|
74
|
-
} else {
|
75
|
-
return false;
|
76
|
-
}
|
77
|
-
}
|
78
|
-
|
79
|
-
static bool
|
80
|
-
pm_encoding_big5_star_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
81
|
-
if (pm_encoding_big5_star_char_width(b, n) == 1) {
|
82
|
-
return pm_encoding_ascii_isupper_char(b, n);
|
83
|
-
} else {
|
84
|
-
return false;
|
85
|
-
}
|
86
|
-
}
|
87
|
-
|
88
|
-
/** Big5 encoding */
|
89
|
-
pm_encoding_t pm_encoding_big5 = {
|
90
|
-
.name = "big5",
|
91
|
-
.char_width = pm_encoding_big5_char_width,
|
92
|
-
.alnum_char = pm_encoding_big5_alnum_char,
|
93
|
-
.alpha_char = pm_encoding_big5_alpha_char,
|
94
|
-
.isupper_char = pm_encoding_big5_isupper_char,
|
95
|
-
.multibyte = true
|
96
|
-
};
|
97
|
-
|
98
|
-
/** Big5-HKSCS encoding */
|
99
|
-
pm_encoding_t pm_encoding_big5_hkscs = {
|
100
|
-
.name = "big5-hkscs",
|
101
|
-
.char_width = pm_encoding_big5_star_char_width,
|
102
|
-
.alnum_char = pm_encoding_big5_star_alnum_char,
|
103
|
-
.alpha_char = pm_encoding_big5_star_alpha_char,
|
104
|
-
.isupper_char = pm_encoding_big5_star_isupper_char,
|
105
|
-
.multibyte = true
|
106
|
-
};
|
107
|
-
|
108
|
-
/** Big5-UAO encoding */
|
109
|
-
pm_encoding_t pm_encoding_big5_uao = {
|
110
|
-
.name = "big5-uao",
|
111
|
-
.char_width = pm_encoding_big5_star_char_width,
|
112
|
-
.alnum_char = pm_encoding_big5_star_alnum_char,
|
113
|
-
.alpha_char = pm_encoding_big5_star_alpha_char,
|
114
|
-
.isupper_char = pm_encoding_big5_star_isupper_char,
|
115
|
-
.multibyte = true
|
116
|
-
};
|
data/src/enc/pm_cp51932.c
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
#include "prism/enc/pm_encoding.h"
|
2
|
-
|
3
|
-
static size_t
|
4
|
-
pm_encoding_cp51932_char_width(const uint8_t *b, ptrdiff_t n) {
|
5
|
-
// These are the single byte characters.
|
6
|
-
if (*b < 0x80) {
|
7
|
-
return 1;
|
8
|
-
}
|
9
|
-
|
10
|
-
// These are the double byte characters.
|
11
|
-
if (
|
12
|
-
(n > 1) &&
|
13
|
-
((b[0] >= 0xa1 && b[0] <= 0xfe) || (b[0] == 0x8e)) &&
|
14
|
-
(b[1] >= 0xa1 && b[1] <= 0xfe)
|
15
|
-
) {
|
16
|
-
return 2;
|
17
|
-
}
|
18
|
-
|
19
|
-
return 0;
|
20
|
-
}
|
21
|
-
|
22
|
-
static size_t
|
23
|
-
pm_encoding_cp51932_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
24
|
-
if (pm_encoding_cp51932_char_width(b, n) == 1) {
|
25
|
-
return pm_encoding_ascii_alpha_char(b, n);
|
26
|
-
} else {
|
27
|
-
return 0;
|
28
|
-
}
|
29
|
-
}
|
30
|
-
|
31
|
-
static size_t
|
32
|
-
pm_encoding_cp51932_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
33
|
-
if (pm_encoding_cp51932_char_width(b, n) == 1) {
|
34
|
-
return pm_encoding_ascii_alnum_char(b, n);
|
35
|
-
} else {
|
36
|
-
return 0;
|
37
|
-
}
|
38
|
-
}
|
39
|
-
|
40
|
-
static bool
|
41
|
-
pm_encoding_cp51932_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
42
|
-
if (pm_encoding_cp51932_char_width(b, n) == 1) {
|
43
|
-
return pm_encoding_ascii_isupper_char(b, n);
|
44
|
-
} else {
|
45
|
-
return 0;
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
|
-
/** cp51932 encoding */
|
50
|
-
pm_encoding_t pm_encoding_cp51932 = {
|
51
|
-
.name = "cp51932",
|
52
|
-
.char_width = pm_encoding_cp51932_char_width,
|
53
|
-
.alnum_char = pm_encoding_cp51932_alnum_char,
|
54
|
-
.alpha_char = pm_encoding_cp51932_alpha_char,
|
55
|
-
.isupper_char = pm_encoding_cp51932_isupper_char,
|
56
|
-
.multibyte = true
|
57
|
-
};
|
data/src/enc/pm_euc_jp.c
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
#include "prism/enc/pm_encoding.h"
|
2
|
-
|
3
|
-
static size_t
|
4
|
-
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
|
5
|
-
// These are the single byte characters.
|
6
|
-
if (*b < 0x80) {
|
7
|
-
return 1;
|
8
|
-
}
|
9
|
-
|
10
|
-
// These are the double byte characters.
|
11
|
-
if (
|
12
|
-
(n > 1) &&
|
13
|
-
(
|
14
|
-
((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) ||
|
15
|
-
((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE))
|
16
|
-
)
|
17
|
-
) {
|
18
|
-
return 2;
|
19
|
-
}
|
20
|
-
|
21
|
-
// These are the triple byte characters.
|
22
|
-
if (
|
23
|
-
(n > 2) &&
|
24
|
-
(b[0] == 0x8F) &&
|
25
|
-
(b[1] >= 0xA1 && b[2] <= 0xFE) &&
|
26
|
-
(b[2] >= 0xA1 && b[2] <= 0xFE)
|
27
|
-
) {
|
28
|
-
return 3;
|
29
|
-
}
|
30
|
-
|
31
|
-
return 0;
|
32
|
-
}
|
33
|
-
|
34
|
-
static size_t
|
35
|
-
pm_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
36
|
-
if (pm_encoding_euc_jp_char_width(b, n) == 1) {
|
37
|
-
return pm_encoding_ascii_alpha_char(b, n);
|
38
|
-
} else {
|
39
|
-
return 0;
|
40
|
-
}
|
41
|
-
}
|
42
|
-
|
43
|
-
static size_t
|
44
|
-
pm_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
45
|
-
if (pm_encoding_euc_jp_char_width(b, n) == 1) {
|
46
|
-
return pm_encoding_ascii_alnum_char(b, n);
|
47
|
-
} else {
|
48
|
-
return 0;
|
49
|
-
}
|
50
|
-
}
|
51
|
-
|
52
|
-
static bool
|
53
|
-
pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
54
|
-
if (pm_encoding_euc_jp_char_width(b, n) == 1) {
|
55
|
-
return pm_encoding_ascii_isupper_char(b, n);
|
56
|
-
} else {
|
57
|
-
return 0;
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
/** EUC-JP encoding */
|
62
|
-
pm_encoding_t pm_encoding_euc_jp = {
|
63
|
-
.name = "euc-jp",
|
64
|
-
.char_width = pm_encoding_euc_jp_char_width,
|
65
|
-
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
66
|
-
.alpha_char = pm_encoding_euc_jp_alpha_char,
|
67
|
-
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
68
|
-
.multibyte = true
|
69
|
-
};
|
data/src/enc/pm_gbk.c
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
#include "prism/enc/pm_encoding.h"
|
2
|
-
|
3
|
-
static size_t
|
4
|
-
pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
|
5
|
-
// These are the single byte characters.
|
6
|
-
if (*b <= 0x80) {
|
7
|
-
return 1;
|
8
|
-
}
|
9
|
-
|
10
|
-
// These are the double byte characters.
|
11
|
-
if (
|
12
|
-
(n > 1) &&
|
13
|
-
(
|
14
|
-
((b[0] >= 0xA1 && b[0] <= 0xA9) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/1
|
15
|
-
((b[0] >= 0xB0 && b[0] <= 0xF7) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/2
|
16
|
-
((b[0] >= 0x81 && b[0] <= 0xA0) && (b[1] >= 0x40 && b[1] <= 0xFE) && (b[1] != 0x7F)) || // GBK/3
|
17
|
-
((b[0] >= 0xAA && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/4
|
18
|
-
((b[0] >= 0xA8 && b[0] <= 0xA9) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/5
|
19
|
-
((b[0] >= 0xAA && b[0] <= 0xAF) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 1
|
20
|
-
((b[0] >= 0xF8 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 2
|
21
|
-
((b[0] >= 0xA1 && b[0] <= 0xA7) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) // user-defined 3
|
22
|
-
)
|
23
|
-
) {
|
24
|
-
return 2;
|
25
|
-
}
|
26
|
-
|
27
|
-
return 0;
|
28
|
-
}
|
29
|
-
|
30
|
-
static size_t
|
31
|
-
pm_encoding_gbk_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
32
|
-
if (pm_encoding_gbk_char_width(b, n) == 1) {
|
33
|
-
return pm_encoding_ascii_alpha_char(b, n);
|
34
|
-
} else {
|
35
|
-
return 0;
|
36
|
-
}
|
37
|
-
}
|
38
|
-
|
39
|
-
static size_t
|
40
|
-
pm_encoding_gbk_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
41
|
-
if (pm_encoding_gbk_char_width(b, n) == 1) {
|
42
|
-
return pm_encoding_ascii_alnum_char(b, n);
|
43
|
-
} else {
|
44
|
-
return 0;
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
static bool
|
49
|
-
pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
50
|
-
if (pm_encoding_gbk_char_width(b, n) == 1) {
|
51
|
-
return pm_encoding_ascii_isupper_char(b, n);
|
52
|
-
} else {
|
53
|
-
return false;
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
/** GBK encoding */
|
58
|
-
pm_encoding_t pm_encoding_gbk = {
|
59
|
-
.name = "gbk",
|
60
|
-
.char_width = pm_encoding_gbk_char_width,
|
61
|
-
.alnum_char = pm_encoding_gbk_alnum_char,
|
62
|
-
.alpha_char = pm_encoding_gbk_alpha_char,
|
63
|
-
.isupper_char = pm_encoding_gbk_isupper_char,
|
64
|
-
.multibyte = true
|
65
|
-
};
|