yarp 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +76 -0
  3. data/CONTRIBUTING.md +51 -0
  4. data/LICENSE.md +7 -0
  5. data/Makefile.in +79 -0
  6. data/README.md +86 -0
  7. data/config.h.in +25 -0
  8. data/config.yml +2147 -0
  9. data/configure +4487 -0
  10. data/docs/build_system.md +85 -0
  11. data/docs/building.md +26 -0
  12. data/docs/configuration.md +56 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +116 -0
  15. data/docs/extension.md +20 -0
  16. data/docs/fuzzing.md +93 -0
  17. data/docs/heredocs.md +36 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/ripper.md +36 -0
  20. data/docs/serialization.md +130 -0
  21. data/docs/testing.md +55 -0
  22. data/ext/yarp/api_node.c +3680 -0
  23. data/ext/yarp/api_pack.c +256 -0
  24. data/ext/yarp/extconf.rb +131 -0
  25. data/ext/yarp/extension.c +547 -0
  26. data/ext/yarp/extension.h +18 -0
  27. data/include/yarp/ast.h +1412 -0
  28. data/include/yarp/defines.h +54 -0
  29. data/include/yarp/diagnostic.h +24 -0
  30. data/include/yarp/enc/yp_encoding.h +94 -0
  31. data/include/yarp/node.h +36 -0
  32. data/include/yarp/pack.h +141 -0
  33. data/include/yarp/parser.h +389 -0
  34. data/include/yarp/regexp.h +19 -0
  35. data/include/yarp/unescape.h +42 -0
  36. data/include/yarp/util/yp_buffer.h +39 -0
  37. data/include/yarp/util/yp_char.h +75 -0
  38. data/include/yarp/util/yp_constant_pool.h +64 -0
  39. data/include/yarp/util/yp_list.h +67 -0
  40. data/include/yarp/util/yp_memchr.h +14 -0
  41. data/include/yarp/util/yp_newline_list.h +54 -0
  42. data/include/yarp/util/yp_state_stack.h +24 -0
  43. data/include/yarp/util/yp_string.h +57 -0
  44. data/include/yarp/util/yp_string_list.h +28 -0
  45. data/include/yarp/util/yp_strpbrk.h +29 -0
  46. data/include/yarp/version.h +5 -0
  47. data/include/yarp.h +69 -0
  48. data/lib/yarp/lex_compat.rb +759 -0
  49. data/lib/yarp/node.rb +7428 -0
  50. data/lib/yarp/pack.rb +185 -0
  51. data/lib/yarp/ripper_compat.rb +174 -0
  52. data/lib/yarp/serialize.rb +389 -0
  53. data/lib/yarp.rb +330 -0
  54. data/src/diagnostic.c +25 -0
  55. data/src/enc/yp_big5.c +79 -0
  56. data/src/enc/yp_euc_jp.c +85 -0
  57. data/src/enc/yp_gbk.c +88 -0
  58. data/src/enc/yp_shift_jis.c +83 -0
  59. data/src/enc/yp_tables.c +509 -0
  60. data/src/enc/yp_unicode.c +2320 -0
  61. data/src/enc/yp_windows_31j.c +83 -0
  62. data/src/node.c +2011 -0
  63. data/src/pack.c +493 -0
  64. data/src/prettyprint.c +1782 -0
  65. data/src/regexp.c +580 -0
  66. data/src/serialize.c +1576 -0
  67. data/src/token_type.c +347 -0
  68. data/src/unescape.c +576 -0
  69. data/src/util/yp_buffer.c +78 -0
  70. data/src/util/yp_char.c +229 -0
  71. data/src/util/yp_constant_pool.c +147 -0
  72. data/src/util/yp_list.c +50 -0
  73. data/src/util/yp_memchr.c +31 -0
  74. data/src/util/yp_newline_list.c +119 -0
  75. data/src/util/yp_state_stack.c +25 -0
  76. data/src/util/yp_string.c +207 -0
  77. data/src/util/yp_string_list.c +32 -0
  78. data/src/util/yp_strncasecmp.c +20 -0
  79. data/src/util/yp_strpbrk.c +66 -0
  80. data/src/yarp.c +13211 -0
  81. data/yarp.gemspec +100 -0
  82. metadata +125 -0
data/lib/yarp.rb ADDED
@@ -0,0 +1,330 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YARP
4
+ # This represents a source of Ruby code that has been parsed. It is used in
5
+ # conjunction with locations to allow them to resolve line numbers and source
6
+ # ranges.
7
+ class Source
8
+ attr_reader :source, :offsets
9
+
10
+ def initialize(source, offsets)
11
+ @source = source
12
+ @offsets = offsets
13
+ end
14
+
15
+ def slice(offset, length)
16
+ source.byteslice(offset, length)
17
+ end
18
+
19
+ def line(value)
20
+ offsets.bsearch_index { |offset| offset > value } || offsets.length
21
+ end
22
+
23
+ def column(value)
24
+ value - offsets[line(value) - 1]
25
+ end
26
+ end
27
+
28
+ # This represents a location in the source.
29
+ class Location
30
+ # A Source object that is used to determine more information from the given
31
+ # offset and length.
32
+ private attr_reader :source
33
+
34
+ # The byte offset from the beginning of the source where this location
35
+ # starts.
36
+ attr_reader :start_offset
37
+
38
+ # The length of this location in bytes.
39
+ attr_reader :length
40
+
41
+ def initialize(source, start_offset, length)
42
+ @source = source
43
+ @start_offset = start_offset
44
+ @length = length
45
+ end
46
+
47
+ def inspect
48
+ "#<YARP::Location @start_offset=#{@start_offset} @length=#{@length}>"
49
+ end
50
+
51
+ # The source code that this location represents.
52
+ def slice
53
+ source.slice(start_offset, length)
54
+ end
55
+
56
+ # The byte offset from the beginning of the source where this location ends.
57
+ def end_offset
58
+ start_offset + length
59
+ end
60
+
61
+ # The line number where this location starts.
62
+ def start_line
63
+ source.line(start_offset)
64
+ end
65
+
66
+ # The line number where this location ends.
67
+ def end_line
68
+ source.line(end_offset - 1)
69
+ end
70
+
71
+ # The column number in bytes where this location starts from the start of
72
+ # the line.
73
+ def start_column
74
+ source.column(start_offset)
75
+ end
76
+
77
+ # The column number in bytes where this location ends from the start of the
78
+ # line.
79
+ def end_column
80
+ source.column(end_offset - 1)
81
+ end
82
+
83
+ def deconstruct_keys(keys)
84
+ { start_offset: start_offset, end_offset: end_offset }
85
+ end
86
+
87
+ def pretty_print(q)
88
+ q.text("(#{start_offset}...#{end_offset})")
89
+ end
90
+
91
+ def ==(other)
92
+ other.is_a?(Location) &&
93
+ other.start_offset == start_offset &&
94
+ other.end_offset == end_offset
95
+ end
96
+
97
+ def self.null
98
+ new(0, 0)
99
+ end
100
+ end
101
+
102
+ # This represents a comment that was encountered during parsing.
103
+ class Comment
104
+ attr_reader :type, :location
105
+
106
+ def initialize(type, location)
107
+ @type = type
108
+ @location = location
109
+ end
110
+
111
+ def deconstruct_keys(keys)
112
+ { type: type, location: location }
113
+ end
114
+ end
115
+
116
+ # This represents an error that was encountered during parsing.
117
+ class ParseError
118
+ attr_reader :message, :location
119
+
120
+ def initialize(message, location)
121
+ @message = message
122
+ @location = location
123
+ end
124
+
125
+ def deconstruct_keys(keys)
126
+ { message: message, location: location }
127
+ end
128
+ end
129
+
130
+ # This represents a warning that was encountered during parsing.
131
+ class ParseWarning
132
+ attr_reader :message, :location
133
+
134
+ def initialize(message, location)
135
+ @message = message
136
+ @location = location
137
+ end
138
+
139
+ def deconstruct_keys(keys)
140
+ { message: message, location: location }
141
+ end
142
+ end
143
+
144
+ # A class that knows how to walk down the tree. None of the individual visit
145
+ # methods are implemented on this visitor, so it forces the consumer to
146
+ # implement each one that they need. For a default implementation that
147
+ # continues walking the tree, see the Visitor class.
148
+ class BasicVisitor
149
+ def visit(node)
150
+ node&.accept(self)
151
+ end
152
+
153
+ def visit_all(nodes)
154
+ nodes.map { |node| visit(node) }
155
+ end
156
+
157
+ def visit_child_nodes(node)
158
+ visit_all(node.child_nodes)
159
+ end
160
+ end
161
+
162
+ class Visitor < BasicVisitor
163
+ end
164
+
165
+ # This represents the result of a call to ::parse or ::parse_file. It contains
166
+ # the AST, any comments that were encounters, and any errors that were
167
+ # encountered.
168
+ class ParseResult
169
+ attr_reader :value, :comments, :errors, :warnings, :source
170
+
171
+ def initialize(value, comments, errors, warnings, source)
172
+ @value = value
173
+ @comments = comments
174
+ @errors = errors
175
+ @warnings = warnings
176
+ @source = source
177
+ end
178
+
179
+ def deconstruct_keys(keys)
180
+ { value: value, comments: comments, errors: errors, warnings: warnings }
181
+ end
182
+
183
+ def success?
184
+ errors.empty?
185
+ end
186
+
187
+ def failure?
188
+ !success?
189
+ end
190
+
191
+ # Keep in sync with Java MarkNewlinesVisitor
192
+ class MarkNewlinesVisitor < YARP::Visitor
193
+ def initialize(newline_marked)
194
+ @newline_marked = newline_marked
195
+ end
196
+
197
+ def visit_block_node(node)
198
+ old_newline_marked = @newline_marked
199
+ @newline_marked = Array.new(old_newline_marked.size, false)
200
+ begin
201
+ super(node)
202
+ ensure
203
+ @newline_marked = old_newline_marked
204
+ end
205
+ end
206
+ alias_method :visit_lambda_node, :visit_block_node
207
+
208
+ def visit_if_node(node)
209
+ node.set_newline_flag(@newline_marked)
210
+ super(node)
211
+ end
212
+ alias_method :visit_unless_node, :visit_if_node
213
+
214
+ def visit_statements_node(node)
215
+ node.body.each do |child|
216
+ child.set_newline_flag(@newline_marked)
217
+ end
218
+ super(node)
219
+ end
220
+ end
221
+ private_constant :MarkNewlinesVisitor
222
+
223
+ def mark_newlines
224
+ newline_marked = Array.new(1 + @source.offsets.size, false)
225
+ visitor = MarkNewlinesVisitor.new(newline_marked)
226
+ value.accept(visitor)
227
+ value
228
+ end
229
+ end
230
+
231
+ # This represents a token from the Ruby source.
232
+ class Token
233
+ attr_reader :type, :value, :location
234
+
235
+ def initialize(type, value, location)
236
+ @type = type
237
+ @value = value
238
+ @location = location
239
+ end
240
+
241
+ def deconstruct_keys(keys)
242
+ { type: type, value: value, location: location }
243
+ end
244
+
245
+ def pretty_print(q)
246
+ q.group do
247
+ q.text(type.to_s)
248
+ self.location.pretty_print(q)
249
+ q.text("(")
250
+ q.nest(2) do
251
+ q.breakable("")
252
+ q.pp(value)
253
+ end
254
+ q.breakable("")
255
+ q.text(")")
256
+ end
257
+ end
258
+
259
+ def ==(other)
260
+ other.is_a?(Token) &&
261
+ other.type == type &&
262
+ other.value == value
263
+ end
264
+ end
265
+
266
+ # This represents a node in the tree.
267
+ class Node
268
+ attr_reader :location
269
+
270
+ def newline?
271
+ @newline ? true : false
272
+ end
273
+
274
+ def set_newline_flag(newline_marked)
275
+ line = location.start_line
276
+ unless newline_marked[line]
277
+ newline_marked[line] = true
278
+ @newline = true
279
+ end
280
+ end
281
+
282
+ def pretty_print(q)
283
+ q.group do
284
+ q.text(self.class.name.split("::").last)
285
+ location.pretty_print(q)
286
+ q.text("[Li:#{location.start_line}]") if newline?
287
+ q.text("(")
288
+ q.nest(2) do
289
+ deconstructed = deconstruct_keys([])
290
+ deconstructed.delete(:location)
291
+
292
+ q.breakable("")
293
+ q.seplist(deconstructed, lambda { q.comma_breakable }, :each_value) { |value| q.pp(value) }
294
+ end
295
+ q.breakable("")
296
+ q.text(")")
297
+ end
298
+ end
299
+ end
300
+
301
+ # Load the serialized AST using the source as a reference into a tree.
302
+ def self.load(source, serialized)
303
+ Serialize.load(source, serialized)
304
+ end
305
+
306
+ # This module is used for testing and debugging and is not meant to be used by
307
+ # consumers of this library.
308
+ module Debug
309
+ def self.newlines(source)
310
+ YARP.parse(source).source.offsets
311
+ end
312
+
313
+ def self.parse_serialize_file(filepath)
314
+ parse_serialize_file_metadata(filepath, [filepath.bytesize, filepath.b, 0].pack("LA*L"))
315
+ end
316
+ end
317
+
318
+ # Marking this as private so that consumers don't see it. It makes it a little
319
+ # annoying for testing since you have to const_get it to access the methods,
320
+ # but at least this way it's clear it's not meant for consumers.
321
+ private_constant :Debug
322
+ end
323
+
324
+ require_relative "yarp/lex_compat"
325
+ require_relative "yarp/node"
326
+ require_relative "yarp/ripper_compat"
327
+ require_relative "yarp/serialize"
328
+ require_relative "yarp/pack"
329
+
330
+ require "yarp/yarp"
data/src/diagnostic.c ADDED
@@ -0,0 +1,25 @@
1
+ #include "yarp/diagnostic.h"
2
+
3
+ // Append an error to the given list of diagnostic.
4
+ bool
5
+ yp_diagnostic_list_append(yp_list_t *list, const char *start, const char *end, const char *message) {
6
+ yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) malloc(sizeof(yp_diagnostic_t));
7
+ if (diagnostic == NULL) return false;
8
+
9
+ *diagnostic = (yp_diagnostic_t) { .start = start, .end = end, .message = message };
10
+ yp_list_append(list, (yp_list_node_t *) diagnostic);
11
+ return true;
12
+ }
13
+
14
+ // Deallocate the internal state of the given diagnostic list.
15
+ void
16
+ yp_diagnostic_list_free(yp_list_t *list) {
17
+ yp_list_node_t *node, *next;
18
+
19
+ for (node = list->head; node != NULL; node = next) {
20
+ next = node->next;
21
+
22
+ yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) node;
23
+ free(diagnostic);
24
+ }
25
+ }
data/src/enc/yp_big5.c ADDED
@@ -0,0 +1,79 @@
1
+ #include "yarp/enc/yp_encoding.h"
2
+
3
+ typedef uint16_t yp_big5_codepoint_t;
4
+
5
+ static yp_big5_codepoint_t
6
+ yp_big5_codepoint(const char *c, ptrdiff_t n, size_t *width) {
7
+ const unsigned char *uc = (const unsigned char *) c;
8
+
9
+ // These are the single byte characters.
10
+ if (*uc < 0x80) {
11
+ *width = 1;
12
+ return *uc;
13
+ }
14
+
15
+ // These are the double byte characters.
16
+ if ((n > 1) && (uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xFE)) {
17
+ *width = 2;
18
+ return (yp_big5_codepoint_t) (uc[0] << 8 | uc[1]);
19
+ }
20
+
21
+ *width = 0;
22
+ return 0;
23
+ }
24
+
25
+ static size_t
26
+ yp_encoding_big5_char_width(const char *c, ptrdiff_t n) {
27
+ size_t width;
28
+ yp_big5_codepoint(c, n, &width);
29
+
30
+ return width;
31
+ }
32
+
33
+ static size_t
34
+ yp_encoding_big5_alpha_char(const char *c, ptrdiff_t n) {
35
+ size_t width;
36
+ yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
37
+
38
+ if (width == 1) {
39
+ const char value = (const char) codepoint;
40
+ return yp_encoding_ascii_alpha_char(&value, n);
41
+ } else {
42
+ return 0;
43
+ }
44
+ }
45
+
46
+ static size_t
47
+ yp_encoding_big5_alnum_char(const char *c, ptrdiff_t n) {
48
+ size_t width;
49
+ yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
50
+
51
+ if (width == 1) {
52
+ const char value = (const char) codepoint;
53
+ return yp_encoding_ascii_alnum_char(&value, n);
54
+ } else {
55
+ return 0;
56
+ }
57
+ }
58
+
59
+ static bool
60
+ yp_encoding_big5_isupper_char(const char *c, ptrdiff_t n) {
61
+ size_t width;
62
+ yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
63
+
64
+ if (width == 1) {
65
+ const char value = (const char) codepoint;
66
+ return yp_encoding_ascii_isupper_char(&value, n);
67
+ } else {
68
+ return false;
69
+ }
70
+ }
71
+
72
+ yp_encoding_t yp_encoding_big5 = {
73
+ .name = "big5",
74
+ .char_width = yp_encoding_big5_char_width,
75
+ .alnum_char = yp_encoding_big5_alnum_char,
76
+ .alpha_char = yp_encoding_big5_alpha_char,
77
+ .isupper_char = yp_encoding_big5_isupper_char,
78
+ .multibyte = true
79
+ };
@@ -0,0 +1,85 @@
1
+ #include "yarp/enc/yp_encoding.h"
2
+
3
+ typedef uint16_t yp_euc_jp_codepoint_t;
4
+
5
+ static yp_euc_jp_codepoint_t
6
+ yp_euc_jp_codepoint(const char *c, ptrdiff_t n, size_t *width) {
7
+ const unsigned char *uc = (const unsigned char *) c;
8
+
9
+ // These are the single byte characters.
10
+ if (*uc < 0x80) {
11
+ *width = 1;
12
+ return *uc;
13
+ }
14
+
15
+ // These are the double byte characters.
16
+ if (
17
+ (n > 1) &&
18
+ (
19
+ ((uc[0] == 0x8E) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) ||
20
+ ((uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0xA1 && uc[1] <= 0xFE))
21
+ )
22
+ ) {
23
+ *width = 2;
24
+ return (yp_euc_jp_codepoint_t) (uc[0] << 8 | uc[1]);
25
+ }
26
+
27
+ *width = 0;
28
+ return 0;
29
+ }
30
+
31
+ static size_t
32
+ yp_encoding_euc_jp_char_width(const char *c, ptrdiff_t n) {
33
+ size_t width;
34
+ yp_euc_jp_codepoint(c, n, &width);
35
+
36
+ return width;
37
+ }
38
+
39
+ static size_t
40
+ yp_encoding_euc_jp_alpha_char(const char *c, ptrdiff_t n) {
41
+ size_t width;
42
+ yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
43
+
44
+ if (width == 1) {
45
+ const char value = (const char) codepoint;
46
+ return yp_encoding_ascii_alpha_char(&value, n);
47
+ } else {
48
+ return 0;
49
+ }
50
+ }
51
+
52
+ static size_t
53
+ yp_encoding_euc_jp_alnum_char(const char *c, ptrdiff_t n) {
54
+ size_t width;
55
+ yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
56
+
57
+ if (width == 1) {
58
+ const char value = (const char) codepoint;
59
+ return yp_encoding_ascii_alnum_char(&value, n);
60
+ } else {
61
+ return 0;
62
+ }
63
+ }
64
+
65
+ static bool
66
+ yp_encoding_euc_jp_isupper_char(const char *c, ptrdiff_t n) {
67
+ size_t width;
68
+ yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
69
+
70
+ if (width == 1) {
71
+ const char value = (const char) codepoint;
72
+ return yp_encoding_ascii_isupper_char(&value, n);
73
+ } else {
74
+ return 0;
75
+ }
76
+ }
77
+
78
+ yp_encoding_t yp_encoding_euc_jp = {
79
+ .name = "euc-jp",
80
+ .char_width = yp_encoding_euc_jp_char_width,
81
+ .alnum_char = yp_encoding_euc_jp_alnum_char,
82
+ .alpha_char = yp_encoding_euc_jp_alpha_char,
83
+ .isupper_char = yp_encoding_euc_jp_isupper_char,
84
+ .multibyte = true
85
+ };
data/src/enc/yp_gbk.c ADDED
@@ -0,0 +1,88 @@
1
+ #include "yarp/enc/yp_encoding.h"
2
+
3
+ typedef uint16_t yp_gbk_codepoint_t;
4
+
5
+ static yp_gbk_codepoint_t
6
+ yp_gbk_codepoint(const char *c, ptrdiff_t n, size_t *width) {
7
+ const unsigned char *uc = (const unsigned char *) c;
8
+
9
+ // These are the single byte characters.
10
+ if (*uc < 0x80) {
11
+ *width = 1;
12
+ return *uc;
13
+ }
14
+
15
+ // These are the double byte characters.
16
+ if (
17
+ (n > 1) &&
18
+ (
19
+ ((uc[0] >= 0xA1 && uc[0] <= 0xA9) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/1
20
+ ((uc[0] >= 0xB0 && uc[0] <= 0xF7) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/2
21
+ ((uc[0] >= 0x81 && uc[0] <= 0xA0) && (uc[1] >= 0x40 && uc[1] <= 0xFE) && (uc[1] != 0x7F)) || // GBK/3
22
+ ((uc[0] >= 0xAA && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) || // GBK/4
23
+ ((uc[0] >= 0xA8 && uc[0] <= 0xA9) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) // GBK/5
24
+ )
25
+ ) {
26
+ *width = 2;
27
+ return (yp_gbk_codepoint_t) (uc[0] << 8 | uc[1]);
28
+ }
29
+
30
+ *width = 0;
31
+ return 0;
32
+ }
33
+
34
+ static size_t
35
+ yp_encoding_gbk_char_width(const char *c, ptrdiff_t n) {
36
+ size_t width;
37
+ yp_gbk_codepoint(c, n, &width);
38
+
39
+ return width;
40
+ }
41
+
42
+ static size_t
43
+ yp_encoding_gbk_alpha_char(const char *c, ptrdiff_t n) {
44
+ size_t width;
45
+ yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
46
+
47
+ if (width == 1) {
48
+ const char value = (const char) codepoint;
49
+ return yp_encoding_ascii_alpha_char(&value, n);
50
+ } else {
51
+ return 0;
52
+ }
53
+ }
54
+
55
+ static size_t
56
+ yp_encoding_gbk_alnum_char(const char *c, ptrdiff_t n) {
57
+ size_t width;
58
+ yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
59
+
60
+ if (width == 1) {
61
+ const char value = (const char) codepoint;
62
+ return yp_encoding_ascii_alnum_char(&value, n);
63
+ } else {
64
+ return 0;
65
+ }
66
+ }
67
+
68
+ static bool
69
+ yp_encoding_gbk_isupper_char(const char *c, ptrdiff_t n) {
70
+ size_t width;
71
+ yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
72
+
73
+ if (width == 1) {
74
+ const char value = (const char) codepoint;
75
+ return yp_encoding_ascii_isupper_char(&value, n);
76
+ } else {
77
+ return false;
78
+ }
79
+ }
80
+
81
+ yp_encoding_t yp_encoding_gbk = {
82
+ .name = "gbk",
83
+ .char_width = yp_encoding_gbk_char_width,
84
+ .alnum_char = yp_encoding_gbk_alnum_char,
85
+ .alpha_char = yp_encoding_gbk_alpha_char,
86
+ .isupper_char = yp_encoding_gbk_isupper_char,
87
+ .multibyte = true
88
+ };
@@ -0,0 +1,83 @@
1
+ #include "yarp/enc/yp_encoding.h"
2
+
3
+ typedef uint16_t yp_shift_jis_codepoint_t;
4
+
5
+ static yp_shift_jis_codepoint_t
6
+ yp_shift_jis_codepoint(const char *c, ptrdiff_t n, size_t *width) {
7
+ const unsigned char *uc = (const unsigned char *) c;
8
+
9
+ // These are the single byte characters.
10
+ if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
11
+ *width = 1;
12
+ return *uc;
13
+ }
14
+
15
+ // These are the double byte characters.
16
+ if (
17
+ (n > 1) &&
18
+ ((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
19
+ (uc[1] >= 0x40 && uc[1] <= 0xFC)
20
+ ) {
21
+ *width = 2;
22
+ return (yp_shift_jis_codepoint_t) (uc[0] << 8 | uc[1]);
23
+ }
24
+
25
+ *width = 0;
26
+ return 0;
27
+ }
28
+
29
+ static size_t
30
+ yp_encoding_shift_jis_char_width(const char *c, ptrdiff_t n) {
31
+ size_t width;
32
+ yp_shift_jis_codepoint(c, n, &width);
33
+
34
+ return width;
35
+ }
36
+
37
+ static size_t
38
+ yp_encoding_shift_jis_alpha_char(const char *c, ptrdiff_t n) {
39
+ size_t width;
40
+ yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
41
+
42
+ if (width == 1) {
43
+ const char value = (const char) codepoint;
44
+ return yp_encoding_ascii_alpha_char(&value, n);
45
+ } else {
46
+ return 0;
47
+ }
48
+ }
49
+
50
+ static size_t
51
+ yp_encoding_shift_jis_alnum_char(const char *c, ptrdiff_t n) {
52
+ size_t width;
53
+ yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
54
+
55
+ if (width == 1) {
56
+ const char value = (const char) codepoint;
57
+ return yp_encoding_ascii_alnum_char(&value, n);
58
+ } else {
59
+ return 0;
60
+ }
61
+ }
62
+
63
+ static bool
64
+ yp_encoding_shift_jis_isupper_char(const char *c, ptrdiff_t n) {
65
+ size_t width;
66
+ yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
67
+
68
+ if (width == 1) {
69
+ const char value = (const char) codepoint;
70
+ return yp_encoding_ascii_isupper_char(&value, n);
71
+ } else {
72
+ return 0;
73
+ }
74
+ }
75
+
76
+ yp_encoding_t yp_encoding_shift_jis = {
77
+ .name = "shift_jis",
78
+ .char_width = yp_encoding_shift_jis_char_width,
79
+ .alnum_char = yp_encoding_shift_jis_alnum_char,
80
+ .alpha_char = yp_encoding_shift_jis_alpha_char,
81
+ .isupper_char = yp_encoding_shift_jis_isupper_char,
82
+ .multibyte = true
83
+ };