yarp 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +51 -0
- data/LICENSE.md +7 -0
- data/Makefile.in +79 -0
- data/README.md +86 -0
- data/config.h.in +25 -0
- data/config.yml +2147 -0
- data/configure +4487 -0
- data/docs/build_system.md +85 -0
- data/docs/building.md +26 -0
- data/docs/configuration.md +56 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +116 -0
- data/docs/extension.md +20 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/serialization.md +130 -0
- data/docs/testing.md +55 -0
- data/ext/yarp/api_node.c +3680 -0
- data/ext/yarp/api_pack.c +256 -0
- data/ext/yarp/extconf.rb +131 -0
- data/ext/yarp/extension.c +547 -0
- data/ext/yarp/extension.h +18 -0
- data/include/yarp/ast.h +1412 -0
- data/include/yarp/defines.h +54 -0
- data/include/yarp/diagnostic.h +24 -0
- data/include/yarp/enc/yp_encoding.h +94 -0
- data/include/yarp/node.h +36 -0
- data/include/yarp/pack.h +141 -0
- data/include/yarp/parser.h +389 -0
- data/include/yarp/regexp.h +19 -0
- data/include/yarp/unescape.h +42 -0
- data/include/yarp/util/yp_buffer.h +39 -0
- data/include/yarp/util/yp_char.h +75 -0
- data/include/yarp/util/yp_constant_pool.h +64 -0
- data/include/yarp/util/yp_list.h +67 -0
- data/include/yarp/util/yp_memchr.h +14 -0
- data/include/yarp/util/yp_newline_list.h +54 -0
- data/include/yarp/util/yp_state_stack.h +24 -0
- data/include/yarp/util/yp_string.h +57 -0
- data/include/yarp/util/yp_string_list.h +28 -0
- data/include/yarp/util/yp_strpbrk.h +29 -0
- data/include/yarp/version.h +5 -0
- data/include/yarp.h +69 -0
- data/lib/yarp/lex_compat.rb +759 -0
- data/lib/yarp/node.rb +7428 -0
- data/lib/yarp/pack.rb +185 -0
- data/lib/yarp/ripper_compat.rb +174 -0
- data/lib/yarp/serialize.rb +389 -0
- data/lib/yarp.rb +330 -0
- data/src/diagnostic.c +25 -0
- data/src/enc/yp_big5.c +79 -0
- data/src/enc/yp_euc_jp.c +85 -0
- data/src/enc/yp_gbk.c +88 -0
- data/src/enc/yp_shift_jis.c +83 -0
- data/src/enc/yp_tables.c +509 -0
- data/src/enc/yp_unicode.c +2320 -0
- data/src/enc/yp_windows_31j.c +83 -0
- data/src/node.c +2011 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +1782 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1576 -0
- data/src/token_type.c +347 -0
- data/src/unescape.c +576 -0
- data/src/util/yp_buffer.c +78 -0
- data/src/util/yp_char.c +229 -0
- data/src/util/yp_constant_pool.c +147 -0
- data/src/util/yp_list.c +50 -0
- data/src/util/yp_memchr.c +31 -0
- data/src/util/yp_newline_list.c +119 -0
- data/src/util/yp_state_stack.c +25 -0
- data/src/util/yp_string.c +207 -0
- data/src/util/yp_string_list.c +32 -0
- data/src/util/yp_strncasecmp.c +20 -0
- data/src/util/yp_strpbrk.c +66 -0
- data/src/yarp.c +13211 -0
- data/yarp.gemspec +100 -0
- metadata +125 -0
data/lib/yarp.rb
ADDED
@@ -0,0 +1,330 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module YARP
|
4
|
+
# This represents a source of Ruby code that has been parsed. It is used in
|
5
|
+
# conjunction with locations to allow them to resolve line numbers and source
|
6
|
+
# ranges.
|
7
|
+
class Source
|
8
|
+
attr_reader :source, :offsets
|
9
|
+
|
10
|
+
def initialize(source, offsets)
|
11
|
+
@source = source
|
12
|
+
@offsets = offsets
|
13
|
+
end
|
14
|
+
|
15
|
+
def slice(offset, length)
|
16
|
+
source.byteslice(offset, length)
|
17
|
+
end
|
18
|
+
|
19
|
+
def line(value)
|
20
|
+
offsets.bsearch_index { |offset| offset > value } || offsets.length
|
21
|
+
end
|
22
|
+
|
23
|
+
def column(value)
|
24
|
+
value - offsets[line(value) - 1]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# This represents a location in the source.
|
29
|
+
class Location
|
30
|
+
# A Source object that is used to determine more information from the given
|
31
|
+
# offset and length.
|
32
|
+
private attr_reader :source
|
33
|
+
|
34
|
+
# The byte offset from the beginning of the source where this location
|
35
|
+
# starts.
|
36
|
+
attr_reader :start_offset
|
37
|
+
|
38
|
+
# The length of this location in bytes.
|
39
|
+
attr_reader :length
|
40
|
+
|
41
|
+
def initialize(source, start_offset, length)
|
42
|
+
@source = source
|
43
|
+
@start_offset = start_offset
|
44
|
+
@length = length
|
45
|
+
end
|
46
|
+
|
47
|
+
def inspect
|
48
|
+
"#<YARP::Location @start_offset=#{@start_offset} @length=#{@length}>"
|
49
|
+
end
|
50
|
+
|
51
|
+
# The source code that this location represents.
|
52
|
+
def slice
|
53
|
+
source.slice(start_offset, length)
|
54
|
+
end
|
55
|
+
|
56
|
+
# The byte offset from the beginning of the source where this location ends.
|
57
|
+
def end_offset
|
58
|
+
start_offset + length
|
59
|
+
end
|
60
|
+
|
61
|
+
# The line number where this location starts.
|
62
|
+
def start_line
|
63
|
+
source.line(start_offset)
|
64
|
+
end
|
65
|
+
|
66
|
+
# The line number where this location ends.
|
67
|
+
def end_line
|
68
|
+
source.line(end_offset - 1)
|
69
|
+
end
|
70
|
+
|
71
|
+
# The column number in bytes where this location starts from the start of
|
72
|
+
# the line.
|
73
|
+
def start_column
|
74
|
+
source.column(start_offset)
|
75
|
+
end
|
76
|
+
|
77
|
+
# The column number in bytes where this location ends from the start of the
|
78
|
+
# line.
|
79
|
+
def end_column
|
80
|
+
source.column(end_offset - 1)
|
81
|
+
end
|
82
|
+
|
83
|
+
def deconstruct_keys(keys)
|
84
|
+
{ start_offset: start_offset, end_offset: end_offset }
|
85
|
+
end
|
86
|
+
|
87
|
+
def pretty_print(q)
|
88
|
+
q.text("(#{start_offset}...#{end_offset})")
|
89
|
+
end
|
90
|
+
|
91
|
+
def ==(other)
|
92
|
+
other.is_a?(Location) &&
|
93
|
+
other.start_offset == start_offset &&
|
94
|
+
other.end_offset == end_offset
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.null
|
98
|
+
new(0, 0)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# This represents a comment that was encountered during parsing.
|
103
|
+
class Comment
|
104
|
+
attr_reader :type, :location
|
105
|
+
|
106
|
+
def initialize(type, location)
|
107
|
+
@type = type
|
108
|
+
@location = location
|
109
|
+
end
|
110
|
+
|
111
|
+
def deconstruct_keys(keys)
|
112
|
+
{ type: type, location: location }
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# This represents an error that was encountered during parsing.
|
117
|
+
class ParseError
|
118
|
+
attr_reader :message, :location
|
119
|
+
|
120
|
+
def initialize(message, location)
|
121
|
+
@message = message
|
122
|
+
@location = location
|
123
|
+
end
|
124
|
+
|
125
|
+
def deconstruct_keys(keys)
|
126
|
+
{ message: message, location: location }
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# This represents a warning that was encountered during parsing.
|
131
|
+
class ParseWarning
|
132
|
+
attr_reader :message, :location
|
133
|
+
|
134
|
+
def initialize(message, location)
|
135
|
+
@message = message
|
136
|
+
@location = location
|
137
|
+
end
|
138
|
+
|
139
|
+
def deconstruct_keys(keys)
|
140
|
+
{ message: message, location: location }
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# A class that knows how to walk down the tree. None of the individual visit
|
145
|
+
# methods are implemented on this visitor, so it forces the consumer to
|
146
|
+
# implement each one that they need. For a default implementation that
|
147
|
+
# continues walking the tree, see the Visitor class.
|
148
|
+
class BasicVisitor
|
149
|
+
def visit(node)
|
150
|
+
node&.accept(self)
|
151
|
+
end
|
152
|
+
|
153
|
+
def visit_all(nodes)
|
154
|
+
nodes.map { |node| visit(node) }
|
155
|
+
end
|
156
|
+
|
157
|
+
def visit_child_nodes(node)
|
158
|
+
visit_all(node.child_nodes)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
class Visitor < BasicVisitor
|
163
|
+
end
|
164
|
+
|
165
|
+
# This represents the result of a call to ::parse or ::parse_file. It contains
|
166
|
+
# the AST, any comments that were encounters, and any errors that were
|
167
|
+
# encountered.
|
168
|
+
class ParseResult
|
169
|
+
attr_reader :value, :comments, :errors, :warnings, :source
|
170
|
+
|
171
|
+
def initialize(value, comments, errors, warnings, source)
|
172
|
+
@value = value
|
173
|
+
@comments = comments
|
174
|
+
@errors = errors
|
175
|
+
@warnings = warnings
|
176
|
+
@source = source
|
177
|
+
end
|
178
|
+
|
179
|
+
def deconstruct_keys(keys)
|
180
|
+
{ value: value, comments: comments, errors: errors, warnings: warnings }
|
181
|
+
end
|
182
|
+
|
183
|
+
def success?
|
184
|
+
errors.empty?
|
185
|
+
end
|
186
|
+
|
187
|
+
def failure?
|
188
|
+
!success?
|
189
|
+
end
|
190
|
+
|
191
|
+
# Keep in sync with Java MarkNewlinesVisitor
|
192
|
+
class MarkNewlinesVisitor < YARP::Visitor
|
193
|
+
def initialize(newline_marked)
|
194
|
+
@newline_marked = newline_marked
|
195
|
+
end
|
196
|
+
|
197
|
+
def visit_block_node(node)
|
198
|
+
old_newline_marked = @newline_marked
|
199
|
+
@newline_marked = Array.new(old_newline_marked.size, false)
|
200
|
+
begin
|
201
|
+
super(node)
|
202
|
+
ensure
|
203
|
+
@newline_marked = old_newline_marked
|
204
|
+
end
|
205
|
+
end
|
206
|
+
alias_method :visit_lambda_node, :visit_block_node
|
207
|
+
|
208
|
+
def visit_if_node(node)
|
209
|
+
node.set_newline_flag(@newline_marked)
|
210
|
+
super(node)
|
211
|
+
end
|
212
|
+
alias_method :visit_unless_node, :visit_if_node
|
213
|
+
|
214
|
+
def visit_statements_node(node)
|
215
|
+
node.body.each do |child|
|
216
|
+
child.set_newline_flag(@newline_marked)
|
217
|
+
end
|
218
|
+
super(node)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
private_constant :MarkNewlinesVisitor
|
222
|
+
|
223
|
+
def mark_newlines
|
224
|
+
newline_marked = Array.new(1 + @source.offsets.size, false)
|
225
|
+
visitor = MarkNewlinesVisitor.new(newline_marked)
|
226
|
+
value.accept(visitor)
|
227
|
+
value
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# This represents a token from the Ruby source.
|
232
|
+
class Token
|
233
|
+
attr_reader :type, :value, :location
|
234
|
+
|
235
|
+
def initialize(type, value, location)
|
236
|
+
@type = type
|
237
|
+
@value = value
|
238
|
+
@location = location
|
239
|
+
end
|
240
|
+
|
241
|
+
def deconstruct_keys(keys)
|
242
|
+
{ type: type, value: value, location: location }
|
243
|
+
end
|
244
|
+
|
245
|
+
def pretty_print(q)
|
246
|
+
q.group do
|
247
|
+
q.text(type.to_s)
|
248
|
+
self.location.pretty_print(q)
|
249
|
+
q.text("(")
|
250
|
+
q.nest(2) do
|
251
|
+
q.breakable("")
|
252
|
+
q.pp(value)
|
253
|
+
end
|
254
|
+
q.breakable("")
|
255
|
+
q.text(")")
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def ==(other)
|
260
|
+
other.is_a?(Token) &&
|
261
|
+
other.type == type &&
|
262
|
+
other.value == value
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# This represents a node in the tree.
|
267
|
+
class Node
|
268
|
+
attr_reader :location
|
269
|
+
|
270
|
+
def newline?
|
271
|
+
@newline ? true : false
|
272
|
+
end
|
273
|
+
|
274
|
+
def set_newline_flag(newline_marked)
|
275
|
+
line = location.start_line
|
276
|
+
unless newline_marked[line]
|
277
|
+
newline_marked[line] = true
|
278
|
+
@newline = true
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
def pretty_print(q)
|
283
|
+
q.group do
|
284
|
+
q.text(self.class.name.split("::").last)
|
285
|
+
location.pretty_print(q)
|
286
|
+
q.text("[Li:#{location.start_line}]") if newline?
|
287
|
+
q.text("(")
|
288
|
+
q.nest(2) do
|
289
|
+
deconstructed = deconstruct_keys([])
|
290
|
+
deconstructed.delete(:location)
|
291
|
+
|
292
|
+
q.breakable("")
|
293
|
+
q.seplist(deconstructed, lambda { q.comma_breakable }, :each_value) { |value| q.pp(value) }
|
294
|
+
end
|
295
|
+
q.breakable("")
|
296
|
+
q.text(")")
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
# Load the serialized AST using the source as a reference into a tree.
|
302
|
+
def self.load(source, serialized)
|
303
|
+
Serialize.load(source, serialized)
|
304
|
+
end
|
305
|
+
|
306
|
+
# This module is used for testing and debugging and is not meant to be used by
|
307
|
+
# consumers of this library.
|
308
|
+
module Debug
|
309
|
+
def self.newlines(source)
|
310
|
+
YARP.parse(source).source.offsets
|
311
|
+
end
|
312
|
+
|
313
|
+
def self.parse_serialize_file(filepath)
|
314
|
+
parse_serialize_file_metadata(filepath, [filepath.bytesize, filepath.b, 0].pack("LA*L"))
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Marking this as private so that consumers don't see it. It makes it a little
|
319
|
+
# annoying for testing since you have to const_get it to access the methods,
|
320
|
+
# but at least this way it's clear it's not meant for consumers.
|
321
|
+
private_constant :Debug
|
322
|
+
end
|
323
|
+
|
324
|
+
require_relative "yarp/lex_compat"
|
325
|
+
require_relative "yarp/node"
|
326
|
+
require_relative "yarp/ripper_compat"
|
327
|
+
require_relative "yarp/serialize"
|
328
|
+
require_relative "yarp/pack"
|
329
|
+
|
330
|
+
require "yarp/yarp"
|
data/src/diagnostic.c
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#include "yarp/diagnostic.h"
|
2
|
+
|
3
|
+
// Append an error to the given list of diagnostic.
|
4
|
+
bool
|
5
|
+
yp_diagnostic_list_append(yp_list_t *list, const char *start, const char *end, const char *message) {
|
6
|
+
yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) malloc(sizeof(yp_diagnostic_t));
|
7
|
+
if (diagnostic == NULL) return false;
|
8
|
+
|
9
|
+
*diagnostic = (yp_diagnostic_t) { .start = start, .end = end, .message = message };
|
10
|
+
yp_list_append(list, (yp_list_node_t *) diagnostic);
|
11
|
+
return true;
|
12
|
+
}
|
13
|
+
|
14
|
+
// Deallocate the internal state of the given diagnostic list.
|
15
|
+
void
|
16
|
+
yp_diagnostic_list_free(yp_list_t *list) {
|
17
|
+
yp_list_node_t *node, *next;
|
18
|
+
|
19
|
+
for (node = list->head; node != NULL; node = next) {
|
20
|
+
next = node->next;
|
21
|
+
|
22
|
+
yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) node;
|
23
|
+
free(diagnostic);
|
24
|
+
}
|
25
|
+
}
|
data/src/enc/yp_big5.c
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_big5_codepoint_t;
|
4
|
+
|
5
|
+
static yp_big5_codepoint_t
|
6
|
+
yp_big5_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if ((n > 1) && (uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xFE)) {
|
17
|
+
*width = 2;
|
18
|
+
return (yp_big5_codepoint_t) (uc[0] << 8 | uc[1]);
|
19
|
+
}
|
20
|
+
|
21
|
+
*width = 0;
|
22
|
+
return 0;
|
23
|
+
}
|
24
|
+
|
25
|
+
static size_t
|
26
|
+
yp_encoding_big5_char_width(const char *c, ptrdiff_t n) {
|
27
|
+
size_t width;
|
28
|
+
yp_big5_codepoint(c, n, &width);
|
29
|
+
|
30
|
+
return width;
|
31
|
+
}
|
32
|
+
|
33
|
+
static size_t
|
34
|
+
yp_encoding_big5_alpha_char(const char *c, ptrdiff_t n) {
|
35
|
+
size_t width;
|
36
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
37
|
+
|
38
|
+
if (width == 1) {
|
39
|
+
const char value = (const char) codepoint;
|
40
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
41
|
+
} else {
|
42
|
+
return 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
static size_t
|
47
|
+
yp_encoding_big5_alnum_char(const char *c, ptrdiff_t n) {
|
48
|
+
size_t width;
|
49
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
50
|
+
|
51
|
+
if (width == 1) {
|
52
|
+
const char value = (const char) codepoint;
|
53
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
54
|
+
} else {
|
55
|
+
return 0;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
static bool
|
60
|
+
yp_encoding_big5_isupper_char(const char *c, ptrdiff_t n) {
|
61
|
+
size_t width;
|
62
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
63
|
+
|
64
|
+
if (width == 1) {
|
65
|
+
const char value = (const char) codepoint;
|
66
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
67
|
+
} else {
|
68
|
+
return false;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
yp_encoding_t yp_encoding_big5 = {
|
73
|
+
.name = "big5",
|
74
|
+
.char_width = yp_encoding_big5_char_width,
|
75
|
+
.alnum_char = yp_encoding_big5_alnum_char,
|
76
|
+
.alpha_char = yp_encoding_big5_alpha_char,
|
77
|
+
.isupper_char = yp_encoding_big5_isupper_char,
|
78
|
+
.multibyte = true
|
79
|
+
};
|
data/src/enc/yp_euc_jp.c
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_euc_jp_codepoint_t;
|
4
|
+
|
5
|
+
static yp_euc_jp_codepoint_t
|
6
|
+
yp_euc_jp_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
(
|
19
|
+
((uc[0] == 0x8E) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) ||
|
20
|
+
((uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0xA1 && uc[1] <= 0xFE))
|
21
|
+
)
|
22
|
+
) {
|
23
|
+
*width = 2;
|
24
|
+
return (yp_euc_jp_codepoint_t) (uc[0] << 8 | uc[1]);
|
25
|
+
}
|
26
|
+
|
27
|
+
*width = 0;
|
28
|
+
return 0;
|
29
|
+
}
|
30
|
+
|
31
|
+
static size_t
|
32
|
+
yp_encoding_euc_jp_char_width(const char *c, ptrdiff_t n) {
|
33
|
+
size_t width;
|
34
|
+
yp_euc_jp_codepoint(c, n, &width);
|
35
|
+
|
36
|
+
return width;
|
37
|
+
}
|
38
|
+
|
39
|
+
static size_t
|
40
|
+
yp_encoding_euc_jp_alpha_char(const char *c, ptrdiff_t n) {
|
41
|
+
size_t width;
|
42
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
43
|
+
|
44
|
+
if (width == 1) {
|
45
|
+
const char value = (const char) codepoint;
|
46
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
47
|
+
} else {
|
48
|
+
return 0;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
static size_t
|
53
|
+
yp_encoding_euc_jp_alnum_char(const char *c, ptrdiff_t n) {
|
54
|
+
size_t width;
|
55
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
56
|
+
|
57
|
+
if (width == 1) {
|
58
|
+
const char value = (const char) codepoint;
|
59
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
60
|
+
} else {
|
61
|
+
return 0;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
static bool
|
66
|
+
yp_encoding_euc_jp_isupper_char(const char *c, ptrdiff_t n) {
|
67
|
+
size_t width;
|
68
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
69
|
+
|
70
|
+
if (width == 1) {
|
71
|
+
const char value = (const char) codepoint;
|
72
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
73
|
+
} else {
|
74
|
+
return 0;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
yp_encoding_t yp_encoding_euc_jp = {
|
79
|
+
.name = "euc-jp",
|
80
|
+
.char_width = yp_encoding_euc_jp_char_width,
|
81
|
+
.alnum_char = yp_encoding_euc_jp_alnum_char,
|
82
|
+
.alpha_char = yp_encoding_euc_jp_alpha_char,
|
83
|
+
.isupper_char = yp_encoding_euc_jp_isupper_char,
|
84
|
+
.multibyte = true
|
85
|
+
};
|
data/src/enc/yp_gbk.c
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_gbk_codepoint_t;
|
4
|
+
|
5
|
+
static yp_gbk_codepoint_t
|
6
|
+
yp_gbk_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
(
|
19
|
+
((uc[0] >= 0xA1 && uc[0] <= 0xA9) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/1
|
20
|
+
((uc[0] >= 0xB0 && uc[0] <= 0xF7) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/2
|
21
|
+
((uc[0] >= 0x81 && uc[0] <= 0xA0) && (uc[1] >= 0x40 && uc[1] <= 0xFE) && (uc[1] != 0x7F)) || // GBK/3
|
22
|
+
((uc[0] >= 0xAA && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) || // GBK/4
|
23
|
+
((uc[0] >= 0xA8 && uc[0] <= 0xA9) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) // GBK/5
|
24
|
+
)
|
25
|
+
) {
|
26
|
+
*width = 2;
|
27
|
+
return (yp_gbk_codepoint_t) (uc[0] << 8 | uc[1]);
|
28
|
+
}
|
29
|
+
|
30
|
+
*width = 0;
|
31
|
+
return 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
static size_t
|
35
|
+
yp_encoding_gbk_char_width(const char *c, ptrdiff_t n) {
|
36
|
+
size_t width;
|
37
|
+
yp_gbk_codepoint(c, n, &width);
|
38
|
+
|
39
|
+
return width;
|
40
|
+
}
|
41
|
+
|
42
|
+
static size_t
|
43
|
+
yp_encoding_gbk_alpha_char(const char *c, ptrdiff_t n) {
|
44
|
+
size_t width;
|
45
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
46
|
+
|
47
|
+
if (width == 1) {
|
48
|
+
const char value = (const char) codepoint;
|
49
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
50
|
+
} else {
|
51
|
+
return 0;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
static size_t
|
56
|
+
yp_encoding_gbk_alnum_char(const char *c, ptrdiff_t n) {
|
57
|
+
size_t width;
|
58
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
59
|
+
|
60
|
+
if (width == 1) {
|
61
|
+
const char value = (const char) codepoint;
|
62
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
63
|
+
} else {
|
64
|
+
return 0;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
static bool
|
69
|
+
yp_encoding_gbk_isupper_char(const char *c, ptrdiff_t n) {
|
70
|
+
size_t width;
|
71
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
72
|
+
|
73
|
+
if (width == 1) {
|
74
|
+
const char value = (const char) codepoint;
|
75
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
76
|
+
} else {
|
77
|
+
return false;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
yp_encoding_t yp_encoding_gbk = {
|
82
|
+
.name = "gbk",
|
83
|
+
.char_width = yp_encoding_gbk_char_width,
|
84
|
+
.alnum_char = yp_encoding_gbk_alnum_char,
|
85
|
+
.alpha_char = yp_encoding_gbk_alpha_char,
|
86
|
+
.isupper_char = yp_encoding_gbk_isupper_char,
|
87
|
+
.multibyte = true
|
88
|
+
};
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_shift_jis_codepoint_t;
|
4
|
+
|
5
|
+
static yp_shift_jis_codepoint_t
|
6
|
+
yp_shift_jis_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
|
19
|
+
(uc[1] >= 0x40 && uc[1] <= 0xFC)
|
20
|
+
) {
|
21
|
+
*width = 2;
|
22
|
+
return (yp_shift_jis_codepoint_t) (uc[0] << 8 | uc[1]);
|
23
|
+
}
|
24
|
+
|
25
|
+
*width = 0;
|
26
|
+
return 0;
|
27
|
+
}
|
28
|
+
|
29
|
+
static size_t
|
30
|
+
yp_encoding_shift_jis_char_width(const char *c, ptrdiff_t n) {
|
31
|
+
size_t width;
|
32
|
+
yp_shift_jis_codepoint(c, n, &width);
|
33
|
+
|
34
|
+
return width;
|
35
|
+
}
|
36
|
+
|
37
|
+
static size_t
|
38
|
+
yp_encoding_shift_jis_alpha_char(const char *c, ptrdiff_t n) {
|
39
|
+
size_t width;
|
40
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
41
|
+
|
42
|
+
if (width == 1) {
|
43
|
+
const char value = (const char) codepoint;
|
44
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
45
|
+
} else {
|
46
|
+
return 0;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
static size_t
|
51
|
+
yp_encoding_shift_jis_alnum_char(const char *c, ptrdiff_t n) {
|
52
|
+
size_t width;
|
53
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
54
|
+
|
55
|
+
if (width == 1) {
|
56
|
+
const char value = (const char) codepoint;
|
57
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
58
|
+
} else {
|
59
|
+
return 0;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
static bool
|
64
|
+
yp_encoding_shift_jis_isupper_char(const char *c, ptrdiff_t n) {
|
65
|
+
size_t width;
|
66
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
67
|
+
|
68
|
+
if (width == 1) {
|
69
|
+
const char value = (const char) codepoint;
|
70
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
71
|
+
} else {
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
yp_encoding_t yp_encoding_shift_jis = {
|
77
|
+
.name = "shift_jis",
|
78
|
+
.char_width = yp_encoding_shift_jis_char_width,
|
79
|
+
.alnum_char = yp_encoding_shift_jis_alnum_char,
|
80
|
+
.alpha_char = yp_encoding_shift_jis_alpha_char,
|
81
|
+
.isupper_char = yp_encoding_shift_jis_isupper_char,
|
82
|
+
.multibyte = true
|
83
|
+
};
|