yarp 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +51 -0
- data/LICENSE.md +7 -0
- data/Makefile.in +79 -0
- data/README.md +86 -0
- data/config.h.in +25 -0
- data/config.yml +2147 -0
- data/configure +4487 -0
- data/docs/build_system.md +85 -0
- data/docs/building.md +26 -0
- data/docs/configuration.md +56 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +116 -0
- data/docs/extension.md +20 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/serialization.md +130 -0
- data/docs/testing.md +55 -0
- data/ext/yarp/api_node.c +3680 -0
- data/ext/yarp/api_pack.c +256 -0
- data/ext/yarp/extconf.rb +131 -0
- data/ext/yarp/extension.c +547 -0
- data/ext/yarp/extension.h +18 -0
- data/include/yarp/ast.h +1412 -0
- data/include/yarp/defines.h +54 -0
- data/include/yarp/diagnostic.h +24 -0
- data/include/yarp/enc/yp_encoding.h +94 -0
- data/include/yarp/node.h +36 -0
- data/include/yarp/pack.h +141 -0
- data/include/yarp/parser.h +389 -0
- data/include/yarp/regexp.h +19 -0
- data/include/yarp/unescape.h +42 -0
- data/include/yarp/util/yp_buffer.h +39 -0
- data/include/yarp/util/yp_char.h +75 -0
- data/include/yarp/util/yp_constant_pool.h +64 -0
- data/include/yarp/util/yp_list.h +67 -0
- data/include/yarp/util/yp_memchr.h +14 -0
- data/include/yarp/util/yp_newline_list.h +54 -0
- data/include/yarp/util/yp_state_stack.h +24 -0
- data/include/yarp/util/yp_string.h +57 -0
- data/include/yarp/util/yp_string_list.h +28 -0
- data/include/yarp/util/yp_strpbrk.h +29 -0
- data/include/yarp/version.h +5 -0
- data/include/yarp.h +69 -0
- data/lib/yarp/lex_compat.rb +759 -0
- data/lib/yarp/node.rb +7428 -0
- data/lib/yarp/pack.rb +185 -0
- data/lib/yarp/ripper_compat.rb +174 -0
- data/lib/yarp/serialize.rb +389 -0
- data/lib/yarp.rb +330 -0
- data/src/diagnostic.c +25 -0
- data/src/enc/yp_big5.c +79 -0
- data/src/enc/yp_euc_jp.c +85 -0
- data/src/enc/yp_gbk.c +88 -0
- data/src/enc/yp_shift_jis.c +83 -0
- data/src/enc/yp_tables.c +509 -0
- data/src/enc/yp_unicode.c +2320 -0
- data/src/enc/yp_windows_31j.c +83 -0
- data/src/node.c +2011 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +1782 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1576 -0
- data/src/token_type.c +347 -0
- data/src/unescape.c +576 -0
- data/src/util/yp_buffer.c +78 -0
- data/src/util/yp_char.c +229 -0
- data/src/util/yp_constant_pool.c +147 -0
- data/src/util/yp_list.c +50 -0
- data/src/util/yp_memchr.c +31 -0
- data/src/util/yp_newline_list.c +119 -0
- data/src/util/yp_state_stack.c +25 -0
- data/src/util/yp_string.c +207 -0
- data/src/util/yp_string_list.c +32 -0
- data/src/util/yp_strncasecmp.c +20 -0
- data/src/util/yp_strpbrk.c +66 -0
- data/src/yarp.c +13211 -0
- data/yarp.gemspec +100 -0
- metadata +125 -0
data/lib/yarp.rb
ADDED
@@ -0,0 +1,330 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module YARP
|
4
|
+
# This represents a source of Ruby code that has been parsed. It is used in
|
5
|
+
# conjunction with locations to allow them to resolve line numbers and source
|
6
|
+
# ranges.
|
7
|
+
class Source
|
8
|
+
attr_reader :source, :offsets
|
9
|
+
|
10
|
+
def initialize(source, offsets)
|
11
|
+
@source = source
|
12
|
+
@offsets = offsets
|
13
|
+
end
|
14
|
+
|
15
|
+
def slice(offset, length)
|
16
|
+
source.byteslice(offset, length)
|
17
|
+
end
|
18
|
+
|
19
|
+
def line(value)
|
20
|
+
offsets.bsearch_index { |offset| offset > value } || offsets.length
|
21
|
+
end
|
22
|
+
|
23
|
+
def column(value)
|
24
|
+
value - offsets[line(value) - 1]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# This represents a location in the source.
|
29
|
+
class Location
|
30
|
+
# A Source object that is used to determine more information from the given
|
31
|
+
# offset and length.
|
32
|
+
private attr_reader :source
|
33
|
+
|
34
|
+
# The byte offset from the beginning of the source where this location
|
35
|
+
# starts.
|
36
|
+
attr_reader :start_offset
|
37
|
+
|
38
|
+
# The length of this location in bytes.
|
39
|
+
attr_reader :length
|
40
|
+
|
41
|
+
def initialize(source, start_offset, length)
|
42
|
+
@source = source
|
43
|
+
@start_offset = start_offset
|
44
|
+
@length = length
|
45
|
+
end
|
46
|
+
|
47
|
+
def inspect
|
48
|
+
"#<YARP::Location @start_offset=#{@start_offset} @length=#{@length}>"
|
49
|
+
end
|
50
|
+
|
51
|
+
# The source code that this location represents.
|
52
|
+
def slice
|
53
|
+
source.slice(start_offset, length)
|
54
|
+
end
|
55
|
+
|
56
|
+
# The byte offset from the beginning of the source where this location ends.
|
57
|
+
def end_offset
|
58
|
+
start_offset + length
|
59
|
+
end
|
60
|
+
|
61
|
+
# The line number where this location starts.
|
62
|
+
def start_line
|
63
|
+
source.line(start_offset)
|
64
|
+
end
|
65
|
+
|
66
|
+
# The line number where this location ends.
|
67
|
+
def end_line
|
68
|
+
source.line(end_offset - 1)
|
69
|
+
end
|
70
|
+
|
71
|
+
# The column number in bytes where this location starts from the start of
|
72
|
+
# the line.
|
73
|
+
def start_column
|
74
|
+
source.column(start_offset)
|
75
|
+
end
|
76
|
+
|
77
|
+
# The column number in bytes where this location ends from the start of the
|
78
|
+
# line.
|
79
|
+
def end_column
|
80
|
+
source.column(end_offset - 1)
|
81
|
+
end
|
82
|
+
|
83
|
+
def deconstruct_keys(keys)
|
84
|
+
{ start_offset: start_offset, end_offset: end_offset }
|
85
|
+
end
|
86
|
+
|
87
|
+
def pretty_print(q)
|
88
|
+
q.text("(#{start_offset}...#{end_offset})")
|
89
|
+
end
|
90
|
+
|
91
|
+
def ==(other)
|
92
|
+
other.is_a?(Location) &&
|
93
|
+
other.start_offset == start_offset &&
|
94
|
+
other.end_offset == end_offset
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.null
|
98
|
+
new(0, 0)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# This represents a comment that was encountered during parsing.
|
103
|
+
class Comment
|
104
|
+
attr_reader :type, :location
|
105
|
+
|
106
|
+
def initialize(type, location)
|
107
|
+
@type = type
|
108
|
+
@location = location
|
109
|
+
end
|
110
|
+
|
111
|
+
def deconstruct_keys(keys)
|
112
|
+
{ type: type, location: location }
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# This represents an error that was encountered during parsing.
|
117
|
+
class ParseError
|
118
|
+
attr_reader :message, :location
|
119
|
+
|
120
|
+
def initialize(message, location)
|
121
|
+
@message = message
|
122
|
+
@location = location
|
123
|
+
end
|
124
|
+
|
125
|
+
def deconstruct_keys(keys)
|
126
|
+
{ message: message, location: location }
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# This represents a warning that was encountered during parsing.
|
131
|
+
class ParseWarning
|
132
|
+
attr_reader :message, :location
|
133
|
+
|
134
|
+
def initialize(message, location)
|
135
|
+
@message = message
|
136
|
+
@location = location
|
137
|
+
end
|
138
|
+
|
139
|
+
def deconstruct_keys(keys)
|
140
|
+
{ message: message, location: location }
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# A class that knows how to walk down the tree. None of the individual visit
|
145
|
+
# methods are implemented on this visitor, so it forces the consumer to
|
146
|
+
# implement each one that they need. For a default implementation that
|
147
|
+
# continues walking the tree, see the Visitor class.
|
148
|
+
class BasicVisitor
|
149
|
+
def visit(node)
|
150
|
+
node&.accept(self)
|
151
|
+
end
|
152
|
+
|
153
|
+
def visit_all(nodes)
|
154
|
+
nodes.map { |node| visit(node) }
|
155
|
+
end
|
156
|
+
|
157
|
+
def visit_child_nodes(node)
|
158
|
+
visit_all(node.child_nodes)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
class Visitor < BasicVisitor
|
163
|
+
end
|
164
|
+
|
165
|
+
# This represents the result of a call to ::parse or ::parse_file. It contains
|
166
|
+
# the AST, any comments that were encounters, and any errors that were
|
167
|
+
# encountered.
|
168
|
+
class ParseResult
|
169
|
+
attr_reader :value, :comments, :errors, :warnings, :source
|
170
|
+
|
171
|
+
def initialize(value, comments, errors, warnings, source)
|
172
|
+
@value = value
|
173
|
+
@comments = comments
|
174
|
+
@errors = errors
|
175
|
+
@warnings = warnings
|
176
|
+
@source = source
|
177
|
+
end
|
178
|
+
|
179
|
+
def deconstruct_keys(keys)
|
180
|
+
{ value: value, comments: comments, errors: errors, warnings: warnings }
|
181
|
+
end
|
182
|
+
|
183
|
+
def success?
|
184
|
+
errors.empty?
|
185
|
+
end
|
186
|
+
|
187
|
+
def failure?
|
188
|
+
!success?
|
189
|
+
end
|
190
|
+
|
191
|
+
# Keep in sync with Java MarkNewlinesVisitor
|
192
|
+
class MarkNewlinesVisitor < YARP::Visitor
|
193
|
+
def initialize(newline_marked)
|
194
|
+
@newline_marked = newline_marked
|
195
|
+
end
|
196
|
+
|
197
|
+
def visit_block_node(node)
|
198
|
+
old_newline_marked = @newline_marked
|
199
|
+
@newline_marked = Array.new(old_newline_marked.size, false)
|
200
|
+
begin
|
201
|
+
super(node)
|
202
|
+
ensure
|
203
|
+
@newline_marked = old_newline_marked
|
204
|
+
end
|
205
|
+
end
|
206
|
+
alias_method :visit_lambda_node, :visit_block_node
|
207
|
+
|
208
|
+
def visit_if_node(node)
|
209
|
+
node.set_newline_flag(@newline_marked)
|
210
|
+
super(node)
|
211
|
+
end
|
212
|
+
alias_method :visit_unless_node, :visit_if_node
|
213
|
+
|
214
|
+
def visit_statements_node(node)
|
215
|
+
node.body.each do |child|
|
216
|
+
child.set_newline_flag(@newline_marked)
|
217
|
+
end
|
218
|
+
super(node)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
private_constant :MarkNewlinesVisitor
|
222
|
+
|
223
|
+
def mark_newlines
|
224
|
+
newline_marked = Array.new(1 + @source.offsets.size, false)
|
225
|
+
visitor = MarkNewlinesVisitor.new(newline_marked)
|
226
|
+
value.accept(visitor)
|
227
|
+
value
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# This represents a token from the Ruby source.
|
232
|
+
class Token
|
233
|
+
attr_reader :type, :value, :location
|
234
|
+
|
235
|
+
def initialize(type, value, location)
|
236
|
+
@type = type
|
237
|
+
@value = value
|
238
|
+
@location = location
|
239
|
+
end
|
240
|
+
|
241
|
+
def deconstruct_keys(keys)
|
242
|
+
{ type: type, value: value, location: location }
|
243
|
+
end
|
244
|
+
|
245
|
+
def pretty_print(q)
|
246
|
+
q.group do
|
247
|
+
q.text(type.to_s)
|
248
|
+
self.location.pretty_print(q)
|
249
|
+
q.text("(")
|
250
|
+
q.nest(2) do
|
251
|
+
q.breakable("")
|
252
|
+
q.pp(value)
|
253
|
+
end
|
254
|
+
q.breakable("")
|
255
|
+
q.text(")")
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def ==(other)
|
260
|
+
other.is_a?(Token) &&
|
261
|
+
other.type == type &&
|
262
|
+
other.value == value
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# This represents a node in the tree.
|
267
|
+
class Node
|
268
|
+
attr_reader :location
|
269
|
+
|
270
|
+
def newline?
|
271
|
+
@newline ? true : false
|
272
|
+
end
|
273
|
+
|
274
|
+
def set_newline_flag(newline_marked)
|
275
|
+
line = location.start_line
|
276
|
+
unless newline_marked[line]
|
277
|
+
newline_marked[line] = true
|
278
|
+
@newline = true
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
def pretty_print(q)
|
283
|
+
q.group do
|
284
|
+
q.text(self.class.name.split("::").last)
|
285
|
+
location.pretty_print(q)
|
286
|
+
q.text("[Li:#{location.start_line}]") if newline?
|
287
|
+
q.text("(")
|
288
|
+
q.nest(2) do
|
289
|
+
deconstructed = deconstruct_keys([])
|
290
|
+
deconstructed.delete(:location)
|
291
|
+
|
292
|
+
q.breakable("")
|
293
|
+
q.seplist(deconstructed, lambda { q.comma_breakable }, :each_value) { |value| q.pp(value) }
|
294
|
+
end
|
295
|
+
q.breakable("")
|
296
|
+
q.text(")")
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
# Load the serialized AST using the source as a reference into a tree.
|
302
|
+
def self.load(source, serialized)
|
303
|
+
Serialize.load(source, serialized)
|
304
|
+
end
|
305
|
+
|
306
|
+
# This module is used for testing and debugging and is not meant to be used by
|
307
|
+
# consumers of this library.
|
308
|
+
module Debug
|
309
|
+
def self.newlines(source)
|
310
|
+
YARP.parse(source).source.offsets
|
311
|
+
end
|
312
|
+
|
313
|
+
def self.parse_serialize_file(filepath)
|
314
|
+
parse_serialize_file_metadata(filepath, [filepath.bytesize, filepath.b, 0].pack("LA*L"))
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# Marking this as private so that consumers don't see it. It makes it a little
|
319
|
+
# annoying for testing since you have to const_get it to access the methods,
|
320
|
+
# but at least this way it's clear it's not meant for consumers.
|
321
|
+
private_constant :Debug
|
322
|
+
end
|
323
|
+
|
324
|
+
require_relative "yarp/lex_compat"
|
325
|
+
require_relative "yarp/node"
|
326
|
+
require_relative "yarp/ripper_compat"
|
327
|
+
require_relative "yarp/serialize"
|
328
|
+
require_relative "yarp/pack"
|
329
|
+
|
330
|
+
require "yarp/yarp"
|
data/src/diagnostic.c
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#include "yarp/diagnostic.h"
|
2
|
+
|
3
|
+
// Append an error to the given list of diagnostic.
|
4
|
+
bool
|
5
|
+
yp_diagnostic_list_append(yp_list_t *list, const char *start, const char *end, const char *message) {
|
6
|
+
yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) malloc(sizeof(yp_diagnostic_t));
|
7
|
+
if (diagnostic == NULL) return false;
|
8
|
+
|
9
|
+
*diagnostic = (yp_diagnostic_t) { .start = start, .end = end, .message = message };
|
10
|
+
yp_list_append(list, (yp_list_node_t *) diagnostic);
|
11
|
+
return true;
|
12
|
+
}
|
13
|
+
|
14
|
+
// Deallocate the internal state of the given diagnostic list.
|
15
|
+
void
|
16
|
+
yp_diagnostic_list_free(yp_list_t *list) {
|
17
|
+
yp_list_node_t *node, *next;
|
18
|
+
|
19
|
+
for (node = list->head; node != NULL; node = next) {
|
20
|
+
next = node->next;
|
21
|
+
|
22
|
+
yp_diagnostic_t *diagnostic = (yp_diagnostic_t *) node;
|
23
|
+
free(diagnostic);
|
24
|
+
}
|
25
|
+
}
|
data/src/enc/yp_big5.c
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_big5_codepoint_t;
|
4
|
+
|
5
|
+
static yp_big5_codepoint_t
|
6
|
+
yp_big5_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if ((n > 1) && (uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xFE)) {
|
17
|
+
*width = 2;
|
18
|
+
return (yp_big5_codepoint_t) (uc[0] << 8 | uc[1]);
|
19
|
+
}
|
20
|
+
|
21
|
+
*width = 0;
|
22
|
+
return 0;
|
23
|
+
}
|
24
|
+
|
25
|
+
static size_t
|
26
|
+
yp_encoding_big5_char_width(const char *c, ptrdiff_t n) {
|
27
|
+
size_t width;
|
28
|
+
yp_big5_codepoint(c, n, &width);
|
29
|
+
|
30
|
+
return width;
|
31
|
+
}
|
32
|
+
|
33
|
+
static size_t
|
34
|
+
yp_encoding_big5_alpha_char(const char *c, ptrdiff_t n) {
|
35
|
+
size_t width;
|
36
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
37
|
+
|
38
|
+
if (width == 1) {
|
39
|
+
const char value = (const char) codepoint;
|
40
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
41
|
+
} else {
|
42
|
+
return 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
static size_t
|
47
|
+
yp_encoding_big5_alnum_char(const char *c, ptrdiff_t n) {
|
48
|
+
size_t width;
|
49
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
50
|
+
|
51
|
+
if (width == 1) {
|
52
|
+
const char value = (const char) codepoint;
|
53
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
54
|
+
} else {
|
55
|
+
return 0;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
static bool
|
60
|
+
yp_encoding_big5_isupper_char(const char *c, ptrdiff_t n) {
|
61
|
+
size_t width;
|
62
|
+
yp_big5_codepoint_t codepoint = yp_big5_codepoint(c, n, &width);
|
63
|
+
|
64
|
+
if (width == 1) {
|
65
|
+
const char value = (const char) codepoint;
|
66
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
67
|
+
} else {
|
68
|
+
return false;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
yp_encoding_t yp_encoding_big5 = {
|
73
|
+
.name = "big5",
|
74
|
+
.char_width = yp_encoding_big5_char_width,
|
75
|
+
.alnum_char = yp_encoding_big5_alnum_char,
|
76
|
+
.alpha_char = yp_encoding_big5_alpha_char,
|
77
|
+
.isupper_char = yp_encoding_big5_isupper_char,
|
78
|
+
.multibyte = true
|
79
|
+
};
|
data/src/enc/yp_euc_jp.c
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_euc_jp_codepoint_t;
|
4
|
+
|
5
|
+
static yp_euc_jp_codepoint_t
|
6
|
+
yp_euc_jp_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
(
|
19
|
+
((uc[0] == 0x8E) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) ||
|
20
|
+
((uc[0] >= 0xA1 && uc[0] <= 0xFE) && (uc[1] >= 0xA1 && uc[1] <= 0xFE))
|
21
|
+
)
|
22
|
+
) {
|
23
|
+
*width = 2;
|
24
|
+
return (yp_euc_jp_codepoint_t) (uc[0] << 8 | uc[1]);
|
25
|
+
}
|
26
|
+
|
27
|
+
*width = 0;
|
28
|
+
return 0;
|
29
|
+
}
|
30
|
+
|
31
|
+
static size_t
|
32
|
+
yp_encoding_euc_jp_char_width(const char *c, ptrdiff_t n) {
|
33
|
+
size_t width;
|
34
|
+
yp_euc_jp_codepoint(c, n, &width);
|
35
|
+
|
36
|
+
return width;
|
37
|
+
}
|
38
|
+
|
39
|
+
static size_t
|
40
|
+
yp_encoding_euc_jp_alpha_char(const char *c, ptrdiff_t n) {
|
41
|
+
size_t width;
|
42
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
43
|
+
|
44
|
+
if (width == 1) {
|
45
|
+
const char value = (const char) codepoint;
|
46
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
47
|
+
} else {
|
48
|
+
return 0;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
static size_t
|
53
|
+
yp_encoding_euc_jp_alnum_char(const char *c, ptrdiff_t n) {
|
54
|
+
size_t width;
|
55
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
56
|
+
|
57
|
+
if (width == 1) {
|
58
|
+
const char value = (const char) codepoint;
|
59
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
60
|
+
} else {
|
61
|
+
return 0;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
static bool
|
66
|
+
yp_encoding_euc_jp_isupper_char(const char *c, ptrdiff_t n) {
|
67
|
+
size_t width;
|
68
|
+
yp_euc_jp_codepoint_t codepoint = yp_euc_jp_codepoint(c, n, &width);
|
69
|
+
|
70
|
+
if (width == 1) {
|
71
|
+
const char value = (const char) codepoint;
|
72
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
73
|
+
} else {
|
74
|
+
return 0;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
yp_encoding_t yp_encoding_euc_jp = {
|
79
|
+
.name = "euc-jp",
|
80
|
+
.char_width = yp_encoding_euc_jp_char_width,
|
81
|
+
.alnum_char = yp_encoding_euc_jp_alnum_char,
|
82
|
+
.alpha_char = yp_encoding_euc_jp_alpha_char,
|
83
|
+
.isupper_char = yp_encoding_euc_jp_isupper_char,
|
84
|
+
.multibyte = true
|
85
|
+
};
|
data/src/enc/yp_gbk.c
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_gbk_codepoint_t;
|
4
|
+
|
5
|
+
static yp_gbk_codepoint_t
|
6
|
+
yp_gbk_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
(
|
19
|
+
((uc[0] >= 0xA1 && uc[0] <= 0xA9) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/1
|
20
|
+
((uc[0] >= 0xB0 && uc[0] <= 0xF7) && (uc[1] >= 0xA1 && uc[1] <= 0xFE)) || // GBK/2
|
21
|
+
((uc[0] >= 0x81 && uc[0] <= 0xA0) && (uc[1] >= 0x40 && uc[1] <= 0xFE) && (uc[1] != 0x7F)) || // GBK/3
|
22
|
+
((uc[0] >= 0xAA && uc[0] <= 0xFE) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) || // GBK/4
|
23
|
+
((uc[0] >= 0xA8 && uc[0] <= 0xA9) && (uc[1] >= 0x40 && uc[1] <= 0xA0) && (uc[1] != 0x7F)) // GBK/5
|
24
|
+
)
|
25
|
+
) {
|
26
|
+
*width = 2;
|
27
|
+
return (yp_gbk_codepoint_t) (uc[0] << 8 | uc[1]);
|
28
|
+
}
|
29
|
+
|
30
|
+
*width = 0;
|
31
|
+
return 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
static size_t
|
35
|
+
yp_encoding_gbk_char_width(const char *c, ptrdiff_t n) {
|
36
|
+
size_t width;
|
37
|
+
yp_gbk_codepoint(c, n, &width);
|
38
|
+
|
39
|
+
return width;
|
40
|
+
}
|
41
|
+
|
42
|
+
static size_t
|
43
|
+
yp_encoding_gbk_alpha_char(const char *c, ptrdiff_t n) {
|
44
|
+
size_t width;
|
45
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
46
|
+
|
47
|
+
if (width == 1) {
|
48
|
+
const char value = (const char) codepoint;
|
49
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
50
|
+
} else {
|
51
|
+
return 0;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
static size_t
|
56
|
+
yp_encoding_gbk_alnum_char(const char *c, ptrdiff_t n) {
|
57
|
+
size_t width;
|
58
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
59
|
+
|
60
|
+
if (width == 1) {
|
61
|
+
const char value = (const char) codepoint;
|
62
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
63
|
+
} else {
|
64
|
+
return 0;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
static bool
|
69
|
+
yp_encoding_gbk_isupper_char(const char *c, ptrdiff_t n) {
|
70
|
+
size_t width;
|
71
|
+
yp_gbk_codepoint_t codepoint = yp_gbk_codepoint(c, n, &width);
|
72
|
+
|
73
|
+
if (width == 1) {
|
74
|
+
const char value = (const char) codepoint;
|
75
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
76
|
+
} else {
|
77
|
+
return false;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
yp_encoding_t yp_encoding_gbk = {
|
82
|
+
.name = "gbk",
|
83
|
+
.char_width = yp_encoding_gbk_char_width,
|
84
|
+
.alnum_char = yp_encoding_gbk_alnum_char,
|
85
|
+
.alpha_char = yp_encoding_gbk_alpha_char,
|
86
|
+
.isupper_char = yp_encoding_gbk_isupper_char,
|
87
|
+
.multibyte = true
|
88
|
+
};
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#include "yarp/enc/yp_encoding.h"
|
2
|
+
|
3
|
+
typedef uint16_t yp_shift_jis_codepoint_t;
|
4
|
+
|
5
|
+
static yp_shift_jis_codepoint_t
|
6
|
+
yp_shift_jis_codepoint(const char *c, ptrdiff_t n, size_t *width) {
|
7
|
+
const unsigned char *uc = (const unsigned char *) c;
|
8
|
+
|
9
|
+
// These are the single byte characters.
|
10
|
+
if (*uc < 0x80 || (*uc >= 0xA1 && *uc <= 0xDF)) {
|
11
|
+
*width = 1;
|
12
|
+
return *uc;
|
13
|
+
}
|
14
|
+
|
15
|
+
// These are the double byte characters.
|
16
|
+
if (
|
17
|
+
(n > 1) &&
|
18
|
+
((uc[0] >= 0x81 && uc[0] <= 0x9F) || (uc[0] >= 0xE0 && uc[0] <= 0xFC)) &&
|
19
|
+
(uc[1] >= 0x40 && uc[1] <= 0xFC)
|
20
|
+
) {
|
21
|
+
*width = 2;
|
22
|
+
return (yp_shift_jis_codepoint_t) (uc[0] << 8 | uc[1]);
|
23
|
+
}
|
24
|
+
|
25
|
+
*width = 0;
|
26
|
+
return 0;
|
27
|
+
}
|
28
|
+
|
29
|
+
static size_t
|
30
|
+
yp_encoding_shift_jis_char_width(const char *c, ptrdiff_t n) {
|
31
|
+
size_t width;
|
32
|
+
yp_shift_jis_codepoint(c, n, &width);
|
33
|
+
|
34
|
+
return width;
|
35
|
+
}
|
36
|
+
|
37
|
+
static size_t
|
38
|
+
yp_encoding_shift_jis_alpha_char(const char *c, ptrdiff_t n) {
|
39
|
+
size_t width;
|
40
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
41
|
+
|
42
|
+
if (width == 1) {
|
43
|
+
const char value = (const char) codepoint;
|
44
|
+
return yp_encoding_ascii_alpha_char(&value, n);
|
45
|
+
} else {
|
46
|
+
return 0;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
static size_t
|
51
|
+
yp_encoding_shift_jis_alnum_char(const char *c, ptrdiff_t n) {
|
52
|
+
size_t width;
|
53
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
54
|
+
|
55
|
+
if (width == 1) {
|
56
|
+
const char value = (const char) codepoint;
|
57
|
+
return yp_encoding_ascii_alnum_char(&value, n);
|
58
|
+
} else {
|
59
|
+
return 0;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
static bool
|
64
|
+
yp_encoding_shift_jis_isupper_char(const char *c, ptrdiff_t n) {
|
65
|
+
size_t width;
|
66
|
+
yp_shift_jis_codepoint_t codepoint = yp_shift_jis_codepoint(c, n, &width);
|
67
|
+
|
68
|
+
if (width == 1) {
|
69
|
+
const char value = (const char) codepoint;
|
70
|
+
return yp_encoding_ascii_isupper_char(&value, n);
|
71
|
+
} else {
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
yp_encoding_t yp_encoding_shift_jis = {
|
77
|
+
.name = "shift_jis",
|
78
|
+
.char_width = yp_encoding_shift_jis_char_width,
|
79
|
+
.alnum_char = yp_encoding_shift_jis_alnum_char,
|
80
|
+
.alpha_char = yp_encoding_shift_jis_alpha_char,
|
81
|
+
.isupper_char = yp_encoding_shift_jis_isupper_char,
|
82
|
+
.multibyte = true
|
83
|
+
};
|