u 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +38 -0
- data/Rakefile +64 -0
- data/ext/encoding/character/utf-8/break.c +25 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
- data/ext/encoding/character/utf-8/decompose.c +444 -0
- data/ext/encoding/character/utf-8/depend +65 -0
- data/ext/encoding/character/utf-8/extconf.rb +67 -0
- data/ext/encoding/character/utf-8/private.c +62 -0
- data/ext/encoding/character/utf-8/private.h +51 -0
- data/ext/encoding/character/utf-8/properties.c +1056 -0
- data/ext/encoding/character/utf-8/rb_includes.h +19 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_private.h +52 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/tables.h +38 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +216 -0
- data/ext/encoding/character/utf-8/utf.c +1334 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/lib/u.rb +16 -0
- data/lib/u/string.rb +185 -0
- data/lib/u/version.rb +5 -0
- data/test/unit/u.rb +5 -0
- data/test/unit/u/string.rb +91 -0
- metadata +174 -0
@@ -0,0 +1,319 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode class.
|
3
|
+
*
|
4
|
+
* Copyright © 2005 Nikolai Weibull <work@rawuncut.elitemail.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
#include <ruby.h>
|
9
|
+
#include <re.h>
|
10
|
+
#include <stdbool.h>
|
11
|
+
#include <stddef.h>
|
12
|
+
#include <stdint.h>
|
13
|
+
#include <limits.h>
|
14
|
+
#include "unicode.h"
|
15
|
+
#include "private.h"
|
16
|
+
#include "rb_private.h"
|
17
|
+
#include "rb_methods.h"
|
18
|
+
|
19
|
+
static VALUE mUTF8Methods;
|
20
|
+
|
21
|
+
void
|
22
|
+
need_at_least_n_arguments(int argc, int n)
|
23
|
+
{
|
24
|
+
static const char *const words[] = {
|
25
|
+
NULL, NULL, "two", "three", "four",
|
26
|
+
"five", "six", "seven", "eight", "nine"
|
27
|
+
};
|
28
|
+
|
29
|
+
if (argc >= n)
|
30
|
+
return;
|
31
|
+
|
32
|
+
if (n == 1)
|
33
|
+
rb_raise(rb_eArgError, "need at least one argument");
|
34
|
+
else if (1 < n && n < 10)
|
35
|
+
rb_raise(rb_eArgError, "need at least %s arguments", words[n]);
|
36
|
+
else
|
37
|
+
rb_raise(rb_eArgError, "need at least %d arguments", n);
|
38
|
+
}
|
39
|
+
|
40
|
+
unichar
|
41
|
+
_utf_char_validated(char const *const str, char const *const str_end)
|
42
|
+
{
|
43
|
+
unichar c = utf_char_validated_n(str, str_end - str);
|
44
|
+
switch (c) {
|
45
|
+
case UTF_BAD_INPUT_UNICHAR:
|
46
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
47
|
+
case UTF_INCOMPLETE_INPUT_UNICHAR:
|
48
|
+
rb_raise(rb_eArgError,
|
49
|
+
"input contains an incomplete UTF-8-encoded character");
|
50
|
+
default:
|
51
|
+
return c;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
/* TODO: instead of ‘end’, perhaps use a len/max-type parameter? */
|
56
|
+
char *
|
57
|
+
_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
58
|
+
const char *limit, bool noisy)
|
59
|
+
{
|
60
|
+
const char *p = str;
|
61
|
+
long saved_offset = offset;
|
62
|
+
|
63
|
+
if (offset > 0) {
|
64
|
+
while (p < limit && offset-- > 0)
|
65
|
+
p = utf_next(p);
|
66
|
+
|
67
|
+
if (offset > 0) {
|
68
|
+
if (noisy)
|
69
|
+
rb_raise(rb_eIndexError,
|
70
|
+
"index %ld lays beyond end of string",
|
71
|
+
saved_offset);
|
72
|
+
else
|
73
|
+
return NULL;
|
74
|
+
}
|
75
|
+
} else {
|
76
|
+
while (offset != 0) {
|
77
|
+
const char *base = p;
|
78
|
+
p += offset;
|
79
|
+
while (p >= limit && (*p & 0xc0) == 0x80)
|
80
|
+
p--;
|
81
|
+
|
82
|
+
if (p < limit) {
|
83
|
+
if (noisy)
|
84
|
+
rb_raise(rb_eIndexError,
|
85
|
+
"index %ld lays before beginning of string",
|
86
|
+
saved_offset);
|
87
|
+
else
|
88
|
+
return NULL;
|
89
|
+
}
|
90
|
+
|
91
|
+
offset += utf_pointer_to_offset(p, base);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
return (char *)p;
|
96
|
+
}
|
97
|
+
|
98
|
+
char *
|
99
|
+
_utf_offset_to_pointer_validated(const char *str, long offset, const char *end)
|
100
|
+
{
|
101
|
+
return _utf_offset_to_pointer_validated_impl(str, offset, end, true);
|
102
|
+
}
|
103
|
+
|
104
|
+
char *
|
105
|
+
_utf_offset_to_pointer_failable(const char *str, long offset, const char *end)
|
106
|
+
{
|
107
|
+
return _utf_offset_to_pointer_validated_impl(str, offset, end, false);
|
108
|
+
}
|
109
|
+
|
110
|
+
static char *
|
111
|
+
rb_utf_begin_setup(VALUE str, long offset, char **base_limit, char **limit)
|
112
|
+
{
|
113
|
+
char *base = RSTRING(str)->ptr;
|
114
|
+
|
115
|
+
*base_limit = RSTRING(str)->ptr + RSTRING(str)->len;
|
116
|
+
*limit = *base_limit;
|
117
|
+
|
118
|
+
if (offset < 0) {
|
119
|
+
char *tmp = base;
|
120
|
+
base = *base_limit;
|
121
|
+
*base_limit = tmp;
|
122
|
+
}
|
123
|
+
|
124
|
+
return base;
|
125
|
+
}
|
126
|
+
|
127
|
+
bool
|
128
|
+
rb_utf_begin_from_offset(VALUE str, long offset, char **begin, char **limit)
|
129
|
+
{
|
130
|
+
char *base_limit;
|
131
|
+
char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
|
132
|
+
|
133
|
+
*begin = _utf_offset_to_pointer_failable(base, offset, base_limit);
|
134
|
+
|
135
|
+
return (*begin != NULL);
|
136
|
+
}
|
137
|
+
|
138
|
+
void
|
139
|
+
rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
140
|
+
char **limit)
|
141
|
+
{
|
142
|
+
char *base_limit;
|
143
|
+
char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
|
144
|
+
|
145
|
+
*begin = _utf_offset_to_pointer_validated(base, offset, base_limit);
|
146
|
+
}
|
147
|
+
|
148
|
+
char *
|
149
|
+
rb_utf_prev_validated(const char *begin, const char *p)
|
150
|
+
{
|
151
|
+
char *prev = utf_find_prev(begin, p);
|
152
|
+
if (prev == NULL)
|
153
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
154
|
+
return prev;
|
155
|
+
}
|
156
|
+
|
157
|
+
char *
|
158
|
+
rb_utf_next_validated(const char *p, const char *end)
|
159
|
+
{
|
160
|
+
char *next = (char *)utf_next(p);
|
161
|
+
if (next > end)
|
162
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
163
|
+
return next;
|
164
|
+
}
|
165
|
+
|
166
|
+
VALUE
|
167
|
+
rb_utf_update(VALUE str, long offset, long len, VALUE replacement)
|
168
|
+
{
|
169
|
+
if (len < 0)
|
170
|
+
rb_raise(rb_eIndexError, "negative length %ld", len);
|
171
|
+
|
172
|
+
char *begin, *limit;
|
173
|
+
rb_utf_begin_from_offset_validated(str, offset, &begin, &limit);
|
174
|
+
char *end = _utf_offset_to_pointer_failable(begin, len, limit);
|
175
|
+
if (end == NULL)
|
176
|
+
end = limit;
|
177
|
+
|
178
|
+
rb_str_update(str, begin - RSTRING(str)->ptr, end - begin, replacement);
|
179
|
+
|
180
|
+
return replacement;
|
181
|
+
}
|
182
|
+
|
183
|
+
VALUE
|
184
|
+
rb_utf_new(const char *str, long len)
|
185
|
+
{
|
186
|
+
VALUE rbstr = rb_str_new(str, len);
|
187
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
188
|
+
return rbstr;
|
189
|
+
}
|
190
|
+
|
191
|
+
VALUE
|
192
|
+
rb_utf_new2(const char *str)
|
193
|
+
{
|
194
|
+
VALUE rbstr = rb_str_new2(str);
|
195
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
196
|
+
return rbstr;
|
197
|
+
}
|
198
|
+
|
199
|
+
VALUE
|
200
|
+
rb_utf_new5(VALUE obj, const char *str, long len)
|
201
|
+
{
|
202
|
+
VALUE rbstr = rb_str_new5(obj, str, len);
|
203
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
204
|
+
return rbstr;
|
205
|
+
}
|
206
|
+
|
207
|
+
VALUE
|
208
|
+
rb_utf_alloc_using(char *str)
|
209
|
+
{
|
210
|
+
VALUE rbstr = rb_utf_new(NULL, 0);
|
211
|
+
long len = strlen(str);
|
212
|
+
|
213
|
+
RSTRING(rbstr)->ptr = str;
|
214
|
+
RSTRING(rbstr)->aux.capa = len;
|
215
|
+
RSTRING(rbstr)->len = len;
|
216
|
+
RSTRING(rbstr)->ptr[len] = '\0';
|
217
|
+
|
218
|
+
return rbstr;
|
219
|
+
}
|
220
|
+
|
221
|
+
VALUE
|
222
|
+
rb_utf_dup(VALUE str)
|
223
|
+
{
|
224
|
+
str = rb_str_dup(str);
|
225
|
+
rb_extend_object(str, mUTF8Methods);
|
226
|
+
return str;
|
227
|
+
}
|
228
|
+
|
229
|
+
/* TODO: rewrite this using the new offset-calculating functions. */
|
230
|
+
long
|
231
|
+
rb_utf_index(VALUE str, VALUE sub, long offset)
|
232
|
+
{
|
233
|
+
long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
234
|
+
|
235
|
+
if (offset < 0) {
|
236
|
+
offset += n_chars;
|
237
|
+
|
238
|
+
if (offset < 0)
|
239
|
+
return -1;
|
240
|
+
}
|
241
|
+
|
242
|
+
if (n_chars - offset < utf_length(RSTRING(sub)->ptr))
|
243
|
+
return -1;
|
244
|
+
|
245
|
+
if (RSTRING(sub)->len == 0)
|
246
|
+
return offset;
|
247
|
+
|
248
|
+
char *begin = utf_offset_to_pointer(RSTRING(str)->ptr, offset);
|
249
|
+
long pos = rb_memsearch(RSTRING(sub)->ptr, RSTRING(sub)->len,
|
250
|
+
begin, RSTRING(str)->len - (begin - RSTRING(str)->ptr));
|
251
|
+
|
252
|
+
if (pos < 0)
|
253
|
+
return -1;
|
254
|
+
|
255
|
+
return offset + utf_pointer_to_offset(begin, begin + pos);
|
256
|
+
}
|
257
|
+
|
258
|
+
long
|
259
|
+
rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
260
|
+
long offset, bool reverse)
|
261
|
+
{
|
262
|
+
long byte_offset = _utf_offset_to_pointer_validated(s, offset, end) - s;
|
263
|
+
long byte_startpos = rb_reg_adjust_startpos(sub, str, byte_offset, reverse);
|
264
|
+
long byte_index = rb_reg_search(sub, str, byte_startpos, reverse);
|
265
|
+
if (byte_index == -1)
|
266
|
+
return -1;
|
267
|
+
return utf_pointer_to_offset(s, s + byte_index);
|
268
|
+
}
|
269
|
+
|
270
|
+
void Init_utf8(void);
|
271
|
+
void
|
272
|
+
Init_utf8(void)
|
273
|
+
{
|
274
|
+
VALUE mEncoding = rb_define_module("Encoding");
|
275
|
+
VALUE mCharacter = rb_define_module_under(mEncoding, "Character");
|
276
|
+
VALUE mUTF8 = rb_define_module_under(mCharacter, "UTF8");
|
277
|
+
|
278
|
+
mUTF8Methods = rb_define_module_under(mUTF8, "Methods");
|
279
|
+
|
280
|
+
rb_define_module_function(mUTF8, "collate", rb_utf_collate, 2);
|
281
|
+
rb_define_module_function(mUTF8, "aref", rb_utf_aref_m, -1);
|
282
|
+
rb_define_module_function(mUTF8, "aset", rb_utf_aset_m, -1);
|
283
|
+
rb_define_module_function(mUTF8, "casecmp", rb_utf_casecmp, 2);
|
284
|
+
rb_define_module_function(mUTF8, "center", rb_utf_center, -1);
|
285
|
+
rb_define_module_function(mUTF8, "chomp", rb_utf_chomp, -1);
|
286
|
+
rb_define_module_function(mUTF8, "chomp!", rb_utf_chomp_bang, -1);
|
287
|
+
rb_define_module_function(mUTF8, "chop", rb_utf_chop, 1);
|
288
|
+
rb_define_module_function(mUTF8, "chop!", rb_utf_chop_bang, 1);
|
289
|
+
rb_define_module_function(mUTF8, "count", rb_utf_count, -1);
|
290
|
+
rb_define_module_function(mUTF8, "delete", rb_utf_delete, -1);
|
291
|
+
rb_define_module_function(mUTF8, "delete!", rb_utf_delete_bang, -1);
|
292
|
+
rb_define_module_function(mUTF8, "each_char", rb_utf_each_char, 1);
|
293
|
+
rb_define_module_function(mUTF8, "index", rb_utf_index_m, -1);
|
294
|
+
rb_define_module_function(mUTF8, "insert", rb_utf_insert, 3);
|
295
|
+
rb_define_module_function(mUTF8, "lstrip", rb_utf_lstrip, 1);
|
296
|
+
rb_define_module_function(mUTF8, "lstrip!", rb_utf_lstrip_bang, 1);
|
297
|
+
rb_define_module_function(mUTF8, "rindex", rb_utf_rindex_m, -1);
|
298
|
+
rb_define_module_function(mUTF8, "rstrip", rb_utf_rstrip, 1);
|
299
|
+
rb_define_module_function(mUTF8, "rstrip!", rb_utf_rstrip_bang, 1);
|
300
|
+
rb_define_module_function(mUTF8, "squeeze", rb_utf_squeeze, -1);
|
301
|
+
rb_define_module_function(mUTF8, "squeeze!", rb_utf_squeeze_bang, -1);
|
302
|
+
rb_define_module_function(mUTF8, "strip", rb_utf_strip, 1);
|
303
|
+
rb_define_module_function(mUTF8, "strip!", rb_utf_strip_bang, 1);
|
304
|
+
rb_define_module_function(mUTF8, "to_i", rb_utf_to_i, -1);
|
305
|
+
rb_define_module_function(mUTF8, "hex", rb_utf_hex, 1);
|
306
|
+
rb_define_module_function(mUTF8, "oct", rb_utf_oct, 1);
|
307
|
+
rb_define_module_function(mUTF8, "tr", rb_utf_tr, 3);
|
308
|
+
rb_define_module_function(mUTF8, "tr_s", rb_utf_tr_s, 3);
|
309
|
+
|
310
|
+
rb_define_module_function(mUTF8, "downcase", rb_utf_downcase, 1);
|
311
|
+
rb_define_module_function(mUTF8, "ljust", rb_utf_ljust, -1);
|
312
|
+
rb_define_module_function(mUTF8, "length", rb_utf_length, 1);
|
313
|
+
rb_define_module_function(mUTF8, "reverse", rb_utf_reverse, 1);
|
314
|
+
rb_define_module_function(mUTF8, "rjust", rb_utf_rjust, -1);
|
315
|
+
rb_define_module_function(mUTF8, "upcase", rb_utf_upcase, 1);
|
316
|
+
|
317
|
+
rb_define_module_function(mUTF8, "foldcase", rb_utf_foldcase, 1);
|
318
|
+
rb_define_module_function(mUTF8, "normalize", rb_utf_normalize, -1);
|
319
|
+
}
|
@@ -0,0 +1,216 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode handling.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
#ifndef UNICODE_H
|
9
|
+
#define UNICODE_H
|
10
|
+
|
11
|
+
#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
|
12
|
+
#define CONST_FUNC \
|
13
|
+
__attribute__((__const__))
|
14
|
+
#else
|
15
|
+
#define CONST_FUNC
|
16
|
+
#endif
|
17
|
+
|
18
|
+
typedef uint32_t unichar;
|
19
|
+
|
20
|
+
#define MAXUNICHAR UINT32_MAX
|
21
|
+
|
22
|
+
#define MAX_UNICHAR_BYTE_LENGTH 6
|
23
|
+
|
24
|
+
#define UNICODE_N_CODEPOINTS (0x10ffff + 1)
|
25
|
+
|
26
|
+
/* unichar return used for representing bad input to a function. */
|
27
|
+
#define UTF_BAD_INPUT_UNICHAR ((unichar)-1)
|
28
|
+
|
29
|
+
|
30
|
+
/* unichar return used for representing an incomplete input to a function. */
|
31
|
+
#define UTF_INCOMPLETE_INPUT_UNICHAR ((unichar)-2)
|
32
|
+
|
33
|
+
|
34
|
+
typedef enum {
|
35
|
+
UNICODE_CONTROL,
|
36
|
+
UNICODE_FORMAT,
|
37
|
+
UNICODE_UNASSIGNED,
|
38
|
+
UNICODE_PRIVATE_USE,
|
39
|
+
UNICODE_SURROGATE,
|
40
|
+
UNICODE_LOWERCASE_LETTER,
|
41
|
+
UNICODE_MODIFIER_LETTER,
|
42
|
+
UNICODE_OTHER_LETTER,
|
43
|
+
UNICODE_TITLECASE_LETTER,
|
44
|
+
UNICODE_UPPERCASE_LETTER,
|
45
|
+
UNICODE_COMBINING_MARK,
|
46
|
+
UNICODE_ENCLOSING_MARK,
|
47
|
+
UNICODE_NON_SPACING_MARK,
|
48
|
+
UNICODE_DECIMAL_NUMBER,
|
49
|
+
UNICODE_LETTER_NUMBER,
|
50
|
+
UNICODE_OTHER_NUMBER,
|
51
|
+
UNICODE_CONNECT_PUNCTUATION,
|
52
|
+
UNICODE_DASH_PUNCTUATION,
|
53
|
+
UNICODE_CLOSE_PUNCTUATION,
|
54
|
+
UNICODE_FINAL_PUNCTUATION,
|
55
|
+
UNICODE_INITIAL_PUNCTUATION,
|
56
|
+
UNICODE_OTHER_PUNCTUATION,
|
57
|
+
UNICODE_OPEN_PUNCTUATION,
|
58
|
+
UNICODE_CURRENCY_SYMBOL,
|
59
|
+
UNICODE_MODIFIER_SYMBOL,
|
60
|
+
UNICODE_MATH_SYMBOL,
|
61
|
+
UNICODE_OTHER_SYMBOL,
|
62
|
+
UNICODE_LINE_SEPARATOR,
|
63
|
+
UNICODE_PARAGRAPH_SEPARATOR,
|
64
|
+
UNICODE_SPACE_SEPARATOR
|
65
|
+
} UnicodeType;
|
66
|
+
|
67
|
+
bool unichar_isalnum(unichar c);
|
68
|
+
bool unichar_isalpha(unichar c);
|
69
|
+
bool unichar_iscntrl(unichar c);
|
70
|
+
bool unichar_isdigit(unichar c);
|
71
|
+
bool unichar_isgraph(unichar c);
|
72
|
+
bool unichar_islower(unichar c);
|
73
|
+
bool unichar_isprint(unichar c);
|
74
|
+
bool unichar_ispunct(unichar c);
|
75
|
+
bool unichar_isspace(unichar c);
|
76
|
+
bool unichar_isupper(unichar c);
|
77
|
+
bool unichar_istitle(unichar c);
|
78
|
+
bool unichar_isnewline(unichar c);
|
79
|
+
bool unichar_isxdigit(unichar c);
|
80
|
+
bool unichar_isassigned(unichar c);
|
81
|
+
bool unichar_iswide(unichar c);
|
82
|
+
bool unichar_isvalid(unichar c);
|
83
|
+
|
84
|
+
unichar unichar_toupper(unichar c);
|
85
|
+
unichar unichar_tolower(unichar c);
|
86
|
+
unichar unichar_totitle(unichar c);
|
87
|
+
|
88
|
+
int unichar_digit_value(unichar c);
|
89
|
+
int unichar_xdigit_value(unichar c);
|
90
|
+
|
91
|
+
UnicodeType unichar_type(unichar c);
|
92
|
+
|
93
|
+
int unichar_combining_class(unichar c) CONST_FUNC;
|
94
|
+
|
95
|
+
bool unichar_mirror(unichar c, unichar *mirrored);
|
96
|
+
|
97
|
+
|
98
|
+
typedef enum {
|
99
|
+
UNICODE_BREAK_MANDATORY,
|
100
|
+
UNICODE_BREAK_CARRIAGE_RETURN,
|
101
|
+
UNICODE_BREAK_LINE_FEED,
|
102
|
+
UNICODE_BREAK_COMBINING_MARK,
|
103
|
+
UNICODE_BREAK_SURROGATE,
|
104
|
+
UNICODE_BREAK_ZERO_WIDTH_SPACE,
|
105
|
+
UNICODE_BREAK_INSEPARABLE,
|
106
|
+
UNICODE_BREAK_NON_BREAKING_GLUE,
|
107
|
+
UNICODE_BREAK_CONTINGENT,
|
108
|
+
UNICODE_BREAK_SPACE,
|
109
|
+
UNICODE_BREAK_AFTER,
|
110
|
+
UNICODE_BREAK_BEFORE,
|
111
|
+
UNICODE_BREAK_BEFORE_AND_AFTER,
|
112
|
+
UNICODE_BREAK_HYPHEN,
|
113
|
+
UNICODE_BREAK_NON_STARTER,
|
114
|
+
UNICODE_BREAK_OPEN_PUNCTUATION,
|
115
|
+
UNICODE_BREAK_CLOSE_PUNCTUATION,
|
116
|
+
UNICODE_BREAK_QUOTATION,
|
117
|
+
UNICODE_BREAK_EXCLAMATION,
|
118
|
+
UNICODE_BREAK_IDEOGRAPHIC,
|
119
|
+
UNICODE_BREAK_NUMERIC,
|
120
|
+
UNICODE_BREAK_INFIX_SEPARATOR,
|
121
|
+
UNICODE_BREAK_SYMBOL,
|
122
|
+
UNICODE_BREAK_ALPHABETIC,
|
123
|
+
UNICODE_BREAK_PREFIX,
|
124
|
+
UNICODE_BREAK_POSTFIX,
|
125
|
+
UNICODE_BREAK_COMPLEX_CONTEXT,
|
126
|
+
UNICODE_BREAK_AMBIGUOUS,
|
127
|
+
UNICODE_BREAK_UNKNOWN,
|
128
|
+
UNICODE_BREAK_NEXT_LINE,
|
129
|
+
UNICODE_BREAK_WORD_JOINER,
|
130
|
+
UNICODE_BREAK_HANGUL_L_JAMO,
|
131
|
+
UNICODE_BREAK_HANGUL_V_JAMO,
|
132
|
+
UNICODE_BREAK_HANGUL_T_JAMO,
|
133
|
+
UNICODE_BREAK_HANGUL_LV_SYLLABLE,
|
134
|
+
UNICODE_BREAK_HANGUL_LVT_SYLLABLE
|
135
|
+
} UnicodeBreakType;
|
136
|
+
|
137
|
+
UnicodeBreakType unichar_break_type(unichar c);
|
138
|
+
|
139
|
+
|
140
|
+
typedef enum {
|
141
|
+
NORMALIZE_DEFAULT,
|
142
|
+
NORMALIZE_NFD = NORMALIZE_DEFAULT,
|
143
|
+
NORMALIZE_DEFAULT_COMPOSE,
|
144
|
+
NORMALIZE_NFC = NORMALIZE_DEFAULT_COMPOSE,
|
145
|
+
NORMALIZE_ALL,
|
146
|
+
NORMALIZE_NFKD = NORMALIZE_ALL,
|
147
|
+
NORMALIZE_ALL_COMPOSE,
|
148
|
+
NORMALIZE_NFKC = NORMALIZE_ALL_COMPOSE
|
149
|
+
} NormalizeMode;
|
150
|
+
|
151
|
+
void unicode_canonical_ordering(unichar *str, size_t len);
|
152
|
+
unichar *unicode_canonical_decomposition(unichar c, size_t *result_len);
|
153
|
+
|
154
|
+
char *utf_normalize(const char *str, NormalizeMode mode);
|
155
|
+
char *utf_normalize_n(const char *str, NormalizeMode mode, size_t len);
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
char *utf_upcase(const char *str);
|
161
|
+
char *utf_upcase_n(const char *str, size_t len);
|
162
|
+
char *utf_downcase(const char *str);
|
163
|
+
char *utf_downcase_n(const char *str, size_t len);
|
164
|
+
char *utf_foldcase(const char *str);
|
165
|
+
char *utf_foldcase_n(const char *str, size_t len);
|
166
|
+
|
167
|
+
unichar utf_char(const char *str);
|
168
|
+
unichar utf_char_n(const char *str, size_t max);
|
169
|
+
unichar utf_char_validated(const char *str);
|
170
|
+
unichar utf_char_validated_n(const char *str, size_t max);
|
171
|
+
|
172
|
+
extern const char * const s_utf_skip_lengths;
|
173
|
+
#define utf_next(str) ((str) + s_utf_skip_lengths[*(const unsigned char *)(str)])
|
174
|
+
char *utf_find_next(const char *p, const char *end);
|
175
|
+
char *utf_prev(const char *p);
|
176
|
+
char *utf_find_prev(const char *begin, const char *p);
|
177
|
+
char *utf_offset_to_pointer(const char *str, long offset);
|
178
|
+
long utf_pointer_to_offset(const char *str, const char *pos);
|
179
|
+
|
180
|
+
void utf_copy(char *dest, const char *src);
|
181
|
+
void utf_copy_n(char *dest, const char *src, size_t n);
|
182
|
+
void utf_append(char *dest, const char *src);
|
183
|
+
void utf_append_n(char *dest, const char *src, size_t n);
|
184
|
+
int utf_collate(const char *a, const char *b);
|
185
|
+
char *utf_collate_key(const char *str);
|
186
|
+
char *utf_collate_key_n(const char *str, size_t len);
|
187
|
+
int utf_char_index(const char *str, unichar c);
|
188
|
+
int utf_char_index_n(const char *str, unichar c, size_t len);
|
189
|
+
int utf_char_rindex(const char *str, unichar c);
|
190
|
+
int utf_char_rindex_n(const char *str, unichar c, size_t len);
|
191
|
+
int utf_index(const char *haystack, const char *needle);
|
192
|
+
int utf_index_n(const char *haystack, const char *needle, size_t len);
|
193
|
+
int utf_rindex(const char *haystack, const char *needle);
|
194
|
+
int utf_rindex_n(const char *haystack, const char *needle, size_t len);
|
195
|
+
bool utf_has_prefix(const char *str, const char *prefix);
|
196
|
+
long utf_length(const char *str);
|
197
|
+
long utf_length_n(const char *str, long len);
|
198
|
+
size_t utf_width(const char *str);
|
199
|
+
size_t utf_width_n(const char *str, size_t len);
|
200
|
+
size_t utf_byte_length(const char *str);
|
201
|
+
char *utf_reverse(const char *str);
|
202
|
+
char *utf_reverse_n(const char *str, size_t len);
|
203
|
+
|
204
|
+
bool utf_isvalid(const char *str);
|
205
|
+
bool utf_isvalid_n(const char *str, size_t max, const char **end);
|
206
|
+
|
207
|
+
/* XXX: should probably name stuff utf32 instead of ucs4 */
|
208
|
+
int unichar_to_utf(unichar c, char *result);
|
209
|
+
char *ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written);
|
210
|
+
char *ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written);
|
211
|
+
unichar *utf8_to_ucs4_fast(const char *str, size_t *items_written);
|
212
|
+
unichar *utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written);
|
213
|
+
unichar *utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written);
|
214
|
+
unichar *utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written);
|
215
|
+
|
216
|
+
#endif /* UNICODE_H */
|