u 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,319 @@
1
+ /*
2
+ * contents: Unicode class.
3
+ *
4
+ * Copyright © 2005 Nikolai Weibull <work@rawuncut.elitemail.org>
5
+ */
6
+
7
+
8
+ #include <ruby.h>
9
+ #include <re.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <limits.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "rb_private.h"
17
+ #include "rb_methods.h"
18
+
19
+ static VALUE mUTF8Methods;
20
+
21
+ void
22
+ need_at_least_n_arguments(int argc, int n)
23
+ {
24
+ static const char *const words[] = {
25
+ NULL, NULL, "two", "three", "four",
26
+ "five", "six", "seven", "eight", "nine"
27
+ };
28
+
29
+ if (argc >= n)
30
+ return;
31
+
32
+ if (n == 1)
33
+ rb_raise(rb_eArgError, "need at least one argument");
34
+ else if (1 < n && n < 10)
35
+ rb_raise(rb_eArgError, "need at least %s arguments", words[n]);
36
+ else
37
+ rb_raise(rb_eArgError, "need at least %d arguments", n);
38
+ }
39
+
40
+ unichar
41
+ _utf_char_validated(char const *const str, char const *const str_end)
42
+ {
43
+ unichar c = utf_char_validated_n(str, str_end - str);
44
+ switch (c) {
45
+ case UTF_BAD_INPUT_UNICHAR:
46
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
47
+ case UTF_INCOMPLETE_INPUT_UNICHAR:
48
+ rb_raise(rb_eArgError,
49
+ "input contains an incomplete UTF-8-encoded character");
50
+ default:
51
+ return c;
52
+ }
53
+ }
54
+
55
+ /* TODO: instead of ‘end’, perhaps use a len/max-type parameter? */
56
+ char *
57
+ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
58
+ const char *limit, bool noisy)
59
+ {
60
+ const char *p = str;
61
+ long saved_offset = offset;
62
+
63
+ if (offset > 0) {
64
+ while (p < limit && offset-- > 0)
65
+ p = utf_next(p);
66
+
67
+ if (offset > 0) {
68
+ if (noisy)
69
+ rb_raise(rb_eIndexError,
70
+ "index %ld lays beyond end of string",
71
+ saved_offset);
72
+ else
73
+ return NULL;
74
+ }
75
+ } else {
76
+ while (offset != 0) {
77
+ const char *base = p;
78
+ p += offset;
79
+ while (p >= limit && (*p & 0xc0) == 0x80)
80
+ p--;
81
+
82
+ if (p < limit) {
83
+ if (noisy)
84
+ rb_raise(rb_eIndexError,
85
+ "index %ld lays before beginning of string",
86
+ saved_offset);
87
+ else
88
+ return NULL;
89
+ }
90
+
91
+ offset += utf_pointer_to_offset(p, base);
92
+ }
93
+ }
94
+
95
+ return (char *)p;
96
+ }
97
+
98
+ char *
99
+ _utf_offset_to_pointer_validated(const char *str, long offset, const char *end)
100
+ {
101
+ return _utf_offset_to_pointer_validated_impl(str, offset, end, true);
102
+ }
103
+
104
+ char *
105
+ _utf_offset_to_pointer_failable(const char *str, long offset, const char *end)
106
+ {
107
+ return _utf_offset_to_pointer_validated_impl(str, offset, end, false);
108
+ }
109
+
110
+ static char *
111
+ rb_utf_begin_setup(VALUE str, long offset, char **base_limit, char **limit)
112
+ {
113
+ char *base = RSTRING(str)->ptr;
114
+
115
+ *base_limit = RSTRING(str)->ptr + RSTRING(str)->len;
116
+ *limit = *base_limit;
117
+
118
+ if (offset < 0) {
119
+ char *tmp = base;
120
+ base = *base_limit;
121
+ *base_limit = tmp;
122
+ }
123
+
124
+ return base;
125
+ }
126
+
127
+ bool
128
+ rb_utf_begin_from_offset(VALUE str, long offset, char **begin, char **limit)
129
+ {
130
+ char *base_limit;
131
+ char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
132
+
133
+ *begin = _utf_offset_to_pointer_failable(base, offset, base_limit);
134
+
135
+ return (*begin != NULL);
136
+ }
137
+
138
+ void
139
+ rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
140
+ char **limit)
141
+ {
142
+ char *base_limit;
143
+ char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
144
+
145
+ *begin = _utf_offset_to_pointer_validated(base, offset, base_limit);
146
+ }
147
+
148
+ char *
149
+ rb_utf_prev_validated(const char *begin, const char *p)
150
+ {
151
+ char *prev = utf_find_prev(begin, p);
152
+ if (prev == NULL)
153
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
154
+ return prev;
155
+ }
156
+
157
+ char *
158
+ rb_utf_next_validated(const char *p, const char *end)
159
+ {
160
+ char *next = (char *)utf_next(p);
161
+ if (next > end)
162
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
163
+ return next;
164
+ }
165
+
166
+ VALUE
167
+ rb_utf_update(VALUE str, long offset, long len, VALUE replacement)
168
+ {
169
+ if (len < 0)
170
+ rb_raise(rb_eIndexError, "negative length %ld", len);
171
+
172
+ char *begin, *limit;
173
+ rb_utf_begin_from_offset_validated(str, offset, &begin, &limit);
174
+ char *end = _utf_offset_to_pointer_failable(begin, len, limit);
175
+ if (end == NULL)
176
+ end = limit;
177
+
178
+ rb_str_update(str, begin - RSTRING(str)->ptr, end - begin, replacement);
179
+
180
+ return replacement;
181
+ }
182
+
183
+ VALUE
184
+ rb_utf_new(const char *str, long len)
185
+ {
186
+ VALUE rbstr = rb_str_new(str, len);
187
+ rb_extend_object(rbstr, mUTF8Methods);
188
+ return rbstr;
189
+ }
190
+
191
+ VALUE
192
+ rb_utf_new2(const char *str)
193
+ {
194
+ VALUE rbstr = rb_str_new2(str);
195
+ rb_extend_object(rbstr, mUTF8Methods);
196
+ return rbstr;
197
+ }
198
+
199
+ VALUE
200
+ rb_utf_new5(VALUE obj, const char *str, long len)
201
+ {
202
+ VALUE rbstr = rb_str_new5(obj, str, len);
203
+ rb_extend_object(rbstr, mUTF8Methods);
204
+ return rbstr;
205
+ }
206
+
207
+ VALUE
208
+ rb_utf_alloc_using(char *str)
209
+ {
210
+ VALUE rbstr = rb_utf_new(NULL, 0);
211
+ long len = strlen(str);
212
+
213
+ RSTRING(rbstr)->ptr = str;
214
+ RSTRING(rbstr)->aux.capa = len;
215
+ RSTRING(rbstr)->len = len;
216
+ RSTRING(rbstr)->ptr[len] = '\0';
217
+
218
+ return rbstr;
219
+ }
220
+
221
+ VALUE
222
+ rb_utf_dup(VALUE str)
223
+ {
224
+ str = rb_str_dup(str);
225
+ rb_extend_object(str, mUTF8Methods);
226
+ return str;
227
+ }
228
+
229
+ /* TODO: rewrite this using the new offset-calculating functions. */
230
+ long
231
+ rb_utf_index(VALUE str, VALUE sub, long offset)
232
+ {
233
+ long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
234
+
235
+ if (offset < 0) {
236
+ offset += n_chars;
237
+
238
+ if (offset < 0)
239
+ return -1;
240
+ }
241
+
242
+ if (n_chars - offset < utf_length(RSTRING(sub)->ptr))
243
+ return -1;
244
+
245
+ if (RSTRING(sub)->len == 0)
246
+ return offset;
247
+
248
+ char *begin = utf_offset_to_pointer(RSTRING(str)->ptr, offset);
249
+ long pos = rb_memsearch(RSTRING(sub)->ptr, RSTRING(sub)->len,
250
+ begin, RSTRING(str)->len - (begin - RSTRING(str)->ptr));
251
+
252
+ if (pos < 0)
253
+ return -1;
254
+
255
+ return offset + utf_pointer_to_offset(begin, begin + pos);
256
+ }
257
+
258
+ long
259
+ rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
260
+ long offset, bool reverse)
261
+ {
262
+ long byte_offset = _utf_offset_to_pointer_validated(s, offset, end) - s;
263
+ long byte_startpos = rb_reg_adjust_startpos(sub, str, byte_offset, reverse);
264
+ long byte_index = rb_reg_search(sub, str, byte_startpos, reverse);
265
+ if (byte_index == -1)
266
+ return -1;
267
+ return utf_pointer_to_offset(s, s + byte_index);
268
+ }
269
+
270
+ void Init_utf8(void);
271
+ void
272
+ Init_utf8(void)
273
+ {
274
+ VALUE mEncoding = rb_define_module("Encoding");
275
+ VALUE mCharacter = rb_define_module_under(mEncoding, "Character");
276
+ VALUE mUTF8 = rb_define_module_under(mCharacter, "UTF8");
277
+
278
+ mUTF8Methods = rb_define_module_under(mUTF8, "Methods");
279
+
280
+ rb_define_module_function(mUTF8, "collate", rb_utf_collate, 2);
281
+ rb_define_module_function(mUTF8, "aref", rb_utf_aref_m, -1);
282
+ rb_define_module_function(mUTF8, "aset", rb_utf_aset_m, -1);
283
+ rb_define_module_function(mUTF8, "casecmp", rb_utf_casecmp, 2);
284
+ rb_define_module_function(mUTF8, "center", rb_utf_center, -1);
285
+ rb_define_module_function(mUTF8, "chomp", rb_utf_chomp, -1);
286
+ rb_define_module_function(mUTF8, "chomp!", rb_utf_chomp_bang, -1);
287
+ rb_define_module_function(mUTF8, "chop", rb_utf_chop, 1);
288
+ rb_define_module_function(mUTF8, "chop!", rb_utf_chop_bang, 1);
289
+ rb_define_module_function(mUTF8, "count", rb_utf_count, -1);
290
+ rb_define_module_function(mUTF8, "delete", rb_utf_delete, -1);
291
+ rb_define_module_function(mUTF8, "delete!", rb_utf_delete_bang, -1);
292
+ rb_define_module_function(mUTF8, "each_char", rb_utf_each_char, 1);
293
+ rb_define_module_function(mUTF8, "index", rb_utf_index_m, -1);
294
+ rb_define_module_function(mUTF8, "insert", rb_utf_insert, 3);
295
+ rb_define_module_function(mUTF8, "lstrip", rb_utf_lstrip, 1);
296
+ rb_define_module_function(mUTF8, "lstrip!", rb_utf_lstrip_bang, 1);
297
+ rb_define_module_function(mUTF8, "rindex", rb_utf_rindex_m, -1);
298
+ rb_define_module_function(mUTF8, "rstrip", rb_utf_rstrip, 1);
299
+ rb_define_module_function(mUTF8, "rstrip!", rb_utf_rstrip_bang, 1);
300
+ rb_define_module_function(mUTF8, "squeeze", rb_utf_squeeze, -1);
301
+ rb_define_module_function(mUTF8, "squeeze!", rb_utf_squeeze_bang, -1);
302
+ rb_define_module_function(mUTF8, "strip", rb_utf_strip, 1);
303
+ rb_define_module_function(mUTF8, "strip!", rb_utf_strip_bang, 1);
304
+ rb_define_module_function(mUTF8, "to_i", rb_utf_to_i, -1);
305
+ rb_define_module_function(mUTF8, "hex", rb_utf_hex, 1);
306
+ rb_define_module_function(mUTF8, "oct", rb_utf_oct, 1);
307
+ rb_define_module_function(mUTF8, "tr", rb_utf_tr, 3);
308
+ rb_define_module_function(mUTF8, "tr_s", rb_utf_tr_s, 3);
309
+
310
+ rb_define_module_function(mUTF8, "downcase", rb_utf_downcase, 1);
311
+ rb_define_module_function(mUTF8, "ljust", rb_utf_ljust, -1);
312
+ rb_define_module_function(mUTF8, "length", rb_utf_length, 1);
313
+ rb_define_module_function(mUTF8, "reverse", rb_utf_reverse, 1);
314
+ rb_define_module_function(mUTF8, "rjust", rb_utf_rjust, -1);
315
+ rb_define_module_function(mUTF8, "upcase", rb_utf_upcase, 1);
316
+
317
+ rb_define_module_function(mUTF8, "foldcase", rb_utf_foldcase, 1);
318
+ rb_define_module_function(mUTF8, "normalize", rb_utf_normalize, -1);
319
+ }
@@ -0,0 +1,216 @@
1
+ /*
2
+ * contents: Unicode handling.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+
8
+ #ifndef UNICODE_H
9
+ #define UNICODE_H
10
+
11
+ #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
12
+ #define CONST_FUNC \
13
+ __attribute__((__const__))
14
+ #else
15
+ #define CONST_FUNC
16
+ #endif
17
+
18
+ typedef uint32_t unichar;
19
+
20
+ #define MAXUNICHAR UINT32_MAX
21
+
22
+ #define MAX_UNICHAR_BYTE_LENGTH 6
23
+
24
+ #define UNICODE_N_CODEPOINTS (0x10ffff + 1)
25
+
26
+ /* unichar return used for representing bad input to a function. */
27
+ #define UTF_BAD_INPUT_UNICHAR ((unichar)-1)
28
+
29
+
30
+ /* unichar return used for representing an incomplete input to a function. */
31
+ #define UTF_INCOMPLETE_INPUT_UNICHAR ((unichar)-2)
32
+
33
+
34
+ typedef enum {
35
+ UNICODE_CONTROL,
36
+ UNICODE_FORMAT,
37
+ UNICODE_UNASSIGNED,
38
+ UNICODE_PRIVATE_USE,
39
+ UNICODE_SURROGATE,
40
+ UNICODE_LOWERCASE_LETTER,
41
+ UNICODE_MODIFIER_LETTER,
42
+ UNICODE_OTHER_LETTER,
43
+ UNICODE_TITLECASE_LETTER,
44
+ UNICODE_UPPERCASE_LETTER,
45
+ UNICODE_COMBINING_MARK,
46
+ UNICODE_ENCLOSING_MARK,
47
+ UNICODE_NON_SPACING_MARK,
48
+ UNICODE_DECIMAL_NUMBER,
49
+ UNICODE_LETTER_NUMBER,
50
+ UNICODE_OTHER_NUMBER,
51
+ UNICODE_CONNECT_PUNCTUATION,
52
+ UNICODE_DASH_PUNCTUATION,
53
+ UNICODE_CLOSE_PUNCTUATION,
54
+ UNICODE_FINAL_PUNCTUATION,
55
+ UNICODE_INITIAL_PUNCTUATION,
56
+ UNICODE_OTHER_PUNCTUATION,
57
+ UNICODE_OPEN_PUNCTUATION,
58
+ UNICODE_CURRENCY_SYMBOL,
59
+ UNICODE_MODIFIER_SYMBOL,
60
+ UNICODE_MATH_SYMBOL,
61
+ UNICODE_OTHER_SYMBOL,
62
+ UNICODE_LINE_SEPARATOR,
63
+ UNICODE_PARAGRAPH_SEPARATOR,
64
+ UNICODE_SPACE_SEPARATOR
65
+ } UnicodeType;
66
+
67
+ bool unichar_isalnum(unichar c);
68
+ bool unichar_isalpha(unichar c);
69
+ bool unichar_iscntrl(unichar c);
70
+ bool unichar_isdigit(unichar c);
71
+ bool unichar_isgraph(unichar c);
72
+ bool unichar_islower(unichar c);
73
+ bool unichar_isprint(unichar c);
74
+ bool unichar_ispunct(unichar c);
75
+ bool unichar_isspace(unichar c);
76
+ bool unichar_isupper(unichar c);
77
+ bool unichar_istitle(unichar c);
78
+ bool unichar_isnewline(unichar c);
79
+ bool unichar_isxdigit(unichar c);
80
+ bool unichar_isassigned(unichar c);
81
+ bool unichar_iswide(unichar c);
82
+ bool unichar_isvalid(unichar c);
83
+
84
+ unichar unichar_toupper(unichar c);
85
+ unichar unichar_tolower(unichar c);
86
+ unichar unichar_totitle(unichar c);
87
+
88
+ int unichar_digit_value(unichar c);
89
+ int unichar_xdigit_value(unichar c);
90
+
91
+ UnicodeType unichar_type(unichar c);
92
+
93
+ int unichar_combining_class(unichar c) CONST_FUNC;
94
+
95
+ bool unichar_mirror(unichar c, unichar *mirrored);
96
+
97
+
98
+ typedef enum {
99
+ UNICODE_BREAK_MANDATORY,
100
+ UNICODE_BREAK_CARRIAGE_RETURN,
101
+ UNICODE_BREAK_LINE_FEED,
102
+ UNICODE_BREAK_COMBINING_MARK,
103
+ UNICODE_BREAK_SURROGATE,
104
+ UNICODE_BREAK_ZERO_WIDTH_SPACE,
105
+ UNICODE_BREAK_INSEPARABLE,
106
+ UNICODE_BREAK_NON_BREAKING_GLUE,
107
+ UNICODE_BREAK_CONTINGENT,
108
+ UNICODE_BREAK_SPACE,
109
+ UNICODE_BREAK_AFTER,
110
+ UNICODE_BREAK_BEFORE,
111
+ UNICODE_BREAK_BEFORE_AND_AFTER,
112
+ UNICODE_BREAK_HYPHEN,
113
+ UNICODE_BREAK_NON_STARTER,
114
+ UNICODE_BREAK_OPEN_PUNCTUATION,
115
+ UNICODE_BREAK_CLOSE_PUNCTUATION,
116
+ UNICODE_BREAK_QUOTATION,
117
+ UNICODE_BREAK_EXCLAMATION,
118
+ UNICODE_BREAK_IDEOGRAPHIC,
119
+ UNICODE_BREAK_NUMERIC,
120
+ UNICODE_BREAK_INFIX_SEPARATOR,
121
+ UNICODE_BREAK_SYMBOL,
122
+ UNICODE_BREAK_ALPHABETIC,
123
+ UNICODE_BREAK_PREFIX,
124
+ UNICODE_BREAK_POSTFIX,
125
+ UNICODE_BREAK_COMPLEX_CONTEXT,
126
+ UNICODE_BREAK_AMBIGUOUS,
127
+ UNICODE_BREAK_UNKNOWN,
128
+ UNICODE_BREAK_NEXT_LINE,
129
+ UNICODE_BREAK_WORD_JOINER,
130
+ UNICODE_BREAK_HANGUL_L_JAMO,
131
+ UNICODE_BREAK_HANGUL_V_JAMO,
132
+ UNICODE_BREAK_HANGUL_T_JAMO,
133
+ UNICODE_BREAK_HANGUL_LV_SYLLABLE,
134
+ UNICODE_BREAK_HANGUL_LVT_SYLLABLE
135
+ } UnicodeBreakType;
136
+
137
+ UnicodeBreakType unichar_break_type(unichar c);
138
+
139
+
140
+ typedef enum {
141
+ NORMALIZE_DEFAULT,
142
+ NORMALIZE_NFD = NORMALIZE_DEFAULT,
143
+ NORMALIZE_DEFAULT_COMPOSE,
144
+ NORMALIZE_NFC = NORMALIZE_DEFAULT_COMPOSE,
145
+ NORMALIZE_ALL,
146
+ NORMALIZE_NFKD = NORMALIZE_ALL,
147
+ NORMALIZE_ALL_COMPOSE,
148
+ NORMALIZE_NFKC = NORMALIZE_ALL_COMPOSE
149
+ } NormalizeMode;
150
+
151
+ void unicode_canonical_ordering(unichar *str, size_t len);
152
+ unichar *unicode_canonical_decomposition(unichar c, size_t *result_len);
153
+
154
+ char *utf_normalize(const char *str, NormalizeMode mode);
155
+ char *utf_normalize_n(const char *str, NormalizeMode mode, size_t len);
156
+
157
+
158
+
159
+
160
+ char *utf_upcase(const char *str);
161
+ char *utf_upcase_n(const char *str, size_t len);
162
+ char *utf_downcase(const char *str);
163
+ char *utf_downcase_n(const char *str, size_t len);
164
+ char *utf_foldcase(const char *str);
165
+ char *utf_foldcase_n(const char *str, size_t len);
166
+
167
+ unichar utf_char(const char *str);
168
+ unichar utf_char_n(const char *str, size_t max);
169
+ unichar utf_char_validated(const char *str);
170
+ unichar utf_char_validated_n(const char *str, size_t max);
171
+
172
+ extern const char * const s_utf_skip_lengths;
173
+ #define utf_next(str) ((str) + s_utf_skip_lengths[*(const unsigned char *)(str)])
174
+ char *utf_find_next(const char *p, const char *end);
175
+ char *utf_prev(const char *p);
176
+ char *utf_find_prev(const char *begin, const char *p);
177
+ char *utf_offset_to_pointer(const char *str, long offset);
178
+ long utf_pointer_to_offset(const char *str, const char *pos);
179
+
180
+ void utf_copy(char *dest, const char *src);
181
+ void utf_copy_n(char *dest, const char *src, size_t n);
182
+ void utf_append(char *dest, const char *src);
183
+ void utf_append_n(char *dest, const char *src, size_t n);
184
+ int utf_collate(const char *a, const char *b);
185
+ char *utf_collate_key(const char *str);
186
+ char *utf_collate_key_n(const char *str, size_t len);
187
+ int utf_char_index(const char *str, unichar c);
188
+ int utf_char_index_n(const char *str, unichar c, size_t len);
189
+ int utf_char_rindex(const char *str, unichar c);
190
+ int utf_char_rindex_n(const char *str, unichar c, size_t len);
191
+ int utf_index(const char *haystack, const char *needle);
192
+ int utf_index_n(const char *haystack, const char *needle, size_t len);
193
+ int utf_rindex(const char *haystack, const char *needle);
194
+ int utf_rindex_n(const char *haystack, const char *needle, size_t len);
195
+ bool utf_has_prefix(const char *str, const char *prefix);
196
+ long utf_length(const char *str);
197
+ long utf_length_n(const char *str, long len);
198
+ size_t utf_width(const char *str);
199
+ size_t utf_width_n(const char *str, size_t len);
200
+ size_t utf_byte_length(const char *str);
201
+ char *utf_reverse(const char *str);
202
+ char *utf_reverse_n(const char *str, size_t len);
203
+
204
+ bool utf_isvalid(const char *str);
205
+ bool utf_isvalid_n(const char *str, size_t max, const char **end);
206
+
207
+ /* XXX: should probably name stuff utf32 instead of ucs4 */
208
+ int unichar_to_utf(unichar c, char *result);
209
+ char *ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written);
210
+ char *ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written);
211
+ unichar *utf8_to_ucs4_fast(const char *str, size_t *items_written);
212
+ unichar *utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written);
213
+ unichar *utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written);
214
+ unichar *utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written);
215
+
216
+ #endif /* UNICODE_H */