character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,64 @@
1
+ break.o: break.c unicode.h data/break.h
2
+ decompose.o: decompose.c unicode.h private.h data/decompose.h \
3
+ data/compose.h
4
+ properties.o: properties.c unicode.h private.h data/character-tables.h
5
+ rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
6
+ rb_methods.h
7
+ rb_utf_aset.o: rb_utf_aset.c rb_includes.h unicode.h private.h \
8
+ rb_methods.h
9
+ rb_utf_casecmp.o: rb_utf_casecmp.c rb_includes.h unicode.h private.h \
10
+ rb_methods.h
11
+ rb_utf_chomp.o: rb_utf_chomp.c rb_includes.h unicode.h private.h \
12
+ rb_methods.h
13
+ rb_utf_chop.o: rb_utf_chop.c rb_includes.h unicode.h private.h \
14
+ rb_methods.h
15
+ rb_utf_collate.o: rb_utf_collate.c rb_includes.h unicode.h private.h \
16
+ rb_methods.h
17
+ rb_utf_count.o: rb_utf_count.c rb_includes.h unicode.h private.h \
18
+ rb_methods.h rb_utf_internal_tr.h
19
+ rb_utf_delete.o: rb_utf_delete.c rb_includes.h unicode.h private.h \
20
+ rb_methods.h rb_utf_internal_tr.h
21
+ rb_utf_downcase.o: rb_utf_downcase.c rb_includes.h unicode.h private.h \
22
+ rb_methods.h
23
+ rb_utf_each_char.o: rb_utf_each_char.c rb_includes.h unicode.h private.h \
24
+ rb_methods.h
25
+ rb_utf_foldcase.o: rb_utf_foldcase.c rb_includes.h unicode.h private.h \
26
+ rb_methods.h
27
+ rb_utf_hex.o: rb_utf_hex.c rb_includes.h unicode.h private.h rb_methods.h \
28
+ rb_utf_internal_bignum.h
29
+ rb_utf_index.o: rb_utf_index.c rb_includes.h unicode.h private.h \
30
+ rb_methods.h
31
+ rb_utf_insert.o: rb_utf_insert.c rb_includes.h unicode.h private.h \
32
+ rb_methods.h
33
+ rb_utf_internal_bignum.o: rb_utf_internal_bignum.c rb_includes.h \
34
+ unicode.h private.h rb_methods.h rb_utf_internal_bignum.h
35
+ rb_utf_internal_tr.o: rb_utf_internal_tr.c rb_includes.h unicode.h \
36
+ private.h rb_methods.h rb_utf_internal_tr.h
37
+ rb_utf_justify.o: rb_utf_justify.c rb_includes.h unicode.h private.h \
38
+ rb_methods.h
39
+ rb_utf_length.o: rb_utf_length.c rb_includes.h unicode.h private.h \
40
+ rb_methods.h
41
+ rb_utf_lstrip.o: rb_utf_lstrip.c rb_includes.h unicode.h private.h \
42
+ rb_methods.h
43
+ rb_utf_normalize.o: rb_utf_normalize.c rb_includes.h unicode.h private.h \
44
+ rb_methods.h
45
+ rb_utf_oct.o: rb_utf_oct.c rb_includes.h unicode.h private.h rb_methods.h \
46
+ rb_utf_internal_bignum.h
47
+ rb_utf_reverse.o: rb_utf_reverse.c rb_includes.h unicode.h private.h \
48
+ rb_methods.h
49
+ rb_utf_rindex.o: rb_utf_rindex.c rb_includes.h unicode.h private.h \
50
+ rb_methods.h
51
+ rb_utf_rstrip.o: rb_utf_rstrip.c rb_includes.h unicode.h private.h \
52
+ rb_methods.h
53
+ rb_utf_squeeze.o: rb_utf_squeeze.c rb_includes.h unicode.h private.h \
54
+ rb_methods.h rb_utf_internal_tr.h
55
+ rb_utf_strip.o: rb_utf_strip.c rb_includes.h unicode.h private.h \
56
+ rb_methods.h
57
+ rb_utf_to_i.o: rb_utf_to_i.c rb_includes.h unicode.h private.h \
58
+ rb_methods.h rb_utf_internal_bignum.h
59
+ rb_utf_tr.o: rb_utf_tr.c rb_includes.h unicode.h private.h rb_methods.h \
60
+ rb_utf_internal_tr.h
61
+ rb_utf_upcase.o: rb_utf_upcase.c rb_includes.h unicode.h private.h \
62
+ rb_methods.h
63
+ unicode.o: unicode.c unicode.h private.h rb_methods.h
64
+ utf.o: utf.c unicode.h private.h
@@ -0,0 +1,47 @@
1
+ require 'mkmf'
2
+
3
+ def try_compiler_option(opt, &b)
4
+ checking_for "‘#{opt}’ option to compiler" do
5
+ if try_compile('', opt, &b)
6
+ $CFLAGS += " #{opt}"
7
+ true
8
+ else
9
+ false
10
+ end
11
+ end
12
+ end
13
+
14
+ try_compiler_option('-std=c99')
15
+ try_compiler_option('-Wall')
16
+ try_compiler_option('-Wextra')
17
+ try_compiler_option('-Wwrite-strings')
18
+ try_compiler_option('-Waggregate-return')
19
+ try_compiler_option('-Wmissing-prototypes')
20
+ try_compiler_option('-Wmissing-declarations')
21
+ try_compiler_option('-Wnested-externs')
22
+ try_compiler_option('-Wundef')
23
+ try_compiler_option('-Wpointer-arith')
24
+ try_compiler_option('-Wcast-align')
25
+ try_compiler_option('-Werror')
26
+ # XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
27
+ # local variables called “index” on FreeBSD.
28
+ # try_compiler_option('-Wshadow')
29
+ # XXX: This is also too strict.
30
+ # try_compiler_option('-Wconversion')
31
+
32
+ have_header('assert.h')
33
+ have_header('limits.h')
34
+ have_header('locale.h')
35
+ have_header('stdbool.h')
36
+ have_header('stddef.h')
37
+ have_header('stdint.h')
38
+ have_header('stdio.h')
39
+ have_header('stdlib.h')
40
+ have_header('string.h')
41
+ have_header('sys/types.h')
42
+ have_header('wchar.h')
43
+
44
+ $INSTALLFILES ||= []
45
+ $INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
46
+
47
+ create_makefile('encoding/character/utf-8/utf8')
@@ -0,0 +1,68 @@
1
+ /*
2
+ * contents: Private Unicode related information.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #ifndef PRIVATE_H
8
+ #define PRIVATE_H
9
+
10
+ #define NUL '\0'
11
+ #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
12
+
13
+ #if defined(__GNUC__)
14
+ # define UNUSED(u) \
15
+ u __attribute__((__unused__))
16
+ # define HIDDEN \
17
+ __attribute__((visibility("hidden")))
18
+ #else
19
+ # define UNUSED(u) \
20
+ u
21
+ # define HIDDEN(u)
22
+ #endif
23
+
24
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
25
+ NormalizeMode mode) HIDDEN;
26
+ inline int _unichar_combining_class(unichar c) HIDDEN;
27
+
28
+ void need_at_least_n_arguments(int argc, int n) HIDDEN;
29
+
30
+ unichar _utf_char_validated(char const *const str,
31
+ char const *const str_end) HIDDEN;
32
+ char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
33
+ const char *limit, bool noisy) HIDDEN;
34
+
35
+ char *_utf_offset_to_pointer_validated(const char *str, long offset,
36
+ const char *end) HIDDEN;
37
+
38
+ char *_utf_offset_to_pointer_failable(const char *str, long offset,
39
+ const char *end) HIDDEN;
40
+
41
+ VALUE rb_utf_new(const char *str, long len) HIDDEN;
42
+
43
+ VALUE rb_utf_new2(const char *str) HIDDEN;
44
+
45
+ VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
46
+
47
+ VALUE rb_utf_alloc_using(char *str) HIDDEN;
48
+
49
+ VALUE rb_utf_dup(VALUE str) HIDDEN;
50
+
51
+ long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
52
+
53
+ bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
54
+ char **limit) HIDDEN;
55
+
56
+ void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
57
+ char **limit) HIDDEN;
58
+
59
+ char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
60
+
61
+ VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
62
+
63
+ char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
64
+
65
+ long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
66
+ long offset, bool reverse) HIDDEN;
67
+
68
+ #endif /* PRIVATE_H */
@@ -0,0 +1,1061 @@
1
+ /*
2
+ * contents: Unicode character properties.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <assert.h>
9
+ #include <locale.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <string.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "data/character-tables.h"
17
+
18
+
19
+ #define COMBINING_DOT_ABOVE ((unichar)0x0307)
20
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
21
+ #define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
22
+ #define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
23
+ #define LATIN_SMALL_LETTER_I ((unichar)0x0069)
24
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
25
+ #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
26
+ #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
27
+ #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
28
+ #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
29
+ #define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
30
+ #define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
31
+ #define COMBINING_TILDE ((unichar)0x0303)
32
+ #define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
33
+ #define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
34
+ #define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
35
+
36
+ #define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
37
+
38
+ /* {{{1
39
+ * Macros for accessing the Unicode character attribute table.
40
+ *
41
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
42
+ * when we have ‹inline› in C99.
43
+ */
44
+ #define ATTR_TABLE(page) \
45
+ (((page) <= UNICODE_LAST_PAGE_PART1) \
46
+ ? attr_table_part1[page] \
47
+ : attr_table_part2[(page) - 0xe00])
48
+
49
+ #define ATTTABLE(page, char) \
50
+ ((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
51
+ ? 0 : (attr_data[ATTR_TABLE(page)][char]))
52
+
53
+
54
+ /* {{{1
55
+ * Internal function used for figuring out the type of a given character.
56
+ */
57
+ static inline int
58
+ s_type(unichar c)
59
+ {
60
+ const int16_t *table;
61
+ unsigned int page;
62
+
63
+ if (c <= UNICODE_LAST_CHAR_PART1) {
64
+ page = c >> 8;
65
+ table = type_table_part1;
66
+ } else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - 0xe0000) >> 8;
68
+ table = type_table_part2;
69
+ } else {
70
+ return UNICODE_UNASSIGNED;
71
+ }
72
+
73
+ if (table[page] >= UNICODE_MAX_TABLE_INDEX)
74
+ return table[page] - UNICODE_MAX_TABLE_INDEX;
75
+ else
76
+ return type_data[table[page]][c & 0xff];
77
+ }
78
+
79
+
80
+ /* {{{1
81
+ * Bit-fiddling macros for testing the class of a type.
82
+ */
83
+ #define IS(type, class) (((unsigned int)1 << (type)) & (class))
84
+ #define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
85
+
86
+
87
+ /* {{{1
88
+ * Internal function used to check if the given type represents a digit type.
89
+ */
90
+ static inline bool
91
+ s_isdigit(int type)
92
+ {
93
+ return IS(type,
94
+ OR(UNICODE_DECIMAL_NUMBER,
95
+ OR(UNICODE_LETTER_NUMBER,
96
+ OR(UNICODE_OTHER_NUMBER, 0))));
97
+ }
98
+
99
+
100
+ /* {{{1
101
+ * Internal function used to check if the given type represents an alphabetic
102
+ * type.
103
+ */
104
+ static inline bool
105
+ s_isalpha(int type)
106
+ {
107
+ return IS(type,
108
+ OR(UNICODE_LOWERCASE_LETTER,
109
+ OR(UNICODE_UPPERCASE_LETTER,
110
+ OR(UNICODE_TITLECASE_LETTER,
111
+ OR(UNICODE_MODIFIER_LETTER,
112
+ OR(UNICODE_OTHER_LETTER, 0))))));
113
+ }
114
+
115
+
116
+ /* {{{1
117
+ * Internal function used to check if the given type represents a mark type.
118
+ */
119
+ static inline bool
120
+ s_ismark(int type)
121
+ {
122
+ return IS(type,
123
+ OR(UNICODE_NON_SPACING_MARK,
124
+ OR(UNICODE_COMBINING_MARK,
125
+ OR(UNICODE_ENCLOSING_MARK, 0))));
126
+ }
127
+
128
+
129
+ /* {{{1
130
+ * Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
131
+ */
132
+ bool
133
+ unichar_isalnum(unichar c)
134
+ {
135
+ int type = s_type(c);
136
+
137
+ return s_isdigit(type) || s_isalpha(type);
138
+ }
139
+
140
+
141
+ /* {{{1
142
+ * Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
143
+ */
144
+ bool
145
+ unichar_isalpha(unichar c)
146
+ {
147
+ return s_isalpha(s_type(c));
148
+ }
149
+
150
+
151
+ /* {{{1
152
+ * Determine whether ‘c’ is a control character, such as ‹NUL›.
153
+ */
154
+ bool
155
+ unichar_iscntrl(unichar c)
156
+ {
157
+ return s_type(c) == UNICODE_CONTROL;
158
+ }
159
+
160
+
161
+ /* {{{1
162
+ * Determine whether ‘c’ is a digit, such as 0, 1, or 2.
163
+ */
164
+ bool
165
+ unichar_isdigit(unichar c)
166
+ {
167
+ return s_type(c) == UNICODE_DECIMAL_NUMBER;
168
+ }
169
+
170
+
171
+ /* {{{1
172
+ * Determine whether ‘c’ is printable and not a space or control character such
173
+ * as tab or <NUL›, such as A, B, or C.
174
+ */
175
+ bool
176
+ unichar_isgraph(unichar c)
177
+ {
178
+ return !IS(s_type(c),
179
+ OR(UNICODE_CONTROL,
180
+ OR(UNICODE_FORMAT,
181
+ OR(UNICODE_UNASSIGNED,
182
+ OR(UNICODE_PRIVATE_USE,
183
+ OR(UNICODE_SURROGATE,
184
+ OR(UNICODE_SPACE_SEPARATOR, 0)))))));
185
+ }
186
+
187
+
188
+ /* {{{1
189
+ * Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
190
+ */
191
+ bool
192
+ unichar_islower(unichar c)
193
+ {
194
+ return s_type(c) == UNICODE_LOWERCASE_LETTER;
195
+ }
196
+
197
+
198
+ /* {{{1
199
+ * Determine whether ‘c’ is printable, which works the same as
200
+ * unichar_isgraph(), except that space characters are also printable.
201
+ */
202
+ bool
203
+ unichar_isprint(unichar c)
204
+ {
205
+ return !IS(s_type(c),
206
+ OR(UNICODE_CONTROL,
207
+ OR(UNICODE_FORMAT,
208
+ OR(UNICODE_UNASSIGNED,
209
+ OR(UNICODE_PRIVATE_USE,
210
+ OR(UNICODE_SURROGATE, 0))))));
211
+ }
212
+
213
+
214
+ /* {{{1
215
+ * Determine whether ‘c’ is some form of punctuation or other symbol.
216
+ */
217
+ bool
218
+ unichar_ispunct(unichar c)
219
+ {
220
+ return IS(s_type(c),
221
+ OR(UNICODE_CONNECT_PUNCTUATION,
222
+ OR(UNICODE_DASH_PUNCTUATION,
223
+ OR(UNICODE_OPEN_PUNCTUATION,
224
+ OR(UNICODE_CLOSE_PUNCTUATION,
225
+ OR(UNICODE_INITIAL_PUNCTUATION,
226
+ OR(UNICODE_FINAL_PUNCTUATION,
227
+ OR(UNICODE_OTHER_PUNCTUATION,
228
+ OR(UNICODE_MODIFIER_SYMBOL,
229
+ OR(UNICODE_MATH_SYMBOL,
230
+ OR(UNICODE_CURRENCY_SYMBOL,
231
+ OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
232
+ }
233
+
234
+
235
+ /* {{{1
236
+ * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
237
+ * line separator (newline, carriage return, etc.).
238
+ */
239
+ bool
240
+ unichar_isspace(unichar c)
241
+ {
242
+ switch (c) {
243
+ case '\t':
244
+ case '\n':
245
+ case '\r':
246
+ case '\f':
247
+ return true;
248
+ default:
249
+ return IS(s_type(c),
250
+ OR(UNICODE_SPACE_SEPARATOR,
251
+ OR(UNICODE_LINE_SEPARATOR,
252
+ OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
253
+ }
254
+ }
255
+
256
+
257
+ /* {{{1
258
+ * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
259
+ */
260
+ bool
261
+ unichar_isupper(unichar c)
262
+ {
263
+ return s_type(c) == UNICODE_UPPERCASE_LETTER;
264
+ }
265
+
266
+
267
+ /* {{{1
268
+ * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
269
+ * which at the beginning of a word is written as Dz, where only the initial D
270
+ * is capitalized. (Complicated huh?)
271
+ */
272
+ bool
273
+ unichar_istitle(unichar c)
274
+ {
275
+ /* TODO: binary search helpful? */
276
+ for (size_t i = 0; i < lengthof(title_table); i++)
277
+ if (title_table[i][0] == c)
278
+ return true;
279
+
280
+ return false;
281
+ }
282
+
283
+
284
+ /* {{{1
285
+ * Determine whether ‘c’ is a new-line.
286
+ */
287
+ #define UNICHAR_NEXT_LINE ((unichar)0x0085)
288
+ #define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
289
+ #define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
290
+
291
+ bool
292
+ unichar_isnewline(unichar c)
293
+ {
294
+ switch (c) {
295
+ case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
296
+ case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
297
+ return true;
298
+ default:
299
+ return false;
300
+ }
301
+ }
302
+
303
+ /* {{{1
304
+ * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
305
+ * ..., f, or A, B, ..., F.
306
+ */
307
+ #define UNICHAR_FULLWIDTH_A 0xff21
308
+ #define UNICHAR_FULLWIDTH_F 0xff26
309
+ #define UNICHAR_FULLWIDTH_a 0xff41
310
+ #define UNICHAR_FULLWIDTH_f 0xff46
311
+ bool
312
+ unichar_isxdigit(unichar c)
313
+ {
314
+ return ((c >= 'a' && c <= 'f') ||
315
+ (c >= 'A' && c <= 'F') ||
316
+ (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
317
+ (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
318
+ (s_type(c) == UNICODE_DECIMAL_NUMBER));
319
+ // s_isdigit(s_type(c)));
320
+ }
321
+
322
+
323
+ /* {{{1
324
+ * Determine whether code point ‘c’ has been assigned a code value.
325
+ */
326
+ bool
327
+ unichar_isassigned(unichar c)
328
+ {
329
+ return s_type(c) != UNICODE_UNASSIGNED;
330
+ }
331
+
332
+
333
+ /* {{{1
334
+ * Determine whether ‘c’ is a wide character, thus is typically rendered in a
335
+ * double-width cell on a terminal.
336
+ */
337
+ bool
338
+ unichar_iswide(unichar c)
339
+ {
340
+ if (c < 0x1100)
341
+ return false;
342
+
343
+ return (c <= 0x115f || /* Hangul Jamo init. consonants */
344
+ c == 0x2329 || c == 0x232a || /* angle brackets */
345
+ (c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
346
+ (c < 0x302a || c > 0x302f) &&
347
+ c != 0x303f && c != 0x3099 && c != 0x309a) ||
348
+ (c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
349
+ (c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
350
+ (c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
351
+ (c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
352
+ (c >= 0xffe0 && c <= 0xffe6) || /* -"- */
353
+ (c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
354
+ (c >= 0x30000 && c <= 0x3fffd)); /* -"- */
355
+ }
356
+
357
+
358
+ /* {{{1
359
+ * Convert ‘c’ to its uppercase representation (if any).
360
+ */
361
+ static unichar
362
+ special_case_table_lookup(unichar c)
363
+ {
364
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
+
366
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
+ return utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
+
370
+ if (tv == '\0')
371
+ return c;
372
+
373
+ return tv;
374
+ }
375
+
376
+ static unichar
377
+ titlecase_table_lookup(unichar c, bool want_upper)
378
+ {
379
+ for (size_t i = 0; i < lengthof(title_table); i++)
380
+ if (title_table[i][0] == c)
381
+ return title_table[i][want_upper ? 1 : 2];
382
+
383
+ return c;
384
+ }
385
+
386
+ unichar
387
+ unichar_toupper(unichar c)
388
+ {
389
+ int type = s_type(c);
390
+
391
+ if (type == UNICODE_LOWERCASE_LETTER)
392
+ return special_case_table_lookup(c);
393
+
394
+ if (type == UNICODE_TITLECASE_LETTER)
395
+ return titlecase_table_lookup(c, true);
396
+
397
+ return c;
398
+ }
399
+
400
+
401
+ /* {{{1
402
+ * Convert ‘c’ to its lowercase representation (if any).
403
+ */
404
+ unichar
405
+ unichar_tolower(unichar c)
406
+ {
407
+ int type = s_type(c);
408
+
409
+ if (type == UNICODE_UPPERCASE_LETTER)
410
+ return special_case_table_lookup(c);
411
+
412
+ if (type == UNICODE_TITLECASE_LETTER)
413
+ return titlecase_table_lookup(c, false);
414
+
415
+ return c;
416
+ }
417
+
418
+
419
+ /* {{{1
420
+ * Convert ‘c’ to its titlecase representation (if any).
421
+ */
422
+ unichar
423
+ unichar_totitle(unichar c)
424
+ {
425
+ for (size_t i = 0; i < lengthof(title_table); i++)
426
+ if (title_table[i][0] == c ||
427
+ title_table[i][1] == c ||
428
+ title_table[i][2] == c)
429
+ return title_table[i][0];
430
+
431
+ if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
+ return ATTTABLE(c >> 8, c & 0xff);
433
+
434
+ return c;
435
+ }
436
+
437
+
438
+ /* {{{1
439
+ * Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
440
+ */
441
+ int
442
+ unichar_digit_value(unichar c)
443
+ {
444
+ if (s_type(c) == UNICODE_DECIMAL_NUMBER)
445
+ return ATTTABLE(c >> 8, c & 0xff);
446
+
447
+ return -1;
448
+ }
449
+
450
+
451
+ /* {{{1
452
+ * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
453
+ */
454
+ int
455
+ unichar_xdigit_value(unichar c)
456
+ {
457
+ if (c >= 'a' && c <= 'f')
458
+ return c - 'a' + 10;
459
+ else if (c >= 'A' && c <= 'F')
460
+ return c - 'A' + 10;
461
+ else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
462
+ return c - UNICHAR_FULLWIDTH_a + 10;
463
+ else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
464
+ return c - UNICHAR_FULLWIDTH_A + 10;
465
+ else
466
+ return unichar_digit_value(c);
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * Determine the Unicode character type of ‘c’.
472
+ */
473
+ UnicodeType
474
+ unichar_type(unichar c)
475
+ {
476
+ return s_type(c);
477
+ }
478
+
479
+
480
+ /* {{{1
481
+ * LocaleType: This ‹enum› is used for dealing with different locales for
482
+ * turning strings into uppercase or lowercase.
483
+ */
484
+ typedef enum {
485
+ LOCALE_NORMAL,
486
+ LOCALE_TURKIC,
487
+ LOCALE_LITHUANIAN
488
+ } LocaleType;
489
+
490
+
491
+ /* {{{1
492
+ * Retrieve the locale type from the environment (LC_CTYPE).
493
+ */
494
+ static LocaleType
495
+ get_locale_type(void)
496
+ {
497
+ const char *locale = setlocale(LC_CTYPE, NULL);
498
+
499
+ if ((locale[0] == 'a' && locale[1] == 'z') ||
500
+ (locale[0] == 't' && locale[1] == 'r'))
501
+ return LOCALE_TURKIC;
502
+
503
+ if (locale[0] == 'l' && locale[1] == 't')
504
+ return LOCALE_LITHUANIAN;
505
+
506
+ return LOCALE_NORMAL;
507
+ }
508
+
509
+
510
+ /* {{{1
511
+ * Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
512
+ * true, remove the dot over an uppercase I for a turkish locale.
513
+ */
514
+ static size_t
515
+ output_marks(const char **p_inout, char *buf, bool remove_dot)
516
+ {
517
+ size_t len = 0;
518
+ const char *p = *p_inout;
519
+
520
+ for ( ; *p != '\0'; p = utf_next(p)) {
521
+ unichar c = utf_char(p);
522
+
523
+ if (!s_ismark(s_type(c)))
524
+ break;
525
+
526
+ if (!remove_dot || c != COMBINING_DOT_ABOVE)
527
+ len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
528
+ }
529
+
530
+ *p_inout = p;
531
+
532
+ return len;
533
+ }
534
+
535
+ /* {{{1
536
+ * Output titlecases where appropriate.
537
+ */
538
+ static size_t
539
+ output_special_case(char *buf, int offset, int type, bool upper)
540
+ {
541
+ const char *p = special_case_table + offset;
542
+
543
+ if (type != UNICODE_TITLECASE_LETTER)
544
+ p = utf_next(p);
545
+
546
+ if (upper)
547
+ p += utf_byte_length(p) + 1;
548
+
549
+ size_t len = utf_byte_length(p);
550
+
551
+ if (buf != NULL)
552
+ memcpy(buf, p, len);
553
+
554
+ return len;
555
+ }
556
+
557
+ /* {{{1
558
+ * Do uppercasing of ‘p’ for Lithuanian locales.
559
+ */
560
+ static size_t
561
+ remove_all_combining_dot_above(unichar c, char *buf)
562
+ {
563
+ size_t decomp_len;
564
+ unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
565
+
566
+ size_t len = 0;
567
+ for (size_t i = 0; i < decomp_len; i++)
568
+ if (decomp[i] != COMBINING_DOT_ABOVE)
569
+ len += unichar_to_utf(unichar_toupper(decomp[i]),
570
+ OFFSET_IF(buf, len));
571
+
572
+ free(decomp);
573
+
574
+ return len;
575
+ }
576
+
577
+ static size_t
578
+ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
579
+ bool *was_i)
580
+ {
581
+ if (c == 'i') {
582
+ *was_i = true;
583
+ return 0;
584
+ }
585
+
586
+ if (*was_i) {
587
+ size_t len = remove_all_combining_dot_above(c, buf);
588
+ return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
589
+ true);
590
+ }
591
+
592
+ if (!s_ismark(type))
593
+ *was_i = false;
594
+
595
+ return 0;
596
+ }
597
+
598
+ /* {{{1
599
+ * Do real upcasing. */
600
+ static inline size_t
601
+ real_do_toupper(unichar c, int type, char *buf)
602
+ {
603
+ bool upper = (type != UNICODE_LOWERCASE_LETTER);
604
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
605
+
606
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
607
+ return output_special_case(buf,
608
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
609
+ type, upper);
610
+
611
+ /* TODO: this should really use titlecase_table_lookup somehow. */
612
+ if (type == UNICODE_TITLECASE_LETTER)
613
+ for (size_t i = 0; i < lengthof(title_table); i++)
614
+ if (title_table[i][0] == c)
615
+ return unichar_to_utf(title_table[i][1], buf);
616
+
617
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
618
+ }
619
+
620
+ /* {{{1
621
+ * Do real uppercasing of ‘str’.
622
+ */
623
+ static size_t
624
+ real_toupper_one(const char **p, const char *prev, char *buf,
625
+ LocaleType locale_type, bool *was_i)
626
+ {
627
+ unichar c = utf_char(prev);
628
+ int type = s_type(c);
629
+
630
+ if (locale_type == LOCALE_LITHUANIAN) {
631
+ size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
632
+ if (len > 0)
633
+ return len;
634
+ }
635
+
636
+ if (locale_type == LOCALE_TURKIC && c == 'i')
637
+ return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
638
+ buf);
639
+
640
+ if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
641
+ /* Nasty, need to move it after other combining marks...this
642
+ * would go away if we normalized first. */
643
+ /* TODO: don’t we need to make sure we don’t go beyond the end
644
+ * of ‘p’? */
645
+ size_t len = output_marks(p, buf, false);
646
+ return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
647
+ OFFSET_IF(buf, len));
648
+ }
649
+
650
+ if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
651
+ OR(UNICODE_TITLECASE_LETTER, 0))))
652
+ return real_do_toupper(c, type, buf);
653
+
654
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
655
+
656
+ if (buf != NULL)
657
+ memcpy(buf, prev, len);
658
+
659
+ return len;
660
+ }
661
+
662
+ static size_t
663
+ real_toupper(const char *str, size_t max, bool use_max, char *buf,
664
+ LocaleType locale_type)
665
+ {
666
+ const char *p = str;
667
+ size_t len = 0;
668
+ bool p_was_i = false;
669
+
670
+ while ((!use_max || p < str + max) && *p != '\0') {
671
+ const char *prev = p;
672
+ p = utf_next(p);
673
+
674
+ len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
675
+ locale_type, &p_was_i);
676
+ }
677
+
678
+ return len;
679
+ }
680
+
681
+ /* {{{1
682
+ * Wrapper around real_toupper() for dealing with memory allocation and such.
683
+ */
684
+ static char *
685
+ utf_upcase_impl(const char *str, size_t max, bool use_max)
686
+ {
687
+ assert(str != NULL);
688
+
689
+ LocaleType locale_type = get_locale_type();
690
+
691
+ size_t len = real_toupper(str, max, use_max, NULL, locale_type);
692
+ char *result = ALLOC_N(char, len + 1);
693
+ real_toupper(str, max, use_max, result, locale_type);
694
+ result[len] = '\0';
695
+
696
+ return result;
697
+ }
698
+
699
+
700
+ /* {{{1
701
+ * Convert all characters in ‘str’ to their uppercase representation if
702
+ * applicable. Returns the freshly allocated representation.
703
+ */
704
+ char *
705
+ utf_upcase(const char *str)
706
+ {
707
+ return utf_upcase_impl(str, 0, false);
708
+ }
709
+
710
+
711
+ /* {{{1
712
+ * Convert all characters in ‘str’ to their uppercase representation if
713
+ * applicable. Returns the freshly allocated representation. Do this for at
714
+ * most ‘len˚ bytes from ‘str’.
715
+ */
716
+ char *
717
+ utf_upcase_n(const char *str, size_t len)
718
+ {
719
+ return utf_upcase_impl(str, len, true);
720
+ }
721
+
722
+
723
+ /* {{{1
724
+ * Traverse the string checking for characters with combining class == 230
725
+ * until a base character is found.
726
+ */
727
+ static bool
728
+ has_more_above(const char *str)
729
+ {
730
+ for (const char *p = str; *p != '\0'; p = utf_next(p)) {
731
+ int c_class = _unichar_combining_class(utf_char(p));
732
+
733
+ if (c_class == 230)
734
+ return true;
735
+
736
+ if (c_class == 0)
737
+ return false;
738
+ }
739
+
740
+ return false;
741
+ }
742
+
743
+ static inline size_t
744
+ real_do_tolower(unichar c, int type, char *buf)
745
+ {
746
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
747
+
748
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
749
+ return output_special_case(buf,
750
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
751
+ type, false);
752
+
753
+ /* TODO: this should really use titlecase_table_lookup somehow. */
754
+ if (type == UNICODE_TITLECASE_LETTER)
755
+ for (size_t i = 0; i < lengthof(title_table); i++)
756
+ if (title_table[i][0] == c)
757
+ return unichar_to_utf(title_table[i][2], buf);
758
+
759
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
760
+ }
761
+
762
+ /* {{{1
763
+ * The real implementation of downcase.
764
+ *
765
+ * TODO: this needs a cleanup.
766
+ */
767
+ static size_t
768
+ real_tolower_one(const char **p, const char *prev, char *buf,
769
+ LocaleType locale_type, const char *end, bool use_end)
770
+ {
771
+ unichar c = utf_char(prev);
772
+ int type = s_type(c);
773
+
774
+ if (locale_type == LOCALE_TURKIC && c == 'I') {
775
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
776
+ /* TODO: don’t we need to make sure we don’t go beyond the end
777
+ * of ‘p’? */
778
+ *p = utf_next(*p);
779
+ return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
780
+ }
781
+
782
+ return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
783
+ }
784
+
785
+ if (locale_type == LOCALE_LITHUANIAN &&
786
+ (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
787
+ c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
788
+ c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
789
+ /* Introduce an explicit dot above the lowercasing capital I's
790
+ * and J's whenever there are more accents above.
791
+ * [SpecialCasing.txt] */
792
+ size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
793
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
794
+ switch (c) {
795
+ case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
796
+ len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
797
+ OFFSET_IF(buf, len));
798
+ break;
799
+ case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
800
+ len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
801
+ OFFSET_IF(buf, len));
802
+ break;
803
+ case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
804
+ len += unichar_to_utf(COMBINING_TILDE,
805
+ OFFSET_IF(buf, len));
806
+ break;
807
+ }
808
+
809
+ return len;
810
+ }
811
+
812
+ if (locale_type == LOCALE_LITHUANIAN &&
813
+ (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
814
+ has_more_above(*p)) {
815
+ size_t len = unichar_to_utf(unichar_tolower(c), buf);
816
+ return len + unichar_to_utf(COMBINING_DOT_ABOVE,
817
+ OFFSET_IF(buf, len));
818
+ }
819
+
820
+ if (c == GREEK_CAPITAL_LETTER_SIGMA) {
821
+ unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
822
+
823
+ if ((!use_end || *p < end) && **p != '\0') {
824
+ unichar next_c = utf_char(*p);
825
+ int next_type = s_type(next_c);
826
+
827
+ /* SIGMA maps differently depending on whether it is
828
+ * final or not. The following simplified test would
829
+ * fail in the case of combining marks following the
830
+ * sigma, but I don't think that occurs in real text.
831
+ * The test here matches that in ICU. */
832
+ if (s_isalpha(next_type))
833
+ tv = GREEK_SMALL_LETTER_SIGMA;
834
+ }
835
+
836
+ return unichar_to_utf(tv, buf);
837
+ }
838
+
839
+ if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
840
+ OR(UNICODE_TITLECASE_LETTER, 0))))
841
+ return real_do_tolower(c, type, buf);
842
+
843
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
844
+
845
+ if (buf != NULL)
846
+ memcpy(buf, prev, len);
847
+
848
+ return len;
849
+ }
850
+
851
+ static size_t
852
+ real_tolower(const char *str, size_t max, bool use_max, char *buf,
853
+ LocaleType locale_type)
854
+ {
855
+ const char *p = str;
856
+ const char *end = str + max;
857
+ size_t len = 0;
858
+
859
+ while ((!use_max || p < end) && *p != '\0') {
860
+ const char *prev = p;
861
+ p = utf_next(p);
862
+
863
+ len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
864
+ locale_type, end, use_max);
865
+ }
866
+
867
+ return len;
868
+ }
869
+
870
+
871
+ /* {{{1 */
872
+ static char *
873
+ utf_downcase_impl(const char *str, size_t max, bool use_max)
874
+ {
875
+ assert(str != NULL);
876
+
877
+ LocaleType locale_type = get_locale_type();
878
+
879
+ size_t len = real_tolower(str, max, use_max, NULL, locale_type);
880
+ char *result = ALLOC_N(char, len + 1);
881
+ real_tolower(str, max, use_max, result, locale_type);
882
+ result[len] = NUL;
883
+
884
+ return result;
885
+ }
886
+
887
+
888
+ /* {{{1
889
+ * Convert all characters in ‘str’ to their lowercase representation if
890
+ * applicable. Returns the freshly allocated representation.
891
+ */
892
+ char *
893
+ utf_downcase(const char *str)
894
+ {
895
+ return utf_downcase_impl(str, 0, false);
896
+ }
897
+
898
+
899
+ /* {{{1
900
+ * Convert all characters in ‘str’ to their lowercase representation if
901
+ * applicable. Returns the freshly allocated representation. Do this for at
902
+ * most ‘len˚ bytes from ‘str’.
903
+ */
904
+ char *
905
+ utf_downcase_n(const char *str, size_t len)
906
+ {
907
+ return utf_downcase_impl(str, len, true);
908
+ }
909
+
910
+
911
+ /* {{{1
912
+ * The real implementation of case folding below.
913
+ */
914
+
915
+ static bool
916
+ casefold_table_lookup(unichar c, char *folded, size_t *len)
917
+ {
918
+ int begin = 0;
919
+ int end = lengthof(casefold_table);
920
+
921
+ if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
922
+ return false;
923
+
924
+ while (true) {
925
+ int mid = (begin + end) / 2;
926
+
927
+ if (c == casefold_table[mid].ch) {
928
+ if (folded != NULL)
929
+ strcpy(folded, casefold_table[mid].data);
930
+ *len += utf_byte_length(casefold_table[mid].data);
931
+ return true;
932
+ } else if (mid == begin) {
933
+ return false;
934
+ } else if (c > casefold_table[mid].ch) {
935
+ begin = mid;
936
+ } else {
937
+ end = mid;
938
+ }
939
+ }
940
+ }
941
+
942
+ static char *
943
+ utf_foldcase_impl(const char *str, size_t max, bool use_max)
944
+ {
945
+ assert(str != NULL);
946
+
947
+ char *folded = NULL;
948
+ size_t len = 0;
949
+
950
+ again:
951
+ for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
952
+ unichar c = utf_char(p);
953
+
954
+ if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
955
+ continue;
956
+
957
+ len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
958
+ }
959
+
960
+ if (folded == NULL) {
961
+ folded = ALLOC_N(char, len + 1);
962
+ folded[0] = NUL;
963
+ len = 0;
964
+ goto again;
965
+ }
966
+
967
+ folded[len] = '\0';
968
+
969
+ return folded;
970
+ }
971
+
972
+
973
+ /* {{{1
974
+ * Convert a string into a form that is independent of case. Return the
975
+ * freshly allocated representation.
976
+ */
977
+ char *
978
+ utf_foldcase(const char *str)
979
+ {
980
+ return utf_foldcase_impl(str, 0, false);
981
+ }
982
+
983
+
984
+ /* {{{1
985
+ * Convert a string into a form that is independent of case. Return the
986
+ * freshly allocated representation. Do this for at most ‘len’ bytes from the
987
+ * string.
988
+ */
989
+ char *
990
+ utf_foldcase_n(const char *str, size_t len)
991
+ {
992
+ return utf_foldcase_impl(str, len, true);
993
+ }
994
+
995
+
996
+ /* {{{1
997
+ * The real implementation of utf_width() and utf_width_n() below.
998
+ */
999
+ static size_t
1000
+ utf_width_impl(const char *str, size_t len, bool use_len)
1001
+ {
1002
+ assert(str != NULL);
1003
+
1004
+ size_t width = 0;
1005
+
1006
+ for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
1007
+ width += unichar_iswide(utf_char(p)) ? 2 : 1;
1008
+
1009
+ return width;
1010
+ }
1011
+
1012
+
1013
+ /* {{{1
1014
+ * Calculate the width in cells of ‘str’.
1015
+ */
1016
+ size_t
1017
+ utf_width(const char *str)
1018
+ {
1019
+ return utf_width_impl(str, 0, false);
1020
+ }
1021
+
1022
+
1023
+ /* {{{1
1024
+ * Calculate the width in cells of ‘str’, which is of length ‘len’.
1025
+ */
1026
+ size_t
1027
+ utf_width_n(const char *str, size_t len)
1028
+ {
1029
+ return utf_width_impl(str, len, true);
1030
+ }
1031
+
1032
+
1033
+ /* {{{1
1034
+ * Retrieve the mirrored representation of ‘c’ (if any) and store it in
1035
+ * ‘mirrored’.
1036
+ */
1037
+ bool
1038
+ unichar_mirror(unichar c, unichar *mirrored)
1039
+ {
1040
+ int begin = 0;
1041
+ int end = lengthof(bidi_mirroring_table);
1042
+
1043
+ while (true) {
1044
+ int mid = (begin + end) / 2;
1045
+
1046
+ if (c == bidi_mirroring_table[mid].ch) {
1047
+ if (mirrored != NULL)
1048
+ *mirrored = bidi_mirroring_table[mid].mirrored_ch;
1049
+ return true;
1050
+ } else if (mid == begin) {
1051
+ return false;
1052
+ } else if (c > bidi_mirroring_table[mid].ch) {
1053
+ begin = mid;
1054
+ } else {
1055
+ end = mid;
1056
+ }
1057
+ }
1058
+ }
1059
+
1060
+
1061
+ /* }}}1 */