character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,64 @@
1
+ break.o: break.c unicode.h data/break.h
2
+ decompose.o: decompose.c unicode.h private.h data/decompose.h \
3
+ data/compose.h
4
+ properties.o: properties.c unicode.h private.h data/character-tables.h
5
+ rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
6
+ rb_methods.h
7
+ rb_utf_aset.o: rb_utf_aset.c rb_includes.h unicode.h private.h \
8
+ rb_methods.h
9
+ rb_utf_casecmp.o: rb_utf_casecmp.c rb_includes.h unicode.h private.h \
10
+ rb_methods.h
11
+ rb_utf_chomp.o: rb_utf_chomp.c rb_includes.h unicode.h private.h \
12
+ rb_methods.h
13
+ rb_utf_chop.o: rb_utf_chop.c rb_includes.h unicode.h private.h \
14
+ rb_methods.h
15
+ rb_utf_collate.o: rb_utf_collate.c rb_includes.h unicode.h private.h \
16
+ rb_methods.h
17
+ rb_utf_count.o: rb_utf_count.c rb_includes.h unicode.h private.h \
18
+ rb_methods.h rb_utf_internal_tr.h
19
+ rb_utf_delete.o: rb_utf_delete.c rb_includes.h unicode.h private.h \
20
+ rb_methods.h rb_utf_internal_tr.h
21
+ rb_utf_downcase.o: rb_utf_downcase.c rb_includes.h unicode.h private.h \
22
+ rb_methods.h
23
+ rb_utf_each_char.o: rb_utf_each_char.c rb_includes.h unicode.h private.h \
24
+ rb_methods.h
25
+ rb_utf_foldcase.o: rb_utf_foldcase.c rb_includes.h unicode.h private.h \
26
+ rb_methods.h
27
+ rb_utf_hex.o: rb_utf_hex.c rb_includes.h unicode.h private.h rb_methods.h \
28
+ rb_utf_internal_bignum.h
29
+ rb_utf_index.o: rb_utf_index.c rb_includes.h unicode.h private.h \
30
+ rb_methods.h
31
+ rb_utf_insert.o: rb_utf_insert.c rb_includes.h unicode.h private.h \
32
+ rb_methods.h
33
+ rb_utf_internal_bignum.o: rb_utf_internal_bignum.c rb_includes.h \
34
+ unicode.h private.h rb_methods.h rb_utf_internal_bignum.h
35
+ rb_utf_internal_tr.o: rb_utf_internal_tr.c rb_includes.h unicode.h \
36
+ private.h rb_methods.h rb_utf_internal_tr.h
37
+ rb_utf_justify.o: rb_utf_justify.c rb_includes.h unicode.h private.h \
38
+ rb_methods.h
39
+ rb_utf_length.o: rb_utf_length.c rb_includes.h unicode.h private.h \
40
+ rb_methods.h
41
+ rb_utf_lstrip.o: rb_utf_lstrip.c rb_includes.h unicode.h private.h \
42
+ rb_methods.h
43
+ rb_utf_normalize.o: rb_utf_normalize.c rb_includes.h unicode.h private.h \
44
+ rb_methods.h
45
+ rb_utf_oct.o: rb_utf_oct.c rb_includes.h unicode.h private.h rb_methods.h \
46
+ rb_utf_internal_bignum.h
47
+ rb_utf_reverse.o: rb_utf_reverse.c rb_includes.h unicode.h private.h \
48
+ rb_methods.h
49
+ rb_utf_rindex.o: rb_utf_rindex.c rb_includes.h unicode.h private.h \
50
+ rb_methods.h
51
+ rb_utf_rstrip.o: rb_utf_rstrip.c rb_includes.h unicode.h private.h \
52
+ rb_methods.h
53
+ rb_utf_squeeze.o: rb_utf_squeeze.c rb_includes.h unicode.h private.h \
54
+ rb_methods.h rb_utf_internal_tr.h
55
+ rb_utf_strip.o: rb_utf_strip.c rb_includes.h unicode.h private.h \
56
+ rb_methods.h
57
+ rb_utf_to_i.o: rb_utf_to_i.c rb_includes.h unicode.h private.h \
58
+ rb_methods.h rb_utf_internal_bignum.h
59
+ rb_utf_tr.o: rb_utf_tr.c rb_includes.h unicode.h private.h rb_methods.h \
60
+ rb_utf_internal_tr.h
61
+ rb_utf_upcase.o: rb_utf_upcase.c rb_includes.h unicode.h private.h \
62
+ rb_methods.h
63
+ unicode.o: unicode.c unicode.h private.h rb_methods.h
64
+ utf.o: utf.c unicode.h private.h
@@ -0,0 +1,47 @@
1
+ require 'mkmf'
2
+
3
+ def try_compiler_option(opt, &b)
4
+ checking_for "‘#{opt}’ option to compiler" do
5
+ if try_compile('', opt, &b)
6
+ $CFLAGS += " #{opt}"
7
+ true
8
+ else
9
+ false
10
+ end
11
+ end
12
+ end
13
+
14
+ try_compiler_option('-std=c99')
15
+ try_compiler_option('-Wall')
16
+ try_compiler_option('-Wextra')
17
+ try_compiler_option('-Wwrite-strings')
18
+ try_compiler_option('-Waggregate-return')
19
+ try_compiler_option('-Wmissing-prototypes')
20
+ try_compiler_option('-Wmissing-declarations')
21
+ try_compiler_option('-Wnested-externs')
22
+ try_compiler_option('-Wundef')
23
+ try_compiler_option('-Wpointer-arith')
24
+ try_compiler_option('-Wcast-align')
25
+ try_compiler_option('-Werror')
26
+ # XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
27
+ # local variables called “index” on FreeBSD.
28
+ # try_compiler_option('-Wshadow')
29
+ # XXX: This is also too strict.
30
+ # try_compiler_option('-Wconversion')
31
+
32
+ have_header('assert.h')
33
+ have_header('limits.h')
34
+ have_header('locale.h')
35
+ have_header('stdbool.h')
36
+ have_header('stddef.h')
37
+ have_header('stdint.h')
38
+ have_header('stdio.h')
39
+ have_header('stdlib.h')
40
+ have_header('string.h')
41
+ have_header('sys/types.h')
42
+ have_header('wchar.h')
43
+
44
+ $INSTALLFILES ||= []
45
+ $INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
46
+
47
+ create_makefile('encoding/character/utf-8/utf8')
@@ -0,0 +1,68 @@
1
+ /*
2
+ * contents: Private Unicode related information.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #ifndef PRIVATE_H
8
+ #define PRIVATE_H
9
+
10
+ #define NUL '\0'
11
+ #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
12
+
13
+ #if defined(__GNUC__)
14
+ # define UNUSED(u) \
15
+ u __attribute__((__unused__))
16
+ # define HIDDEN \
17
+ __attribute__((visibility("hidden")))
18
+ #else
19
+ # define UNUSED(u) \
20
+ u
21
+ # define HIDDEN(u)
22
+ #endif
23
+
24
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
25
+ NormalizeMode mode) HIDDEN;
26
+ inline int _unichar_combining_class(unichar c) HIDDEN;
27
+
28
+ void need_at_least_n_arguments(int argc, int n) HIDDEN;
29
+
30
+ unichar _utf_char_validated(char const *const str,
31
+ char const *const str_end) HIDDEN;
32
+ char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
33
+ const char *limit, bool noisy) HIDDEN;
34
+
35
+ char *_utf_offset_to_pointer_validated(const char *str, long offset,
36
+ const char *end) HIDDEN;
37
+
38
+ char *_utf_offset_to_pointer_failable(const char *str, long offset,
39
+ const char *end) HIDDEN;
40
+
41
+ VALUE rb_utf_new(const char *str, long len) HIDDEN;
42
+
43
+ VALUE rb_utf_new2(const char *str) HIDDEN;
44
+
45
+ VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
46
+
47
+ VALUE rb_utf_alloc_using(char *str) HIDDEN;
48
+
49
+ VALUE rb_utf_dup(VALUE str) HIDDEN;
50
+
51
+ long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
52
+
53
+ bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
54
+ char **limit) HIDDEN;
55
+
56
+ void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
57
+ char **limit) HIDDEN;
58
+
59
+ char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
60
+
61
+ VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
62
+
63
+ char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
64
+
65
+ long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
66
+ long offset, bool reverse) HIDDEN;
67
+
68
+ #endif /* PRIVATE_H */
@@ -0,0 +1,1061 @@
1
+ /*
2
+ * contents: Unicode character properties.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <assert.h>
9
+ #include <locale.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <string.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "data/character-tables.h"
17
+
18
+
19
+ #define COMBINING_DOT_ABOVE ((unichar)0x0307)
20
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
21
+ #define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
22
+ #define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
23
+ #define LATIN_SMALL_LETTER_I ((unichar)0x0069)
24
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
25
+ #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
26
+ #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
27
+ #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
28
+ #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
29
+ #define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
30
+ #define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
31
+ #define COMBINING_TILDE ((unichar)0x0303)
32
+ #define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
33
+ #define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
34
+ #define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
35
+
36
+ #define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
37
+
38
+ /* {{{1
39
+ * Macros for accessing the Unicode character attribute table.
40
+ *
41
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
42
+ * when we have ‹inline› in C99.
43
+ */
44
+ #define ATTR_TABLE(page) \
45
+ (((page) <= UNICODE_LAST_PAGE_PART1) \
46
+ ? attr_table_part1[page] \
47
+ : attr_table_part2[(page) - 0xe00])
48
+
49
+ #define ATTTABLE(page, char) \
50
+ ((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
51
+ ? 0 : (attr_data[ATTR_TABLE(page)][char]))
52
+
53
+
54
+ /* {{{1
55
+ * Internal function used for figuring out the type of a given character.
56
+ */
57
+ static inline int
58
+ s_type(unichar c)
59
+ {
60
+ const int16_t *table;
61
+ unsigned int page;
62
+
63
+ if (c <= UNICODE_LAST_CHAR_PART1) {
64
+ page = c >> 8;
65
+ table = type_table_part1;
66
+ } else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - 0xe0000) >> 8;
68
+ table = type_table_part2;
69
+ } else {
70
+ return UNICODE_UNASSIGNED;
71
+ }
72
+
73
+ if (table[page] >= UNICODE_MAX_TABLE_INDEX)
74
+ return table[page] - UNICODE_MAX_TABLE_INDEX;
75
+ else
76
+ return type_data[table[page]][c & 0xff];
77
+ }
78
+
79
+
80
+ /* {{{1
81
+ * Bit-fiddling macros for testing the class of a type.
82
+ */
83
+ #define IS(type, class) (((unsigned int)1 << (type)) & (class))
84
+ #define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
85
+
86
+
87
+ /* {{{1
88
+ * Internal function used to check if the given type represents a digit type.
89
+ */
90
+ static inline bool
91
+ s_isdigit(int type)
92
+ {
93
+ return IS(type,
94
+ OR(UNICODE_DECIMAL_NUMBER,
95
+ OR(UNICODE_LETTER_NUMBER,
96
+ OR(UNICODE_OTHER_NUMBER, 0))));
97
+ }
98
+
99
+
100
+ /* {{{1
101
+ * Internal function used to check if the given type represents an alphabetic
102
+ * type.
103
+ */
104
+ static inline bool
105
+ s_isalpha(int type)
106
+ {
107
+ return IS(type,
108
+ OR(UNICODE_LOWERCASE_LETTER,
109
+ OR(UNICODE_UPPERCASE_LETTER,
110
+ OR(UNICODE_TITLECASE_LETTER,
111
+ OR(UNICODE_MODIFIER_LETTER,
112
+ OR(UNICODE_OTHER_LETTER, 0))))));
113
+ }
114
+
115
+
116
+ /* {{{1
117
+ * Internal function used to check if the given type represents a mark type.
118
+ */
119
+ static inline bool
120
+ s_ismark(int type)
121
+ {
122
+ return IS(type,
123
+ OR(UNICODE_NON_SPACING_MARK,
124
+ OR(UNICODE_COMBINING_MARK,
125
+ OR(UNICODE_ENCLOSING_MARK, 0))));
126
+ }
127
+
128
+
129
+ /* {{{1
130
+ * Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
131
+ */
132
+ bool
133
+ unichar_isalnum(unichar c)
134
+ {
135
+ int type = s_type(c);
136
+
137
+ return s_isdigit(type) || s_isalpha(type);
138
+ }
139
+
140
+
141
+ /* {{{1
142
+ * Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
143
+ */
144
+ bool
145
+ unichar_isalpha(unichar c)
146
+ {
147
+ return s_isalpha(s_type(c));
148
+ }
149
+
150
+
151
+ /* {{{1
152
+ * Determine whether ‘c’ is a control character, such as ‹NUL›.
153
+ */
154
+ bool
155
+ unichar_iscntrl(unichar c)
156
+ {
157
+ return s_type(c) == UNICODE_CONTROL;
158
+ }
159
+
160
+
161
+ /* {{{1
162
+ * Determine whether ‘c’ is a digit, such as 0, 1, or 2.
163
+ */
164
+ bool
165
+ unichar_isdigit(unichar c)
166
+ {
167
+ return s_type(c) == UNICODE_DECIMAL_NUMBER;
168
+ }
169
+
170
+
171
+ /* {{{1
172
+ * Determine whether ‘c’ is printable and not a space or control character such
173
+ * as tab or <NUL›, such as A, B, or C.
174
+ */
175
+ bool
176
+ unichar_isgraph(unichar c)
177
+ {
178
+ return !IS(s_type(c),
179
+ OR(UNICODE_CONTROL,
180
+ OR(UNICODE_FORMAT,
181
+ OR(UNICODE_UNASSIGNED,
182
+ OR(UNICODE_PRIVATE_USE,
183
+ OR(UNICODE_SURROGATE,
184
+ OR(UNICODE_SPACE_SEPARATOR, 0)))))));
185
+ }
186
+
187
+
188
+ /* {{{1
189
+ * Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
190
+ */
191
+ bool
192
+ unichar_islower(unichar c)
193
+ {
194
+ return s_type(c) == UNICODE_LOWERCASE_LETTER;
195
+ }
196
+
197
+
198
+ /* {{{1
199
+ * Determine whether ‘c’ is printable, which works the same as
200
+ * unichar_isgraph(), except that space characters are also printable.
201
+ */
202
+ bool
203
+ unichar_isprint(unichar c)
204
+ {
205
+ return !IS(s_type(c),
206
+ OR(UNICODE_CONTROL,
207
+ OR(UNICODE_FORMAT,
208
+ OR(UNICODE_UNASSIGNED,
209
+ OR(UNICODE_PRIVATE_USE,
210
+ OR(UNICODE_SURROGATE, 0))))));
211
+ }
212
+
213
+
214
+ /* {{{1
215
+ * Determine whether ‘c’ is some form of punctuation or other symbol.
216
+ */
217
+ bool
218
+ unichar_ispunct(unichar c)
219
+ {
220
+ return IS(s_type(c),
221
+ OR(UNICODE_CONNECT_PUNCTUATION,
222
+ OR(UNICODE_DASH_PUNCTUATION,
223
+ OR(UNICODE_OPEN_PUNCTUATION,
224
+ OR(UNICODE_CLOSE_PUNCTUATION,
225
+ OR(UNICODE_INITIAL_PUNCTUATION,
226
+ OR(UNICODE_FINAL_PUNCTUATION,
227
+ OR(UNICODE_OTHER_PUNCTUATION,
228
+ OR(UNICODE_MODIFIER_SYMBOL,
229
+ OR(UNICODE_MATH_SYMBOL,
230
+ OR(UNICODE_CURRENCY_SYMBOL,
231
+ OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
232
+ }
233
+
234
+
235
+ /* {{{1
236
+ * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
237
+ * line separator (newline, carriage return, etc.).
238
+ */
239
+ bool
240
+ unichar_isspace(unichar c)
241
+ {
242
+ switch (c) {
243
+ case '\t':
244
+ case '\n':
245
+ case '\r':
246
+ case '\f':
247
+ return true;
248
+ default:
249
+ return IS(s_type(c),
250
+ OR(UNICODE_SPACE_SEPARATOR,
251
+ OR(UNICODE_LINE_SEPARATOR,
252
+ OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
253
+ }
254
+ }
255
+
256
+
257
+ /* {{{1
258
+ * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
259
+ */
260
+ bool
261
+ unichar_isupper(unichar c)
262
+ {
263
+ return s_type(c) == UNICODE_UPPERCASE_LETTER;
264
+ }
265
+
266
+
267
+ /* {{{1
268
+ * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
269
+ * which at the beginning of a word is written as Dz, where only the initial D
270
+ * is capitalized. (Complicated huh?)
271
+ */
272
+ bool
273
+ unichar_istitle(unichar c)
274
+ {
275
+ /* TODO: binary search helpful? */
276
+ for (size_t i = 0; i < lengthof(title_table); i++)
277
+ if (title_table[i][0] == c)
278
+ return true;
279
+
280
+ return false;
281
+ }
282
+
283
+
284
+ /* {{{1
285
+ * Determine whether ‘c’ is a new-line.
286
+ */
287
+ #define UNICHAR_NEXT_LINE ((unichar)0x0085)
288
+ #define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
289
+ #define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
290
+
291
+ bool
292
+ unichar_isnewline(unichar c)
293
+ {
294
+ switch (c) {
295
+ case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
296
+ case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
297
+ return true;
298
+ default:
299
+ return false;
300
+ }
301
+ }
302
+
303
+ /* {{{1
304
+ * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
305
+ * ..., f, or A, B, ..., F.
306
+ */
307
+ #define UNICHAR_FULLWIDTH_A 0xff21
308
+ #define UNICHAR_FULLWIDTH_F 0xff26
309
+ #define UNICHAR_FULLWIDTH_a 0xff41
310
+ #define UNICHAR_FULLWIDTH_f 0xff46
311
+ bool
312
+ unichar_isxdigit(unichar c)
313
+ {
314
+ return ((c >= 'a' && c <= 'f') ||
315
+ (c >= 'A' && c <= 'F') ||
316
+ (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
317
+ (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
318
+ (s_type(c) == UNICODE_DECIMAL_NUMBER));
319
+ // s_isdigit(s_type(c)));
320
+ }
321
+
322
+
323
+ /* {{{1
324
+ * Determine whether code point ‘c’ has been assigned a code value.
325
+ */
326
+ bool
327
+ unichar_isassigned(unichar c)
328
+ {
329
+ return s_type(c) != UNICODE_UNASSIGNED;
330
+ }
331
+
332
+
333
+ /* {{{1
334
+ * Determine whether ‘c’ is a wide character, thus is typically rendered in a
335
+ * double-width cell on a terminal.
336
+ */
337
+ bool
338
+ unichar_iswide(unichar c)
339
+ {
340
+ if (c < 0x1100)
341
+ return false;
342
+
343
+ return (c <= 0x115f || /* Hangul Jamo init. consonants */
344
+ c == 0x2329 || c == 0x232a || /* angle brackets */
345
+ (c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
346
+ (c < 0x302a || c > 0x302f) &&
347
+ c != 0x303f && c != 0x3099 && c != 0x309a) ||
348
+ (c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
349
+ (c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
350
+ (c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
351
+ (c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
352
+ (c >= 0xffe0 && c <= 0xffe6) || /* -"- */
353
+ (c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
354
+ (c >= 0x30000 && c <= 0x3fffd)); /* -"- */
355
+ }
356
+
357
+
358
+ /* {{{1
359
+ * Convert ‘c’ to its uppercase representation (if any).
360
+ */
361
+ static unichar
362
+ special_case_table_lookup(unichar c)
363
+ {
364
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
+
366
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
+ return utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
+
370
+ if (tv == '\0')
371
+ return c;
372
+
373
+ return tv;
374
+ }
375
+
376
+ static unichar
377
+ titlecase_table_lookup(unichar c, bool want_upper)
378
+ {
379
+ for (size_t i = 0; i < lengthof(title_table); i++)
380
+ if (title_table[i][0] == c)
381
+ return title_table[i][want_upper ? 1 : 2];
382
+
383
+ return c;
384
+ }
385
+
386
+ unichar
387
+ unichar_toupper(unichar c)
388
+ {
389
+ int type = s_type(c);
390
+
391
+ if (type == UNICODE_LOWERCASE_LETTER)
392
+ return special_case_table_lookup(c);
393
+
394
+ if (type == UNICODE_TITLECASE_LETTER)
395
+ return titlecase_table_lookup(c, true);
396
+
397
+ return c;
398
+ }
399
+
400
+
401
+ /* {{{1
402
+ * Convert ‘c’ to its lowercase representation (if any).
403
+ */
404
+ unichar
405
+ unichar_tolower(unichar c)
406
+ {
407
+ int type = s_type(c);
408
+
409
+ if (type == UNICODE_UPPERCASE_LETTER)
410
+ return special_case_table_lookup(c);
411
+
412
+ if (type == UNICODE_TITLECASE_LETTER)
413
+ return titlecase_table_lookup(c, false);
414
+
415
+ return c;
416
+ }
417
+
418
+
419
+ /* {{{1
420
+ * Convert ‘c’ to its titlecase representation (if any).
421
+ */
422
+ unichar
423
+ unichar_totitle(unichar c)
424
+ {
425
+ for (size_t i = 0; i < lengthof(title_table); i++)
426
+ if (title_table[i][0] == c ||
427
+ title_table[i][1] == c ||
428
+ title_table[i][2] == c)
429
+ return title_table[i][0];
430
+
431
+ if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
+ return ATTTABLE(c >> 8, c & 0xff);
433
+
434
+ return c;
435
+ }
436
+
437
+
438
+ /* {{{1
439
+ * Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
440
+ */
441
+ int
442
+ unichar_digit_value(unichar c)
443
+ {
444
+ if (s_type(c) == UNICODE_DECIMAL_NUMBER)
445
+ return ATTTABLE(c >> 8, c & 0xff);
446
+
447
+ return -1;
448
+ }
449
+
450
+
451
+ /* {{{1
452
+ * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
453
+ */
454
+ int
455
+ unichar_xdigit_value(unichar c)
456
+ {
457
+ if (c >= 'a' && c <= 'f')
458
+ return c - 'a' + 10;
459
+ else if (c >= 'A' && c <= 'F')
460
+ return c - 'A' + 10;
461
+ else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
462
+ return c - UNICHAR_FULLWIDTH_a + 10;
463
+ else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
464
+ return c - UNICHAR_FULLWIDTH_A + 10;
465
+ else
466
+ return unichar_digit_value(c);
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * Determine the Unicode character type of ‘c’.
472
+ */
473
+ UnicodeType
474
+ unichar_type(unichar c)
475
+ {
476
+ return s_type(c);
477
+ }
478
+
479
+
480
+ /* {{{1
481
+ * LocaleType: This ‹enum› is used for dealing with different locales for
482
+ * turning strings into uppercase or lowercase.
483
+ */
484
+ typedef enum {
485
+ LOCALE_NORMAL,
486
+ LOCALE_TURKIC,
487
+ LOCALE_LITHUANIAN
488
+ } LocaleType;
489
+
490
+
491
+ /* {{{1
492
+ * Retrieve the locale type from the environment (LC_CTYPE).
493
+ */
494
+ static LocaleType
495
+ get_locale_type(void)
496
+ {
497
+ const char *locale = setlocale(LC_CTYPE, NULL);
498
+
499
+ if ((locale[0] == 'a' && locale[1] == 'z') ||
500
+ (locale[0] == 't' && locale[1] == 'r'))
501
+ return LOCALE_TURKIC;
502
+
503
+ if (locale[0] == 'l' && locale[1] == 't')
504
+ return LOCALE_LITHUANIAN;
505
+
506
+ return LOCALE_NORMAL;
507
+ }
508
+
509
+
510
+ /* {{{1
511
+ * Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
512
+ * true, remove the dot over an uppercase I for a turkish locale.
513
+ */
514
+ static size_t
515
+ output_marks(const char **p_inout, char *buf, bool remove_dot)
516
+ {
517
+ size_t len = 0;
518
+ const char *p = *p_inout;
519
+
520
+ for ( ; *p != '\0'; p = utf_next(p)) {
521
+ unichar c = utf_char(p);
522
+
523
+ if (!s_ismark(s_type(c)))
524
+ break;
525
+
526
+ if (!remove_dot || c != COMBINING_DOT_ABOVE)
527
+ len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
528
+ }
529
+
530
+ *p_inout = p;
531
+
532
+ return len;
533
+ }
534
+
535
+ /* {{{1
536
+ * Output titlecases where appropriate.
537
+ */
538
+ static size_t
539
+ output_special_case(char *buf, int offset, int type, bool upper)
540
+ {
541
+ const char *p = special_case_table + offset;
542
+
543
+ if (type != UNICODE_TITLECASE_LETTER)
544
+ p = utf_next(p);
545
+
546
+ if (upper)
547
+ p += utf_byte_length(p) + 1;
548
+
549
+ size_t len = utf_byte_length(p);
550
+
551
+ if (buf != NULL)
552
+ memcpy(buf, p, len);
553
+
554
+ return len;
555
+ }
556
+
557
+ /* {{{1
558
+ * Do uppercasing of ‘p’ for Lithuanian locales.
559
+ */
560
+ static size_t
561
+ remove_all_combining_dot_above(unichar c, char *buf)
562
+ {
563
+ size_t decomp_len;
564
+ unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
565
+
566
+ size_t len = 0;
567
+ for (size_t i = 0; i < decomp_len; i++)
568
+ if (decomp[i] != COMBINING_DOT_ABOVE)
569
+ len += unichar_to_utf(unichar_toupper(decomp[i]),
570
+ OFFSET_IF(buf, len));
571
+
572
+ free(decomp);
573
+
574
+ return len;
575
+ }
576
+
577
+ static size_t
578
+ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
579
+ bool *was_i)
580
+ {
581
+ if (c == 'i') {
582
+ *was_i = true;
583
+ return 0;
584
+ }
585
+
586
+ if (*was_i) {
587
+ size_t len = remove_all_combining_dot_above(c, buf);
588
+ return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
589
+ true);
590
+ }
591
+
592
+ if (!s_ismark(type))
593
+ *was_i = false;
594
+
595
+ return 0;
596
+ }
597
+
598
+ /* {{{1
599
+ * Do real upcasing. */
600
+ static inline size_t
601
+ real_do_toupper(unichar c, int type, char *buf)
602
+ {
603
+ bool upper = (type != UNICODE_LOWERCASE_LETTER);
604
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
605
+
606
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
607
+ return output_special_case(buf,
608
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
609
+ type, upper);
610
+
611
+ /* TODO: this should really use titlecase_table_lookup somehow. */
612
+ if (type == UNICODE_TITLECASE_LETTER)
613
+ for (size_t i = 0; i < lengthof(title_table); i++)
614
+ if (title_table[i][0] == c)
615
+ return unichar_to_utf(title_table[i][1], buf);
616
+
617
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
618
+ }
619
+
620
+ /* {{{1
621
+ * Do real uppercasing of ‘str’.
622
+ */
623
+ static size_t
624
+ real_toupper_one(const char **p, const char *prev, char *buf,
625
+ LocaleType locale_type, bool *was_i)
626
+ {
627
+ unichar c = utf_char(prev);
628
+ int type = s_type(c);
629
+
630
+ if (locale_type == LOCALE_LITHUANIAN) {
631
+ size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
632
+ if (len > 0)
633
+ return len;
634
+ }
635
+
636
+ if (locale_type == LOCALE_TURKIC && c == 'i')
637
+ return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
638
+ buf);
639
+
640
+ if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
641
+ /* Nasty, need to move it after other combining marks...this
642
+ * would go away if we normalized first. */
643
+ /* TODO: don’t we need to make sure we don’t go beyond the end
644
+ * of ‘p’? */
645
+ size_t len = output_marks(p, buf, false);
646
+ return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
647
+ OFFSET_IF(buf, len));
648
+ }
649
+
650
+ if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
651
+ OR(UNICODE_TITLECASE_LETTER, 0))))
652
+ return real_do_toupper(c, type, buf);
653
+
654
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
655
+
656
+ if (buf != NULL)
657
+ memcpy(buf, prev, len);
658
+
659
+ return len;
660
+ }
661
+
662
+ static size_t
663
+ real_toupper(const char *str, size_t max, bool use_max, char *buf,
664
+ LocaleType locale_type)
665
+ {
666
+ const char *p = str;
667
+ size_t len = 0;
668
+ bool p_was_i = false;
669
+
670
+ while ((!use_max || p < str + max) && *p != '\0') {
671
+ const char *prev = p;
672
+ p = utf_next(p);
673
+
674
+ len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
675
+ locale_type, &p_was_i);
676
+ }
677
+
678
+ return len;
679
+ }
680
+
681
+ /* {{{1
682
+ * Wrapper around real_toupper() for dealing with memory allocation and such.
683
+ */
684
+ static char *
685
+ utf_upcase_impl(const char *str, size_t max, bool use_max)
686
+ {
687
+ assert(str != NULL);
688
+
689
+ LocaleType locale_type = get_locale_type();
690
+
691
+ size_t len = real_toupper(str, max, use_max, NULL, locale_type);
692
+ char *result = ALLOC_N(char, len + 1);
693
+ real_toupper(str, max, use_max, result, locale_type);
694
+ result[len] = '\0';
695
+
696
+ return result;
697
+ }
698
+
699
+
700
+ /* {{{1
701
+ * Convert all characters in ‘str’ to their uppercase representation if
702
+ * applicable. Returns the freshly allocated representation.
703
+ */
704
+ char *
705
+ utf_upcase(const char *str)
706
+ {
707
+ return utf_upcase_impl(str, 0, false);
708
+ }
709
+
710
+
711
+ /* {{{1
712
+ * Convert all characters in ‘str’ to their uppercase representation if
713
+ * applicable. Returns the freshly allocated representation. Do this for at
714
+ * most ‘len˚ bytes from ‘str’.
715
+ */
716
+ char *
717
+ utf_upcase_n(const char *str, size_t len)
718
+ {
719
+ return utf_upcase_impl(str, len, true);
720
+ }
721
+
722
+
723
+ /* {{{1
724
+ * Traverse the string checking for characters with combining class == 230
725
+ * until a base character is found.
726
+ */
727
+ static bool
728
+ has_more_above(const char *str)
729
+ {
730
+ for (const char *p = str; *p != '\0'; p = utf_next(p)) {
731
+ int c_class = _unichar_combining_class(utf_char(p));
732
+
733
+ if (c_class == 230)
734
+ return true;
735
+
736
+ if (c_class == 0)
737
+ return false;
738
+ }
739
+
740
+ return false;
741
+ }
742
+
743
+ static inline size_t
744
+ real_do_tolower(unichar c, int type, char *buf)
745
+ {
746
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
747
+
748
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
749
+ return output_special_case(buf,
750
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
751
+ type, false);
752
+
753
+ /* TODO: this should really use titlecase_table_lookup somehow. */
754
+ if (type == UNICODE_TITLECASE_LETTER)
755
+ for (size_t i = 0; i < lengthof(title_table); i++)
756
+ if (title_table[i][0] == c)
757
+ return unichar_to_utf(title_table[i][2], buf);
758
+
759
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
760
+ }
761
+
762
+ /* {{{1
763
+ * The real implementation of downcase.
764
+ *
765
+ * TODO: this needs a cleanup.
766
+ */
767
+ static size_t
768
+ real_tolower_one(const char **p, const char *prev, char *buf,
769
+ LocaleType locale_type, const char *end, bool use_end)
770
+ {
771
+ unichar c = utf_char(prev);
772
+ int type = s_type(c);
773
+
774
+ if (locale_type == LOCALE_TURKIC && c == 'I') {
775
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
776
+ /* TODO: don’t we need to make sure we don’t go beyond the end
777
+ * of ‘p’? */
778
+ *p = utf_next(*p);
779
+ return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
780
+ }
781
+
782
+ return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
783
+ }
784
+
785
+ if (locale_type == LOCALE_LITHUANIAN &&
786
+ (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
787
+ c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
788
+ c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
789
+ /* Introduce an explicit dot above the lowercasing capital I's
790
+ * and J's whenever there are more accents above.
791
+ * [SpecialCasing.txt] */
792
+ size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
793
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
794
+ switch (c) {
795
+ case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
796
+ len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
797
+ OFFSET_IF(buf, len));
798
+ break;
799
+ case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
800
+ len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
801
+ OFFSET_IF(buf, len));
802
+ break;
803
+ case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
804
+ len += unichar_to_utf(COMBINING_TILDE,
805
+ OFFSET_IF(buf, len));
806
+ break;
807
+ }
808
+
809
+ return len;
810
+ }
811
+
812
+ if (locale_type == LOCALE_LITHUANIAN &&
813
+ (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
814
+ has_more_above(*p)) {
815
+ size_t len = unichar_to_utf(unichar_tolower(c), buf);
816
+ return len + unichar_to_utf(COMBINING_DOT_ABOVE,
817
+ OFFSET_IF(buf, len));
818
+ }
819
+
820
+ if (c == GREEK_CAPITAL_LETTER_SIGMA) {
821
+ unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
822
+
823
+ if ((!use_end || *p < end) && **p != '\0') {
824
+ unichar next_c = utf_char(*p);
825
+ int next_type = s_type(next_c);
826
+
827
+ /* SIGMA maps differently depending on whether it is
828
+ * final or not. The following simplified test would
829
+ * fail in the case of combining marks following the
830
+ * sigma, but I don't think that occurs in real text.
831
+ * The test here matches that in ICU. */
832
+ if (s_isalpha(next_type))
833
+ tv = GREEK_SMALL_LETTER_SIGMA;
834
+ }
835
+
836
+ return unichar_to_utf(tv, buf);
837
+ }
838
+
839
+ if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
840
+ OR(UNICODE_TITLECASE_LETTER, 0))))
841
+ return real_do_tolower(c, type, buf);
842
+
843
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
844
+
845
+ if (buf != NULL)
846
+ memcpy(buf, prev, len);
847
+
848
+ return len;
849
+ }
850
+
851
+ static size_t
852
+ real_tolower(const char *str, size_t max, bool use_max, char *buf,
853
+ LocaleType locale_type)
854
+ {
855
+ const char *p = str;
856
+ const char *end = str + max;
857
+ size_t len = 0;
858
+
859
+ while ((!use_max || p < end) && *p != '\0') {
860
+ const char *prev = p;
861
+ p = utf_next(p);
862
+
863
+ len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
864
+ locale_type, end, use_max);
865
+ }
866
+
867
+ return len;
868
+ }
869
+
870
+
871
+ /* {{{1 */
872
+ static char *
873
+ utf_downcase_impl(const char *str, size_t max, bool use_max)
874
+ {
875
+ assert(str != NULL);
876
+
877
+ LocaleType locale_type = get_locale_type();
878
+
879
+ size_t len = real_tolower(str, max, use_max, NULL, locale_type);
880
+ char *result = ALLOC_N(char, len + 1);
881
+ real_tolower(str, max, use_max, result, locale_type);
882
+ result[len] = NUL;
883
+
884
+ return result;
885
+ }
886
+
887
+
888
+ /* {{{1
889
+ * Convert all characters in ‘str’ to their lowercase representation if
890
+ * applicable. Returns the freshly allocated representation.
891
+ */
892
+ char *
893
+ utf_downcase(const char *str)
894
+ {
895
+ return utf_downcase_impl(str, 0, false);
896
+ }
897
+
898
+
899
+ /* {{{1
900
+ * Convert all characters in ‘str’ to their lowercase representation if
901
+ * applicable. Returns the freshly allocated representation. Do this for at
902
+ * most ‘len˚ bytes from ‘str’.
903
+ */
904
+ char *
905
+ utf_downcase_n(const char *str, size_t len)
906
+ {
907
+ return utf_downcase_impl(str, len, true);
908
+ }
909
+
910
+
911
+ /* {{{1
912
+ * The real implementation of case folding below.
913
+ */
914
+
915
+ static bool
916
+ casefold_table_lookup(unichar c, char *folded, size_t *len)
917
+ {
918
+ int begin = 0;
919
+ int end = lengthof(casefold_table);
920
+
921
+ if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
922
+ return false;
923
+
924
+ while (true) {
925
+ int mid = (begin + end) / 2;
926
+
927
+ if (c == casefold_table[mid].ch) {
928
+ if (folded != NULL)
929
+ strcpy(folded, casefold_table[mid].data);
930
+ *len += utf_byte_length(casefold_table[mid].data);
931
+ return true;
932
+ } else if (mid == begin) {
933
+ return false;
934
+ } else if (c > casefold_table[mid].ch) {
935
+ begin = mid;
936
+ } else {
937
+ end = mid;
938
+ }
939
+ }
940
+ }
941
+
942
+ static char *
943
+ utf_foldcase_impl(const char *str, size_t max, bool use_max)
944
+ {
945
+ assert(str != NULL);
946
+
947
+ char *folded = NULL;
948
+ size_t len = 0;
949
+
950
+ again:
951
+ for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
952
+ unichar c = utf_char(p);
953
+
954
+ if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
955
+ continue;
956
+
957
+ len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
958
+ }
959
+
960
+ if (folded == NULL) {
961
+ folded = ALLOC_N(char, len + 1);
962
+ folded[0] = NUL;
963
+ len = 0;
964
+ goto again;
965
+ }
966
+
967
+ folded[len] = '\0';
968
+
969
+ return folded;
970
+ }
971
+
972
+
973
+ /* {{{1
974
+ * Convert a string into a form that is independent of case. Return the
975
+ * freshly allocated representation.
976
+ */
977
+ char *
978
+ utf_foldcase(const char *str)
979
+ {
980
+ return utf_foldcase_impl(str, 0, false);
981
+ }
982
+
983
+
984
+ /* {{{1
985
+ * Convert a string into a form that is independent of case. Return the
986
+ * freshly allocated representation. Do this for at most ‘len’ bytes from the
987
+ * string.
988
+ */
989
+ char *
990
+ utf_foldcase_n(const char *str, size_t len)
991
+ {
992
+ return utf_foldcase_impl(str, len, true);
993
+ }
994
+
995
+
996
+ /* {{{1
997
+ * The real implementation of utf_width() and utf_width_n() below.
998
+ */
999
+ static size_t
1000
+ utf_width_impl(const char *str, size_t len, bool use_len)
1001
+ {
1002
+ assert(str != NULL);
1003
+
1004
+ size_t width = 0;
1005
+
1006
+ for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
1007
+ width += unichar_iswide(utf_char(p)) ? 2 : 1;
1008
+
1009
+ return width;
1010
+ }
1011
+
1012
+
1013
+ /* {{{1
1014
+ * Calculate the width in cells of ‘str’.
1015
+ */
1016
+ size_t
1017
+ utf_width(const char *str)
1018
+ {
1019
+ return utf_width_impl(str, 0, false);
1020
+ }
1021
+
1022
+
1023
+ /* {{{1
1024
+ * Calculate the width in cells of ‘str’, which is of length ‘len’.
1025
+ */
1026
+ size_t
1027
+ utf_width_n(const char *str, size_t len)
1028
+ {
1029
+ return utf_width_impl(str, len, true);
1030
+ }
1031
+
1032
+
1033
+ /* {{{1
1034
+ * Retrieve the mirrored representation of ‘c’ (if any) and store it in
1035
+ * ‘mirrored’.
1036
+ */
1037
+ bool
1038
+ unichar_mirror(unichar c, unichar *mirrored)
1039
+ {
1040
+ int begin = 0;
1041
+ int end = lengthof(bidi_mirroring_table);
1042
+
1043
+ while (true) {
1044
+ int mid = (begin + end) / 2;
1045
+
1046
+ if (c == bidi_mirroring_table[mid].ch) {
1047
+ if (mirrored != NULL)
1048
+ *mirrored = bidi_mirroring_table[mid].mirrored_ch;
1049
+ return true;
1050
+ } else if (mid == begin) {
1051
+ return false;
1052
+ } else if (c > bidi_mirroring_table[mid].ch) {
1053
+ begin = mid;
1054
+ } else {
1055
+ end = mid;
1056
+ }
1057
+ }
1058
+ }
1059
+
1060
+
1061
+ /* }}}1 */