u 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,67 @@
1
+ require 'mkmf'
2
+
3
+ def try_compiler_option(opt, &block)
4
+ checking_for "#{opt} option to compiler" do
5
+ $CFLAGS += " #{opt}" if try_compile '', opt, &block
6
+ end
7
+ end
8
+
9
+ try_compiler_option '-std=c99'
10
+ try_compiler_option '-finline-functions'
11
+ try_compiler_option '-fno-common'
12
+ try_compiler_option '-Wall'
13
+ try_compiler_option '-Waggregate-return'
14
+ try_compiler_option '-Wcast-align'
15
+ try_compiler_option '-Wextra'
16
+ try_compiler_option '-Wformat=2'
17
+ try_compiler_option '-Winit-self'
18
+ try_compiler_option '-Winline'
19
+ try_compiler_option '-Wmissing-declarations'
20
+ try_compiler_option '-Wmissing-format-attribute'
21
+ try_compiler_option '-Wmissing-include-dirs'
22
+ try_compiler_option '-Wmissing-noreturn'
23
+ try_compiler_option '-Wmissing-prototypes'
24
+ try_compiler_option '-Wnested-externs'
25
+ try_compiler_option '-Wold-style-definition'
26
+ try_compiler_option '-Wpacked'
27
+ try_compiler_option '-Wp,-D_FORTIFY_SOURCE=2'
28
+ try_compiler_option '-Wpointer-arith'
29
+ try_compiler_option '-Wsign-compare'
30
+ try_compiler_option '-Wstrict-aliasing=2'
31
+ try_compiler_option '-Wswitch-default'
32
+ try_compiler_option '-Wswitch-enum'
33
+ try_compiler_option '-Wundef'
34
+ try_compiler_option '-Wunsafe-loop-optimizations'
35
+ try_compiler_option '-Wwrite-strings'
36
+
37
+ checking_for 'GNUC visibility attribute' do
38
+ $defs.push('-DHAVE_GNUC_VISIBILITY') if try_compile <<EOC, '-Werror'
39
+ void f_hidden(void);
40
+ void __attribute__((visibility("hidden")))
41
+ f_hidden(void)
42
+ {
43
+ }
44
+ int main(void)
45
+ {
46
+ f_hidden();
47
+ return 0;
48
+ }
49
+ EOC
50
+ end
51
+
52
+ have_header 'assert.h'
53
+ have_header 'limits.h'
54
+ have_header 'locale.h'
55
+ have_header 'stdbool.h'
56
+ have_header 'stddef.h'
57
+ have_header 'stdint.h'
58
+ have_header 'stdio.h'
59
+ have_header 'stdlib.h'
60
+ have_header 'string.h'
61
+ have_header 'sys/types.h'
62
+ have_header 'wchar.h'
63
+
64
+ $INSTALLFILES ||= []
65
+ $INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
66
+
67
+ create_makefile 'encoding/character/utf-8/utf8'
@@ -0,0 +1,62 @@
1
+ /*
2
+ * contents: Private functions used by the UTF-8 character-encoding library.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include <stdlib.h>
12
+
13
+ #include "unicode.h"
14
+
15
+ #include "private.h"
16
+
17
+ /* Lookup C in the sorted TABLE using binary search. TABLE consists of N
18
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
19
+ * component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
20
+ * index is stored in INDEX and true is returned. Otherwise, false is returned
21
+ * and INDEX is left untouched. */
22
+ bool
23
+ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
24
+ {
25
+ #define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
26
+
27
+ int begin = 0;
28
+ int end = n - 1;
29
+ int middle;
30
+
31
+ /* This is ugly, but not all tables use unichars as their lookup
32
+ * character. The casefold table, for example, uses uint16_t-sized
33
+ * characters. To only get the interesting part of our table entry
34
+ * we’ll have to mask the retrieved value. */
35
+ int char_mask = (1 << (8 * sizeof_char)) - 1;
36
+
37
+ /* Drop out early if we know for certain that C can’t be in the
38
+ * decomposition table. */
39
+ if (c < ENTRY(0) || c > ENTRY(end))
40
+ return false;
41
+
42
+ while (begin <= end) {
43
+ middle = binary_search_middle_of(begin, end);
44
+
45
+ unichar probe = ENTRY(middle);
46
+ if (c < probe)
47
+ end = middle - 1;
48
+ else if (c > probe)
49
+ begin = middle + 1;
50
+ else
51
+ break;
52
+ }
53
+
54
+ if (begin > end)
55
+ return false;
56
+
57
+ *index = middle;
58
+
59
+ return true;
60
+
61
+ #undef ENTRY
62
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * contents: Private Unicode related information.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #ifndef PRIVATE_H
8
+ #define PRIVATE_H
9
+
10
+ #define NUL '\0'
11
+ #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
12
+
13
+ #if defined(HAVE_GNUC_VISIBILITY)
14
+ # define HIDDEN \
15
+ __attribute__((visibility("hidden")))
16
+ #else
17
+ # define HIDDEN
18
+ #endif
19
+
20
+ #if defined(__GNUC__)
21
+ # define UNUSED(u) \
22
+ u __attribute__((__unused__))
23
+ #else
24
+ # define UNUSED(u) \
25
+ u
26
+ #endif
27
+
28
+ #define binary_search_middle_of(begin, end) \
29
+ (((unsigned)((begin) + (end))) >> 1)
30
+
31
+ #define unicode_table_lookup(table, c, index) \
32
+ binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
33
+
34
+ bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
35
+
36
+ #define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
37
+ ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
38
+ ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
39
+ : (data[part[page]][(c) & 0xff]))
40
+
41
+ #define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
42
+ (((c) <= UNICODE_LAST_CHAR_PART1) \
43
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
44
+ : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
45
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
46
+ : (fallback)))
47
+
48
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
49
+ NormalizeMode mode) HIDDEN;
50
+
51
+ #endif /* PRIVATE_H */
@@ -0,0 +1,1056 @@
1
+ /*
2
+ * contents: Unicode character properties.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <assert.h>
9
+ #include <locale.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <string.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "data/character-tables.h"
17
+
18
+
19
+ #define COMBINING_DOT_ABOVE ((unichar)0x0307)
20
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
21
+ #define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
22
+ #define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
23
+ #define LATIN_SMALL_LETTER_I ((unichar)0x0069)
24
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
25
+ #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
26
+ #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
27
+ #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
28
+ #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
29
+ #define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
30
+ #define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
31
+ #define COMBINING_TILDE ((unichar)0x0303)
32
+ #define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
33
+ #define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
34
+ #define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
35
+
36
+ #define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
37
+
38
+ /* {{{1
39
+ * Macros for accessing the Unicode character attribute table.
40
+ *
41
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
42
+ * when we have ‹inline› in C99.
43
+ */
44
+ #define ATTR_TABLE(page) \
45
+ (((page) <= UNICODE_LAST_PAGE_PART1) \
46
+ ? attr_table_part1[page] \
47
+ : attr_table_part2[(page) - 0xe00])
48
+
49
+ #define ATTTABLE(page, char) \
50
+ ((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
51
+ ? 0 : (attr_data[ATTR_TABLE(page)][char]))
52
+
53
+
54
+ /* {{{1
55
+ * Internal function used for figuring out the type of a given character.
56
+ */
57
+ static inline int
58
+ s_type(unichar c)
59
+ {
60
+ const int16_t *table;
61
+ unsigned int page;
62
+
63
+ if (c <= UNICODE_LAST_CHAR_PART1) {
64
+ page = c >> 8;
65
+ table = type_table_part1;
66
+ } else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
68
+ table = type_table_part2;
69
+ } else {
70
+ return UNICODE_UNASSIGNED;
71
+ }
72
+
73
+ if (table[page] >= UNICODE_MAX_TABLE_INDEX)
74
+ return table[page] - UNICODE_MAX_TABLE_INDEX;
75
+ else
76
+ return type_data[table[page]][c & 0xff];
77
+ }
78
+
79
+
80
+ /* {{{1
81
+ * Bit-fiddling macros for testing the class of a type.
82
+ */
83
+ #define IS(type, class) (((unsigned int)1 << (type)) & (class))
84
+ #define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
85
+
86
+
87
+ /* {{{1
88
+ * Internal function used to check if the given type represents a digit type.
89
+ */
90
+ static inline bool
91
+ s_isdigit(int type)
92
+ {
93
+ return IS(type,
94
+ OR(UNICODE_DECIMAL_NUMBER,
95
+ OR(UNICODE_LETTER_NUMBER,
96
+ OR(UNICODE_OTHER_NUMBER, 0))));
97
+ }
98
+
99
+
100
+ /* {{{1
101
+ * Internal function used to check if the given type represents an alphabetic
102
+ * type.
103
+ */
104
+ static inline bool
105
+ s_isalpha(int type)
106
+ {
107
+ return IS(type,
108
+ OR(UNICODE_LOWERCASE_LETTER,
109
+ OR(UNICODE_UPPERCASE_LETTER,
110
+ OR(UNICODE_TITLECASE_LETTER,
111
+ OR(UNICODE_MODIFIER_LETTER,
112
+ OR(UNICODE_OTHER_LETTER, 0))))));
113
+ }
114
+
115
+
116
+ /* {{{1
117
+ * Internal function used to check if the given type represents a mark type.
118
+ */
119
+ static inline bool
120
+ s_ismark(int type)
121
+ {
122
+ return IS(type,
123
+ OR(UNICODE_NON_SPACING_MARK,
124
+ OR(UNICODE_COMBINING_MARK,
125
+ OR(UNICODE_ENCLOSING_MARK, 0))));
126
+ }
127
+
128
+
129
+ /* {{{1
130
+ * Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
131
+ */
132
+ bool
133
+ unichar_isalnum(unichar c)
134
+ {
135
+ int type = s_type(c);
136
+
137
+ return s_isdigit(type) || s_isalpha(type);
138
+ }
139
+
140
+
141
+ /* {{{1
142
+ * Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
143
+ */
144
+ bool
145
+ unichar_isalpha(unichar c)
146
+ {
147
+ return s_isalpha(s_type(c));
148
+ }
149
+
150
+
151
+ /* {{{1
152
+ * Determine whether ‘c’ is a control character, such as ‹NUL›.
153
+ */
154
+ bool
155
+ unichar_iscntrl(unichar c)
156
+ {
157
+ return s_type(c) == UNICODE_CONTROL;
158
+ }
159
+
160
+
161
+ /* {{{1
162
+ * Determine whether ‘c’ is a digit, such as 0, 1, or 2.
163
+ */
164
+ bool
165
+ unichar_isdigit(unichar c)
166
+ {
167
+ return s_type(c) == UNICODE_DECIMAL_NUMBER;
168
+ }
169
+
170
+
171
+ /* {{{1
172
+ * Determine whether ‘c’ is printable and not a space or control character such
173
+ * as tab or <NUL›, such as A, B, or C.
174
+ */
175
+ bool
176
+ unichar_isgraph(unichar c)
177
+ {
178
+ return !IS(s_type(c),
179
+ OR(UNICODE_CONTROL,
180
+ OR(UNICODE_FORMAT,
181
+ OR(UNICODE_UNASSIGNED,
182
+ OR(UNICODE_PRIVATE_USE,
183
+ OR(UNICODE_SURROGATE,
184
+ OR(UNICODE_SPACE_SEPARATOR, 0)))))));
185
+ }
186
+
187
+
188
+ /* {{{1
189
+ * Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
190
+ */
191
+ bool
192
+ unichar_islower(unichar c)
193
+ {
194
+ return s_type(c) == UNICODE_LOWERCASE_LETTER;
195
+ }
196
+
197
+
198
+ /* {{{1
199
+ * Determine whether ‘c’ is printable, which works the same as
200
+ * unichar_isgraph(), except that space characters are also printable.
201
+ */
202
+ bool
203
+ unichar_isprint(unichar c)
204
+ {
205
+ return !IS(s_type(c),
206
+ OR(UNICODE_CONTROL,
207
+ OR(UNICODE_FORMAT,
208
+ OR(UNICODE_UNASSIGNED,
209
+ OR(UNICODE_PRIVATE_USE,
210
+ OR(UNICODE_SURROGATE, 0))))));
211
+ }
212
+
213
+
214
+ /* {{{1
215
+ * Determine whether ‘c’ is some form of punctuation or other symbol.
216
+ */
217
+ bool
218
+ unichar_ispunct(unichar c)
219
+ {
220
+ return IS(s_type(c),
221
+ OR(UNICODE_CONNECT_PUNCTUATION,
222
+ OR(UNICODE_DASH_PUNCTUATION,
223
+ OR(UNICODE_OPEN_PUNCTUATION,
224
+ OR(UNICODE_CLOSE_PUNCTUATION,
225
+ OR(UNICODE_INITIAL_PUNCTUATION,
226
+ OR(UNICODE_FINAL_PUNCTUATION,
227
+ OR(UNICODE_OTHER_PUNCTUATION,
228
+ OR(UNICODE_MODIFIER_SYMBOL,
229
+ OR(UNICODE_MATH_SYMBOL,
230
+ OR(UNICODE_CURRENCY_SYMBOL,
231
+ OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
232
+ }
233
+
234
+
235
+ /* {{{1
236
+ * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
237
+ * line separator (newline, carriage return, etc.).
238
+ */
239
+ bool
240
+ unichar_isspace(unichar c)
241
+ {
242
+ switch (c) {
243
+ case '\t':
244
+ case '\n':
245
+ case '\r':
246
+ case '\f':
247
+ return true;
248
+ default:
249
+ return IS(s_type(c),
250
+ OR(UNICODE_SPACE_SEPARATOR,
251
+ OR(UNICODE_LINE_SEPARATOR,
252
+ OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
253
+ }
254
+ }
255
+
256
+
257
+ /* {{{1
258
+ * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
259
+ */
260
+ bool
261
+ unichar_isupper(unichar c)
262
+ {
263
+ return s_type(c) == UNICODE_UPPERCASE_LETTER;
264
+ }
265
+
266
+
267
+ /* {{{1
268
+ * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
269
+ * which at the beginning of a word is written as Dz, where only the initial D
270
+ * is capitalized. (Complicated huh?)
271
+ */
272
+ bool
273
+ unichar_istitle(unichar c)
274
+ {
275
+ /* TODO: binary search helpful? */
276
+ for (size_t i = 0; i < lengthof(title_table); i++)
277
+ if (title_table[i][0] == c)
278
+ return true;
279
+
280
+ return false;
281
+ }
282
+
283
+
284
+ /* {{{1
285
+ * Determine whether ‘c’ is a new-line.
286
+ */
287
+ #define UNICHAR_NEXT_LINE ((unichar)0x0085)
288
+ #define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
289
+ #define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
290
+
291
+ bool
292
+ unichar_isnewline(unichar c)
293
+ {
294
+ switch (c) {
295
+ case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
296
+ case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
297
+ return true;
298
+ default:
299
+ return false;
300
+ }
301
+ }
302
+
303
+ /* {{{1
304
+ * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
305
+ * ..., f, or A, B, ..., F.
306
+ */
307
+ #define UNICHAR_FULLWIDTH_A 0xff21
308
+ #define UNICHAR_FULLWIDTH_F 0xff26
309
+ #define UNICHAR_FULLWIDTH_a 0xff41
310
+ #define UNICHAR_FULLWIDTH_f 0xff46
311
+ bool
312
+ unichar_isxdigit(unichar c)
313
+ {
314
+ return ((c >= 'a' && c <= 'f') ||
315
+ (c >= 'A' && c <= 'F') ||
316
+ (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
317
+ (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
318
+ (s_type(c) == UNICODE_DECIMAL_NUMBER));
319
+ // s_isdigit(s_type(c)));
320
+ }
321
+
322
+
323
+ /* {{{1
324
+ * Determine whether code point ‘c’ has been assigned a code value.
325
+ */
326
+ bool
327
+ unichar_isassigned(unichar c)
328
+ {
329
+ return s_type(c) != UNICODE_UNASSIGNED;
330
+ }
331
+
332
+
333
+ /* {{{1
334
+ * Determine whether ‘c’ is a wide character, thus is typically rendered in a
335
+ * double-width cell on a terminal.
336
+ */
337
+ bool
338
+ unichar_iswide(unichar c)
339
+ {
340
+ if (c < 0x1100)
341
+ return false;
342
+
343
+ return (c <= 0x115f || /* Hangul Jamo init. consonants */
344
+ c == 0x2329 || c == 0x232a || /* angle brackets */
345
+ (c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
346
+ (c < 0x302a || c > 0x302f) &&
347
+ c != 0x303f && c != 0x3099 && c != 0x309a) ||
348
+ (c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
349
+ (c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
350
+ (c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
351
+ (c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
352
+ (c >= 0xffe0 && c <= 0xffe6) || /* -"- */
353
+ (c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
354
+ (c >= 0x30000 && c <= 0x3fffd)); /* -"- */
355
+ }
356
+
357
+
358
+ /* {{{1
359
+ * Convert ‘c’ to its uppercase representation (if any).
360
+ */
361
+ static unichar
362
+ special_case_table_lookup(unichar c)
363
+ {
364
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
+
366
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
+ tv = utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
+
370
+ if (tv == '\0')
371
+ return c;
372
+
373
+ return tv;
374
+ }
375
+
376
+ static unichar
377
+ titlecase_table_lookup(unichar c, bool want_upper)
378
+ {
379
+ for (size_t i = 0; i < lengthof(title_table); i++)
380
+ if (title_table[i][0] == c)
381
+ return title_table[i][want_upper ? 1 : 2];
382
+
383
+ return c;
384
+ }
385
+
386
+ unichar
387
+ unichar_toupper(unichar c)
388
+ {
389
+ int type = s_type(c);
390
+
391
+ if (type == UNICODE_LOWERCASE_LETTER)
392
+ return special_case_table_lookup(c);
393
+
394
+ if (type == UNICODE_TITLECASE_LETTER)
395
+ return titlecase_table_lookup(c, true);
396
+
397
+ return c;
398
+ }
399
+
400
+
401
+ /* {{{1
402
+ * Convert ‘c’ to its lowercase representation (if any).
403
+ */
404
+ unichar
405
+ unichar_tolower(unichar c)
406
+ {
407
+ int type = s_type(c);
408
+
409
+ if (type == UNICODE_UPPERCASE_LETTER)
410
+ return special_case_table_lookup(c);
411
+
412
+ if (type == UNICODE_TITLECASE_LETTER)
413
+ return titlecase_table_lookup(c, false);
414
+
415
+ return c;
416
+ }
417
+
418
+
419
+ /* {{{1
420
+ * Convert ‘c’ to its titlecase representation (if any).
421
+ */
422
+ unichar
423
+ unichar_totitle(unichar c)
424
+ {
425
+ for (size_t i = 0; i < lengthof(title_table); i++)
426
+ if (title_table[i][0] == c ||
427
+ title_table[i][1] == c ||
428
+ title_table[i][2] == c)
429
+ return title_table[i][0];
430
+
431
+ if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
+ return unichar_toupper(c);
433
+
434
+ return c;
435
+ }
436
+
437
+
438
+ /* {{{1
439
+ * Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
440
+ */
441
+ int
442
+ unichar_digit_value(unichar c)
443
+ {
444
+ if (s_type(c) == UNICODE_DECIMAL_NUMBER)
445
+ return ATTTABLE(c >> 8, c & 0xff);
446
+
447
+ return -1;
448
+ }
449
+
450
+
451
+ /* {{{1
452
+ * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
453
+ */
454
+ int
455
+ unichar_xdigit_value(unichar c)
456
+ {
457
+ if (c >= 'a' && c <= 'f')
458
+ return c - 'a' + 10;
459
+ else if (c >= 'A' && c <= 'F')
460
+ return c - 'A' + 10;
461
+ else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
462
+ return c - UNICHAR_FULLWIDTH_a + 10;
463
+ else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
464
+ return c - UNICHAR_FULLWIDTH_A + 10;
465
+ else
466
+ return unichar_digit_value(c);
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * Determine the Unicode character type of ‘c’.
472
+ */
473
+ UnicodeType
474
+ unichar_type(unichar c)
475
+ {
476
+ return s_type(c);
477
+ }
478
+
479
+
480
+ /* {{{1
481
+ * LocaleType: This ‹enum› is used for dealing with different locales for
482
+ * turning strings into uppercase or lowercase.
483
+ */
484
+ typedef enum {
485
+ LOCALE_NORMAL,
486
+ LOCALE_TURKIC,
487
+ LOCALE_LITHUANIAN
488
+ } LocaleType;
489
+
490
+
491
+ /* {{{1
492
+ * Retrieve the locale type from the environment (LC_CTYPE).
493
+ */
494
+ static LocaleType
495
+ get_locale_type(void)
496
+ {
497
+ const char *locale = setlocale(LC_CTYPE, NULL);
498
+
499
+ if ((locale[0] == 'a' && locale[1] == 'z') ||
500
+ (locale[0] == 't' && locale[1] == 'r'))
501
+ return LOCALE_TURKIC;
502
+
503
+ if (locale[0] == 'l' && locale[1] == 't')
504
+ return LOCALE_LITHUANIAN;
505
+
506
+ return LOCALE_NORMAL;
507
+ }
508
+
509
+
510
+ /* {{{1
511
+ * Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
512
+ * true, remove the dot over an uppercase I for a turkish locale.
513
+ */
514
+ static size_t
515
+ output_marks(const char **p_inout, char *buf, bool remove_dot)
516
+ {
517
+ size_t len = 0;
518
+ const char *p = *p_inout;
519
+
520
+ for ( ; *p != '\0'; p = utf_next(p)) {
521
+ unichar c = utf_char(p);
522
+
523
+ if (!s_ismark(s_type(c)))
524
+ break;
525
+
526
+ if (!remove_dot || c != COMBINING_DOT_ABOVE)
527
+ len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
528
+ }
529
+
530
+ *p_inout = p;
531
+
532
+ return len;
533
+ }
534
+
535
+ /* {{{1
536
+ * Output titlecases where appropriate.
537
+ */
538
+ static size_t
539
+ output_special_case(char *buf, int offset, int type, bool upper)
540
+ {
541
+ const char *p = special_case_table + offset;
542
+
543
+ if (type != UNICODE_TITLECASE_LETTER)
544
+ p = utf_next(p);
545
+
546
+ if (upper)
547
+ p += utf_byte_length(p) + 1;
548
+
549
+ size_t len = utf_byte_length(p);
550
+
551
+ if (buf != NULL)
552
+ memcpy(buf, p, len);
553
+
554
+ return len;
555
+ }
556
+
557
+ /* {{{1
558
+ * Do uppercasing of ‘p’ for Lithuanian locales.
559
+ */
560
+ static size_t
561
+ remove_all_combining_dot_above(unichar c, char *buf)
562
+ {
563
+ size_t decomp_len;
564
+ unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
565
+
566
+ size_t len = 0;
567
+ for (size_t i = 0; i < decomp_len; i++)
568
+ if (decomp[i] != COMBINING_DOT_ABOVE)
569
+ len += unichar_to_utf(unichar_toupper(decomp[i]),
570
+ OFFSET_IF(buf, len));
571
+
572
+ free(decomp);
573
+
574
+ return len;
575
+ }
576
+
577
+ static size_t
578
+ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
579
+ bool *was_i)
580
+ {
581
+ if (c == 'i') {
582
+ *was_i = true;
583
+ return 0;
584
+ }
585
+
586
+ if (*was_i) {
587
+ size_t len = remove_all_combining_dot_above(c, buf);
588
+ return len + output_marks(p, OFFSET_IF(buf, len), true);
589
+ }
590
+
591
+ if (!s_ismark(type))
592
+ *was_i = false;
593
+
594
+ return 0;
595
+ }
596
+
597
+ /* {{{1
598
+ * Do real upcasing. */
599
+ static inline size_t
600
+ real_do_toupper(unichar c, int type, char *buf)
601
+ {
602
+ bool upper = (type != UNICODE_LOWERCASE_LETTER);
603
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
604
+
605
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
606
+ return output_special_case(buf,
607
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
608
+ type, upper);
609
+
610
+ /* TODO: this should really use titlecase_table_lookup somehow. */
611
+ if (type == UNICODE_TITLECASE_LETTER)
612
+ for (size_t i = 0; i < lengthof(title_table); i++)
613
+ if (title_table[i][0] == c)
614
+ return unichar_to_utf(title_table[i][1], buf);
615
+
616
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
617
+ }
618
+
619
+ /* {{{1
620
+ * Do real uppercasing of ‘str’.
621
+ */
622
+ static size_t
623
+ real_toupper_one(const char **p, const char *prev, char *buf,
624
+ LocaleType locale_type, bool *was_i)
625
+ {
626
+ unichar c = utf_char(prev);
627
+ int type = s_type(c);
628
+
629
+ if (locale_type == LOCALE_LITHUANIAN) {
630
+ size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
631
+ if (len > 0)
632
+ return len;
633
+ }
634
+
635
+ if (locale_type == LOCALE_TURKIC && c == 'i')
636
+ return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
637
+ buf);
638
+
639
+ if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
640
+ /* Nasty, need to move it after other combining marks...this
641
+ * would go away if we normalized first. */
642
+ /* TODO: don’t we need to make sure we don’t go beyond the end
643
+ * of ‘p’? */
644
+ size_t len = output_marks(p, buf, false);
645
+ return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
646
+ OFFSET_IF(buf, len));
647
+ }
648
+
649
+ if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
650
+ OR(UNICODE_TITLECASE_LETTER, 0))))
651
+ return real_do_toupper(c, type, buf);
652
+
653
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
654
+
655
+ if (buf != NULL)
656
+ memcpy(buf, prev, len);
657
+
658
+ return len;
659
+ }
660
+
661
+ static size_t
662
+ real_toupper(const char *str, size_t max, bool use_max, char *buf,
663
+ LocaleType locale_type)
664
+ {
665
+ const char *p = str;
666
+ size_t len = 0;
667
+ bool p_was_i = false;
668
+
669
+ while ((!use_max || p < str + max) && *p != '\0') {
670
+ const char *prev = p;
671
+ p = utf_next(p);
672
+
673
+ len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
674
+ locale_type, &p_was_i);
675
+ }
676
+
677
+ return len;
678
+ }
679
+
680
+ /* {{{1
681
+ * Wrapper around real_toupper() for dealing with memory allocation and such.
682
+ */
683
+ static char *
684
+ utf_upcase_impl(const char *str, size_t max, bool use_max)
685
+ {
686
+ assert(str != NULL);
687
+
688
+ LocaleType locale_type = get_locale_type();
689
+
690
+ size_t len = real_toupper(str, max, use_max, NULL, locale_type);
691
+ char *result = ALLOC_N(char, len + 1);
692
+ real_toupper(str, max, use_max, result, locale_type);
693
+ result[len] = '\0';
694
+
695
+ return result;
696
+ }
697
+
698
+
699
+ /* {{{1
700
+ * Convert all characters in ‘str’ to their uppercase representation if
701
+ * applicable. Returns the freshly allocated representation.
702
+ */
703
+ char *
704
+ utf_upcase(const char *str)
705
+ {
706
+ return utf_upcase_impl(str, 0, false);
707
+ }
708
+
709
+
710
+ /* {{{1
711
+ * Convert all characters in ‘str’ to their uppercase representation if
712
+ * applicable. Returns the freshly allocated representation. Do this for at
713
+ * most ‘len˚ bytes from ‘str’.
714
+ */
715
+ char *
716
+ utf_upcase_n(const char *str, size_t len)
717
+ {
718
+ return utf_upcase_impl(str, len, true);
719
+ }
720
+
721
+
722
+ /* {{{1
723
+ * Traverse the string checking for characters with combining class == 230
724
+ * until a base character is found.
725
+ */
726
+ static bool
727
+ has_more_above(const char *str)
728
+ {
729
+ for (const char *p = str; *p != '\0'; p = utf_next(p)) {
730
+ int c_class = unichar_combining_class(utf_char(p));
731
+
732
+ if (c_class == 230)
733
+ return true;
734
+
735
+ if (c_class == 0)
736
+ return false;
737
+ }
738
+
739
+ return false;
740
+ }
741
+
742
+ static inline size_t
743
+ real_do_tolower(unichar c, int type, char *buf)
744
+ {
745
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
746
+
747
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
748
+ return output_special_case(buf,
749
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
750
+ type, false);
751
+
752
+ /* TODO: this should really use titlecase_table_lookup somehow. */
753
+ if (type == UNICODE_TITLECASE_LETTER)
754
+ for (size_t i = 0; i < lengthof(title_table); i++)
755
+ if (title_table[i][0] == c)
756
+ return unichar_to_utf(title_table[i][2], buf);
757
+
758
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
759
+ }
760
+
761
+ /* {{{1
762
+ * The real implementation of downcase.
763
+ */
764
+ static size_t
765
+ tolower_turkic_i(const char **p, char *buf)
766
+ {
767
+ unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
768
+
769
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
770
+ /* TODO: don’t we need to make sure we don’t go beyond the end
771
+ * of ‘p’? */
772
+ *p = utf_next(*p);
773
+ i = LATIN_SMALL_LETTER_I;
774
+ }
775
+
776
+ return unichar_to_utf(i, buf);
777
+ }
778
+
779
+ static size_t
780
+ tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
781
+ {
782
+ size_t len = unichar_to_utf(base, buf);
783
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
784
+ if (combiner != '\0')
785
+ len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
786
+
787
+ return len;
788
+ }
789
+
790
+ static size_t
791
+ tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
792
+ {
793
+ unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
794
+
795
+ /* SIGMA maps differently depending on whether it is final or not. The
796
+ * following simplified test would fail in the case of combining marks
797
+ * following the sigma, but I don't think that occurs in real text.
798
+ * The test here matches that in ICU. */
799
+ if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
800
+ sigma = GREEK_SMALL_LETTER_SIGMA;
801
+
802
+ return unichar_to_utf(sigma, buf);
803
+ }
804
+
805
+ static size_t
806
+ real_tolower_one(const char **p, const char *prev, char *buf,
807
+ LocaleType locale_type, const char *end, bool use_end)
808
+ {
809
+ unichar c = utf_char(prev);
810
+ int type = s_type(c);
811
+
812
+ if (locale_type == LOCALE_TURKIC && c == 'I')
813
+ return tolower_turkic_i(p, buf);
814
+
815
+ /* Introduce an explicit dot above the lowercasing capital I’s
816
+ * and J’s whenever there are more accents above.
817
+ * [SpecialCasing.txt] */
818
+ if (locale_type == LOCALE_LITHUANIAN) {
819
+ unichar base = LATIN_SMALL_LETTER_I;
820
+ unichar combiner = '\0';
821
+
822
+ switch (c) {
823
+ case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
824
+ combiner = COMBINING_GRAVE_ACCENT;
825
+ break;
826
+ case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
827
+ combiner = COMBINING_ACUTE_ACCENT;
828
+ break;
829
+ case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
830
+ combiner = COMBINING_TILDE;
831
+ break;
832
+ case 'I':
833
+ case 'J':
834
+ case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
835
+ if (!has_more_above(*p))
836
+ goto no_lithuanian_i_casing;
837
+
838
+ base = unichar_tolower(c);
839
+ break;
840
+ default:
841
+ goto no_lithuanian_i_casing;
842
+ }
843
+
844
+ return tolower_lithuianian_i(buf, base, combiner);
845
+ }
846
+
847
+ no_lithuanian_i_casing:
848
+
849
+ if (c == GREEK_CAPITAL_LETTER_SIGMA)
850
+ return tolower_sigma(p, buf, end, use_end);
851
+
852
+ if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
853
+ OR(UNICODE_TITLECASE_LETTER, 0))))
854
+ return real_do_tolower(c, type, buf);
855
+
856
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
857
+
858
+ if (buf != NULL)
859
+ memcpy(buf, prev, len);
860
+
861
+ return len;
862
+ }
863
+
864
+ static size_t
865
+ real_tolower(const char *str, size_t max, bool use_max, char *buf,
866
+ LocaleType locale_type)
867
+ {
868
+ const char *p = str;
869
+ const char *end = str + max;
870
+ size_t len = 0;
871
+
872
+ while ((!use_max || p < end) && *p != '\0') {
873
+ const char *prev = p;
874
+ p = utf_next(p);
875
+
876
+ len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
877
+ locale_type, end, use_max);
878
+ }
879
+
880
+ return len;
881
+ }
882
+
883
+
884
+ /* {{{1 */
885
+ static char *
886
+ utf_downcase_impl(const char *str, size_t max, bool use_max)
887
+ {
888
+ assert(str != NULL);
889
+
890
+ LocaleType locale_type = get_locale_type();
891
+
892
+ size_t len = real_tolower(str, max, use_max, NULL, locale_type);
893
+ char *result = ALLOC_N(char, len + 1);
894
+ real_tolower(str, max, use_max, result, locale_type);
895
+ result[len] = '\0';
896
+
897
+ return result;
898
+ }
899
+
900
+
901
+ /* {{{1
902
+ * Convert all characters in ‘str’ to their lowercase representation if
903
+ * applicable. Returns the freshly allocated representation.
904
+ */
905
+ char *
906
+ utf_downcase(const char *str)
907
+ {
908
+ return utf_downcase_impl(str, 0, false);
909
+ }
910
+
911
+
912
+ /* {{{1
913
+ * Convert all characters in ‘str’ to their lowercase representation if
914
+ * applicable. Returns the freshly allocated representation. Do this for at
915
+ * most ‘len˚ bytes from ‘str’.
916
+ */
917
+ char *
918
+ utf_downcase_n(const char *str, size_t len)
919
+ {
920
+ return utf_downcase_impl(str, len, true);
921
+ }
922
+
923
+
924
+ /* {{{1
925
+ * The real implementation of case folding below.
926
+ */
927
+
928
+ static bool
929
+ casefold_table_lookup(unichar c, char *folded, size_t *len)
930
+ {
931
+ int index;
932
+
933
+ if (!unicode_table_lookup(casefold_table, c, &index))
934
+ return false;
935
+
936
+ char const *folded_c = casefold_table[index].data;
937
+
938
+ if (folded != NULL)
939
+ strcpy(folded, folded_c);
940
+
941
+ *len += utf_byte_length(folded_c);
942
+
943
+ return true;
944
+ }
945
+
946
+ static char *
947
+ utf_foldcase_impl(const char *str, size_t max, bool use_max)
948
+ {
949
+ assert(str != NULL);
950
+
951
+ char *folded = NULL;
952
+ size_t len = 0;
953
+
954
+ again:
955
+ for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
956
+ unichar c = utf_char(p);
957
+
958
+ if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
959
+ continue;
960
+
961
+ len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
962
+ }
963
+
964
+ if (folded == NULL) {
965
+ folded = ALLOC_N(char, len + 1);
966
+ folded[0] = NUL;
967
+ len = 0;
968
+ goto again;
969
+ }
970
+
971
+ folded[len] = '\0';
972
+
973
+ return folded;
974
+ }
975
+
976
+
977
+ /* {{{1
978
+ * Convert a string into a form that is independent of case. Return the
979
+ * freshly allocated representation.
980
+ */
981
+ char *
982
+ utf_foldcase(const char *str)
983
+ {
984
+ return utf_foldcase_impl(str, 0, false);
985
+ }
986
+
987
+
988
+ /* {{{1
989
+ * Convert a string into a form that is independent of case. Return the
990
+ * freshly allocated representation. Do this for at most ‘len’ bytes from the
991
+ * string.
992
+ */
993
+ char *
994
+ utf_foldcase_n(const char *str, size_t len)
995
+ {
996
+ return utf_foldcase_impl(str, len, true);
997
+ }
998
+
999
+
1000
+ /* {{{1
1001
+ * The real implementation of utf_width() and utf_width_n() below.
1002
+ */
1003
+ static size_t
1004
+ utf_width_impl(const char *str, size_t len, bool use_len)
1005
+ {
1006
+ assert(str != NULL);
1007
+
1008
+ size_t width = 0;
1009
+
1010
+ for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
1011
+ width += unichar_iswide(utf_char(p)) ? 2 : 1;
1012
+
1013
+ return width;
1014
+ }
1015
+
1016
+
1017
+ /* {{{1
1018
+ * Calculate the width in cells of ‘str’.
1019
+ */
1020
+ size_t
1021
+ utf_width(const char *str)
1022
+ {
1023
+ return utf_width_impl(str, 0, false);
1024
+ }
1025
+
1026
+
1027
+ /* {{{1
1028
+ * Calculate the width in cells of ‘str’, which is of length ‘len’.
1029
+ */
1030
+ size_t
1031
+ utf_width_n(const char *str, size_t len)
1032
+ {
1033
+ return utf_width_impl(str, len, true);
1034
+ }
1035
+
1036
+
1037
+ /* {{{1
1038
+ * Retrieve the mirrored representation of ‘c’ (if any) and store it in
1039
+ * ‘mirrored’.
1040
+ */
1041
+ bool
1042
+ unichar_mirror(unichar c, unichar *mirrored)
1043
+ {
1044
+ int index;
1045
+
1046
+ if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
1047
+ return false;
1048
+
1049
+ if (mirrored != NULL)
1050
+ *mirrored = bidi_mirroring_table[index].mirrored_ch;
1051
+
1052
+ return true;
1053
+ }
1054
+
1055
+
1056
+ /* }}}1 */