u 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,67 @@
1
+ require 'mkmf'
2
+
3
+ def try_compiler_option(opt, &block)
4
+ checking_for "#{opt} option to compiler" do
5
+ $CFLAGS += " #{opt}" if try_compile '', opt, &block
6
+ end
7
+ end
8
+
9
+ try_compiler_option '-std=c99'
10
+ try_compiler_option '-finline-functions'
11
+ try_compiler_option '-fno-common'
12
+ try_compiler_option '-Wall'
13
+ try_compiler_option '-Waggregate-return'
14
+ try_compiler_option '-Wcast-align'
15
+ try_compiler_option '-Wextra'
16
+ try_compiler_option '-Wformat=2'
17
+ try_compiler_option '-Winit-self'
18
+ try_compiler_option '-Winline'
19
+ try_compiler_option '-Wmissing-declarations'
20
+ try_compiler_option '-Wmissing-format-attribute'
21
+ try_compiler_option '-Wmissing-include-dirs'
22
+ try_compiler_option '-Wmissing-noreturn'
23
+ try_compiler_option '-Wmissing-prototypes'
24
+ try_compiler_option '-Wnested-externs'
25
+ try_compiler_option '-Wold-style-definition'
26
+ try_compiler_option '-Wpacked'
27
+ try_compiler_option '-Wp,-D_FORTIFY_SOURCE=2'
28
+ try_compiler_option '-Wpointer-arith'
29
+ try_compiler_option '-Wsign-compare'
30
+ try_compiler_option '-Wstrict-aliasing=2'
31
+ try_compiler_option '-Wswitch-default'
32
+ try_compiler_option '-Wswitch-enum'
33
+ try_compiler_option '-Wundef'
34
+ try_compiler_option '-Wunsafe-loop-optimizations'
35
+ try_compiler_option '-Wwrite-strings'
36
+
37
+ checking_for 'GNUC visibility attribute' do
38
+ $defs.push('-DHAVE_GNUC_VISIBILITY') if try_compile <<EOC, '-Werror'
39
+ void f_hidden(void);
40
+ void __attribute__((visibility("hidden")))
41
+ f_hidden(void)
42
+ {
43
+ }
44
+ int main(void)
45
+ {
46
+ f_hidden();
47
+ return 0;
48
+ }
49
+ EOC
50
+ end
51
+
52
+ have_header 'assert.h'
53
+ have_header 'limits.h'
54
+ have_header 'locale.h'
55
+ have_header 'stdbool.h'
56
+ have_header 'stddef.h'
57
+ have_header 'stdint.h'
58
+ have_header 'stdio.h'
59
+ have_header 'stdlib.h'
60
+ have_header 'string.h'
61
+ have_header 'sys/types.h'
62
+ have_header 'wchar.h'
63
+
64
+ $INSTALLFILES ||= []
65
+ $INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
66
+
67
+ create_makefile 'encoding/character/utf-8/utf8'
@@ -0,0 +1,62 @@
1
+ /*
2
+ * contents: Private functions used by the UTF-8 character-encoding library.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include <stdlib.h>
12
+
13
+ #include "unicode.h"
14
+
15
+ #include "private.h"
16
+
17
+ /* Lookup C in the sorted TABLE using binary search. TABLE consists of N
18
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
19
+ * component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
20
+ * index is stored in INDEX and true is returned. Otherwise, false is returned
21
+ * and INDEX is left untouched. */
22
+ bool
23
+ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
24
+ {
25
+ #define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
26
+
27
+ int begin = 0;
28
+ int end = n - 1;
29
+ int middle;
30
+
31
+ /* This is ugly, but not all tables use unichars as their lookup
32
+ * character. The casefold table, for example, uses uint16_t-sized
33
+ * characters. To only get the interesting part of our table entry
34
+ * we’ll have to mask the retrieved value. */
35
+ int char_mask = (1 << (8 * sizeof_char)) - 1;
36
+
37
+ /* Drop out early if we know for certain that C can’t be in the
38
+ * decomposition table. */
39
+ if (c < ENTRY(0) || c > ENTRY(end))
40
+ return false;
41
+
42
+ while (begin <= end) {
43
+ middle = binary_search_middle_of(begin, end);
44
+
45
+ unichar probe = ENTRY(middle);
46
+ if (c < probe)
47
+ end = middle - 1;
48
+ else if (c > probe)
49
+ begin = middle + 1;
50
+ else
51
+ break;
52
+ }
53
+
54
+ if (begin > end)
55
+ return false;
56
+
57
+ *index = middle;
58
+
59
+ return true;
60
+
61
+ #undef ENTRY
62
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * contents: Private Unicode related information.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #ifndef PRIVATE_H
8
+ #define PRIVATE_H
9
+
10
+ #define NUL '\0'
11
+ #define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
12
+
13
+ #if defined(HAVE_GNUC_VISIBILITY)
14
+ # define HIDDEN \
15
+ __attribute__((visibility("hidden")))
16
+ #else
17
+ # define HIDDEN
18
+ #endif
19
+
20
+ #if defined(__GNUC__)
21
+ # define UNUSED(u) \
22
+ u __attribute__((__unused__))
23
+ #else
24
+ # define UNUSED(u) \
25
+ u
26
+ #endif
27
+
28
+ #define binary_search_middle_of(begin, end) \
29
+ (((unsigned)((begin) + (end))) >> 1)
30
+
31
+ #define unicode_table_lookup(table, c, index) \
32
+ binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
33
+
34
+ bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
35
+
36
+ #define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
37
+ ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
38
+ ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
39
+ : (data[part[page]][(c) & 0xff]))
40
+
41
+ #define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
42
+ (((c) <= UNICODE_LAST_CHAR_PART1) \
43
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
44
+ : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
45
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
46
+ : (fallback)))
47
+
48
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
49
+ NormalizeMode mode) HIDDEN;
50
+
51
+ #endif /* PRIVATE_H */
@@ -0,0 +1,1056 @@
1
+ /*
2
+ * contents: Unicode character properties.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <assert.h>
9
+ #include <locale.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <string.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "data/character-tables.h"
17
+
18
+
19
+ #define COMBINING_DOT_ABOVE ((unichar)0x0307)
20
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
21
+ #define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
22
+ #define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
23
+ #define LATIN_SMALL_LETTER_I ((unichar)0x0069)
24
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
25
+ #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
26
+ #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
27
+ #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
28
+ #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
29
+ #define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
30
+ #define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
31
+ #define COMBINING_TILDE ((unichar)0x0303)
32
+ #define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
33
+ #define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
34
+ #define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
35
+
36
+ #define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
37
+
38
+ /* {{{1
39
+ * Macros for accessing the Unicode character attribute table.
40
+ *
41
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
42
+ * when we have ‹inline› in C99.
43
+ */
44
+ #define ATTR_TABLE(page) \
45
+ (((page) <= UNICODE_LAST_PAGE_PART1) \
46
+ ? attr_table_part1[page] \
47
+ : attr_table_part2[(page) - 0xe00])
48
+
49
+ #define ATTTABLE(page, char) \
50
+ ((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
51
+ ? 0 : (attr_data[ATTR_TABLE(page)][char]))
52
+
53
+
54
+ /* {{{1
55
+ * Internal function used for figuring out the type of a given character.
56
+ */
57
+ static inline int
58
+ s_type(unichar c)
59
+ {
60
+ const int16_t *table;
61
+ unsigned int page;
62
+
63
+ if (c <= UNICODE_LAST_CHAR_PART1) {
64
+ page = c >> 8;
65
+ table = type_table_part1;
66
+ } else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
68
+ table = type_table_part2;
69
+ } else {
70
+ return UNICODE_UNASSIGNED;
71
+ }
72
+
73
+ if (table[page] >= UNICODE_MAX_TABLE_INDEX)
74
+ return table[page] - UNICODE_MAX_TABLE_INDEX;
75
+ else
76
+ return type_data[table[page]][c & 0xff];
77
+ }
78
+
79
+
80
+ /* {{{1
81
+ * Bit-fiddling macros for testing the class of a type.
82
+ */
83
+ #define IS(type, class) (((unsigned int)1 << (type)) & (class))
84
+ #define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
85
+
86
+
87
+ /* {{{1
88
+ * Internal function used to check if the given type represents a digit type.
89
+ */
90
+ static inline bool
91
+ s_isdigit(int type)
92
+ {
93
+ return IS(type,
94
+ OR(UNICODE_DECIMAL_NUMBER,
95
+ OR(UNICODE_LETTER_NUMBER,
96
+ OR(UNICODE_OTHER_NUMBER, 0))));
97
+ }
98
+
99
+
100
+ /* {{{1
101
+ * Internal function used to check if the given type represents an alphabetic
102
+ * type.
103
+ */
104
+ static inline bool
105
+ s_isalpha(int type)
106
+ {
107
+ return IS(type,
108
+ OR(UNICODE_LOWERCASE_LETTER,
109
+ OR(UNICODE_UPPERCASE_LETTER,
110
+ OR(UNICODE_TITLECASE_LETTER,
111
+ OR(UNICODE_MODIFIER_LETTER,
112
+ OR(UNICODE_OTHER_LETTER, 0))))));
113
+ }
114
+
115
+
116
+ /* {{{1
117
+ * Internal function used to check if the given type represents a mark type.
118
+ */
119
+ static inline bool
120
+ s_ismark(int type)
121
+ {
122
+ return IS(type,
123
+ OR(UNICODE_NON_SPACING_MARK,
124
+ OR(UNICODE_COMBINING_MARK,
125
+ OR(UNICODE_ENCLOSING_MARK, 0))));
126
+ }
127
+
128
+
129
+ /* {{{1
130
+ * Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
131
+ */
132
+ bool
133
+ unichar_isalnum(unichar c)
134
+ {
135
+ int type = s_type(c);
136
+
137
+ return s_isdigit(type) || s_isalpha(type);
138
+ }
139
+
140
+
141
+ /* {{{1
142
+ * Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
143
+ */
144
+ bool
145
+ unichar_isalpha(unichar c)
146
+ {
147
+ return s_isalpha(s_type(c));
148
+ }
149
+
150
+
151
+ /* {{{1
152
+ * Determine whether ‘c’ is a control character, such as ‹NUL›.
153
+ */
154
+ bool
155
+ unichar_iscntrl(unichar c)
156
+ {
157
+ return s_type(c) == UNICODE_CONTROL;
158
+ }
159
+
160
+
161
+ /* {{{1
162
+ * Determine whether ‘c’ is a digit, such as 0, 1, or 2.
163
+ */
164
+ bool
165
+ unichar_isdigit(unichar c)
166
+ {
167
+ return s_type(c) == UNICODE_DECIMAL_NUMBER;
168
+ }
169
+
170
+
171
+ /* {{{1
172
+ * Determine whether ‘c’ is printable and not a space or control character such
173
+ * as tab or <NUL›, such as A, B, or C.
174
+ */
175
+ bool
176
+ unichar_isgraph(unichar c)
177
+ {
178
+ return !IS(s_type(c),
179
+ OR(UNICODE_CONTROL,
180
+ OR(UNICODE_FORMAT,
181
+ OR(UNICODE_UNASSIGNED,
182
+ OR(UNICODE_PRIVATE_USE,
183
+ OR(UNICODE_SURROGATE,
184
+ OR(UNICODE_SPACE_SEPARATOR, 0)))))));
185
+ }
186
+
187
+
188
+ /* {{{1
189
+ * Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
190
+ */
191
+ bool
192
+ unichar_islower(unichar c)
193
+ {
194
+ return s_type(c) == UNICODE_LOWERCASE_LETTER;
195
+ }
196
+
197
+
198
+ /* {{{1
199
+ * Determine whether ‘c’ is printable, which works the same as
200
+ * unichar_isgraph(), except that space characters are also printable.
201
+ */
202
+ bool
203
+ unichar_isprint(unichar c)
204
+ {
205
+ return !IS(s_type(c),
206
+ OR(UNICODE_CONTROL,
207
+ OR(UNICODE_FORMAT,
208
+ OR(UNICODE_UNASSIGNED,
209
+ OR(UNICODE_PRIVATE_USE,
210
+ OR(UNICODE_SURROGATE, 0))))));
211
+ }
212
+
213
+
214
+ /* {{{1
215
+ * Determine whether ‘c’ is some form of punctuation or other symbol.
216
+ */
217
+ bool
218
+ unichar_ispunct(unichar c)
219
+ {
220
+ return IS(s_type(c),
221
+ OR(UNICODE_CONNECT_PUNCTUATION,
222
+ OR(UNICODE_DASH_PUNCTUATION,
223
+ OR(UNICODE_OPEN_PUNCTUATION,
224
+ OR(UNICODE_CLOSE_PUNCTUATION,
225
+ OR(UNICODE_INITIAL_PUNCTUATION,
226
+ OR(UNICODE_FINAL_PUNCTUATION,
227
+ OR(UNICODE_OTHER_PUNCTUATION,
228
+ OR(UNICODE_MODIFIER_SYMBOL,
229
+ OR(UNICODE_MATH_SYMBOL,
230
+ OR(UNICODE_CURRENCY_SYMBOL,
231
+ OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
232
+ }
233
+
234
+
235
+ /* {{{1
236
+ * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
237
+ * line separator (newline, carriage return, etc.).
238
+ */
239
+ bool
240
+ unichar_isspace(unichar c)
241
+ {
242
+ switch (c) {
243
+ case '\t':
244
+ case '\n':
245
+ case '\r':
246
+ case '\f':
247
+ return true;
248
+ default:
249
+ return IS(s_type(c),
250
+ OR(UNICODE_SPACE_SEPARATOR,
251
+ OR(UNICODE_LINE_SEPARATOR,
252
+ OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
253
+ }
254
+ }
255
+
256
+
257
+ /* {{{1
258
+ * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
259
+ */
260
+ bool
261
+ unichar_isupper(unichar c)
262
+ {
263
+ return s_type(c) == UNICODE_UPPERCASE_LETTER;
264
+ }
265
+
266
+
267
+ /* {{{1
268
+ * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
269
+ * which at the beginning of a word is written as Dz, where only the initial D
270
+ * is capitalized. (Complicated huh?)
271
+ */
272
+ bool
273
+ unichar_istitle(unichar c)
274
+ {
275
+ /* TODO: binary search helpful? */
276
+ for (size_t i = 0; i < lengthof(title_table); i++)
277
+ if (title_table[i][0] == c)
278
+ return true;
279
+
280
+ return false;
281
+ }
282
+
283
+
284
+ /* {{{1
285
+ * Determine whether ‘c’ is a new-line.
286
+ */
287
+ #define UNICHAR_NEXT_LINE ((unichar)0x0085)
288
+ #define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
289
+ #define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
290
+
291
+ bool
292
+ unichar_isnewline(unichar c)
293
+ {
294
+ switch (c) {
295
+ case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
296
+ case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
297
+ return true;
298
+ default:
299
+ return false;
300
+ }
301
+ }
302
+
303
+ /* {{{1
304
+ * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
305
+ * ..., f, or A, B, ..., F.
306
+ */
307
+ #define UNICHAR_FULLWIDTH_A 0xff21
308
+ #define UNICHAR_FULLWIDTH_F 0xff26
309
+ #define UNICHAR_FULLWIDTH_a 0xff41
310
+ #define UNICHAR_FULLWIDTH_f 0xff46
311
+ bool
312
+ unichar_isxdigit(unichar c)
313
+ {
314
+ return ((c >= 'a' && c <= 'f') ||
315
+ (c >= 'A' && c <= 'F') ||
316
+ (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
317
+ (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
318
+ (s_type(c) == UNICODE_DECIMAL_NUMBER));
319
+ // s_isdigit(s_type(c)));
320
+ }
321
+
322
+
323
+ /* {{{1
324
+ * Determine whether code point ‘c’ has been assigned a code value.
325
+ */
326
+ bool
327
+ unichar_isassigned(unichar c)
328
+ {
329
+ return s_type(c) != UNICODE_UNASSIGNED;
330
+ }
331
+
332
+
333
+ /* {{{1
334
+ * Determine whether ‘c’ is a wide character, thus is typically rendered in a
335
+ * double-width cell on a terminal.
336
+ */
337
+ bool
338
+ unichar_iswide(unichar c)
339
+ {
340
+ if (c < 0x1100)
341
+ return false;
342
+
343
+ return (c <= 0x115f || /* Hangul Jamo init. consonants */
344
+ c == 0x2329 || c == 0x232a || /* angle brackets */
345
+ (c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
346
+ (c < 0x302a || c > 0x302f) &&
347
+ c != 0x303f && c != 0x3099 && c != 0x309a) ||
348
+ (c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
349
+ (c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
350
+ (c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
351
+ (c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
352
+ (c >= 0xffe0 && c <= 0xffe6) || /* -"- */
353
+ (c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
354
+ (c >= 0x30000 && c <= 0x3fffd)); /* -"- */
355
+ }
356
+
357
+
358
+ /* {{{1
359
+ * Convert ‘c’ to its uppercase representation (if any).
360
+ */
361
+ static unichar
362
+ special_case_table_lookup(unichar c)
363
+ {
364
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
+
366
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
+ tv = utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
+
370
+ if (tv == '\0')
371
+ return c;
372
+
373
+ return tv;
374
+ }
375
+
376
+ static unichar
377
+ titlecase_table_lookup(unichar c, bool want_upper)
378
+ {
379
+ for (size_t i = 0; i < lengthof(title_table); i++)
380
+ if (title_table[i][0] == c)
381
+ return title_table[i][want_upper ? 1 : 2];
382
+
383
+ return c;
384
+ }
385
+
386
+ unichar
387
+ unichar_toupper(unichar c)
388
+ {
389
+ int type = s_type(c);
390
+
391
+ if (type == UNICODE_LOWERCASE_LETTER)
392
+ return special_case_table_lookup(c);
393
+
394
+ if (type == UNICODE_TITLECASE_LETTER)
395
+ return titlecase_table_lookup(c, true);
396
+
397
+ return c;
398
+ }
399
+
400
+
401
+ /* {{{1
402
+ * Convert ‘c’ to its lowercase representation (if any).
403
+ */
404
+ unichar
405
+ unichar_tolower(unichar c)
406
+ {
407
+ int type = s_type(c);
408
+
409
+ if (type == UNICODE_UPPERCASE_LETTER)
410
+ return special_case_table_lookup(c);
411
+
412
+ if (type == UNICODE_TITLECASE_LETTER)
413
+ return titlecase_table_lookup(c, false);
414
+
415
+ return c;
416
+ }
417
+
418
+
419
+ /* {{{1
420
+ * Convert ‘c’ to its titlecase representation (if any).
421
+ */
422
+ unichar
423
+ unichar_totitle(unichar c)
424
+ {
425
+ for (size_t i = 0; i < lengthof(title_table); i++)
426
+ if (title_table[i][0] == c ||
427
+ title_table[i][1] == c ||
428
+ title_table[i][2] == c)
429
+ return title_table[i][0];
430
+
431
+ if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
+ return unichar_toupper(c);
433
+
434
+ return c;
435
+ }
436
+
437
+
438
+ /* {{{1
439
+ * Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
440
+ */
441
+ int
442
+ unichar_digit_value(unichar c)
443
+ {
444
+ if (s_type(c) == UNICODE_DECIMAL_NUMBER)
445
+ return ATTTABLE(c >> 8, c & 0xff);
446
+
447
+ return -1;
448
+ }
449
+
450
+
451
+ /* {{{1
452
+ * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
453
+ */
454
+ int
455
+ unichar_xdigit_value(unichar c)
456
+ {
457
+ if (c >= 'a' && c <= 'f')
458
+ return c - 'a' + 10;
459
+ else if (c >= 'A' && c <= 'F')
460
+ return c - 'A' + 10;
461
+ else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
462
+ return c - UNICHAR_FULLWIDTH_a + 10;
463
+ else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
464
+ return c - UNICHAR_FULLWIDTH_A + 10;
465
+ else
466
+ return unichar_digit_value(c);
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * Determine the Unicode character type of ‘c’.
472
+ */
473
+ UnicodeType
474
+ unichar_type(unichar c)
475
+ {
476
+ return s_type(c);
477
+ }
478
+
479
+
480
+ /* {{{1
481
+ * LocaleType: This ‹enum› is used for dealing with different locales for
482
+ * turning strings into uppercase or lowercase.
483
+ */
484
+ typedef enum {
485
+ LOCALE_NORMAL,
486
+ LOCALE_TURKIC,
487
+ LOCALE_LITHUANIAN
488
+ } LocaleType;
489
+
490
+
491
+ /* {{{1
492
+ * Retrieve the locale type from the environment (LC_CTYPE).
493
+ */
494
+ static LocaleType
495
+ get_locale_type(void)
496
+ {
497
+ const char *locale = setlocale(LC_CTYPE, NULL);
498
+
499
+ if ((locale[0] == 'a' && locale[1] == 'z') ||
500
+ (locale[0] == 't' && locale[1] == 'r'))
501
+ return LOCALE_TURKIC;
502
+
503
+ if (locale[0] == 'l' && locale[1] == 't')
504
+ return LOCALE_LITHUANIAN;
505
+
506
+ return LOCALE_NORMAL;
507
+ }
508
+
509
+
510
+ /* {{{1
511
+ * Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
512
+ * true, remove the dot over an uppercase I for a turkish locale.
513
+ */
514
+ static size_t
515
+ output_marks(const char **p_inout, char *buf, bool remove_dot)
516
+ {
517
+ size_t len = 0;
518
+ const char *p = *p_inout;
519
+
520
+ for ( ; *p != '\0'; p = utf_next(p)) {
521
+ unichar c = utf_char(p);
522
+
523
+ if (!s_ismark(s_type(c)))
524
+ break;
525
+
526
+ if (!remove_dot || c != COMBINING_DOT_ABOVE)
527
+ len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
528
+ }
529
+
530
+ *p_inout = p;
531
+
532
+ return len;
533
+ }
534
+
535
+ /* {{{1
536
+ * Output titlecases where appropriate.
537
+ */
538
+ static size_t
539
+ output_special_case(char *buf, int offset, int type, bool upper)
540
+ {
541
+ const char *p = special_case_table + offset;
542
+
543
+ if (type != UNICODE_TITLECASE_LETTER)
544
+ p = utf_next(p);
545
+
546
+ if (upper)
547
+ p += utf_byte_length(p) + 1;
548
+
549
+ size_t len = utf_byte_length(p);
550
+
551
+ if (buf != NULL)
552
+ memcpy(buf, p, len);
553
+
554
+ return len;
555
+ }
556
+
557
+ /* {{{1
558
+ * Do uppercasing of ‘p’ for Lithuanian locales.
559
+ */
560
+ static size_t
561
+ remove_all_combining_dot_above(unichar c, char *buf)
562
+ {
563
+ size_t decomp_len;
564
+ unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
565
+
566
+ size_t len = 0;
567
+ for (size_t i = 0; i < decomp_len; i++)
568
+ if (decomp[i] != COMBINING_DOT_ABOVE)
569
+ len += unichar_to_utf(unichar_toupper(decomp[i]),
570
+ OFFSET_IF(buf, len));
571
+
572
+ free(decomp);
573
+
574
+ return len;
575
+ }
576
+
577
+ static size_t
578
+ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
579
+ bool *was_i)
580
+ {
581
+ if (c == 'i') {
582
+ *was_i = true;
583
+ return 0;
584
+ }
585
+
586
+ if (*was_i) {
587
+ size_t len = remove_all_combining_dot_above(c, buf);
588
+ return len + output_marks(p, OFFSET_IF(buf, len), true);
589
+ }
590
+
591
+ if (!s_ismark(type))
592
+ *was_i = false;
593
+
594
+ return 0;
595
+ }
596
+
597
+ /* {{{1
598
+ * Do real upcasing. */
599
+ static inline size_t
600
+ real_do_toupper(unichar c, int type, char *buf)
601
+ {
602
+ bool upper = (type != UNICODE_LOWERCASE_LETTER);
603
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
604
+
605
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
606
+ return output_special_case(buf,
607
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
608
+ type, upper);
609
+
610
+ /* TODO: this should really use titlecase_table_lookup somehow. */
611
+ if (type == UNICODE_TITLECASE_LETTER)
612
+ for (size_t i = 0; i < lengthof(title_table); i++)
613
+ if (title_table[i][0] == c)
614
+ return unichar_to_utf(title_table[i][1], buf);
615
+
616
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
617
+ }
618
+
619
+ /* {{{1
620
+ * Do real uppercasing of ‘str’.
621
+ */
622
+ static size_t
623
+ real_toupper_one(const char **p, const char *prev, char *buf,
624
+ LocaleType locale_type, bool *was_i)
625
+ {
626
+ unichar c = utf_char(prev);
627
+ int type = s_type(c);
628
+
629
+ if (locale_type == LOCALE_LITHUANIAN) {
630
+ size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
631
+ if (len > 0)
632
+ return len;
633
+ }
634
+
635
+ if (locale_type == LOCALE_TURKIC && c == 'i')
636
+ return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
637
+ buf);
638
+
639
+ if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
640
+ /* Nasty, need to move it after other combining marks...this
641
+ * would go away if we normalized first. */
642
+ /* TODO: don’t we need to make sure we don’t go beyond the end
643
+ * of ‘p’? */
644
+ size_t len = output_marks(p, buf, false);
645
+ return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
646
+ OFFSET_IF(buf, len));
647
+ }
648
+
649
+ if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
650
+ OR(UNICODE_TITLECASE_LETTER, 0))))
651
+ return real_do_toupper(c, type, buf);
652
+
653
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
654
+
655
+ if (buf != NULL)
656
+ memcpy(buf, prev, len);
657
+
658
+ return len;
659
+ }
660
+
661
+ static size_t
662
+ real_toupper(const char *str, size_t max, bool use_max, char *buf,
663
+ LocaleType locale_type)
664
+ {
665
+ const char *p = str;
666
+ size_t len = 0;
667
+ bool p_was_i = false;
668
+
669
+ while ((!use_max || p < str + max) && *p != '\0') {
670
+ const char *prev = p;
671
+ p = utf_next(p);
672
+
673
+ len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
674
+ locale_type, &p_was_i);
675
+ }
676
+
677
+ return len;
678
+ }
679
+
680
+ /* {{{1
681
+ * Wrapper around real_toupper() for dealing with memory allocation and such.
682
+ */
683
+ static char *
684
+ utf_upcase_impl(const char *str, size_t max, bool use_max)
685
+ {
686
+ assert(str != NULL);
687
+
688
+ LocaleType locale_type = get_locale_type();
689
+
690
+ size_t len = real_toupper(str, max, use_max, NULL, locale_type);
691
+ char *result = ALLOC_N(char, len + 1);
692
+ real_toupper(str, max, use_max, result, locale_type);
693
+ result[len] = '\0';
694
+
695
+ return result;
696
+ }
697
+
698
+
699
+ /* {{{1
700
+ * Convert all characters in ‘str’ to their uppercase representation if
701
+ * applicable. Returns the freshly allocated representation.
702
+ */
703
+ char *
704
+ utf_upcase(const char *str)
705
+ {
706
+ return utf_upcase_impl(str, 0, false);
707
+ }
708
+
709
+
710
+ /* {{{1
711
+ * Convert all characters in ‘str’ to their uppercase representation if
712
+ * applicable. Returns the freshly allocated representation. Do this for at
713
+ * most ‘len˚ bytes from ‘str’.
714
+ */
715
+ char *
716
+ utf_upcase_n(const char *str, size_t len)
717
+ {
718
+ return utf_upcase_impl(str, len, true);
719
+ }
720
+
721
+
722
+ /* {{{1
723
+ * Traverse the string checking for characters with combining class == 230
724
+ * until a base character is found.
725
+ */
726
+ static bool
727
+ has_more_above(const char *str)
728
+ {
729
+ for (const char *p = str; *p != '\0'; p = utf_next(p)) {
730
+ int c_class = unichar_combining_class(utf_char(p));
731
+
732
+ if (c_class == 230)
733
+ return true;
734
+
735
+ if (c_class == 0)
736
+ return false;
737
+ }
738
+
739
+ return false;
740
+ }
741
+
742
+ static inline size_t
743
+ real_do_tolower(unichar c, int type, char *buf)
744
+ {
745
+ unichar tv = ATTTABLE(c >> 8, c & 0xff);
746
+
747
+ if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
748
+ return output_special_case(buf,
749
+ tv - UNICODE_SPECIAL_CASE_TABLE_START,
750
+ type, false);
751
+
752
+ /* TODO: this should really use titlecase_table_lookup somehow. */
753
+ if (type == UNICODE_TITLECASE_LETTER)
754
+ for (size_t i = 0; i < lengthof(title_table); i++)
755
+ if (title_table[i][0] == c)
756
+ return unichar_to_utf(title_table[i][2], buf);
757
+
758
+ return unichar_to_utf(tv != '\0' ? tv : c, buf);
759
+ }
760
+
761
+ /* {{{1
762
+ * The real implementation of downcase.
763
+ */
764
+ static size_t
765
+ tolower_turkic_i(const char **p, char *buf)
766
+ {
767
+ unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
768
+
769
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
770
+ /* TODO: don’t we need to make sure we don’t go beyond the end
771
+ * of ‘p’? */
772
+ *p = utf_next(*p);
773
+ i = LATIN_SMALL_LETTER_I;
774
+ }
775
+
776
+ return unichar_to_utf(i, buf);
777
+ }
778
+
779
+ static size_t
780
+ tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
781
+ {
782
+ size_t len = unichar_to_utf(base, buf);
783
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
784
+ if (combiner != '\0')
785
+ len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
786
+
787
+ return len;
788
+ }
789
+
790
+ static size_t
791
+ tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
792
+ {
793
+ unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
794
+
795
+ /* SIGMA maps differently depending on whether it is final or not. The
796
+ * following simplified test would fail in the case of combining marks
797
+ * following the sigma, but I don't think that occurs in real text.
798
+ * The test here matches that in ICU. */
799
+ if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
800
+ sigma = GREEK_SMALL_LETTER_SIGMA;
801
+
802
+ return unichar_to_utf(sigma, buf);
803
+ }
804
+
805
+ static size_t
806
+ real_tolower_one(const char **p, const char *prev, char *buf,
807
+ LocaleType locale_type, const char *end, bool use_end)
808
+ {
809
+ unichar c = utf_char(prev);
810
+ int type = s_type(c);
811
+
812
+ if (locale_type == LOCALE_TURKIC && c == 'I')
813
+ return tolower_turkic_i(p, buf);
814
+
815
+ /* Introduce an explicit dot above the lowercasing capital I’s
816
+ * and J’s whenever there are more accents above.
817
+ * [SpecialCasing.txt] */
818
+ if (locale_type == LOCALE_LITHUANIAN) {
819
+ unichar base = LATIN_SMALL_LETTER_I;
820
+ unichar combiner = '\0';
821
+
822
+ switch (c) {
823
+ case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
824
+ combiner = COMBINING_GRAVE_ACCENT;
825
+ break;
826
+ case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
827
+ combiner = COMBINING_ACUTE_ACCENT;
828
+ break;
829
+ case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
830
+ combiner = COMBINING_TILDE;
831
+ break;
832
+ case 'I':
833
+ case 'J':
834
+ case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
835
+ if (!has_more_above(*p))
836
+ goto no_lithuanian_i_casing;
837
+
838
+ base = unichar_tolower(c);
839
+ break;
840
+ default:
841
+ goto no_lithuanian_i_casing;
842
+ }
843
+
844
+ return tolower_lithuianian_i(buf, base, combiner);
845
+ }
846
+
847
+ no_lithuanian_i_casing:
848
+
849
+ if (c == GREEK_CAPITAL_LETTER_SIGMA)
850
+ return tolower_sigma(p, buf, end, use_end);
851
+
852
+ if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
853
+ OR(UNICODE_TITLECASE_LETTER, 0))))
854
+ return real_do_tolower(c, type, buf);
855
+
856
+ size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
857
+
858
+ if (buf != NULL)
859
+ memcpy(buf, prev, len);
860
+
861
+ return len;
862
+ }
863
+
864
+ static size_t
865
+ real_tolower(const char *str, size_t max, bool use_max, char *buf,
866
+ LocaleType locale_type)
867
+ {
868
+ const char *p = str;
869
+ const char *end = str + max;
870
+ size_t len = 0;
871
+
872
+ while ((!use_max || p < end) && *p != '\0') {
873
+ const char *prev = p;
874
+ p = utf_next(p);
875
+
876
+ len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
877
+ locale_type, end, use_max);
878
+ }
879
+
880
+ return len;
881
+ }
882
+
883
+
884
+ /* {{{1 */
885
+ static char *
886
+ utf_downcase_impl(const char *str, size_t max, bool use_max)
887
+ {
888
+ assert(str != NULL);
889
+
890
+ LocaleType locale_type = get_locale_type();
891
+
892
+ size_t len = real_tolower(str, max, use_max, NULL, locale_type);
893
+ char *result = ALLOC_N(char, len + 1);
894
+ real_tolower(str, max, use_max, result, locale_type);
895
+ result[len] = '\0';
896
+
897
+ return result;
898
+ }
899
+
900
+
901
+ /* {{{1
902
+ * Convert all characters in ‘str’ to their lowercase representation if
903
+ * applicable. Returns the freshly allocated representation.
904
+ */
905
+ char *
906
+ utf_downcase(const char *str)
907
+ {
908
+ return utf_downcase_impl(str, 0, false);
909
+ }
910
+
911
+
912
+ /* {{{1
913
+ * Convert all characters in ‘str’ to their lowercase representation if
914
+ * applicable. Returns the freshly allocated representation. Do this for at
915
+ * most ‘len˚ bytes from ‘str’.
916
+ */
917
+ char *
918
+ utf_downcase_n(const char *str, size_t len)
919
+ {
920
+ return utf_downcase_impl(str, len, true);
921
+ }
922
+
923
+
924
+ /* {{{1
925
+ * The real implementation of case folding below.
926
+ */
927
+
928
+ static bool
929
+ casefold_table_lookup(unichar c, char *folded, size_t *len)
930
+ {
931
+ int index;
932
+
933
+ if (!unicode_table_lookup(casefold_table, c, &index))
934
+ return false;
935
+
936
+ char const *folded_c = casefold_table[index].data;
937
+
938
+ if (folded != NULL)
939
+ strcpy(folded, folded_c);
940
+
941
+ *len += utf_byte_length(folded_c);
942
+
943
+ return true;
944
+ }
945
+
946
+ static char *
947
+ utf_foldcase_impl(const char *str, size_t max, bool use_max)
948
+ {
949
+ assert(str != NULL);
950
+
951
+ char *folded = NULL;
952
+ size_t len = 0;
953
+
954
+ again:
955
+ for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
956
+ unichar c = utf_char(p);
957
+
958
+ if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
959
+ continue;
960
+
961
+ len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
962
+ }
963
+
964
+ if (folded == NULL) {
965
+ folded = ALLOC_N(char, len + 1);
966
+ folded[0] = NUL;
967
+ len = 0;
968
+ goto again;
969
+ }
970
+
971
+ folded[len] = '\0';
972
+
973
+ return folded;
974
+ }
975
+
976
+
977
+ /* {{{1
978
+ * Convert a string into a form that is independent of case. Return the
979
+ * freshly allocated representation.
980
+ */
981
+ char *
982
+ utf_foldcase(const char *str)
983
+ {
984
+ return utf_foldcase_impl(str, 0, false);
985
+ }
986
+
987
+
988
+ /* {{{1
989
+ * Convert a string into a form that is independent of case. Return the
990
+ * freshly allocated representation. Do this for at most ‘len’ bytes from the
991
+ * string.
992
+ */
993
+ char *
994
+ utf_foldcase_n(const char *str, size_t len)
995
+ {
996
+ return utf_foldcase_impl(str, len, true);
997
+ }
998
+
999
+
1000
+ /* {{{1
1001
+ * The real implementation of utf_width() and utf_width_n() below.
1002
+ */
1003
+ static size_t
1004
+ utf_width_impl(const char *str, size_t len, bool use_len)
1005
+ {
1006
+ assert(str != NULL);
1007
+
1008
+ size_t width = 0;
1009
+
1010
+ for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
1011
+ width += unichar_iswide(utf_char(p)) ? 2 : 1;
1012
+
1013
+ return width;
1014
+ }
1015
+
1016
+
1017
+ /* {{{1
1018
+ * Calculate the width in cells of ‘str’.
1019
+ */
1020
+ size_t
1021
+ utf_width(const char *str)
1022
+ {
1023
+ return utf_width_impl(str, 0, false);
1024
+ }
1025
+
1026
+
1027
+ /* {{{1
1028
+ * Calculate the width in cells of ‘str’, which is of length ‘len’.
1029
+ */
1030
+ size_t
1031
+ utf_width_n(const char *str, size_t len)
1032
+ {
1033
+ return utf_width_impl(str, len, true);
1034
+ }
1035
+
1036
+
1037
+ /* {{{1
1038
+ * Retrieve the mirrored representation of ‘c’ (if any) and store it in
1039
+ * ‘mirrored’.
1040
+ */
1041
+ bool
1042
+ unichar_mirror(unichar c, unichar *mirrored)
1043
+ {
1044
+ int index;
1045
+
1046
+ if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
1047
+ return false;
1048
+
1049
+ if (mirrored != NULL)
1050
+ *mirrored = bidi_mirroring_table[index].mirrored_ch;
1051
+
1052
+ return true;
1053
+ }
1054
+
1055
+
1056
+ /* }}}1 */