u 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,1334 @@
1
+ /*
2
+ * contents: UTF-8 string operations.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+
8
+ #include <ruby.h>
9
+ #include <assert.h>
10
+ #include <locale.h>
11
+ #include <stdbool.h>
12
+ #include <stddef.h>
13
+ #include <stdint.h>
14
+ #include <stdlib.h>
15
+ #include <string.h>
16
+ #include <wchar.h>
17
+
18
+ #include "unicode.h"
19
+ #include "private.h"
20
+
21
+
22
+ #define UNICODE_ISVALID(char) \
23
+ ((char) < 0x110000 && \
24
+ (((char) & 0xffffff800) != 0xd800) && \
25
+ ((char) < 0xfdd0 || (char) > 0xfdef) && \
26
+ ((char) & 0xfffe) != 0xfffe)
27
+
28
+
29
+ /* {{{1
30
+ * These are a couple of constants we use for dealing with the bit-twiddling
31
+ * necessary when dealing with UTF-8 character sequences.
32
+ */
33
+ enum {
34
+ BIT_1 = 7,
35
+ BIT_X = 6,
36
+ BIT_2 = 5,
37
+ BIT_3 = 4,
38
+ BIT_4 = 3,
39
+ BIT_5 = 2,
40
+ BIT_6 = 1,
41
+
42
+ OCT_1 = ((1 << (BIT_1 + 1)) - 1) ^ 0xff, /* 0000 0000 */
43
+ OCT_X = ((1 << (BIT_X + 1)) - 1) ^ 0xff, /* 1000 0000 */
44
+ OCT_2 = ((1 << (BIT_2 + 1)) - 1) ^ 0xff, /* 1100 0000 */
45
+ OCT_3 = ((1 << (BIT_3 + 1)) - 1) ^ 0xff, /* 1110 0000 */
46
+ OCT_4 = ((1 << (BIT_4 + 1)) - 1) ^ 0xff, /* 1111 0000 */
47
+ OCT_5 = ((1 << (BIT_5 + 1)) - 1) ^ 0xff, /* 1111 1000 */
48
+ OCT_6 = ((1 << (BIT_6 + 1)) - 1) ^ 0xff, /* 1111 1100 */
49
+
50
+ UNI_LEN1 = 0x80,
51
+ UNI_LEN2 = 0x800,
52
+ UNI_LEN3 = 0x10000,
53
+ UNI_LEN4 = 0x200000,
54
+ UNI_LEN5 = 0x4000000,
55
+
56
+ MASK_X = (1 << BIT_X) - 1, /* 0011 1111 */
57
+ TEST_X = MASK_X ^ 0xff, /* 1100 0000 */
58
+ };
59
+
60
+ /* {{{1
61
+ * Determine whether ‘p’ is part of a UTF-8 multi-byte sequence.
62
+ */
63
+ #define CONT_X(p) ((((unsigned char)p) & TEST_X) == OCT_X)
64
+
65
+ /* {{{1
66
+ * Add the bits from ‘p’ to ‘c’, which is first shifted right to make room for
67
+ * the additional bits.
68
+ */
69
+ #define ADD_X(c, p) (((c) << BIT_X) | (((unsigned char)p) & MASK_X))
70
+
71
+ /* {{{1
72
+ * Put bits from ‘c’ into ‘p’ and shift them off of ‘c’ afterwards.
73
+ */
74
+ #define PUT_X(c, p) ((p) = OCT_X | ((c) & MASK_X), (c) >> BIT_X)
75
+
76
+
77
+ /* {{{1
78
+ * s_utf_skip_lengths: This table is used for keeping track of how long a given
79
+ * UTF-8 character sequence is from the contents of the first byte.
80
+ */
81
+ static const uint8_t s_utf_skip_length_data[256] = {
82
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
90
+ };
91
+
92
+
93
+ const char * const s_utf_skip_lengths = (const char *)s_utf_skip_length_data;
94
+
95
+
96
+
97
+ /* {{{1
98
+ * Private function used to calculate the length and mask to use when dealing
99
+ * with a given UTF-8 character sequence.
100
+ */
101
+ static inline void
102
+ _utf_compute(unsigned char c, int *mask, int *len)
103
+ {
104
+ if (c < 0x80) {
105
+ *len = 1;
106
+ *mask = 0x7f;
107
+ } else if ((c & 0xe0) == 0xc0) {
108
+ *len = 2;
109
+ *mask = 0x1f;
110
+ } else if ((c & 0xf0) == 0xe0) {
111
+ *len = 3;
112
+ *mask = 0x0f;
113
+ } else if ((c & 0xf8) == 0xf0) {
114
+ *len = 4;
115
+ *mask = 0x07;
116
+ } else if ((c & 0xfc) == 0xf8) {
117
+ *len = 5;
118
+ *mask = 0x03;
119
+ } else if ((c & 0xfe) == 0xfc) {
120
+ *len = 6;
121
+ *mask = 0x01;
122
+ } else {
123
+ *len = -1;
124
+ }
125
+ }
126
+
127
+ /* {{{1
128
+ * Private function used to figure out the length of the UTF-8 representation
129
+ * of a given Unicode character (UTF-32).
130
+ */
131
+ static inline unsigned short
132
+ _utf_length(const unichar c)
133
+ {
134
+ if (c < UNI_LEN1)
135
+ return 1;
136
+ else if (c < UNI_LEN2)
137
+ return 2;
138
+ else if (c < UNI_LEN3)
139
+ return 3;
140
+ else if (c < UNI_LEN4)
141
+ return 4;
142
+ else if (c < UNI_LEN5)
143
+ return 5;
144
+ else
145
+ return 6;
146
+ }
147
+
148
+ /* {{{1
149
+ * Private function used to retrieve a UTF-32 character from an UTF-8 character
150
+ * sequence given a mask and length previously retrieved with _utf_compute().
151
+ */
152
+ static inline unichar
153
+ _utf_get(const char *str, int mask, int len)
154
+ {
155
+ unichar c = (unsigned char)str[0] & mask;
156
+
157
+ for (int i = 1; i < len; i++) {
158
+ unsigned char ch = ((const unsigned char *)str)[i];
159
+
160
+ if (CONT_X(ch)) {
161
+ c = ADD_X(c, ch);
162
+ } else {
163
+ c = UTF_BAD_INPUT_UNICHAR;
164
+ break;
165
+ }
166
+ }
167
+
168
+ return c;
169
+ }
170
+
171
+
172
+ /* {{{1
173
+ * Retrieve a UTF-32 character from a UTF-8 character sequence.
174
+ */
175
+ unichar
176
+ utf_char(const char *str)
177
+ {
178
+ int mask;
179
+ int len;
180
+
181
+ _utf_compute(*str, &mask, &len);
182
+
183
+ return (len > -1) ? _utf_get(str, mask, len) : UTF_BAD_INPUT_UNICHAR;
184
+ }
185
+
186
+
187
+ /* {{{1
188
+ * TODO
189
+ */
190
+ unichar
191
+ utf_char_n(const char *str, size_t max)
192
+ {
193
+ if (max == 0)
194
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
195
+
196
+ size_t len;
197
+ unichar c = (unsigned char)*str;
198
+
199
+ /* TODO: _utf_compute() here */
200
+ if (c < 0x80) {
201
+ return c;
202
+ } else if (c < 0xc0) {
203
+ return UTF_BAD_INPUT_UNICHAR;
204
+ } else if (c < 0xe0) {
205
+ len = 2;
206
+ c &= 0x1f;
207
+ } else if (c < 0xf0) {
208
+ len = 3;
209
+ c &= 0x0f;
210
+ } else if (c < 0xf8) {
211
+ len = 4;
212
+ c &= 0x07;
213
+ } else if (c < 0xfc) {
214
+ len = 5;
215
+ c &= 0x03;
216
+ } else if (c < 0xfe) {
217
+ len = 6;
218
+ c &= 0x01;
219
+ } else {
220
+ return UTF_BAD_INPUT_UNICHAR;
221
+ }
222
+
223
+ if (len > max) {
224
+ for (size_t i = 1; i < max; i++) {
225
+ if (!CONT_X(str[i]))
226
+ return UTF_BAD_INPUT_UNICHAR;
227
+ }
228
+
229
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
230
+ }
231
+
232
+ for (size_t i = 1; i < len; i++) {
233
+ unsigned char ch = ((const unsigned char *)str)[i];
234
+
235
+ if (!CONT_X(ch))
236
+ return (ch != NUL) ? UTF_BAD_INPUT_UNICHAR : UTF_INCOMPLETE_INPUT_UNICHAR;
237
+
238
+ c = ADD_X(c, ch);
239
+ }
240
+
241
+ return (_utf_length(c) == len) ? c : UTF_BAD_INPUT_UNICHAR;
242
+ }
243
+
244
+
245
+ /* {{{1
246
+ * Retrieve a UTF-32 character from a UTF-8 character sequence. This function
247
+ * does additional checking while converitng, such as not overruning a maximum
248
+ * length and checks for incomplete, invalid or out-of-range characters.
249
+ */
250
+ unichar
251
+ utf_char_validated(const char *str)
252
+ {
253
+ unichar result = utf_char(str);
254
+
255
+ if (result & 0x80000000) {
256
+ return result;
257
+ } else if (!unichar_isvalid(result)) {
258
+ return UTF_BAD_INPUT_UNICHAR;
259
+ } else {
260
+ return result;
261
+ }
262
+ }
263
+
264
+
265
+ /* {{{1 */
266
+ unichar
267
+ utf_char_validated_n(const char *str, size_t max)
268
+ {
269
+ unichar result = utf_char_n(str, max);
270
+
271
+ if (result & 0x80000000) {
272
+ return result;
273
+ } else if (!unichar_isvalid(result)) {
274
+ return UTF_BAD_INPUT_UNICHAR;
275
+ } else {
276
+ return result;
277
+ }
278
+ }
279
+
280
+
281
+ /* {{{1
282
+ * Return a pointer to the next UTF-8 character sequence in ‘str’. This
283
+ * requires that it is at the start of the previous one already and no
284
+ * additional error checking is done.
285
+ */
286
+ /*
287
+ inline char *
288
+ utf_next(const char *str)
289
+ {
290
+ return (char *)str + s_utf_skip_lengths[*(const uchar *)str];
291
+ }
292
+ */
293
+
294
+
295
+ /* {{{1
296
+ * Synchronize and go to the next UTF-8 character sequence in ‘p’. This search
297
+ * will not go beyond ‘end’. ‹NULL› is returned if it couldn't be found.
298
+ */
299
+ char *
300
+ utf_find_next(const char *p, const char *end)
301
+ {
302
+ if (*p != NUL) {
303
+ if (end != NULL) {
304
+ for (p++; p < end && CONT_X(*p); p++) {
305
+ /* this loop intentionally left empty */
306
+ }
307
+ } else {
308
+ for (p++; CONT_X(*p); p++) {
309
+ /* this loop intentionally left empty */
310
+ }
311
+ }
312
+ }
313
+ return (p == end) ? NULL : (char *)p;
314
+ }
315
+
316
+
317
+ /* {{{1
318
+ * Return a pointer to the previous UTF-8 character sequence in ‘str’.
319
+ */
320
+ char *
321
+ utf_prev(const char *p)
322
+ {
323
+ while (true) {
324
+ p--;
325
+
326
+ if (!CONT_X(*p))
327
+ return (char *)p;
328
+ }
329
+ }
330
+
331
+
332
+ /* {{{1
333
+ * Synchronize and go to the previous UTF-8 character sequence in ‘p’. This
334
+ * search will not go beyond ‘begin’. ‹NULL› is returned if it couldn't be
335
+ * found.
336
+ */
337
+ char *
338
+ utf_find_prev(const char *begin, const char *p)
339
+ {
340
+ for (p--; p >= begin; p--) {
341
+ if (!CONT_X(*p))
342
+ return (char *)p;
343
+ }
344
+
345
+ return NULL;
346
+ }
347
+
348
+
349
+ /* {{{1
350
+ * Convert an integer offset to a pointer within ‘str’.
351
+ *
352
+ */
353
+ char *
354
+ utf_offset_to_pointer(const char *str, long offset)
355
+ {
356
+ const char *p = str;
357
+
358
+ if (offset > 0) {
359
+ while (offset-- > 0)
360
+ p = utf_next(p);
361
+ } else {
362
+ while (offset != 0) {
363
+ const char *base = p;
364
+ p += offset;
365
+ while ((*p & 0xc0) == 0x80)
366
+ p--;
367
+
368
+ offset += utf_pointer_to_offset(p, base);
369
+ }
370
+ }
371
+
372
+ return (char *)p;
373
+ }
374
+
375
+
376
+ /* {{{1
377
+ * Convert a pointer to an integer offset within ‘str’.
378
+ */
379
+ long
380
+ utf_pointer_to_offset(const char *str, const char *pos)
381
+ {
382
+ if (pos < str)
383
+ return -utf_pointer_to_offset(pos, str);
384
+
385
+ long offset = 0;
386
+ for (const char *p = str; p < pos; p = utf_next(p))
387
+ offset++;
388
+
389
+ return offset;
390
+ }
391
+
392
+
393
+ /* {{{1
394
+ * Copy the contents of an UTF-8 string to another.
395
+ */
396
+ void
397
+ utf_copy(char *dest, const char *src)
398
+ {
399
+ strcpy(dest, src);
400
+ }
401
+
402
+
403
+ /* {{{1
404
+ * Copy at most n Unicode characters from an UTF-8 string to another. The
405
+ * destination string will be ‹NUL›-terminated properly.
406
+ */
407
+ void
408
+ utf_copy_n(char *dest, const char *src, size_t n)
409
+ {
410
+ const char *p;
411
+
412
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
413
+ /* this loop intentionally left empty */;
414
+ }
415
+
416
+ strncpy(dest, src, p - src);
417
+ dest[p - src] = NUL;
418
+ }
419
+
420
+
421
+ /* {{{1
422
+ * Append an UTF-8 string onto another.
423
+ */
424
+ void
425
+ utf_append(char *dest, const char *src)
426
+ {
427
+ strcat(dest, src);
428
+ }
429
+
430
+
431
+ /* {{{1
432
+ * Append at most ‘n’ Unicode character from an UTF-8 string onto another.
433
+ */
434
+ void
435
+ utf_append_n(char *dest, const char *src, size_t n)
436
+ {
437
+ const char *p;
438
+
439
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
440
+ /* this loop intentionally left empty */;
441
+ }
442
+
443
+ strncat(dest, src, p - src);
444
+ dest[p - src] = NUL;
445
+ }
446
+
447
+
448
+ /* {{{1
449
+ * Compare two strings for ordering using the linguistically correct rules of
450
+ * the current locale.
451
+ */
452
+ int
453
+ utf_collate(const char *a, const char *b)
454
+ {
455
+ assert(a != NULL);
456
+ assert(b != NULL);
457
+
458
+ unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
459
+ unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
460
+
461
+ int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
462
+
463
+ free(a_norm);
464
+ free(b_norm);
465
+
466
+ return result;
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * We need UTF-8 encoding of numbers to encode the weights if
472
+ * we are using wcsxfrm. However, we aren't encoding Unicode
473
+ * characters, so we can't simply use unichar_to_utf.
474
+ *
475
+ * The following routine is taken (with modification) from GNU
476
+ * libc's strxfrm routine:
477
+ *
478
+ * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
479
+ * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
480
+ */
481
+ static inline int
482
+ _utf_encode(char *buf, wchar_t c)
483
+ {
484
+ int retval;
485
+
486
+ if (c < 0x80) {
487
+ if (buf != NULL)
488
+ *buf++ = (char)c;
489
+ retval = 1;
490
+ } else {
491
+ int step;
492
+
493
+ for (step = 2; step < 6; step++) {
494
+ if ((c & (~(uint32_t)0 << (5 * step + 1))) == 0)
495
+ break;
496
+ }
497
+
498
+ retval = step;
499
+
500
+ if (buf != NULL) {
501
+ *buf = (unsigned char)(~0xff >> step);
502
+ step--;
503
+ do {
504
+ c = PUT_X(c, buf[step]);
505
+ } while (--step > 0);
506
+ *buf |= c;
507
+ }
508
+ }
509
+
510
+ return retval;
511
+ }
512
+
513
+
514
+ /* {{{1
515
+ * Generate a collation key from a string which can be compared with other
516
+ * collation keys using str_compare().
517
+ */
518
+ static char *
519
+ utf_collate_key_impl(const char *str, size_t len, bool use_len)
520
+ {
521
+ assert(str != NULL);
522
+
523
+ unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
524
+ size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
525
+ wchar_t result_wc[xfrm_len + 1];
526
+ wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
527
+
528
+ int result_len = 0;
529
+ for (size_t i = 0; i < xfrm_len; i++)
530
+ result_len += _utf_encode(NULL, result_wc[i]);
531
+
532
+ char *result = ALLOC_N(char, result_len + 1);
533
+ result_len = 0;
534
+ for (size_t i = 0; i < xfrm_len; i++)
535
+ result_len += _utf_encode(result + result_len, result_wc[i]);
536
+ result[result_len] = NUL;
537
+
538
+ free(str_norm);
539
+
540
+ return result;
541
+ }
542
+
543
+
544
+ /* {{{1
545
+ * Generate a collation key from a string which can be compared with other
546
+ * collation keys using str_compare().
547
+ */
548
+ char *
549
+ utf_collate_key(const char *str)
550
+ {
551
+ return utf_collate_key_impl(str, 0, false);
552
+ }
553
+
554
+
555
+ /* {{{1
556
+ * Generate a collation key from a string (of length ‘len’) which can be
557
+ * compared with other collation keys using str_compare().
558
+ */
559
+ char *
560
+ utf_collate_key_n(const char *str, size_t len)
561
+ {
562
+ return utf_collate_key_impl(str, len, true);
563
+ }
564
+
565
+
566
+ /* {{{1
567
+ * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
568
+ * ‘haystack_len’.
569
+ */
570
+ static int
571
+ str_index_n(const char *haystack, const char *needle, size_t haystack_len)
572
+ {
573
+ assert(haystack != NULL);
574
+ assert(needle != NULL);
575
+
576
+ size_t needle_len = strlen(needle);
577
+
578
+ if (needle_len == 0)
579
+ return 0;
580
+
581
+ if (haystack_len < needle_len)
582
+ return -1;
583
+
584
+ const char *end = haystack + haystack_len - needle_len;
585
+ for (const char *p = haystack; *p != '\0' && p <= end; p++) {
586
+ size_t i;
587
+
588
+ for (i = 0; i < needle_len; i++) {
589
+ if (p[i] != needle[i])
590
+ break;
591
+ }
592
+
593
+ if (i == needle_len)
594
+ return p - haystack;
595
+ }
596
+
597
+ return -1;
598
+ }
599
+
600
+
601
+ /* {{{1
602
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
603
+ * ‘haystack’, or -1 if it doesn't exist.
604
+ */
605
+ static int
606
+ str_rindex(const char *haystack, const char *needle)
607
+ {
608
+ assert(haystack != NULL);
609
+ assert(needle != NULL);
610
+
611
+ size_t needle_len = strlen(needle);
612
+ size_t haystack_len = strlen(haystack);
613
+
614
+ if (needle_len == 0)
615
+ return haystack_len;
616
+
617
+ if (haystack_len < needle_len)
618
+ return -1;
619
+
620
+ for (const char *p = haystack + haystack_len - needle_len; p >= haystack; p--) {
621
+ size_t i;
622
+
623
+ for (i = 0; i < needle_len; i++) {
624
+ if (p[i] != needle[i])
625
+ break;
626
+ }
627
+
628
+ if (i == needle_len)
629
+ return p - haystack;
630
+ }
631
+
632
+ return -1;
633
+ }
634
+
635
+
636
+ /* {{{1
637
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
638
+ * ‘haystack’, or -1 if it doesn't exist.
639
+ */
640
+ static int
641
+ str_rindex_n(const char *haystack, const char *needle, size_t haystack_len)
642
+ {
643
+ assert(haystack != NULL);
644
+ assert(needle != NULL);
645
+
646
+ size_t needle_len = strlen(needle);
647
+ const char *haystack_max = haystack + haystack_len;
648
+ const char *p = haystack;
649
+
650
+ while (p < haystack_max && *p != '\0')
651
+ p++;
652
+
653
+ if (p < haystack + needle_len)
654
+ return -1;
655
+
656
+ p -= needle_len;
657
+
658
+ for ( ; p >= haystack; p--) {
659
+ size_t i;
660
+
661
+ for (i = 0; i < needle_len; i++) {
662
+ if (p[i] != needle[i])
663
+ break;
664
+ }
665
+
666
+ if (i == needle_len)
667
+ return p - haystack;
668
+ }
669
+
670
+ return -1;
671
+ }
672
+
673
+
674
+ /* {{{1
675
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
676
+ * doesn't exist.
677
+ */
678
+ int
679
+ utf_char_index(const char *str, unichar c)
680
+ {
681
+ char ch[7];
682
+
683
+ ch[unichar_to_utf(c, ch)] = NUL;
684
+ char *p = strstr(str, ch);
685
+ return (p != NULL) ? p - str : -1;
686
+ }
687
+
688
+
689
+ /* {{{1
690
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
691
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
692
+ */
693
+ int
694
+ utf_char_index_n(const char *str, unichar c, size_t len)
695
+ {
696
+ char ch[7];
697
+
698
+ ch[unichar_to_utf(c, ch)] = NUL;
699
+
700
+ return str_index_n(str, ch, len);
701
+ }
702
+
703
+
704
+ /* {{{1
705
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
706
+ * doesn't exist.
707
+ */
708
+ int
709
+ utf_char_rindex(const char *str, unichar c)
710
+ {
711
+ char ch[7];
712
+
713
+ ch[unichar_to_utf(c, ch)] = NUL;
714
+
715
+ return str_rindex(str, ch);
716
+ }
717
+
718
+
719
+ /* {{{1
720
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
721
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
722
+ */
723
+ int
724
+ utf_char_rindex_n(const char *str, unichar c, size_t len)
725
+ {
726
+ char ch[7];
727
+
728
+ ch[unichar_to_utf(c, ch)] = NUL;
729
+
730
+ return str_rindex_n(str, ch, len);
731
+ }
732
+
733
+
734
+ /* {{{1
735
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
736
+ * -1 if it doesn't exist.
737
+ */
738
+ int
739
+ utf_index(const char *haystack, const char *needle)
740
+ {
741
+ return strstr(haystack, needle) - haystack;
742
+ }
743
+
744
+
745
+ /* {{{1
746
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
747
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
748
+ */
749
+ int
750
+ utf_index_n(const char *haystack, const char *needle, size_t len)
751
+ {
752
+ return str_index_n(haystack, needle, len);
753
+ }
754
+
755
+
756
+ /* {{{1
757
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
758
+ * -1 if it doesn't exist.
759
+ */
760
+ int
761
+ utf_rindex(const char *haystack, const char *needle)
762
+ {
763
+ return str_rindex(haystack, needle);
764
+ }
765
+
766
+
767
+ /* {{{1
768
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
769
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
770
+ */
771
+ int
772
+ utf_rindex_n(const char *haystack, const char *needle, size_t len)
773
+ {
774
+ return str_rindex_n(haystack, needle, len);
775
+ }
776
+
777
+
778
+ /* {{{1
779
+ * Check if the given string begins with ‘prefix’.
780
+ */
781
+ bool
782
+ utf_has_prefix(const char *str, const char *prefix)
783
+ {
784
+ assert(str != NULL);
785
+ assert(prefix != NULL);
786
+
787
+ do {
788
+ if (*prefix == NUL)
789
+ return true;
790
+ else if (*str == NUL)
791
+ return false;
792
+ } while (*str++ == *prefix++);
793
+
794
+ return false;
795
+ }
796
+
797
+
798
+ /* {{{1
799
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
800
+ */
801
+ long
802
+ utf_length(const char *str)
803
+ {
804
+ assert(str != NULL);
805
+
806
+ long n = 0;
807
+ const char *p = str;
808
+ while (*p != '\0') {
809
+ n++;
810
+ p = utf_next(p);
811
+ }
812
+
813
+ return n;
814
+ }
815
+
816
+
817
+ /* {{{1
818
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
819
+ * ‘len’ bytes.
820
+ */
821
+ long
822
+ utf_length_n(const char *str, long len)
823
+ {
824
+ assert(str != NULL || len == 0);
825
+
826
+ if (len == 0)
827
+ return 0;
828
+
829
+ long n = 0;
830
+ const char *p = str;
831
+ const char *end = str + len;
832
+ while (p < end) {
833
+ n++;
834
+ p = utf_next(p);
835
+ }
836
+
837
+ /* This makes sure that we don’t count incomplete characters. It won’t
838
+ * save us from illegal UTF-8-sequences, however. */
839
+ if (p > end)
840
+ n--;
841
+
842
+ return n;
843
+ }
844
+
845
+
846
+ /* {{{1
847
+ * Retrieve the number of bytes making up the given UTF-8 string.
848
+ */
849
+ size_t
850
+ utf_byte_length(const char *str)
851
+ {
852
+ return strlen(str);
853
+ }
854
+
855
+
856
+ /* {{{1
857
+ * The real implementation of utf_reverse() and utf_reverse_n() below.
858
+ */
859
+ static char *
860
+ utf_reverse_impl(const char *str, size_t len, bool use_len)
861
+ {
862
+ if (!use_len)
863
+ len = utf_byte_length(str);
864
+
865
+ char *result = ALLOC_N(char, len + 1);
866
+ char *r = result + len;
867
+ const char *p = str;
868
+ while (r > result) {
869
+ uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
870
+ r -= skip;
871
+ for (char *m = r; skip > 0; skip--)
872
+ *m++ = *p++;
873
+ }
874
+ result[len] = 0;
875
+
876
+ return result;
877
+ }
878
+
879
+
880
+ /* {{{1
881
+ * Return a new string which is ‘str’ reversed.
882
+ */
883
+ char *
884
+ utf_reverse(const char *str)
885
+ {
886
+ return utf_reverse_impl(str, 0, false);
887
+ }
888
+
889
+
890
+ /* {{{1
891
+ * Return a new string which is ‘str’ reversed, examining at most ‘len’ bytes
892
+ * of it.
893
+ */
894
+ char *
895
+ utf_reverse_n(const char *str, size_t len)
896
+ {
897
+ return utf_reverse_impl(str, len, true);
898
+ }
899
+
900
+
901
+ /* {{{1
902
+ * The real implementation of utf_isvalid() and utf_isvalid_n() below.
903
+ *
904
+ * TODO: this needs optimizing. Look at glib's new optimized implementation
905
+ * (2.6.0) and also separate the ‘use_max’ into two cases.
906
+ */
907
+ #define CONTINUATION_CHAR do { \
908
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
909
+ goto error; \
910
+ val <<= 6; \
911
+ val |= (*(unsigned char *)p) & 0x3f; \
912
+ } while (0);
913
+
914
+ static const char *
915
+ fast_validate(const char *str)
916
+ {
917
+ unichar val = 0;
918
+ unichar min = 0;
919
+ const char *p;
920
+
921
+ for (p = str; *p != NUL; p++) {
922
+ if (*(unsigned char *)p < 128)
923
+ continue;
924
+
925
+ const char *last = p;
926
+
927
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
928
+ if ((*(unsigned char *)p & 0x1e) == 0)
929
+ goto error;
930
+ p++;
931
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
932
+ goto error;
933
+ } else {
934
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
935
+ min = (1 << 11);
936
+ val = *(unsigned char *)p & 0x0f;
937
+ goto two_remaining;
938
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
939
+ min = (1 << 16);
940
+ val = *(unsigned char *)p & 0x07;
941
+ } else {
942
+ goto error;
943
+ }
944
+
945
+ p++;
946
+ CONTINUATION_CHAR;
947
+ two_remaining:
948
+ p++;
949
+ CONTINUATION_CHAR;
950
+ p++;
951
+ CONTINUATION_CHAR;
952
+
953
+ if (val < min)
954
+ goto error;
955
+
956
+ if (!UNICODE_ISVALID(val))
957
+ goto error;
958
+ }
959
+
960
+ continue;
961
+ error:
962
+ return last;
963
+ }
964
+
965
+ return p;
966
+ }
967
+
968
+ static const char *
969
+ fast_validate_len(const char *str, size_t max_len)
970
+ {
971
+ unichar val = 0;
972
+ unichar min = 0;
973
+ const char *p;
974
+
975
+ for (p = str; (size_t)(p - str) < max_len && *p != NUL; p++) {
976
+ if (*(unsigned char *)p < 128)
977
+ continue;
978
+
979
+ const char *last = p;
980
+
981
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
982
+ if (max_len - (p - str) < 2)
983
+ goto error;
984
+
985
+ if ((*(unsigned char *)p & 0x1e) == 0)
986
+ goto error;
987
+ p++;
988
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
989
+ goto error;
990
+ } else {
991
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
992
+ if (max_len - (p - str) < 3)
993
+ goto error;
994
+
995
+ min = (1 << 11);
996
+ val = *(unsigned char *)p & 0x0f;
997
+ goto two_remaining;
998
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
999
+ if (max_len - (p - str) < 4)
1000
+ goto error;
1001
+
1002
+ min = (1 << 16);
1003
+ val = *(unsigned char *)p & 0x07;
1004
+ } else {
1005
+ goto error;
1006
+ }
1007
+
1008
+ p++;
1009
+ CONTINUATION_CHAR;
1010
+ two_remaining:
1011
+ p++;
1012
+ CONTINUATION_CHAR;
1013
+ p++;
1014
+ CONTINUATION_CHAR;
1015
+
1016
+ if (val < min)
1017
+ goto error;
1018
+ if (!UNICODE_ISVALID(val))
1019
+ goto error;
1020
+ }
1021
+
1022
+ continue;
1023
+ error:
1024
+ return last;
1025
+ }
1026
+
1027
+ return p;
1028
+ }
1029
+
1030
+
1031
+ /* {{{1
1032
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence.
1033
+ */
1034
+ bool
1035
+ utf_isvalid(const char *str)
1036
+ {
1037
+ const char *p = fast_validate(str);
1038
+
1039
+ return *p == NUL;
1040
+ }
1041
+
1042
+
1043
+ /* {{{1
1044
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence, examining at
1045
+ * most ‘max’ bytes. If it turns out ‘str’ isn't a valid UTF-8 character
1046
+ * sequence and ‘end’ is non-‹NULL›, ‘end’ is set to the end of the valid range
1047
+ * of bytes in ‘str’.
1048
+ */
1049
+ bool
1050
+ utf_isvalid_n(const char *str, size_t max, const char **end)
1051
+ {
1052
+ const char *p = fast_validate_len(str, max);
1053
+
1054
+ if (end != NULL)
1055
+ *end = p;
1056
+
1057
+ return p == str + max;
1058
+ }
1059
+
1060
+
1061
+ /* {{{1
1062
+ * Check whether ‘c’ is a valid Unicode character.
1063
+ */
1064
+ bool
1065
+ unichar_isvalid(unichar c)
1066
+ {
1067
+ return UNICODE_ISVALID(c);
1068
+ }
1069
+
1070
+
1071
+ /* {{{1
1072
+ * Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
1073
+ * store it in ‘result’, returning the length of the stored sequence.
1074
+ */
1075
+ int
1076
+ unichar_to_utf(unichar c, char *result)
1077
+ {
1078
+ int len = 0;
1079
+ int first;
1080
+
1081
+ if (c < UNI_LEN1) {
1082
+ first = 0;
1083
+ len = 1;
1084
+ } else if (c < UNI_LEN2) {
1085
+ first = 0xc0;
1086
+ len = 2;
1087
+ } else if (c < UNI_LEN3) {
1088
+ first = 0xe0;
1089
+ len = 3;
1090
+ } else if (c < UNI_LEN4) {
1091
+ first = 0xf0;
1092
+ len = 4;
1093
+ } else if (c < UNI_LEN5) {
1094
+ first = 0xf8;
1095
+ len = 5;
1096
+ } else {
1097
+ first = 0xfc;
1098
+ len = 6;
1099
+ }
1100
+
1101
+ if (result != NULL) {
1102
+ for (int i = len - 1; i > 0; i--)
1103
+ c = PUT_X(c, result[i]);
1104
+
1105
+ result[0] = c | first;
1106
+ }
1107
+
1108
+ return len;
1109
+ }
1110
+
1111
+
1112
+ /* {{{1
1113
+ * The real implementation of ucs4_to_utf8() and ucs4_to_utf8_n() below.
1114
+ */
1115
+ static char *
1116
+ ucs4_to_utf8_n_impl(unichar *str, size_t len, bool use_len,
1117
+ size_t *items_read, size_t *items_written)
1118
+ {
1119
+ size_t result_len = 0;
1120
+ char *result = NULL, *p;
1121
+
1122
+ for (size_t i = 0; (!use_len || i < len) && str[i] != NUL; i++) {
1123
+ if (str[i] >= 0x80000000) {
1124
+ if (items_read != NULL)
1125
+ *items_read = i;
1126
+
1127
+ rb_raise(rb_eArgError, "UCS-4 input contains character outside of range for UTF-8 (%lc))", str[i]);
1128
+ }
1129
+
1130
+ result_len += _utf_length(str[i]);
1131
+ }
1132
+
1133
+ p = result = ALLOC_N(char, result_len + 1);
1134
+ size_t i;
1135
+ for (i = 0; p < result + result_len; i++)
1136
+ p += unichar_to_utf(str[i], p);
1137
+ *p = NUL;
1138
+
1139
+ if (items_written != NULL)
1140
+ *items_written = p - result;
1141
+ if (items_read != NULL)
1142
+ *items_read = i;
1143
+
1144
+ return result;
1145
+ }
1146
+
1147
+ /* {{{1
1148
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1149
+ * store the number of characters read and bytes written in ‘items_read’ and
1150
+ * ‘items_written’ respectivelly.
1151
+ */
1152
+ char *
1153
+ ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written)
1154
+ {
1155
+ return ucs4_to_utf8_n_impl(str, 0, false, items_read, items_written);
1156
+ }
1157
+
1158
+ /* {{{1
1159
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1160
+ * store the number of characters read and bytes written in ‘items_read’ and
1161
+ * ‘items_written’ respectivelly. Examine at most ‘len’ characters from ‘str’.
1162
+ */
1163
+ char *
1164
+ ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written)
1165
+ {
1166
+ return ucs4_to_utf8_n_impl(str, len, true, items_read, items_written);
1167
+ }
1168
+
1169
+
1170
+ /* {{{1
1171
+ * The real implementation of utf8_to_ucs4_fast() and utf8_to_ucs4_fast_n()
1172
+ * below.
1173
+ */
1174
+ static unichar *
1175
+ utf8_to_ucs4_fast_impl(const char *str, size_t len, bool use_len, size_t *items_written)
1176
+ {
1177
+ assert(str != NULL);
1178
+
1179
+ const char *p = str;
1180
+ size_t n = 0;
1181
+ if (use_len) {
1182
+ while (p < str + len && *p != NUL) {
1183
+ p = utf_next(p);
1184
+ n++;
1185
+ }
1186
+ } else {
1187
+ while (p != NUL) {
1188
+ p = utf_next(p);
1189
+ n++;
1190
+ }
1191
+ }
1192
+
1193
+ unichar *result = ALLOC_N(unichar, n + 1);
1194
+ p = str;
1195
+ size_t i;
1196
+ for (i = 0; i < n; i++) {
1197
+ unichar c = ((unsigned char *)p)[0];
1198
+ int c_len;
1199
+
1200
+ if (c < 0x80) {
1201
+ result[i] = c;
1202
+ p++;
1203
+ } else {
1204
+ /* TODO: use _utf_compute() here */
1205
+ if (c < 0xe0) {
1206
+ c_len = 2;
1207
+ c &= 0x1f;
1208
+ } else if (c < 0xf0) {
1209
+ c_len = 3;
1210
+ c &= 0x0f;
1211
+ } else if (c < 0xf8) {
1212
+ c_len = 4;
1213
+ c &= 0x07;
1214
+ } else if (c < 0xfc) {
1215
+ c_len = 5;
1216
+ c &= 0x03;
1217
+ } else {
1218
+ c_len = 6;
1219
+ c &= 0x01;
1220
+ }
1221
+
1222
+ for (int j = 1; j < c_len; j++) {
1223
+ c <<= BIT_X;
1224
+ c |= ((unsigned char *)p)[j] & MASK_X;
1225
+ }
1226
+
1227
+ result[i] = c;
1228
+ p += c_len;
1229
+ }
1230
+ }
1231
+ result[i] = NUL;
1232
+
1233
+ if (items_written != NULL)
1234
+ *items_written = i;
1235
+
1236
+ return result;
1237
+ }
1238
+
1239
+
1240
+ /* {{{1
1241
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1242
+ * the number of characters written in ‘items_written’.
1243
+ */
1244
+ unichar *
1245
+ utf8_to_ucs4_fast(const char *str, size_t *items_written)
1246
+ {
1247
+ return utf8_to_ucs4_fast_impl(str, 0, false, items_written);
1248
+ }
1249
+
1250
+
1251
+ /* {{{1
1252
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1253
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1254
+ * bytes from ‘str’.
1255
+ */
1256
+ unichar *
1257
+ utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written)
1258
+ {
1259
+ return utf8_to_ucs4_fast_impl(str, len, true, items_written);
1260
+ }
1261
+
1262
+
1263
+ /* {{{1
1264
+ * The real implementation of utf8_to_ucs4() and utf8_to_ucs4_n() below.
1265
+ */
1266
+ static unichar *
1267
+ utf8_to_ucs4_impl(const char *str, size_t len, bool use_len, size_t *items_read, size_t *items_written)
1268
+ {
1269
+ size_t n = 0;
1270
+ const char *p = str;
1271
+ for (; (!use_len || str + len - p > 0) && *p != NUL; p = utf_next(p)) {
1272
+ unichar c = utf_char_n(p, str + len - p);
1273
+ if (c & 0x80000000) {
1274
+ if (c == UTF_INCOMPLETE_INPUT_UNICHAR) {
1275
+ if (items_read != NULL)
1276
+ break;
1277
+
1278
+ rb_raise(rb_eArgError, "partial character sequence in UTF-8 input");
1279
+ } else {
1280
+ rb_raise(rb_eArgError, "UTF-8 input contains character outside of range for UTF-8 (%lc))", c);
1281
+ }
1282
+
1283
+ if (items_read != NULL)
1284
+ *items_read = p - str;
1285
+
1286
+ return NULL;
1287
+ } else {
1288
+ n++;
1289
+ }
1290
+ }
1291
+
1292
+ unichar *result = ALLOC_N(unichar, n + 1);
1293
+ size_t i;
1294
+ for (i = 0, p = str; i < n; i++) {
1295
+ result[i] = utf_char(p);
1296
+ p = utf_next(p);
1297
+ }
1298
+ result[i] = NUL;
1299
+
1300
+ if (items_written != NULL)
1301
+ *items_written = n;
1302
+ if (items_read != NULL)
1303
+ *items_read = p - str;
1304
+
1305
+ return result;
1306
+ }
1307
+
1308
+
1309
+ /* {{{1
1310
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1311
+ * the number of characters written in ‘items_written’. This function does
1312
+ * additional error-checking on the input.
1313
+ */
1314
+ unichar *
1315
+ utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written)
1316
+ {
1317
+ return utf8_to_ucs4_impl(str, 0, false, items_read, items_written);
1318
+ }
1319
+
1320
+
1321
+ /* {{{1
1322
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1323
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1324
+ * bytes from ‘str’. This function does additional error-checking on the
1325
+ * input.
1326
+ */
1327
+ unichar *
1328
+ utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written)
1329
+ {
1330
+ return utf8_to_ucs4_impl(str, len, true, items_read, items_written);
1331
+ }
1332
+
1333
+
1334
+ /* }}}1 */