u 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,1334 @@
1
+ /*
2
+ * contents: UTF-8 string operations.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+
8
+ #include <ruby.h>
9
+ #include <assert.h>
10
+ #include <locale.h>
11
+ #include <stdbool.h>
12
+ #include <stddef.h>
13
+ #include <stdint.h>
14
+ #include <stdlib.h>
15
+ #include <string.h>
16
+ #include <wchar.h>
17
+
18
+ #include "unicode.h"
19
+ #include "private.h"
20
+
21
+
22
+ #define UNICODE_ISVALID(char) \
23
+ ((char) < 0x110000 && \
24
+ (((char) & 0xffffff800) != 0xd800) && \
25
+ ((char) < 0xfdd0 || (char) > 0xfdef) && \
26
+ ((char) & 0xfffe) != 0xfffe)
27
+
28
+
29
+ /* {{{1
30
+ * These are a couple of constants we use for dealing with the bit-twiddling
31
+ * necessary when dealing with UTF-8 character sequences.
32
+ */
33
+ enum {
34
+ BIT_1 = 7,
35
+ BIT_X = 6,
36
+ BIT_2 = 5,
37
+ BIT_3 = 4,
38
+ BIT_4 = 3,
39
+ BIT_5 = 2,
40
+ BIT_6 = 1,
41
+
42
+ OCT_1 = ((1 << (BIT_1 + 1)) - 1) ^ 0xff, /* 0000 0000 */
43
+ OCT_X = ((1 << (BIT_X + 1)) - 1) ^ 0xff, /* 1000 0000 */
44
+ OCT_2 = ((1 << (BIT_2 + 1)) - 1) ^ 0xff, /* 1100 0000 */
45
+ OCT_3 = ((1 << (BIT_3 + 1)) - 1) ^ 0xff, /* 1110 0000 */
46
+ OCT_4 = ((1 << (BIT_4 + 1)) - 1) ^ 0xff, /* 1111 0000 */
47
+ OCT_5 = ((1 << (BIT_5 + 1)) - 1) ^ 0xff, /* 1111 1000 */
48
+ OCT_6 = ((1 << (BIT_6 + 1)) - 1) ^ 0xff, /* 1111 1100 */
49
+
50
+ UNI_LEN1 = 0x80,
51
+ UNI_LEN2 = 0x800,
52
+ UNI_LEN3 = 0x10000,
53
+ UNI_LEN4 = 0x200000,
54
+ UNI_LEN5 = 0x4000000,
55
+
56
+ MASK_X = (1 << BIT_X) - 1, /* 0011 1111 */
57
+ TEST_X = MASK_X ^ 0xff, /* 1100 0000 */
58
+ };
59
+
60
+ /* {{{1
61
+ * Determine whether ‘p’ is part of a UTF-8 multi-byte sequence.
62
+ */
63
+ #define CONT_X(p) ((((unsigned char)p) & TEST_X) == OCT_X)
64
+
65
+ /* {{{1
66
+ * Add the bits from ‘p’ to ‘c’, which is first shifted right to make room for
67
+ * the additional bits.
68
+ */
69
+ #define ADD_X(c, p) (((c) << BIT_X) | (((unsigned char)p) & MASK_X))
70
+
71
+ /* {{{1
72
+ * Put bits from ‘c’ into ‘p’ and shift them off of ‘c’ afterwards.
73
+ */
74
+ #define PUT_X(c, p) ((p) = OCT_X | ((c) & MASK_X), (c) >> BIT_X)
75
+
76
+
77
+ /* {{{1
78
+ * s_utf_skip_lengths: This table is used for keeping track of how long a given
79
+ * UTF-8 character sequence is from the contents of the first byte.
80
+ */
81
+ static const uint8_t s_utf_skip_length_data[256] = {
82
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
90
+ };
91
+
92
+
93
+ const char * const s_utf_skip_lengths = (const char *)s_utf_skip_length_data;
94
+
95
+
96
+
97
+ /* {{{1
98
+ * Private function used to calculate the length and mask to use when dealing
99
+ * with a given UTF-8 character sequence.
100
+ */
101
+ static inline void
102
+ _utf_compute(unsigned char c, int *mask, int *len)
103
+ {
104
+ if (c < 0x80) {
105
+ *len = 1;
106
+ *mask = 0x7f;
107
+ } else if ((c & 0xe0) == 0xc0) {
108
+ *len = 2;
109
+ *mask = 0x1f;
110
+ } else if ((c & 0xf0) == 0xe0) {
111
+ *len = 3;
112
+ *mask = 0x0f;
113
+ } else if ((c & 0xf8) == 0xf0) {
114
+ *len = 4;
115
+ *mask = 0x07;
116
+ } else if ((c & 0xfc) == 0xf8) {
117
+ *len = 5;
118
+ *mask = 0x03;
119
+ } else if ((c & 0xfe) == 0xfc) {
120
+ *len = 6;
121
+ *mask = 0x01;
122
+ } else {
123
+ *len = -1;
124
+ }
125
+ }
126
+
127
+ /* {{{1
128
+ * Private function used to figure out the length of the UTF-8 representation
129
+ * of a given Unicode character (UTF-32).
130
+ */
131
+ static inline unsigned short
132
+ _utf_length(const unichar c)
133
+ {
134
+ if (c < UNI_LEN1)
135
+ return 1;
136
+ else if (c < UNI_LEN2)
137
+ return 2;
138
+ else if (c < UNI_LEN3)
139
+ return 3;
140
+ else if (c < UNI_LEN4)
141
+ return 4;
142
+ else if (c < UNI_LEN5)
143
+ return 5;
144
+ else
145
+ return 6;
146
+ }
147
+
148
+ /* {{{1
149
+ * Private function used to retrieve a UTF-32 character from an UTF-8 character
150
+ * sequence given a mask and length previously retrieved with _utf_compute().
151
+ */
152
+ static inline unichar
153
+ _utf_get(const char *str, int mask, int len)
154
+ {
155
+ unichar c = (unsigned char)str[0] & mask;
156
+
157
+ for (int i = 1; i < len; i++) {
158
+ unsigned char ch = ((const unsigned char *)str)[i];
159
+
160
+ if (CONT_X(ch)) {
161
+ c = ADD_X(c, ch);
162
+ } else {
163
+ c = UTF_BAD_INPUT_UNICHAR;
164
+ break;
165
+ }
166
+ }
167
+
168
+ return c;
169
+ }
170
+
171
+
172
+ /* {{{1
173
+ * Retrieve a UTF-32 character from a UTF-8 character sequence.
174
+ */
175
+ unichar
176
+ utf_char(const char *str)
177
+ {
178
+ int mask;
179
+ int len;
180
+
181
+ _utf_compute(*str, &mask, &len);
182
+
183
+ return (len > -1) ? _utf_get(str, mask, len) : UTF_BAD_INPUT_UNICHAR;
184
+ }
185
+
186
+
187
+ /* {{{1
188
+ * TODO
189
+ */
190
+ unichar
191
+ utf_char_n(const char *str, size_t max)
192
+ {
193
+ if (max == 0)
194
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
195
+
196
+ size_t len;
197
+ unichar c = (unsigned char)*str;
198
+
199
+ /* TODO: _utf_compute() here */
200
+ if (c < 0x80) {
201
+ return c;
202
+ } else if (c < 0xc0) {
203
+ return UTF_BAD_INPUT_UNICHAR;
204
+ } else if (c < 0xe0) {
205
+ len = 2;
206
+ c &= 0x1f;
207
+ } else if (c < 0xf0) {
208
+ len = 3;
209
+ c &= 0x0f;
210
+ } else if (c < 0xf8) {
211
+ len = 4;
212
+ c &= 0x07;
213
+ } else if (c < 0xfc) {
214
+ len = 5;
215
+ c &= 0x03;
216
+ } else if (c < 0xfe) {
217
+ len = 6;
218
+ c &= 0x01;
219
+ } else {
220
+ return UTF_BAD_INPUT_UNICHAR;
221
+ }
222
+
223
+ if (len > max) {
224
+ for (size_t i = 1; i < max; i++) {
225
+ if (!CONT_X(str[i]))
226
+ return UTF_BAD_INPUT_UNICHAR;
227
+ }
228
+
229
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
230
+ }
231
+
232
+ for (size_t i = 1; i < len; i++) {
233
+ unsigned char ch = ((const unsigned char *)str)[i];
234
+
235
+ if (!CONT_X(ch))
236
+ return (ch != NUL) ? UTF_BAD_INPUT_UNICHAR : UTF_INCOMPLETE_INPUT_UNICHAR;
237
+
238
+ c = ADD_X(c, ch);
239
+ }
240
+
241
+ return (_utf_length(c) == len) ? c : UTF_BAD_INPUT_UNICHAR;
242
+ }
243
+
244
+
245
+ /* {{{1
246
+ * Retrieve a UTF-32 character from a UTF-8 character sequence. This function
247
+ * does additional checking while converitng, such as not overruning a maximum
248
+ * length and checks for incomplete, invalid or out-of-range characters.
249
+ */
250
+ unichar
251
+ utf_char_validated(const char *str)
252
+ {
253
+ unichar result = utf_char(str);
254
+
255
+ if (result & 0x80000000) {
256
+ return result;
257
+ } else if (!unichar_isvalid(result)) {
258
+ return UTF_BAD_INPUT_UNICHAR;
259
+ } else {
260
+ return result;
261
+ }
262
+ }
263
+
264
+
265
+ /* {{{1 */
266
+ unichar
267
+ utf_char_validated_n(const char *str, size_t max)
268
+ {
269
+ unichar result = utf_char_n(str, max);
270
+
271
+ if (result & 0x80000000) {
272
+ return result;
273
+ } else if (!unichar_isvalid(result)) {
274
+ return UTF_BAD_INPUT_UNICHAR;
275
+ } else {
276
+ return result;
277
+ }
278
+ }
279
+
280
+
281
+ /* {{{1
282
+ * Return a pointer to the next UTF-8 character sequence in ‘str’. This
283
+ * requires that it is at the start of the previous one already and no
284
+ * additional error checking is done.
285
+ */
286
+ /*
287
+ inline char *
288
+ utf_next(const char *str)
289
+ {
290
+ return (char *)str + s_utf_skip_lengths[*(const uchar *)str];
291
+ }
292
+ */
293
+
294
+
295
+ /* {{{1
296
+ * Synchronize and go to the next UTF-8 character sequence in ‘p’. This search
297
+ * will not go beyond ‘end’. ‹NULL› is returned if it couldn't be found.
298
+ */
299
+ char *
300
+ utf_find_next(const char *p, const char *end)
301
+ {
302
+ if (*p != NUL) {
303
+ if (end != NULL) {
304
+ for (p++; p < end && CONT_X(*p); p++) {
305
+ /* this loop intentionally left empty */
306
+ }
307
+ } else {
308
+ for (p++; CONT_X(*p); p++) {
309
+ /* this loop intentionally left empty */
310
+ }
311
+ }
312
+ }
313
+ return (p == end) ? NULL : (char *)p;
314
+ }
315
+
316
+
317
+ /* {{{1
318
+ * Return a pointer to the previous UTF-8 character sequence in ‘str’.
319
+ */
320
+ char *
321
+ utf_prev(const char *p)
322
+ {
323
+ while (true) {
324
+ p--;
325
+
326
+ if (!CONT_X(*p))
327
+ return (char *)p;
328
+ }
329
+ }
330
+
331
+
332
+ /* {{{1
333
+ * Synchronize and go to the previous UTF-8 character sequence in ‘p’. This
334
+ * search will not go beyond ‘begin’. ‹NULL› is returned if it couldn't be
335
+ * found.
336
+ */
337
+ char *
338
+ utf_find_prev(const char *begin, const char *p)
339
+ {
340
+ for (p--; p >= begin; p--) {
341
+ if (!CONT_X(*p))
342
+ return (char *)p;
343
+ }
344
+
345
+ return NULL;
346
+ }
347
+
348
+
349
+ /* {{{1
350
+ * Convert an integer offset to a pointer within ‘str’.
351
+ *
352
+ */
353
+ char *
354
+ utf_offset_to_pointer(const char *str, long offset)
355
+ {
356
+ const char *p = str;
357
+
358
+ if (offset > 0) {
359
+ while (offset-- > 0)
360
+ p = utf_next(p);
361
+ } else {
362
+ while (offset != 0) {
363
+ const char *base = p;
364
+ p += offset;
365
+ while ((*p & 0xc0) == 0x80)
366
+ p--;
367
+
368
+ offset += utf_pointer_to_offset(p, base);
369
+ }
370
+ }
371
+
372
+ return (char *)p;
373
+ }
374
+
375
+
376
+ /* {{{1
377
+ * Convert a pointer to an integer offset within ‘str’.
378
+ */
379
+ long
380
+ utf_pointer_to_offset(const char *str, const char *pos)
381
+ {
382
+ if (pos < str)
383
+ return -utf_pointer_to_offset(pos, str);
384
+
385
+ long offset = 0;
386
+ for (const char *p = str; p < pos; p = utf_next(p))
387
+ offset++;
388
+
389
+ return offset;
390
+ }
391
+
392
+
393
+ /* {{{1
394
+ * Copy the contents of an UTF-8 string to another.
395
+ */
396
+ void
397
+ utf_copy(char *dest, const char *src)
398
+ {
399
+ strcpy(dest, src);
400
+ }
401
+
402
+
403
+ /* {{{1
404
+ * Copy at most n Unicode characters from an UTF-8 string to another. The
405
+ * destination string will be ‹NUL›-terminated properly.
406
+ */
407
+ void
408
+ utf_copy_n(char *dest, const char *src, size_t n)
409
+ {
410
+ const char *p;
411
+
412
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
413
+ /* this loop intentionally left empty */;
414
+ }
415
+
416
+ strncpy(dest, src, p - src);
417
+ dest[p - src] = NUL;
418
+ }
419
+
420
+
421
+ /* {{{1
422
+ * Append an UTF-8 string onto another.
423
+ */
424
+ void
425
+ utf_append(char *dest, const char *src)
426
+ {
427
+ strcat(dest, src);
428
+ }
429
+
430
+
431
+ /* {{{1
432
+ * Append at most ‘n’ Unicode character from an UTF-8 string onto another.
433
+ */
434
+ void
435
+ utf_append_n(char *dest, const char *src, size_t n)
436
+ {
437
+ const char *p;
438
+
439
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
440
+ /* this loop intentionally left empty */;
441
+ }
442
+
443
+ strncat(dest, src, p - src);
444
+ dest[p - src] = NUL;
445
+ }
446
+
447
+
448
+ /* {{{1
449
+ * Compare two strings for ordering using the linguistically correct rules of
450
+ * the current locale.
451
+ */
452
+ int
453
+ utf_collate(const char *a, const char *b)
454
+ {
455
+ assert(a != NULL);
456
+ assert(b != NULL);
457
+
458
+ unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
459
+ unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
460
+
461
+ int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
462
+
463
+ free(a_norm);
464
+ free(b_norm);
465
+
466
+ return result;
467
+ }
468
+
469
+
470
+ /* {{{1
471
+ * We need UTF-8 encoding of numbers to encode the weights if
472
+ * we are using wcsxfrm. However, we aren't encoding Unicode
473
+ * characters, so we can't simply use unichar_to_utf.
474
+ *
475
+ * The following routine is taken (with modification) from GNU
476
+ * libc's strxfrm routine:
477
+ *
478
+ * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
479
+ * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
480
+ */
481
+ static inline int
482
+ _utf_encode(char *buf, wchar_t c)
483
+ {
484
+ int retval;
485
+
486
+ if (c < 0x80) {
487
+ if (buf != NULL)
488
+ *buf++ = (char)c;
489
+ retval = 1;
490
+ } else {
491
+ int step;
492
+
493
+ for (step = 2; step < 6; step++) {
494
+ if ((c & (~(uint32_t)0 << (5 * step + 1))) == 0)
495
+ break;
496
+ }
497
+
498
+ retval = step;
499
+
500
+ if (buf != NULL) {
501
+ *buf = (unsigned char)(~0xff >> step);
502
+ step--;
503
+ do {
504
+ c = PUT_X(c, buf[step]);
505
+ } while (--step > 0);
506
+ *buf |= c;
507
+ }
508
+ }
509
+
510
+ return retval;
511
+ }
512
+
513
+
514
+ /* {{{1
515
+ * Generate a collation key from a string which can be compared with other
516
+ * collation keys using str_compare().
517
+ */
518
+ static char *
519
+ utf_collate_key_impl(const char *str, size_t len, bool use_len)
520
+ {
521
+ assert(str != NULL);
522
+
523
+ unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
524
+ size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
525
+ wchar_t result_wc[xfrm_len + 1];
526
+ wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
527
+
528
+ int result_len = 0;
529
+ for (size_t i = 0; i < xfrm_len; i++)
530
+ result_len += _utf_encode(NULL, result_wc[i]);
531
+
532
+ char *result = ALLOC_N(char, result_len + 1);
533
+ result_len = 0;
534
+ for (size_t i = 0; i < xfrm_len; i++)
535
+ result_len += _utf_encode(result + result_len, result_wc[i]);
536
+ result[result_len] = NUL;
537
+
538
+ free(str_norm);
539
+
540
+ return result;
541
+ }
542
+
543
+
544
+ /* {{{1
545
+ * Generate a collation key from a string which can be compared with other
546
+ * collation keys using str_compare().
547
+ */
548
+ char *
549
+ utf_collate_key(const char *str)
550
+ {
551
+ return utf_collate_key_impl(str, 0, false);
552
+ }
553
+
554
+
555
+ /* {{{1
556
+ * Generate a collation key from a string (of length ‘len’) which can be
557
+ * compared with other collation keys using str_compare().
558
+ */
559
+ char *
560
+ utf_collate_key_n(const char *str, size_t len)
561
+ {
562
+ return utf_collate_key_impl(str, len, true);
563
+ }
564
+
565
+
566
+ /* {{{1
567
+ * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
568
+ * ‘haystack_len’.
569
+ */
570
+ static int
571
+ str_index_n(const char *haystack, const char *needle, size_t haystack_len)
572
+ {
573
+ assert(haystack != NULL);
574
+ assert(needle != NULL);
575
+
576
+ size_t needle_len = strlen(needle);
577
+
578
+ if (needle_len == 0)
579
+ return 0;
580
+
581
+ if (haystack_len < needle_len)
582
+ return -1;
583
+
584
+ const char *end = haystack + haystack_len - needle_len;
585
+ for (const char *p = haystack; *p != '\0' && p <= end; p++) {
586
+ size_t i;
587
+
588
+ for (i = 0; i < needle_len; i++) {
589
+ if (p[i] != needle[i])
590
+ break;
591
+ }
592
+
593
+ if (i == needle_len)
594
+ return p - haystack;
595
+ }
596
+
597
+ return -1;
598
+ }
599
+
600
+
601
+ /* {{{1
602
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
603
+ * ‘haystack’, or -1 if it doesn't exist.
604
+ */
605
+ static int
606
+ str_rindex(const char *haystack, const char *needle)
607
+ {
608
+ assert(haystack != NULL);
609
+ assert(needle != NULL);
610
+
611
+ size_t needle_len = strlen(needle);
612
+ size_t haystack_len = strlen(haystack);
613
+
614
+ if (needle_len == 0)
615
+ return haystack_len;
616
+
617
+ if (haystack_len < needle_len)
618
+ return -1;
619
+
620
+ for (const char *p = haystack + haystack_len - needle_len; p >= haystack; p--) {
621
+ size_t i;
622
+
623
+ for (i = 0; i < needle_len; i++) {
624
+ if (p[i] != needle[i])
625
+ break;
626
+ }
627
+
628
+ if (i == needle_len)
629
+ return p - haystack;
630
+ }
631
+
632
+ return -1;
633
+ }
634
+
635
+
636
+ /* {{{1
637
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
638
+ * ‘haystack’, or -1 if it doesn't exist.
639
+ */
640
+ static int
641
+ str_rindex_n(const char *haystack, const char *needle, size_t haystack_len)
642
+ {
643
+ assert(haystack != NULL);
644
+ assert(needle != NULL);
645
+
646
+ size_t needle_len = strlen(needle);
647
+ const char *haystack_max = haystack + haystack_len;
648
+ const char *p = haystack;
649
+
650
+ while (p < haystack_max && *p != '\0')
651
+ p++;
652
+
653
+ if (p < haystack + needle_len)
654
+ return -1;
655
+
656
+ p -= needle_len;
657
+
658
+ for ( ; p >= haystack; p--) {
659
+ size_t i;
660
+
661
+ for (i = 0; i < needle_len; i++) {
662
+ if (p[i] != needle[i])
663
+ break;
664
+ }
665
+
666
+ if (i == needle_len)
667
+ return p - haystack;
668
+ }
669
+
670
+ return -1;
671
+ }
672
+
673
+
674
+ /* {{{1
675
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
676
+ * doesn't exist.
677
+ */
678
+ int
679
+ utf_char_index(const char *str, unichar c)
680
+ {
681
+ char ch[7];
682
+
683
+ ch[unichar_to_utf(c, ch)] = NUL;
684
+ char *p = strstr(str, ch);
685
+ return (p != NULL) ? p - str : -1;
686
+ }
687
+
688
+
689
+ /* {{{1
690
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
691
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
692
+ */
693
+ int
694
+ utf_char_index_n(const char *str, unichar c, size_t len)
695
+ {
696
+ char ch[7];
697
+
698
+ ch[unichar_to_utf(c, ch)] = NUL;
699
+
700
+ return str_index_n(str, ch, len);
701
+ }
702
+
703
+
704
+ /* {{{1
705
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
706
+ * doesn't exist.
707
+ */
708
+ int
709
+ utf_char_rindex(const char *str, unichar c)
710
+ {
711
+ char ch[7];
712
+
713
+ ch[unichar_to_utf(c, ch)] = NUL;
714
+
715
+ return str_rindex(str, ch);
716
+ }
717
+
718
+
719
+ /* {{{1
720
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
721
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
722
+ */
723
+ int
724
+ utf_char_rindex_n(const char *str, unichar c, size_t len)
725
+ {
726
+ char ch[7];
727
+
728
+ ch[unichar_to_utf(c, ch)] = NUL;
729
+
730
+ return str_rindex_n(str, ch, len);
731
+ }
732
+
733
+
734
+ /* {{{1
735
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
736
+ * -1 if it doesn't exist.
737
+ */
738
+ int
739
+ utf_index(const char *haystack, const char *needle)
740
+ {
741
+ return strstr(haystack, needle) - haystack;
742
+ }
743
+
744
+
745
+ /* {{{1
746
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
747
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
748
+ */
749
+ int
750
+ utf_index_n(const char *haystack, const char *needle, size_t len)
751
+ {
752
+ return str_index_n(haystack, needle, len);
753
+ }
754
+
755
+
756
+ /* {{{1
757
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
758
+ * -1 if it doesn't exist.
759
+ */
760
+ int
761
+ utf_rindex(const char *haystack, const char *needle)
762
+ {
763
+ return str_rindex(haystack, needle);
764
+ }
765
+
766
+
767
+ /* {{{1
768
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
769
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
770
+ */
771
+ int
772
+ utf_rindex_n(const char *haystack, const char *needle, size_t len)
773
+ {
774
+ return str_rindex_n(haystack, needle, len);
775
+ }
776
+
777
+
778
+ /* {{{1
779
+ * Check if the given string begins with ‘prefix’.
780
+ */
781
+ bool
782
+ utf_has_prefix(const char *str, const char *prefix)
783
+ {
784
+ assert(str != NULL);
785
+ assert(prefix != NULL);
786
+
787
+ do {
788
+ if (*prefix == NUL)
789
+ return true;
790
+ else if (*str == NUL)
791
+ return false;
792
+ } while (*str++ == *prefix++);
793
+
794
+ return false;
795
+ }
796
+
797
+
798
+ /* {{{1
799
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
800
+ */
801
+ long
802
+ utf_length(const char *str)
803
+ {
804
+ assert(str != NULL);
805
+
806
+ long n = 0;
807
+ const char *p = str;
808
+ while (*p != '\0') {
809
+ n++;
810
+ p = utf_next(p);
811
+ }
812
+
813
+ return n;
814
+ }
815
+
816
+
817
+ /* {{{1
818
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
819
+ * ‘len’ bytes.
820
+ */
821
+ long
822
+ utf_length_n(const char *str, long len)
823
+ {
824
+ assert(str != NULL || len == 0);
825
+
826
+ if (len == 0)
827
+ return 0;
828
+
829
+ long n = 0;
830
+ const char *p = str;
831
+ const char *end = str + len;
832
+ while (p < end) {
833
+ n++;
834
+ p = utf_next(p);
835
+ }
836
+
837
+ /* This makes sure that we don’t count incomplete characters. It won’t
838
+ * save us from illegal UTF-8-sequences, however. */
839
+ if (p > end)
840
+ n--;
841
+
842
+ return n;
843
+ }
844
+
845
+
846
+ /* {{{1
847
+ * Retrieve the number of bytes making up the given UTF-8 string.
848
+ */
849
+ size_t
850
+ utf_byte_length(const char *str)
851
+ {
852
+ return strlen(str);
853
+ }
854
+
855
+
856
+ /* {{{1
857
+ * The real implementation of utf_reverse() and utf_reverse_n() below.
858
+ */
859
+ static char *
860
+ utf_reverse_impl(const char *str, size_t len, bool use_len)
861
+ {
862
+ if (!use_len)
863
+ len = utf_byte_length(str);
864
+
865
+ char *result = ALLOC_N(char, len + 1);
866
+ char *r = result + len;
867
+ const char *p = str;
868
+ while (r > result) {
869
+ uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
870
+ r -= skip;
871
+ for (char *m = r; skip > 0; skip--)
872
+ *m++ = *p++;
873
+ }
874
+ result[len] = 0;
875
+
876
+ return result;
877
+ }
878
+
879
+
880
+ /* {{{1
881
+ * Return a new string which is ‘str’ reversed.
882
+ */
883
+ char *
884
+ utf_reverse(const char *str)
885
+ {
886
+ return utf_reverse_impl(str, 0, false);
887
+ }
888
+
889
+
890
+ /* {{{1
891
+ * Return a new string which is ‘str’ reversed, examining at most ‘len’ bytes
892
+ * of it.
893
+ */
894
+ char *
895
+ utf_reverse_n(const char *str, size_t len)
896
+ {
897
+ return utf_reverse_impl(str, len, true);
898
+ }
899
+
900
+
901
+ /* {{{1
902
+ * The real implementation of utf_isvalid() and utf_isvalid_n() below.
903
+ *
904
+ * TODO: this needs optimizing. Look at glib's new optimized implementation
905
+ * (2.6.0) and also separate the ‘use_max’ into two cases.
906
+ */
907
+ #define CONTINUATION_CHAR do { \
908
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
909
+ goto error; \
910
+ val <<= 6; \
911
+ val |= (*(unsigned char *)p) & 0x3f; \
912
+ } while (0);
913
+
914
+ static const char *
915
+ fast_validate(const char *str)
916
+ {
917
+ unichar val = 0;
918
+ unichar min = 0;
919
+ const char *p;
920
+
921
+ for (p = str; *p != NUL; p++) {
922
+ if (*(unsigned char *)p < 128)
923
+ continue;
924
+
925
+ const char *last = p;
926
+
927
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
928
+ if ((*(unsigned char *)p & 0x1e) == 0)
929
+ goto error;
930
+ p++;
931
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
932
+ goto error;
933
+ } else {
934
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
935
+ min = (1 << 11);
936
+ val = *(unsigned char *)p & 0x0f;
937
+ goto two_remaining;
938
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
939
+ min = (1 << 16);
940
+ val = *(unsigned char *)p & 0x07;
941
+ } else {
942
+ goto error;
943
+ }
944
+
945
+ p++;
946
+ CONTINUATION_CHAR;
947
+ two_remaining:
948
+ p++;
949
+ CONTINUATION_CHAR;
950
+ p++;
951
+ CONTINUATION_CHAR;
952
+
953
+ if (val < min)
954
+ goto error;
955
+
956
+ if (!UNICODE_ISVALID(val))
957
+ goto error;
958
+ }
959
+
960
+ continue;
961
+ error:
962
+ return last;
963
+ }
964
+
965
+ return p;
966
+ }
967
+
968
+ static const char *
969
+ fast_validate_len(const char *str, size_t max_len)
970
+ {
971
+ unichar val = 0;
972
+ unichar min = 0;
973
+ const char *p;
974
+
975
+ for (p = str; (size_t)(p - str) < max_len && *p != NUL; p++) {
976
+ if (*(unsigned char *)p < 128)
977
+ continue;
978
+
979
+ const char *last = p;
980
+
981
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
982
+ if (max_len - (p - str) < 2)
983
+ goto error;
984
+
985
+ if ((*(unsigned char *)p & 0x1e) == 0)
986
+ goto error;
987
+ p++;
988
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
989
+ goto error;
990
+ } else {
991
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
992
+ if (max_len - (p - str) < 3)
993
+ goto error;
994
+
995
+ min = (1 << 11);
996
+ val = *(unsigned char *)p & 0x0f;
997
+ goto two_remaining;
998
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
999
+ if (max_len - (p - str) < 4)
1000
+ goto error;
1001
+
1002
+ min = (1 << 16);
1003
+ val = *(unsigned char *)p & 0x07;
1004
+ } else {
1005
+ goto error;
1006
+ }
1007
+
1008
+ p++;
1009
+ CONTINUATION_CHAR;
1010
+ two_remaining:
1011
+ p++;
1012
+ CONTINUATION_CHAR;
1013
+ p++;
1014
+ CONTINUATION_CHAR;
1015
+
1016
+ if (val < min)
1017
+ goto error;
1018
+ if (!UNICODE_ISVALID(val))
1019
+ goto error;
1020
+ }
1021
+
1022
+ continue;
1023
+ error:
1024
+ return last;
1025
+ }
1026
+
1027
+ return p;
1028
+ }
1029
+
1030
+
1031
+ /* {{{1
1032
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence.
1033
+ */
1034
+ bool
1035
+ utf_isvalid(const char *str)
1036
+ {
1037
+ const char *p = fast_validate(str);
1038
+
1039
+ return *p == NUL;
1040
+ }
1041
+
1042
+
1043
+ /* {{{1
1044
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence, examining at
1045
+ * most ‘max’ bytes. If it turns out ‘str’ isn't a valid UTF-8 character
1046
+ * sequence and ‘end’ is non-‹NULL›, ‘end’ is set to the end of the valid range
1047
+ * of bytes in ‘str’.
1048
+ */
1049
+ bool
1050
+ utf_isvalid_n(const char *str, size_t max, const char **end)
1051
+ {
1052
+ const char *p = fast_validate_len(str, max);
1053
+
1054
+ if (end != NULL)
1055
+ *end = p;
1056
+
1057
+ return p == str + max;
1058
+ }
1059
+
1060
+
1061
+ /* {{{1
1062
+ * Check whether ‘c’ is a valid Unicode character.
1063
+ */
1064
+ bool
1065
+ unichar_isvalid(unichar c)
1066
+ {
1067
+ return UNICODE_ISVALID(c);
1068
+ }
1069
+
1070
+
1071
+ /* {{{1
1072
+ * Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
1073
+ * store it in ‘result’, returning the length of the stored sequence.
1074
+ */
1075
+ int
1076
+ unichar_to_utf(unichar c, char *result)
1077
+ {
1078
+ int len = 0;
1079
+ int first;
1080
+
1081
+ if (c < UNI_LEN1) {
1082
+ first = 0;
1083
+ len = 1;
1084
+ } else if (c < UNI_LEN2) {
1085
+ first = 0xc0;
1086
+ len = 2;
1087
+ } else if (c < UNI_LEN3) {
1088
+ first = 0xe0;
1089
+ len = 3;
1090
+ } else if (c < UNI_LEN4) {
1091
+ first = 0xf0;
1092
+ len = 4;
1093
+ } else if (c < UNI_LEN5) {
1094
+ first = 0xf8;
1095
+ len = 5;
1096
+ } else {
1097
+ first = 0xfc;
1098
+ len = 6;
1099
+ }
1100
+
1101
+ if (result != NULL) {
1102
+ for (int i = len - 1; i > 0; i--)
1103
+ c = PUT_X(c, result[i]);
1104
+
1105
+ result[0] = c | first;
1106
+ }
1107
+
1108
+ return len;
1109
+ }
1110
+
1111
+
1112
+ /* {{{1
1113
+ * The real implementation of ucs4_to_utf8() and ucs4_to_utf8_n() below.
1114
+ */
1115
+ static char *
1116
+ ucs4_to_utf8_n_impl(unichar *str, size_t len, bool use_len,
1117
+ size_t *items_read, size_t *items_written)
1118
+ {
1119
+ size_t result_len = 0;
1120
+ char *result = NULL, *p;
1121
+
1122
+ for (size_t i = 0; (!use_len || i < len) && str[i] != NUL; i++) {
1123
+ if (str[i] >= 0x80000000) {
1124
+ if (items_read != NULL)
1125
+ *items_read = i;
1126
+
1127
+ rb_raise(rb_eArgError, "UCS-4 input contains character outside of range for UTF-8 (%lc))", str[i]);
1128
+ }
1129
+
1130
+ result_len += _utf_length(str[i]);
1131
+ }
1132
+
1133
+ p = result = ALLOC_N(char, result_len + 1);
1134
+ size_t i;
1135
+ for (i = 0; p < result + result_len; i++)
1136
+ p += unichar_to_utf(str[i], p);
1137
+ *p = NUL;
1138
+
1139
+ if (items_written != NULL)
1140
+ *items_written = p - result;
1141
+ if (items_read != NULL)
1142
+ *items_read = i;
1143
+
1144
+ return result;
1145
+ }
1146
+
1147
+ /* {{{1
1148
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1149
+ * store the number of characters read and bytes written in ‘items_read’ and
1150
+ * ‘items_written’ respectivelly.
1151
+ */
1152
+ char *
1153
+ ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written)
1154
+ {
1155
+ return ucs4_to_utf8_n_impl(str, 0, false, items_read, items_written);
1156
+ }
1157
+
1158
+ /* {{{1
1159
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1160
+ * store the number of characters read and bytes written in ‘items_read’ and
1161
+ * ‘items_written’ respectivelly. Examine at most ‘len’ characters from ‘str’.
1162
+ */
1163
+ char *
1164
+ ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written)
1165
+ {
1166
+ return ucs4_to_utf8_n_impl(str, len, true, items_read, items_written);
1167
+ }
1168
+
1169
+
1170
+ /* {{{1
1171
+ * The real implementation of utf8_to_ucs4_fast() and utf8_to_ucs4_fast_n()
1172
+ * below.
1173
+ */
1174
+ static unichar *
1175
+ utf8_to_ucs4_fast_impl(const char *str, size_t len, bool use_len, size_t *items_written)
1176
+ {
1177
+ assert(str != NULL);
1178
+
1179
+ const char *p = str;
1180
+ size_t n = 0;
1181
+ if (use_len) {
1182
+ while (p < str + len && *p != NUL) {
1183
+ p = utf_next(p);
1184
+ n++;
1185
+ }
1186
+ } else {
1187
+ while (p != NUL) {
1188
+ p = utf_next(p);
1189
+ n++;
1190
+ }
1191
+ }
1192
+
1193
+ unichar *result = ALLOC_N(unichar, n + 1);
1194
+ p = str;
1195
+ size_t i;
1196
+ for (i = 0; i < n; i++) {
1197
+ unichar c = ((unsigned char *)p)[0];
1198
+ int c_len;
1199
+
1200
+ if (c < 0x80) {
1201
+ result[i] = c;
1202
+ p++;
1203
+ } else {
1204
+ /* TODO: use _utf_compute() here */
1205
+ if (c < 0xe0) {
1206
+ c_len = 2;
1207
+ c &= 0x1f;
1208
+ } else if (c < 0xf0) {
1209
+ c_len = 3;
1210
+ c &= 0x0f;
1211
+ } else if (c < 0xf8) {
1212
+ c_len = 4;
1213
+ c &= 0x07;
1214
+ } else if (c < 0xfc) {
1215
+ c_len = 5;
1216
+ c &= 0x03;
1217
+ } else {
1218
+ c_len = 6;
1219
+ c &= 0x01;
1220
+ }
1221
+
1222
+ for (int j = 1; j < c_len; j++) {
1223
+ c <<= BIT_X;
1224
+ c |= ((unsigned char *)p)[j] & MASK_X;
1225
+ }
1226
+
1227
+ result[i] = c;
1228
+ p += c_len;
1229
+ }
1230
+ }
1231
+ result[i] = NUL;
1232
+
1233
+ if (items_written != NULL)
1234
+ *items_written = i;
1235
+
1236
+ return result;
1237
+ }
1238
+
1239
+
1240
+ /* {{{1
1241
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1242
+ * the number of characters written in ‘items_written’.
1243
+ */
1244
+ unichar *
1245
+ utf8_to_ucs4_fast(const char *str, size_t *items_written)
1246
+ {
1247
+ return utf8_to_ucs4_fast_impl(str, 0, false, items_written);
1248
+ }
1249
+
1250
+
1251
+ /* {{{1
1252
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1253
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1254
+ * bytes from ‘str’.
1255
+ */
1256
+ unichar *
1257
+ utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written)
1258
+ {
1259
+ return utf8_to_ucs4_fast_impl(str, len, true, items_written);
1260
+ }
1261
+
1262
+
1263
+ /* {{{1
1264
+ * The real implementation of utf8_to_ucs4() and utf8_to_ucs4_n() below.
1265
+ */
1266
+ static unichar *
1267
+ utf8_to_ucs4_impl(const char *str, size_t len, bool use_len, size_t *items_read, size_t *items_written)
1268
+ {
1269
+ size_t n = 0;
1270
+ const char *p = str;
1271
+ for (; (!use_len || str + len - p > 0) && *p != NUL; p = utf_next(p)) {
1272
+ unichar c = utf_char_n(p, str + len - p);
1273
+ if (c & 0x80000000) {
1274
+ if (c == UTF_INCOMPLETE_INPUT_UNICHAR) {
1275
+ if (items_read != NULL)
1276
+ break;
1277
+
1278
+ rb_raise(rb_eArgError, "partial character sequence in UTF-8 input");
1279
+ } else {
1280
+ rb_raise(rb_eArgError, "UTF-8 input contains character outside of range for UTF-8 (%lc))", c);
1281
+ }
1282
+
1283
+ if (items_read != NULL)
1284
+ *items_read = p - str;
1285
+
1286
+ return NULL;
1287
+ } else {
1288
+ n++;
1289
+ }
1290
+ }
1291
+
1292
+ unichar *result = ALLOC_N(unichar, n + 1);
1293
+ size_t i;
1294
+ for (i = 0, p = str; i < n; i++) {
1295
+ result[i] = utf_char(p);
1296
+ p = utf_next(p);
1297
+ }
1298
+ result[i] = NUL;
1299
+
1300
+ if (items_written != NULL)
1301
+ *items_written = n;
1302
+ if (items_read != NULL)
1303
+ *items_read = p - str;
1304
+
1305
+ return result;
1306
+ }
1307
+
1308
+
1309
+ /* {{{1
1310
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1311
+ * the number of characters written in ‘items_written’. This function does
1312
+ * additional error-checking on the input.
1313
+ */
1314
+ unichar *
1315
+ utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written)
1316
+ {
1317
+ return utf8_to_ucs4_impl(str, 0, false, items_read, items_written);
1318
+ }
1319
+
1320
+
1321
+ /* {{{1
1322
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1323
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1324
+ * bytes from ‘str’. This function does additional error-checking on the
1325
+ * input.
1326
+ */
1327
+ unichar *
1328
+ utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written)
1329
+ {
1330
+ return utf8_to_ucs4_impl(str, len, true, items_read, items_written);
1331
+ }
1332
+
1333
+
1334
+ /* }}}1 */