u 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,444 @@
1
+ /*
2
+ * contents: Unicode decomposition handling.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+
12
+ #include "unicode.h"
13
+
14
+ #include "data/decompose.h"
15
+ #include "data/compose.h"
16
+
17
+ #include "private.h"
18
+
19
+ #define COMBINING_CLASS(c) \
20
+ SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
21
+
22
+
23
+ /* {{{1
24
+ * Hangul syllable [de]composition constants. A lot of work I'd say.
25
+ */
26
+ enum {
27
+ SBase = 0xac00,
28
+ LBase = 0x1100,
29
+ VBase = 0x1161,
30
+ TBase = 0x11a7,
31
+ LCount = 19,
32
+ VCount = 21,
33
+ TCount = 28,
34
+ NCount = (VCount * TCount),
35
+ SCount = (LCount * NCount),
36
+ SLast = (SBase + SCount - 1)
37
+ };
38
+
39
+
40
+ /* {{{1
41
+ * Return the combinging class of ‘c’.
42
+ */
43
+ int
44
+ unichar_combining_class(unichar c)
45
+ {
46
+ return COMBINING_CLASS(c);
47
+ }
48
+
49
+
50
+ /* {{{1
51
+ * Rearrange ‘str’ so that decomposed characters are arranged according to
52
+ * their combining class. Do this for at most ‘len’ bytes of data.
53
+ */
54
+ static void
55
+ unicode_canonical_ordering_swap(unichar *str, size_t offset, int next, bool *swapped)
56
+ {
57
+ size_t initial = offset + 1;
58
+
59
+ size_t j = initial;
60
+ while (j > 0 && COMBINING_CLASS(str[j - 1]) > next) {
61
+ unichar c = str[j];
62
+ str[j] = str[j - 1];
63
+ str[j - 1] = c;
64
+ j--;
65
+ }
66
+
67
+ *swapped = *swapped || (j != initial);
68
+ }
69
+
70
+ static bool
71
+ unicode_canonical_ordering_reorder(unichar *str, size_t len)
72
+ {
73
+ bool swapped = false;
74
+
75
+ int prev = COMBINING_CLASS(str[0]);
76
+ for (size_t i = 0; i < len - 1; i++) {
77
+ int next = COMBINING_CLASS(str[i + 1]);
78
+
79
+ if (next != 0 && prev > next)
80
+ unicode_canonical_ordering_swap(str, i, next, &swapped);
81
+ else
82
+ prev = next;
83
+ }
84
+
85
+ return swapped;
86
+ }
87
+
88
+ void
89
+ unicode_canonical_ordering(unichar *str, size_t len)
90
+ {
91
+ while (unicode_canonical_ordering_reorder(str, len))
92
+ ; /* This loop intentionally left empty. */
93
+ }
94
+
95
+
96
+ /* {{{1
97
+ * Decompose the character ‘s’ according to the rules outlined in
98
+ * http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
99
+ * or of sufficient length to store the decomposition of ‘s’. Returns the
100
+ * number of characters stored (or would be if it were non-NULL) in R.
101
+ */
102
+ static size_t
103
+ decompose_hangul(unichar s, unichar *r)
104
+ {
105
+ int SIndex = s - SBase;
106
+
107
+ /* If this isn’t a Hangul symbol, then simply return it unchanged. */
108
+ if (SIndex < 0 || SIndex >= SCount) {
109
+ if (r != NULL)
110
+ r[0] = s;
111
+ return 1;
112
+ }
113
+
114
+ unichar L = LBase + SIndex / NCount;
115
+ unichar V = VBase + (SIndex % NCount) / TCount;
116
+ unichar T = TBase + SIndex % TCount;
117
+
118
+ if (r != NULL) {
119
+ r[0] = L;
120
+ r[1] = V;
121
+ }
122
+
123
+ if (T == TBase)
124
+ return 2;
125
+
126
+ if (r != NULL)
127
+ r[2] = T;
128
+
129
+ return 3;
130
+ }
131
+
132
+
133
+ /* {{{1
134
+ * Search the Unicode decomposition table for ‘c’ and depending on the boolean
135
+ * value of ‘compat’, return its compatibility or canonical decomposition if
136
+ * found.
137
+ */
138
+ static const char *
139
+ get_decomposition(int index, bool compat)
140
+ {
141
+ int offset;
142
+
143
+ if (compat) {
144
+ offset = decomp_table[index].compat_offset;
145
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
146
+ offset = decomp_table[index].canon_offset;
147
+ /* Either .compat_offset or .canon_offset can be missing, but
148
+ * not both. */
149
+ } else {
150
+ offset = decomp_table[index].canon_offset;
151
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
152
+ return NULL;
153
+ }
154
+
155
+ return &decomp_expansion_string[offset];
156
+ }
157
+
158
+ static const char *
159
+ find_decomposition(unichar c, bool compat)
160
+ {
161
+ int index;
162
+
163
+ if (!unicode_table_lookup(decomp_table, c, &index))
164
+ return NULL;
165
+
166
+ return get_decomposition(index, compat);
167
+ }
168
+
169
+
170
+ /* {{{1
171
+ * Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
172
+ * ‘chars’. Return the number of unichars in ‘chars’.
173
+ */
174
+ static size_t
175
+ decomposition_to_wc(const char *decomposition, unichar *chars)
176
+ {
177
+ size_t i = 0;
178
+ for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
179
+ chars[i++] = utf_char(p);
180
+
181
+ return i;
182
+ }
183
+
184
+
185
+ /* {{{1
186
+ * Generate the canonical decomposition of ‘c’. The length of the
187
+ * decomposition is stored in ‘len’.
188
+ */
189
+ /* TODO: clean this up. */
190
+ unichar *
191
+ unicode_canonical_decomposition(unichar c, size_t *len)
192
+ {
193
+ const char *decomp;
194
+ unichar *r;
195
+
196
+ /* Hangul syllable */
197
+ if (c >= SBase && c <= SLast) {
198
+ *len = decompose_hangul(c, NULL);
199
+ r = ALLOC_N(unichar, *len);
200
+ decompose_hangul(c, r);
201
+ } else if ((decomp = find_decomposition(c, false)) != NULL) {
202
+ *len = utf_length(decomp);
203
+ r = ALLOC_N(unichar, *len);
204
+ decomposition_to_wc(decomp, r);
205
+ } else {
206
+ r = ALLOC(unichar);
207
+ *r = c;
208
+ *len = 1;
209
+ }
210
+
211
+ /* Supposedly following the Unicode 2.1.9 table means that the
212
+ * decompositions come out in canonical order. I haven't tested this,
213
+ * but we rely on it here. */
214
+ return r;
215
+ }
216
+
217
+
218
+ /* {{{1
219
+ * Combine Hangul characters ‘a’ and ‘b’ if possible, and store the result in
220
+ * ‘result’. The combinations tried are L,V => LV and LV,T => LVT in that
221
+ * order.
222
+ */
223
+ static bool
224
+ combine_hangul(unichar a, unichar b, unichar *result)
225
+ {
226
+ int LIndex = a - LBase;
227
+ int VIndex = b - VBase;
228
+
229
+ if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) {
230
+ *result = SBase + (LIndex * VCount + VIndex) * TCount;
231
+ return true;
232
+ }
233
+
234
+ int SIndex = a - SBase;
235
+ int TIndex = b - TBase;
236
+
237
+ if (0 <= SIndex && SIndex < SCount &&
238
+ (SIndex % TCount) == 0 &&
239
+ 0 < TIndex && TIndex < TCount) {
240
+ *result = a + TIndex;
241
+ return true;
242
+ }
243
+
244
+ return false;
245
+ }
246
+
247
+
248
+ /* {{{1
249
+ * Try to combine the Unicode characters ‘a’ and ‘b’ storing the result in
250
+ * ‘result’.
251
+ */
252
+ static uint16_t
253
+ compose_index(unichar c)
254
+ {
255
+ unsigned int page = c >> 8;
256
+
257
+ if (page > COMPOSE_TABLE_LAST)
258
+ return 0;
259
+
260
+ return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
261
+ }
262
+
263
+ static bool
264
+ lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
265
+ unichar *result)
266
+ {
267
+ if (c != table[index][0])
268
+ return false;
269
+
270
+ *result = table[index][1];
271
+
272
+ return true;
273
+ }
274
+
275
+ static bool
276
+ combine(unichar a, unichar b, unichar *result)
277
+ {
278
+ if (combine_hangul(a, b, result))
279
+ return true;
280
+
281
+ uint16_t index_a = compose_index(a);
282
+ if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
283
+ return lookup_compose(compose_first_single,
284
+ index_a - COMPOSE_FIRST_SINGLE_START,
285
+ b,
286
+ result);
287
+
288
+ uint16_t index_b = compose_index(b);
289
+ if (index_b >= COMPOSE_SECOND_SINGLE_START)
290
+ return lookup_compose(compose_second_single,
291
+ index_b - COMPOSE_SECOND_SINGLE_START,
292
+ a,
293
+ result);
294
+
295
+ if (index_a >= COMPOSE_FIRST_START &&
296
+ index_a < COMPOSE_FIRST_SINGLE_START &&
297
+ index_b >= COMPOSE_SECOND_START &&
298
+ index_b < COMPOSE_SECOND_SINGLE_START) {
299
+ unichar r = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
300
+
301
+ if (r != 0) {
302
+ *result = r;
303
+ return true;
304
+ }
305
+ }
306
+
307
+ return false;
308
+ }
309
+
310
+
311
+ /* {{{1
312
+ * Normalize (compose/decompose) characters in ‘str’ so that strings that
313
+ * actually contain the same characters will be recognized as equal for
314
+ * comparison for example.
315
+ */
316
+ static size_t
317
+ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
318
+ {
319
+ bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
320
+ const char *decomp = find_decomposition(c, do_compat);
321
+
322
+ if (decomp == NULL) {
323
+ if (buf != NULL)
324
+ buf[0] = c;
325
+ return 1;
326
+ }
327
+
328
+ if (buf != NULL)
329
+ return decomposition_to_wc(decomp, buf);
330
+
331
+ return utf_length(decomp);
332
+ }
333
+
334
+ static void
335
+ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
336
+ NormalizeMode mode, unichar *buf, size_t *buf_len)
337
+ {
338
+ size_t n = 0;
339
+ size_t prev_start = 0;
340
+ const char *end = str + max_len;
341
+ for (const char *p = str; (!use_len || p < end) && *p != NUL; p = utf_next(p)) {
342
+ unichar c = utf_char(p);
343
+ size_t prev_n = n;
344
+
345
+ unichar *base = (buf != NULL) ? buf + n : NULL;
346
+ if (c >= SBase && c <= SLast)
347
+ n += decompose_hangul(c, base);
348
+ else
349
+ n += normalize_wc_decompose_one(c, mode, base);
350
+
351
+ if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
352
+ unicode_canonical_ordering(buf + prev_start,
353
+ n - prev_start);
354
+ prev_start = prev_n;
355
+ }
356
+ }
357
+
358
+ if (buf != NULL && n > 0)
359
+ unicode_canonical_ordering(buf + prev_start, n - prev_start);
360
+
361
+ if (buf != NULL)
362
+ buf[n] = NUL;
363
+
364
+ *buf_len = n;
365
+ }
366
+
367
+ static unichar *
368
+ normalize_wc_compose(unichar *buf, size_t len)
369
+ {
370
+ int new_len = len;
371
+ size_t prev_start = 0;
372
+ int prev_cc = 0;
373
+
374
+ for (size_t i = 0; i < len; i++) {
375
+ int cc = COMBINING_CLASS(buf[i]);
376
+ size_t j = i - (len - new_len);
377
+
378
+ if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
379
+ combine(buf[prev_start], buf[i], &buf[prev_start])) {
380
+ new_len--;
381
+ prev_cc = (j + 1 == prev_start) ?
382
+ 0 : COMBINING_CLASS(buf[j - 1]);
383
+ } else {
384
+ if (cc == 0)
385
+ prev_start = j;
386
+
387
+ buf[j] = buf[i];
388
+ prev_cc = cc;
389
+ }
390
+ }
391
+
392
+ buf[new_len] = NUL;
393
+
394
+ return buf;
395
+ }
396
+
397
+ unichar *
398
+ _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
399
+ {
400
+ size_t n;
401
+ normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
402
+ unichar *buf = ALLOC_N(unichar, n + 1);
403
+ normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
404
+
405
+ /* Just return if we don’t want composition. */
406
+ if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
407
+ return buf;
408
+
409
+ return normalize_wc_compose(buf, n);
410
+ }
411
+
412
+
413
+ /* {{{1
414
+ * Normalize (compose/decompose) characters in ‘str˚ so that strings that
415
+ * actually contain the same characters will be recognized as equal for
416
+ * comparison for example.
417
+ */
418
+ char *
419
+ utf_normalize(const char *str, NormalizeMode mode)
420
+ {
421
+ unichar *wcs = _utf_normalize_wc(str, 0, false, mode);
422
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
423
+
424
+ free(wcs);
425
+ return utf;
426
+ }
427
+
428
+
429
+ /* {{{1
430
+ * This function is the same as utf_normalize() except that at most ‘len˚
431
+ * bytes are normalized from ‘str’.
432
+ */
433
+ char *
434
+ utf_normalize_n(const char *str, NormalizeMode mode, size_t len)
435
+ {
436
+ unichar *wcs = _utf_normalize_wc(str, len, true, mode);
437
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
438
+
439
+ free(wcs);
440
+ return utf;
441
+ }
442
+
443
+
444
+ /* }}}1 */
@@ -0,0 +1,65 @@
1
+ break.o: break.c unicode.h data/break.h
2
+ decompose.o: decompose.c unicode.h private.h data/decompose.h \
3
+ data/compose.h
4
+ private.o: private.c private.h
5
+ properties.o: properties.c unicode.h private.h data/character-tables.h
6
+ rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
7
+ rb_methods.h
8
+ rb_utf_aset.o: rb_utf_aset.c rb_includes.h unicode.h private.h \
9
+ rb_methods.h
10
+ rb_utf_casecmp.o: rb_utf_casecmp.c rb_includes.h unicode.h private.h \
11
+ rb_methods.h
12
+ rb_utf_chomp.o: rb_utf_chomp.c rb_includes.h unicode.h private.h \
13
+ rb_methods.h
14
+ rb_utf_chop.o: rb_utf_chop.c rb_includes.h unicode.h private.h \
15
+ rb_methods.h
16
+ rb_utf_collate.o: rb_utf_collate.c rb_includes.h unicode.h private.h \
17
+ rb_methods.h
18
+ rb_utf_count.o: rb_utf_count.c rb_includes.h unicode.h private.h \
19
+ rb_methods.h rb_utf_internal_tr.h
20
+ rb_utf_delete.o: rb_utf_delete.c rb_includes.h unicode.h private.h \
21
+ rb_methods.h rb_utf_internal_tr.h
22
+ rb_utf_downcase.o: rb_utf_downcase.c rb_includes.h unicode.h private.h \
23
+ rb_methods.h
24
+ rb_utf_each_char.o: rb_utf_each_char.c rb_includes.h unicode.h private.h \
25
+ rb_methods.h
26
+ rb_utf_foldcase.o: rb_utf_foldcase.c rb_includes.h unicode.h private.h \
27
+ rb_methods.h
28
+ rb_utf_hex.o: rb_utf_hex.c rb_includes.h unicode.h private.h rb_methods.h \
29
+ rb_utf_internal_bignum.h
30
+ rb_utf_index.o: rb_utf_index.c rb_includes.h unicode.h private.h \
31
+ rb_methods.h
32
+ rb_utf_insert.o: rb_utf_insert.c rb_includes.h unicode.h private.h \
33
+ rb_methods.h
34
+ rb_utf_internal_bignum.o: rb_utf_internal_bignum.c rb_includes.h \
35
+ unicode.h private.h rb_methods.h rb_utf_internal_bignum.h
36
+ rb_utf_internal_tr.o: rb_utf_internal_tr.c rb_includes.h unicode.h \
37
+ private.h rb_methods.h rb_utf_internal_tr.h
38
+ rb_utf_justify.o: rb_utf_justify.c rb_includes.h unicode.h private.h \
39
+ rb_methods.h
40
+ rb_utf_length.o: rb_utf_length.c rb_includes.h unicode.h private.h \
41
+ rb_methods.h
42
+ rb_utf_lstrip.o: rb_utf_lstrip.c rb_includes.h unicode.h private.h \
43
+ rb_methods.h
44
+ rb_utf_normalize.o: rb_utf_normalize.c rb_includes.h unicode.h private.h \
45
+ rb_methods.h
46
+ rb_utf_oct.o: rb_utf_oct.c rb_includes.h unicode.h private.h rb_methods.h \
47
+ rb_utf_internal_bignum.h
48
+ rb_utf_reverse.o: rb_utf_reverse.c rb_includes.h unicode.h private.h \
49
+ rb_methods.h
50
+ rb_utf_rindex.o: rb_utf_rindex.c rb_includes.h unicode.h private.h \
51
+ rb_methods.h
52
+ rb_utf_rstrip.o: rb_utf_rstrip.c rb_includes.h unicode.h private.h \
53
+ rb_methods.h
54
+ rb_utf_squeeze.o: rb_utf_squeeze.c rb_includes.h unicode.h private.h \
55
+ rb_methods.h rb_utf_internal_tr.h
56
+ rb_utf_strip.o: rb_utf_strip.c rb_includes.h unicode.h private.h \
57
+ rb_methods.h
58
+ rb_utf_to_i.o: rb_utf_to_i.c rb_includes.h unicode.h private.h \
59
+ rb_methods.h rb_utf_internal_bignum.h
60
+ rb_utf_tr.o: rb_utf_tr.c rb_includes.h unicode.h private.h rb_methods.h \
61
+ rb_utf_internal_tr.h
62
+ rb_utf_upcase.o: rb_utf_upcase.c rb_includes.h unicode.h private.h \
63
+ rb_methods.h
64
+ unicode.o: unicode.c unicode.h private.h rb_methods.h
65
+ utf.o: utf.c unicode.h private.h