character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,476 @@
1
+ /*
2
+ * contents: Unicode decomposition handling.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include "unicode.h"
12
+ #include "private.h"
13
+ #include "data/decompose.h"
14
+ #include "data/compose.h"
15
+
16
+
17
+ /* {{{1
18
+ * Macros for accessing the combining class property tables for a given
19
+ * character.
20
+ *
21
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
22
+ * when we have ‹inline› in C99.
23
+ */
24
+ #define CC_PART1(page, char) \
25
+ ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
26
+ ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
27
+ : (cclass_data[combining_class_table_part1[page]][char]))
28
+
29
+ #define CC_PART2(page, char) \
30
+ ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
31
+ ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
32
+ : (cclass_data[combining_class_table_part2[page]][char]))
33
+
34
+ #define COMBINING_CLASS(char) \
35
+ (((char) <= UNICODE_LAST_CHAR_PART1) \
36
+ ? CC_PART1((char) >> 8, (char) & 0xff) \
37
+ : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
38
+ ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
39
+ : 0))
40
+
41
+
42
+ /* {{{1
43
+ * Hangul syllable [de]composition constants. A lot of work I'd say.
44
+ */
45
+ enum {
46
+ SBase = 0xac00,
47
+ LBase = 0x1100,
48
+ VBase = 0x1161,
49
+ TBase = 0x11a7,
50
+ LCount = 19,
51
+ VCount = 21,
52
+ TCount = 28,
53
+ NCount = (VCount * TCount),
54
+ SCount = (LCount * NCount),
55
+ SLast = (SBase + SCount - 1)
56
+ };
57
+
58
+
59
+ /* {{{1
60
+ * Return the combinging class of ‘c’.
61
+ */
62
+ inline int
63
+ _unichar_combining_class(unichar c)
64
+ {
65
+ return COMBINING_CLASS(c);
66
+ }
67
+
68
+
69
+ /* {{{1
70
+ * Rearrange ‘str’ so that decomposed characters are arranged according to
71
+ * their combining class. Do this for at most ‘len’ bytes of data.
72
+ */
73
+ static void
74
+ unicode_canonical_ordering_swap(unichar *str, size_t offset, int next, bool *swapped)
75
+ {
76
+ size_t initial = offset + 1;
77
+
78
+ size_t j = initial;
79
+ while (j > 0 && COMBINING_CLASS(str[j - 1]) > next) {
80
+ unichar c = str[j];
81
+ str[j] = str[j - 1];
82
+ str[j - 1] = c;
83
+ j--;
84
+ }
85
+
86
+ *swapped = *swapped || (j != initial);
87
+ }
88
+
89
+ static bool
90
+ unicode_canonical_ordering_reorder(unichar *str, size_t len)
91
+ {
92
+ bool swapped = false;
93
+
94
+ int prev = COMBINING_CLASS(str[0]);
95
+ for (size_t i = 0; i < len - 1; i++) {
96
+ int next = COMBINING_CLASS(str[i + 1]);
97
+
98
+ if (next != 0 && prev > next)
99
+ unicode_canonical_ordering_swap(str, i, next, &swapped);
100
+ else
101
+ prev = next;
102
+ }
103
+
104
+ return swapped;
105
+ }
106
+
107
+ void
108
+ unicode_canonical_ordering(unichar *str, size_t len)
109
+ {
110
+ while (unicode_canonical_ordering_reorder(str, len))
111
+ ; /* This loop intentionally left empty. */
112
+ }
113
+
114
+
115
+ /* {{{1
116
+ * Decompose the character ‘s’ according to the rules outlined in
117
+ * http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
118
+ * or of sufficient length to store the decomposition of ‘s’. The number of
119
+ * characters stored (or would be if it were non-‹NULL›) in ‘r’ is put in
120
+ * ‘r_len’.
121
+ */
122
+ static void
123
+ decompose_hangul(unichar s, unichar *r, size_t *r_len)
124
+ {
125
+ int SIndex = s - SBase;
126
+
127
+ /* If this isn’t a Hangul symbol, then simply return it unchanged. */
128
+ if (SIndex < 0 || SIndex >= SCount) {
129
+ if (r != NULL)
130
+ r[0] = s;
131
+ *r_len = 1;
132
+ return;
133
+ }
134
+
135
+ unichar L = LBase + SIndex / NCount;
136
+ unichar V = VBase + (SIndex % NCount) / TCount;
137
+ unichar T = TBase + SIndex % TCount;
138
+
139
+ if (r != NULL) {
140
+ r[0] = L;
141
+ r[1] = V;
142
+ }
143
+
144
+ if (T != TBase) {
145
+ if (r != NULL)
146
+ r[2] = T;
147
+ *r_len = 3;
148
+ } else {
149
+ *r_len = 2;
150
+ }
151
+ }
152
+
153
+
154
+ /* {{{1
155
+ * Search the Unicode decomposition table for ‘c’ and depending on the boolean
156
+ * value of ‘compat’, return its compatibility or canonical decomposition if
157
+ * found.
158
+ */
159
+ static const char *
160
+ get_decomposition(int index, bool compat)
161
+ {
162
+ int offset;
163
+
164
+ if (compat) {
165
+ offset = decomp_table[index].compat_offset;
166
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
167
+ offset = decomp_table[index].canon_offset;
168
+ /* Either .compat_offset or .canon_offset can be missing, but
169
+ * not both. */
170
+ } else {
171
+ offset = decomp_table[index].canon_offset;
172
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
173
+ return NULL;
174
+ }
175
+
176
+ return &decomp_expansion_string[offset];
177
+ }
178
+
179
+ static const char *
180
+ find_decomposition(unichar c, bool compat)
181
+ {
182
+ int begin = 0;
183
+ int end = lengthof(decomp_table);
184
+
185
+ if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
186
+ return NULL;
187
+
188
+ while (true) {
189
+ int middle = (begin + end) / 2;
190
+
191
+ if (c == decomp_table[middle].ch)
192
+ return get_decomposition(middle, compat);
193
+ else if (middle == begin)
194
+ break;
195
+ else if (c > decomp_table[middle].ch)
196
+ begin = middle;
197
+ else
198
+ end = middle;
199
+ }
200
+
201
+ return NULL;
202
+ }
203
+
204
+
205
+ /* {{{1
206
+ * Generate the canonical decomposition of ‘c’. The length of the
207
+ * decomposition is stored in ‘len’.
208
+ */
209
+ /* TODO: clean this up. */
210
+ unichar *
211
+ unicode_canonical_decomposition(unichar c, size_t *len)
212
+ {
213
+ const char *decomp;
214
+ unichar *r;
215
+
216
+ /* Hangul syllable */
217
+ if (c >= SBase && c <= SLast) {
218
+ decompose_hangul(c, NULL, len);
219
+ r = ALLOC_N(unichar, *len);
220
+ decompose_hangul(c, r, len);
221
+ } else if ((decomp = find_decomposition(c, false)) != NULL) {
222
+ *len = utf_length(decomp);
223
+ r = ALLOC_N(unichar, *len);
224
+
225
+ int i;
226
+ const char *p;
227
+ for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
228
+ r[i] = utf_char(p);
229
+ } else {
230
+ r = ALLOC(unichar);
231
+ *r = c;
232
+ *len = 1;
233
+ }
234
+
235
+ /* Supposedly following the Unicode 2.1.9 table means that the
236
+ * decompositions come out in canonical order. I haven't tested this,
237
+ * but we rely on it here. */
238
+ return r;
239
+ }
240
+
241
+
242
+ /* {{{1
243
+ * Combine Hangul characters ‘a’ and ‘b’ if possible, and store the result in
244
+ * ‘result’. The combinations tried are L,V => LV and LV,T => LVT in that
245
+ * order.
246
+ */
247
+ static bool
248
+ combine_hangul(unichar a, unichar b, unichar *result)
249
+ {
250
+ int LIndex = a - LBase;
251
+ int VIndex = b - VBase;
252
+
253
+ if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) {
254
+ *result = SBase + (LIndex * VCount + VIndex) * TCount;
255
+ return true;
256
+ }
257
+
258
+ int SIndex = a - SBase;
259
+ int TIndex = b - TBase;
260
+
261
+ if (0 <= SIndex && SIndex < SCount &&
262
+ (SIndex % TCount) == 0 &&
263
+ 0 < TIndex && TIndex < TCount) {
264
+ *result = a + TIndex;
265
+ return true;
266
+ }
267
+
268
+ return false;
269
+ }
270
+
271
+
272
+ /* {{{1
273
+ * Try to combine the Unicode characters ‘a’ and ‘b’ storing the result in
274
+ * ‘result’.
275
+ */
276
+ static uint16_t
277
+ compose_index(unichar c)
278
+ {
279
+ unsigned int page = c >> 8;
280
+
281
+ if (page > COMPOSE_TABLE_LAST)
282
+ return 0;
283
+
284
+ /* TODO: why is this signed, exactly? */
285
+ int16_t compose_offset = compose_table[page];
286
+ return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
287
+ compose_offset - UNICODE_MAX_TABLE_INDEX :
288
+ compose_data[compose_offset][c & 0xff];
289
+ }
290
+
291
+ static bool
292
+ lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
293
+ unichar *result)
294
+ {
295
+ if (c == table[index][0]) {
296
+ *result = table[index][1];
297
+ return true;
298
+ }
299
+
300
+ return false;
301
+ }
302
+
303
+ static bool
304
+ combine(unichar a, unichar b, unichar *result)
305
+ {
306
+ if (combine_hangul(a, b, result))
307
+ return true;
308
+
309
+ uint16_t index_a = compose_index(a);
310
+ if (index_a >= COMPOSE_FIRST_SINGLE_START &&
311
+ index_a < COMPOSE_SECOND_START) {
312
+ return lookup_compose(compose_first_single,
313
+ index_a - COMPOSE_FIRST_SINGLE_START,
314
+ b,
315
+ result);
316
+ }
317
+
318
+ uint16_t index_b = compose_index(b);
319
+ if (index_b >= COMPOSE_SECOND_SINGLE_START) {
320
+ return lookup_compose(compose_second_single,
321
+ index_b - COMPOSE_SECOND_SINGLE_START,
322
+ a,
323
+ result);
324
+ }
325
+
326
+ if (index_a >= COMPOSE_FIRST_START &&
327
+ index_a < COMPOSE_FIRST_SINGLE_START &&
328
+ index_b >= COMPOSE_SECOND_START &&
329
+ index_b < COMPOSE_SECOND_SINGLE_START) {
330
+ unichar r = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
331
+
332
+ if (r != 0) {
333
+ *result = r;
334
+ return true;
335
+ }
336
+ }
337
+
338
+ return false;
339
+ }
340
+
341
+
342
+ /* {{{1
343
+ * Normalize (compose/decompose) characters in ‘str’ so that strings that
344
+ * actually contain the same characters will be recognized as equal for
345
+ * comparison for example.
346
+ */
347
+ static size_t
348
+ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
349
+ {
350
+ bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
351
+ const char *decomp = find_decomposition(c, do_compat);
352
+
353
+ if (decomp == NULL) {
354
+ if (buf != NULL)
355
+ buf[0] = c;
356
+ return 1;
357
+ }
358
+
359
+ if (buf != NULL) {
360
+ int i;
361
+ for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
362
+ buf[i] = utf_char(decomp);
363
+ return i;
364
+ }
365
+
366
+ return utf_length(decomp);
367
+ }
368
+
369
+ static void
370
+ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
371
+ NormalizeMode mode, unichar *buf, size_t *buf_len)
372
+ {
373
+ size_t n = 0;
374
+ size_t prev_start = 0;
375
+ const char *end = str + max_len;
376
+ for (const char *p = str; (!use_len || p < end) && *p != NUL; p = utf_next(p)) {
377
+ unichar c = utf_char(p);
378
+ size_t prev_n = n;
379
+
380
+ unichar *base = (buf != NULL) ? buf + n : NULL;
381
+ if (c >= SBase && c <= SLast) {
382
+ size_t len;
383
+
384
+ decompose_hangul(c, base, &len);
385
+ n += len;
386
+ } else {
387
+ n += normalize_wc_decompose_one(c, mode, base);
388
+ }
389
+
390
+ if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
391
+ unicode_canonical_ordering(buf + prev_start,
392
+ n - prev_start);
393
+ prev_start = prev_n;
394
+ }
395
+ }
396
+
397
+ if (buf != NULL && n > 0)
398
+ unicode_canonical_ordering(buf + prev_start, n - prev_start);
399
+
400
+ if (buf != NULL)
401
+ buf[n] = NUL;
402
+
403
+ *buf_len = n;
404
+ }
405
+
406
+ unichar *
407
+ _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
408
+ {
409
+ size_t n;
410
+ normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
411
+ unichar *buf = ALLOC_N(unichar, n + 1);
412
+ normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
413
+
414
+ /* Just return if we don’t want composition. */
415
+ if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
416
+ return buf;
417
+
418
+ size_t prev_start = 0;
419
+ int prev_cc = 0;
420
+ for (size_t i = 0; i < n; i++) {
421
+ int cc = COMBINING_CLASS(buf[i]);
422
+
423
+ if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
424
+ combine(buf[prev_start], buf[i], &buf[prev_start])) {
425
+ for (size_t j = i + 1; j < n; j++)
426
+ buf[j - 1] = buf[j];
427
+
428
+ n--;
429
+ i--;
430
+ prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
431
+ } else {
432
+ if (cc == 0)
433
+ prev_start = i;
434
+
435
+ prev_cc = cc;
436
+ }
437
+ }
438
+
439
+ buf[n] = NUL;
440
+
441
+ return buf;
442
+ }
443
+
444
+
445
+ /* {{{1
446
+ * Normalize (compose/decompose) characters in ‘str˚ so that strings that
447
+ * actually contain the same characters will be recognized as equal for
448
+ * comparison for example.
449
+ */
450
+ char *
451
+ utf_normalize(const char *str, NormalizeMode mode)
452
+ {
453
+ unichar *wcs = _utf_normalize_wc(str, 0, false, mode);
454
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
455
+
456
+ free(wcs);
457
+ return utf;
458
+ }
459
+
460
+
461
+ /* {{{1
462
+ * This function is the same as utf_normalize() except that at most ‘len˚
463
+ * bytes are normalized from ‘str’.
464
+ */
465
+ char *
466
+ utf_normalize_n(const char *str, NormalizeMode mode, size_t len)
467
+ {
468
+ unichar *wcs = _utf_normalize_wc(str, len, true, mode);
469
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
470
+
471
+ free(wcs);
472
+ return utf;
473
+ }
474
+
475
+
476
+ /* }}}1 */