character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,476 @@
1
+ /*
2
+ * contents: Unicode decomposition handling.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include "unicode.h"
12
+ #include "private.h"
13
+ #include "data/decompose.h"
14
+ #include "data/compose.h"
15
+
16
+
17
+ /* {{{1
18
+ * Macros for accessing the combining class property tables for a given
19
+ * character.
20
+ *
21
+ * TODO: Turn these macros into full-fledged functions, as this is rather silly
22
+ * when we have ‹inline› in C99.
23
+ */
24
+ #define CC_PART1(page, char) \
25
+ ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
26
+ ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
27
+ : (cclass_data[combining_class_table_part1[page]][char]))
28
+
29
+ #define CC_PART2(page, char) \
30
+ ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
31
+ ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
32
+ : (cclass_data[combining_class_table_part2[page]][char]))
33
+
34
+ #define COMBINING_CLASS(char) \
35
+ (((char) <= UNICODE_LAST_CHAR_PART1) \
36
+ ? CC_PART1((char) >> 8, (char) & 0xff) \
37
+ : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
38
+ ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
39
+ : 0))
40
+
41
+
42
+ /* {{{1
43
+ * Hangul syllable [de]composition constants. A lot of work I'd say.
44
+ */
45
+ enum {
46
+ SBase = 0xac00,
47
+ LBase = 0x1100,
48
+ VBase = 0x1161,
49
+ TBase = 0x11a7,
50
+ LCount = 19,
51
+ VCount = 21,
52
+ TCount = 28,
53
+ NCount = (VCount * TCount),
54
+ SCount = (LCount * NCount),
55
+ SLast = (SBase + SCount - 1)
56
+ };
57
+
58
+
59
+ /* {{{1
60
+ * Return the combinging class of ‘c’.
61
+ */
62
+ inline int
63
+ _unichar_combining_class(unichar c)
64
+ {
65
+ return COMBINING_CLASS(c);
66
+ }
67
+
68
+
69
+ /* {{{1
70
+ * Rearrange ‘str’ so that decomposed characters are arranged according to
71
+ * their combining class. Do this for at most ‘len’ bytes of data.
72
+ */
73
+ static void
74
+ unicode_canonical_ordering_swap(unichar *str, size_t offset, int next, bool *swapped)
75
+ {
76
+ size_t initial = offset + 1;
77
+
78
+ size_t j = initial;
79
+ while (j > 0 && COMBINING_CLASS(str[j - 1]) > next) {
80
+ unichar c = str[j];
81
+ str[j] = str[j - 1];
82
+ str[j - 1] = c;
83
+ j--;
84
+ }
85
+
86
+ *swapped = *swapped || (j != initial);
87
+ }
88
+
89
+ static bool
90
+ unicode_canonical_ordering_reorder(unichar *str, size_t len)
91
+ {
92
+ bool swapped = false;
93
+
94
+ int prev = COMBINING_CLASS(str[0]);
95
+ for (size_t i = 0; i < len - 1; i++) {
96
+ int next = COMBINING_CLASS(str[i + 1]);
97
+
98
+ if (next != 0 && prev > next)
99
+ unicode_canonical_ordering_swap(str, i, next, &swapped);
100
+ else
101
+ prev = next;
102
+ }
103
+
104
+ return swapped;
105
+ }
106
+
107
+ void
108
+ unicode_canonical_ordering(unichar *str, size_t len)
109
+ {
110
+ while (unicode_canonical_ordering_reorder(str, len))
111
+ ; /* This loop intentionally left empty. */
112
+ }
113
+
114
+
115
+ /* {{{1
116
+ * Decompose the character ‘s’ according to the rules outlined in
117
+ * http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
118
+ * or of sufficient length to store the decomposition of ‘s’. The number of
119
+ * characters stored (or would be if it were non-‹NULL›) in ‘r’ is put in
120
+ * ‘r_len’.
121
+ */
122
+ static void
123
+ decompose_hangul(unichar s, unichar *r, size_t *r_len)
124
+ {
125
+ int SIndex = s - SBase;
126
+
127
+ /* If this isn’t a Hangul symbol, then simply return it unchanged. */
128
+ if (SIndex < 0 || SIndex >= SCount) {
129
+ if (r != NULL)
130
+ r[0] = s;
131
+ *r_len = 1;
132
+ return;
133
+ }
134
+
135
+ unichar L = LBase + SIndex / NCount;
136
+ unichar V = VBase + (SIndex % NCount) / TCount;
137
+ unichar T = TBase + SIndex % TCount;
138
+
139
+ if (r != NULL) {
140
+ r[0] = L;
141
+ r[1] = V;
142
+ }
143
+
144
+ if (T != TBase) {
145
+ if (r != NULL)
146
+ r[2] = T;
147
+ *r_len = 3;
148
+ } else {
149
+ *r_len = 2;
150
+ }
151
+ }
152
+
153
+
154
+ /* {{{1
155
+ * Search the Unicode decomposition table for ‘c’ and depending on the boolean
156
+ * value of ‘compat’, return its compatibility or canonical decomposition if
157
+ * found.
158
+ */
159
+ static const char *
160
+ get_decomposition(int index, bool compat)
161
+ {
162
+ int offset;
163
+
164
+ if (compat) {
165
+ offset = decomp_table[index].compat_offset;
166
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
167
+ offset = decomp_table[index].canon_offset;
168
+ /* Either .compat_offset or .canon_offset can be missing, but
169
+ * not both. */
170
+ } else {
171
+ offset = decomp_table[index].canon_offset;
172
+ if (offset == UNICODE_NOT_PRESENT_OFFSET)
173
+ return NULL;
174
+ }
175
+
176
+ return &decomp_expansion_string[offset];
177
+ }
178
+
179
+ static const char *
180
+ find_decomposition(unichar c, bool compat)
181
+ {
182
+ int begin = 0;
183
+ int end = lengthof(decomp_table);
184
+
185
+ if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
186
+ return NULL;
187
+
188
+ while (true) {
189
+ int middle = (begin + end) / 2;
190
+
191
+ if (c == decomp_table[middle].ch)
192
+ return get_decomposition(middle, compat);
193
+ else if (middle == begin)
194
+ break;
195
+ else if (c > decomp_table[middle].ch)
196
+ begin = middle;
197
+ else
198
+ end = middle;
199
+ }
200
+
201
+ return NULL;
202
+ }
203
+
204
+
205
+ /* {{{1
206
+ * Generate the canonical decomposition of ‘c’. The length of the
207
+ * decomposition is stored in ‘len’.
208
+ */
209
+ /* TODO: clean this up. */
210
+ unichar *
211
+ unicode_canonical_decomposition(unichar c, size_t *len)
212
+ {
213
+ const char *decomp;
214
+ unichar *r;
215
+
216
+ /* Hangul syllable */
217
+ if (c >= SBase && c <= SLast) {
218
+ decompose_hangul(c, NULL, len);
219
+ r = ALLOC_N(unichar, *len);
220
+ decompose_hangul(c, r, len);
221
+ } else if ((decomp = find_decomposition(c, false)) != NULL) {
222
+ *len = utf_length(decomp);
223
+ r = ALLOC_N(unichar, *len);
224
+
225
+ int i;
226
+ const char *p;
227
+ for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
228
+ r[i] = utf_char(p);
229
+ } else {
230
+ r = ALLOC(unichar);
231
+ *r = c;
232
+ *len = 1;
233
+ }
234
+
235
+ /* Supposedly following the Unicode 2.1.9 table means that the
236
+ * decompositions come out in canonical order. I haven't tested this,
237
+ * but we rely on it here. */
238
+ return r;
239
+ }
240
+
241
+
242
+ /* {{{1
243
+ * Combine Hangul characters ‘a’ and ‘b’ if possible, and store the result in
244
+ * ‘result’. The combinations tried are L,V => LV and LV,T => LVT in that
245
+ * order.
246
+ */
247
+ static bool
248
+ combine_hangul(unichar a, unichar b, unichar *result)
249
+ {
250
+ int LIndex = a - LBase;
251
+ int VIndex = b - VBase;
252
+
253
+ if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) {
254
+ *result = SBase + (LIndex * VCount + VIndex) * TCount;
255
+ return true;
256
+ }
257
+
258
+ int SIndex = a - SBase;
259
+ int TIndex = b - TBase;
260
+
261
+ if (0 <= SIndex && SIndex < SCount &&
262
+ (SIndex % TCount) == 0 &&
263
+ 0 < TIndex && TIndex < TCount) {
264
+ *result = a + TIndex;
265
+ return true;
266
+ }
267
+
268
+ return false;
269
+ }
270
+
271
+
272
+ /* {{{1
273
+ * Try to combine the Unicode characters ‘a’ and ‘b’ storing the result in
274
+ * ‘result’.
275
+ */
276
+ static uint16_t
277
+ compose_index(unichar c)
278
+ {
279
+ unsigned int page = c >> 8;
280
+
281
+ if (page > COMPOSE_TABLE_LAST)
282
+ return 0;
283
+
284
+ /* TODO: why is this signed, exactly? */
285
+ int16_t compose_offset = compose_table[page];
286
+ return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
287
+ compose_offset - UNICODE_MAX_TABLE_INDEX :
288
+ compose_data[compose_offset][c & 0xff];
289
+ }
290
+
291
+ static bool
292
+ lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
293
+ unichar *result)
294
+ {
295
+ if (c == table[index][0]) {
296
+ *result = table[index][1];
297
+ return true;
298
+ }
299
+
300
+ return false;
301
+ }
302
+
303
+ static bool
304
+ combine(unichar a, unichar b, unichar *result)
305
+ {
306
+ if (combine_hangul(a, b, result))
307
+ return true;
308
+
309
+ uint16_t index_a = compose_index(a);
310
+ if (index_a >= COMPOSE_FIRST_SINGLE_START &&
311
+ index_a < COMPOSE_SECOND_START) {
312
+ return lookup_compose(compose_first_single,
313
+ index_a - COMPOSE_FIRST_SINGLE_START,
314
+ b,
315
+ result);
316
+ }
317
+
318
+ uint16_t index_b = compose_index(b);
319
+ if (index_b >= COMPOSE_SECOND_SINGLE_START) {
320
+ return lookup_compose(compose_second_single,
321
+ index_b - COMPOSE_SECOND_SINGLE_START,
322
+ a,
323
+ result);
324
+ }
325
+
326
+ if (index_a >= COMPOSE_FIRST_START &&
327
+ index_a < COMPOSE_FIRST_SINGLE_START &&
328
+ index_b >= COMPOSE_SECOND_START &&
329
+ index_b < COMPOSE_SECOND_SINGLE_START) {
330
+ unichar r = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
331
+
332
+ if (r != 0) {
333
+ *result = r;
334
+ return true;
335
+ }
336
+ }
337
+
338
+ return false;
339
+ }
340
+
341
+
342
+ /* {{{1
343
+ * Normalize (compose/decompose) characters in ‘str’ so that strings that
344
+ * actually contain the same characters will be recognized as equal for
345
+ * comparison for example.
346
+ */
347
+ static size_t
348
+ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
349
+ {
350
+ bool do_compat = (mode == NORMALIZE_NFKC || mode == NORMALIZE_NFKD);
351
+ const char *decomp = find_decomposition(c, do_compat);
352
+
353
+ if (decomp == NULL) {
354
+ if (buf != NULL)
355
+ buf[0] = c;
356
+ return 1;
357
+ }
358
+
359
+ if (buf != NULL) {
360
+ int i;
361
+ for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
362
+ buf[i] = utf_char(decomp);
363
+ return i;
364
+ }
365
+
366
+ return utf_length(decomp);
367
+ }
368
+
369
+ static void
370
+ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
371
+ NormalizeMode mode, unichar *buf, size_t *buf_len)
372
+ {
373
+ size_t n = 0;
374
+ size_t prev_start = 0;
375
+ const char *end = str + max_len;
376
+ for (const char *p = str; (!use_len || p < end) && *p != NUL; p = utf_next(p)) {
377
+ unichar c = utf_char(p);
378
+ size_t prev_n = n;
379
+
380
+ unichar *base = (buf != NULL) ? buf + n : NULL;
381
+ if (c >= SBase && c <= SLast) {
382
+ size_t len;
383
+
384
+ decompose_hangul(c, base, &len);
385
+ n += len;
386
+ } else {
387
+ n += normalize_wc_decompose_one(c, mode, base);
388
+ }
389
+
390
+ if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
391
+ unicode_canonical_ordering(buf + prev_start,
392
+ n - prev_start);
393
+ prev_start = prev_n;
394
+ }
395
+ }
396
+
397
+ if (buf != NULL && n > 0)
398
+ unicode_canonical_ordering(buf + prev_start, n - prev_start);
399
+
400
+ if (buf != NULL)
401
+ buf[n] = NUL;
402
+
403
+ *buf_len = n;
404
+ }
405
+
406
+ unichar *
407
+ _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
408
+ {
409
+ size_t n;
410
+ normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
411
+ unichar *buf = ALLOC_N(unichar, n + 1);
412
+ normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
413
+
414
+ /* Just return if we don’t want composition. */
415
+ if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
416
+ return buf;
417
+
418
+ size_t prev_start = 0;
419
+ int prev_cc = 0;
420
+ for (size_t i = 0; i < n; i++) {
421
+ int cc = COMBINING_CLASS(buf[i]);
422
+
423
+ if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
424
+ combine(buf[prev_start], buf[i], &buf[prev_start])) {
425
+ for (size_t j = i + 1; j < n; j++)
426
+ buf[j - 1] = buf[j];
427
+
428
+ n--;
429
+ i--;
430
+ prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
431
+ } else {
432
+ if (cc == 0)
433
+ prev_start = i;
434
+
435
+ prev_cc = cc;
436
+ }
437
+ }
438
+
439
+ buf[n] = NUL;
440
+
441
+ return buf;
442
+ }
443
+
444
+
445
+ /* {{{1
446
+ * Normalize (compose/decompose) characters in ‘str˚ so that strings that
447
+ * actually contain the same characters will be recognized as equal for
448
+ * comparison for example.
449
+ */
450
+ char *
451
+ utf_normalize(const char *str, NormalizeMode mode)
452
+ {
453
+ unichar *wcs = _utf_normalize_wc(str, 0, false, mode);
454
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
455
+
456
+ free(wcs);
457
+ return utf;
458
+ }
459
+
460
+
461
+ /* {{{1
462
+ * This function is the same as utf_normalize() except that at most ‘len˚
463
+ * bytes are normalized from ‘str’.
464
+ */
465
+ char *
466
+ utf_normalize_n(const char *str, NormalizeMode mode, size_t len)
467
+ {
468
+ unichar *wcs = _utf_normalize_wc(str, len, true, mode);
469
+ char *utf = ucs4_to_utf8(wcs, NULL, NULL);
470
+
471
+ free(wcs);
472
+ return utf;
473
+ }
474
+
475
+
476
+ /* }}}1 */