character-encodings 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ require 'rake/testtask'
10
10
  require 'spec/rake/spectask'
11
11
 
12
12
  PackageName = 'character-encodings'
13
- PackageVersion = '0.2.0'
13
+ PackageVersion = '0.3.0'
14
14
 
15
15
  desc 'Default task'
16
16
  task :default => [:extensions]
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
92
92
  end
93
93
 
94
94
  Tests = [
95
- ['tests/foldcase.rb'],
95
+ ['tests/foldcase.rb', 'tests/case.rb'],
96
96
  ['tests/normalize.rb']
97
97
  ]
98
98
 
@@ -9,30 +9,17 @@
9
9
  #include <stdint.h>
10
10
  #include "unicode.h"
11
11
  #include "data/break.h"
12
-
12
+ #include "private.h"
13
13
 
14
14
  /* Figure out what break type the Unicode character ‘c’ possesses, if any.
15
15
  * This information is used for finding word and line boundaries, which is
16
16
  * useful when displaying Unicode text on screen. */
17
- static UnicodeBreakType
18
- break_type(const int16_t table[], unsigned int page, unichar c)
19
- {
20
- int16_t break_property = table[page];
21
-
22
- return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
23
- break_property - UNICODE_MAX_TABLE_INDEX :
24
- break_property_data[break_property][c & 0xff];
25
- }
26
-
27
17
  UnicodeBreakType
28
18
  unichar_break_type(unichar c)
29
19
  {
30
- if (c <= UNICODE_LAST_CHAR_PART1)
31
- return break_type(break_property_table_part1, c >> 8, c);
32
-
33
- if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
34
- return break_type(break_property_table_part2,
35
- (c - 0xe0000) >> 8, c);
36
-
37
- return UNICODE_BREAK_UNKNOWN;
20
+ return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
21
+ break_property_table_part1,
22
+ break_property_table_part2,
23
+ c,
24
+ UNICODE_BREAK_UNKNOWN);
38
25
  }
@@ -13,6 +13,8 @@
13
13
 
14
14
  #define UNICODE_LAST_PAGE_PART1 762
15
15
 
16
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
17
+
16
18
  #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
17
19
 
18
20
 
@@ -10,6 +10,7 @@
10
10
 
11
11
  #define UNICODE_LAST_CHAR_PART1 0x2faff
12
12
  #define UNICODE_LAST_PAGE_PART1 762
13
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
13
14
 
14
15
  #define UNICODE_NOT_PRESENT_OFFSET 65535
15
16
 
@@ -534,6 +534,8 @@ private
534
534
 
535
535
  #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
536
 
537
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
538
+
537
539
  #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
538
540
  EOF
539
541
  print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -694,8 +696,11 @@ EOF
694
696
  #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
695
697
 
696
698
  #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
699
+
697
700
  #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
698
701
 
702
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
703
+
699
704
  #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
700
705
  EOF
701
706
  print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -8,35 +8,16 @@
8
8
  #include <stdbool.h>
9
9
  #include <stddef.h>
10
10
  #include <stdint.h>
11
+
11
12
  #include "unicode.h"
12
- #include "private.h"
13
+
13
14
  #include "data/decompose.h"
14
15
  #include "data/compose.h"
15
16
 
17
+ #include "private.h"
16
18
 
17
- /* {{{1
18
- * Macros for accessing the combining class property tables for a given
19
- * character.
20
- *
21
- * TODO: Turn these macros into full-fledged functions, as this is rather silly
22
- * when we have ‹inline› in C99.
23
- */
24
- #define CC_PART1(page, char) \
25
- ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
26
- ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
27
- : (cclass_data[combining_class_table_part1[page]][char]))
28
-
29
- #define CC_PART2(page, char) \
30
- ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
31
- ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
32
- : (cclass_data[combining_class_table_part2[page]][char]))
33
-
34
- #define COMBINING_CLASS(char) \
35
- (((char) <= UNICODE_LAST_CHAR_PART1) \
36
- ? CC_PART1((char) >> 8, (char) & 0xff) \
37
- : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
38
- ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
39
- : 0))
19
+ #define COMBINING_CLASS(c) \
20
+ SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
40
21
 
41
22
 
42
23
  /* {{{1
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
115
96
  /* {{{1
116
97
  * Decompose the character ‘s’ according to the rules outlined in
117
98
  * http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
118
- * or of sufficient length to store the decomposition of ‘s’. The number of
119
- * characters stored (or would be if it were non-‹NULL) in ‘r’ is put in
120
- * ‘r_len’.
99
+ * or of sufficient length to store the decomposition of ‘s’. Returns the
100
+ * number of characters stored (or would be if it were non-NULL) in R.
121
101
  */
122
- static void
123
- decompose_hangul(unichar s, unichar *r, size_t *r_len)
102
+ static size_t
103
+ decompose_hangul(unichar s, unichar *r)
124
104
  {
125
105
  int SIndex = s - SBase;
126
106
 
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
128
108
  if (SIndex < 0 || SIndex >= SCount) {
129
109
  if (r != NULL)
130
110
  r[0] = s;
131
- *r_len = 1;
132
- return;
111
+ return 1;
133
112
  }
134
113
 
135
114
  unichar L = LBase + SIndex / NCount;
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
141
120
  r[1] = V;
142
121
  }
143
122
 
144
- if (T != TBase) {
145
- if (r != NULL)
146
- r[2] = T;
147
- *r_len = 3;
148
- } else {
149
- *r_len = 2;
150
- }
123
+ if (T == TBase)
124
+ return 2;
125
+
126
+ if (r != NULL)
127
+ r[2] = T;
128
+
129
+ return 3;
151
130
  }
152
131
 
153
132
 
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
179
158
  static const char *
180
159
  find_decomposition(unichar c, bool compat)
181
160
  {
182
- int begin = 0;
183
- int end = lengthof(decomp_table);
161
+ int index;
184
162
 
185
- if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
163
+ if (!unicode_table_lookup(decomp_table, c, &index))
186
164
  return NULL;
187
165
 
188
- while (true) {
189
- int middle = (begin + end) / 2;
166
+ return get_decomposition(index, compat);
167
+ }
190
168
 
191
- if (c == decomp_table[middle].ch)
192
- return get_decomposition(middle, compat);
193
- else if (middle == begin)
194
- break;
195
- else if (c > decomp_table[middle].ch)
196
- begin = middle;
197
- else
198
- end = middle;
199
- }
200
169
 
201
- return NULL;
170
+ /* {{{1
171
+ * Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
172
+ * ‘chars’. Return the number of unichars in ‘chars’.
173
+ */
174
+ static size_t
175
+ decomposition_to_wc(const char *decomposition, unichar *chars)
176
+ {
177
+ size_t i = 0;
178
+ for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
179
+ chars[i++] = utf_char(p);
180
+
181
+ return i;
202
182
  }
203
183
 
204
184
 
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
215
195
 
216
196
  /* Hangul syllable */
217
197
  if (c >= SBase && c <= SLast) {
218
- decompose_hangul(c, NULL, len);
198
+ *len = decompose_hangul(c, NULL);
219
199
  r = ALLOC_N(unichar, *len);
220
- decompose_hangul(c, r, len);
200
+ decompose_hangul(c, r);
221
201
  } else if ((decomp = find_decomposition(c, false)) != NULL) {
222
202
  *len = utf_length(decomp);
223
203
  r = ALLOC_N(unichar, *len);
224
-
225
- int i;
226
- const char *p;
227
- for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
228
- r[i] = utf_char(p);
204
+ decomposition_to_wc(decomp, r);
229
205
  } else {
230
206
  r = ALLOC(unichar);
231
207
  *r = c;
@@ -281,23 +257,19 @@ compose_index(unichar c)
281
257
  if (page > COMPOSE_TABLE_LAST)
282
258
  return 0;
283
259
 
284
- /* TODO: why is this signed, exactly? */
285
- int16_t compose_offset = compose_table[page];
286
- return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
287
- compose_offset - UNICODE_MAX_TABLE_INDEX :
288
- compose_data[compose_offset][c & 0xff];
260
+ return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
289
261
  }
290
262
 
291
263
  static bool
292
264
  lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
293
265
  unichar *result)
294
266
  {
295
- if (c == table[index][0]) {
296
- *result = table[index][1];
297
- return true;
298
- }
267
+ if (c != table[index][0])
268
+ return false;
299
269
 
300
- return false;
270
+ *result = table[index][1];
271
+
272
+ return true;
301
273
  }
302
274
 
303
275
  static bool
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
307
279
  return true;
308
280
 
309
281
  uint16_t index_a = compose_index(a);
310
- if (index_a >= COMPOSE_FIRST_SINGLE_START &&
311
- index_a < COMPOSE_SECOND_START) {
282
+ if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
312
283
  return lookup_compose(compose_first_single,
313
284
  index_a - COMPOSE_FIRST_SINGLE_START,
314
285
  b,
315
286
  result);
316
- }
317
287
 
318
288
  uint16_t index_b = compose_index(b);
319
- if (index_b >= COMPOSE_SECOND_SINGLE_START) {
289
+ if (index_b >= COMPOSE_SECOND_SINGLE_START)
320
290
  return lookup_compose(compose_second_single,
321
291
  index_b - COMPOSE_SECOND_SINGLE_START,
322
292
  a,
323
293
  result);
324
- }
325
294
 
326
295
  if (index_a >= COMPOSE_FIRST_START &&
327
296
  index_a < COMPOSE_FIRST_SINGLE_START &&
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
356
325
  return 1;
357
326
  }
358
327
 
359
- if (buf != NULL) {
360
- int i;
361
- for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
362
- buf[i] = utf_char(decomp);
363
- return i;
364
- }
328
+ if (buf != NULL)
329
+ return decomposition_to_wc(decomp, buf);
365
330
 
366
331
  return utf_length(decomp);
367
332
  }
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
378
343
  size_t prev_n = n;
379
344
 
380
345
  unichar *base = (buf != NULL) ? buf + n : NULL;
381
- if (c >= SBase && c <= SLast) {
382
- size_t len;
383
-
384
- decompose_hangul(c, base, &len);
385
- n += len;
386
- } else {
346
+ if (c >= SBase && c <= SLast)
347
+ n += decompose_hangul(c, base);
348
+ else
387
349
  n += normalize_wc_decompose_one(c, mode, base);
388
- }
389
350
 
390
351
  if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
391
352
  unicode_canonical_ordering(buf + prev_start,
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
403
364
  *buf_len = n;
404
365
  }
405
366
 
406
- unichar *
407
- _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
367
+ static unichar *
368
+ normalize_wc_compose(unichar *buf, size_t len)
408
369
  {
409
- size_t n;
410
- normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
411
- unichar *buf = ALLOC_N(unichar, n + 1);
412
- normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
413
-
414
- /* Just return if we don’t want composition. */
415
- if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
416
- return buf;
417
-
370
+ int new_len = len;
418
371
  size_t prev_start = 0;
419
372
  int prev_cc = 0;
420
- for (size_t i = 0; i < n; i++) {
373
+
374
+ for (size_t i = 0; i < len; i++) {
421
375
  int cc = COMBINING_CLASS(buf[i]);
376
+ size_t j = i - (len - new_len);
422
377
 
423
- if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
378
+ if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
424
379
  combine(buf[prev_start], buf[i], &buf[prev_start])) {
425
- for (size_t j = i + 1; j < n; j++)
426
- buf[j - 1] = buf[j];
427
-
428
- n--;
429
- i--;
430
- prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
380
+ new_len--;
381
+ prev_cc = (j + 1 == prev_start) ?
382
+ 0 : COMBINING_CLASS(buf[j - 1]);
431
383
  } else {
432
384
  if (cc == 0)
433
- prev_start = i;
385
+ prev_start = j;
434
386
 
387
+ buf[j] = buf[i];
435
388
  prev_cc = cc;
436
389
  }
437
390
  }
438
391
 
439
- buf[n] = NUL;
392
+ buf[new_len] = NUL;
440
393
 
441
394
  return buf;
442
395
  }
443
396
 
397
+ unichar *
398
+ _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
399
+ {
400
+ size_t n;
401
+ normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
402
+ unichar *buf = ALLOC_N(unichar, n + 1);
403
+ normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
404
+
405
+ /* Just return if we don’t want composition. */
406
+ if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
407
+ return buf;
408
+
409
+ return normalize_wc_compose(buf, n);
410
+ }
411
+
444
412
 
445
413
  /* {{{1
446
414
  * Normalize (compose/decompose) characters in ‘str˚ so that strings that
@@ -1,6 +1,7 @@
1
1
  break.o: break.c unicode.h data/break.h
2
2
  decompose.o: decompose.c unicode.h private.h data/decompose.h \
3
3
  data/compose.h
4
+ private.o: private.c private.h
4
5
  properties.o: properties.c unicode.h private.h data/character-tables.h
5
6
  rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
6
7
  rb_methods.h
@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
12
12
  end
13
13
 
14
14
  try_compiler_option('-std=c99')
15
+ try_compiler_option('-finline-functions')
15
16
  try_compiler_option('-Wall')
16
17
  try_compiler_option('-Wextra')
17
18
  try_compiler_option('-Wwrite-strings')
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
23
24
  try_compiler_option('-Wpointer-arith')
24
25
  try_compiler_option('-Wcast-align')
25
26
  try_compiler_option('-Werror')
27
+ try_compiler_option('-Winline')
26
28
  # XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
27
29
  # local variables called “index” on FreeBSD.
28
30
  # try_compiler_option('-Wshadow')
@@ -0,0 +1,62 @@
1
+ /*
2
+ * contents: Private functions used by the UTF-8 character-encoding library.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include <stdlib.h>
12
+
13
+ #include "unicode.h"
14
+
15
+ #include "private.h"
16
+
17
+ /* Lookup C in the sorted TABLE using binary search. TABLE consists of N
18
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
19
+ * component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
20
+ * index is stored in INDEX and true is returned. Otherwise, false is returned
21
+ * and INDEX is left untouched. */
22
+ bool
23
+ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
24
+ {
25
+ #define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
26
+
27
+ int begin = 0;
28
+ int end = n - 1;
29
+ int middle;
30
+
31
+ /* This is ugly, but not all tables use unichars as their lookup
32
+ * character. The casefold table, for example, uses uint16_t-sized
33
+ * characters. To only get the interesting part of our table entry
34
+ * we’ll have to mask the retrieved value. */
35
+ int char_mask = (1 << (8 * sizeof_char)) - 1;
36
+
37
+ /* Drop out early if we know for certain that C can’t be in the
38
+ * decomposition table. */
39
+ if (c < ENTRY(0) || c > ENTRY(end))
40
+ return false;
41
+
42
+ while (begin <= end) {
43
+ middle = binary_search_middle_of(begin, end);
44
+
45
+ unichar probe = ENTRY(middle);
46
+ if (c < probe)
47
+ end = middle - 1;
48
+ else if (c > probe)
49
+ begin = middle + 1;
50
+ else
51
+ break;
52
+ }
53
+
54
+ if (begin > end)
55
+ return false;
56
+
57
+ *index = middle;
58
+
59
+ return true;
60
+
61
+ #undef ENTRY
62
+ }
@@ -21,48 +21,28 @@
21
21
  # define HIDDEN(u)
22
22
  #endif
23
23
 
24
- unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
25
- NormalizeMode mode) HIDDEN;
26
- inline int _unichar_combining_class(unichar c) HIDDEN;
27
-
28
- void need_at_least_n_arguments(int argc, int n) HIDDEN;
29
-
30
- unichar _utf_char_validated(char const *const str,
31
- char const *const str_end) HIDDEN;
32
- char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
33
- const char *limit, bool noisy) HIDDEN;
34
-
35
- char *_utf_offset_to_pointer_validated(const char *str, long offset,
36
- const char *end) HIDDEN;
37
-
38
- char *_utf_offset_to_pointer_failable(const char *str, long offset,
39
- const char *end) HIDDEN;
40
-
41
- VALUE rb_utf_new(const char *str, long len) HIDDEN;
42
-
43
- VALUE rb_utf_new2(const char *str) HIDDEN;
24
+ #define binary_search_middle_of(begin, end) \
25
+ (((unsigned)((begin) + (end))) >> 1)
44
26
 
45
- VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
27
+ #define unicode_table_lookup(table, c, index) \
28
+ binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
46
29
 
47
- VALUE rb_utf_alloc_using(char *str) HIDDEN;
30
+ bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
48
31
 
49
- VALUE rb_utf_dup(VALUE str) HIDDEN;
32
+ #define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
33
+ ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
34
+ ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
35
+ : (data[part[page]][(c) & 0xff]))
50
36
 
51
- long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
37
+ #define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
38
+ (((c) <= UNICODE_LAST_CHAR_PART1) \
39
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
40
+ : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
41
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
42
+ : (fallback)))
52
43
 
53
- bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
54
- char **limit) HIDDEN;
55
-
56
- void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
57
- char **limit) HIDDEN;
58
-
59
- char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
60
-
61
- VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
62
-
63
- char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
64
-
65
- long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
66
- long offset, bool reverse) HIDDEN;
44
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
45
+ NormalizeMode mode) HIDDEN;
46
+ int _unichar_combining_class(unichar c) HIDDEN;
67
47
 
68
48
  #endif /* PRIVATE_H */
@@ -63,8 +63,8 @@ s_type(unichar c)
63
63
  if (c <= UNICODE_LAST_CHAR_PART1) {
64
64
  page = c >> 8;
65
65
  table = type_table_part1;
66
- } else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
67
- page = (c - 0xe0000) >> 8;
66
+ } else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
68
68
  table = type_table_part2;
69
69
  } else {
70
70
  return UNICODE_UNASSIGNED;
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
364
364
  unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
365
 
366
366
  if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
- return utf_char(special_case_table +
368
- tv - UNICODE_SPECIAL_CASE_TABLE_START);
367
+ tv = utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
369
 
370
370
  if (tv == '\0')
371
371
  return c;
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
429
429
  return title_table[i][0];
430
430
 
431
431
  if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
- return ATTTABLE(c >> 8, c & 0xff);
432
+ return unichar_toupper(c);
433
433
 
434
434
  return c;
435
435
  }
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
585
585
 
586
586
  if (*was_i) {
587
587
  size_t len = remove_all_combining_dot_above(c, buf);
588
- return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
589
- true);
588
+ return len + output_marks(p, OFFSET_IF(buf, len), true);
590
589
  }
591
590
 
592
591
  if (!s_ismark(type))
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
761
760
 
762
761
  /* {{{1
763
762
  * The real implementation of downcase.
764
- *
765
- * TODO: this needs a cleanup.
766
763
  */
764
+ static size_t
765
+ tolower_turkic_i(const char **p, char *buf)
766
+ {
767
+ unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
768
+
769
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
770
+ /* TODO: don’t we need to make sure we don’t go beyond the end
771
+ * of ‘p’? */
772
+ *p = utf_next(*p);
773
+ i = LATIN_SMALL_LETTER_I;
774
+ }
775
+
776
+ return unichar_to_utf(i, buf);
777
+ }
778
+
779
+ static size_t
780
+ tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
781
+ {
782
+ size_t len = unichar_to_utf(base, buf);
783
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
784
+ if (combiner != '\0')
785
+ len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
786
+
787
+ return len;
788
+ }
789
+
790
+ static size_t
791
+ tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
792
+ {
793
+ unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
794
+
795
+ /* SIGMA maps differently depending on whether it is final or not. The
796
+ * following simplified test would fail in the case of combining marks
797
+ * following the sigma, but I don't think that occurs in real text.
798
+ * The test here matches that in ICU. */
799
+ if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
800
+ sigma = GREEK_SMALL_LETTER_SIGMA;
801
+
802
+ return unichar_to_utf(sigma, buf);
803
+ }
804
+
767
805
  static size_t
768
806
  real_tolower_one(const char **p, const char *prev, char *buf,
769
807
  LocaleType locale_type, const char *end, bool use_end)
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
771
809
  unichar c = utf_char(prev);
772
810
  int type = s_type(c);
773
811
 
774
- if (locale_type == LOCALE_TURKIC && c == 'I') {
775
- if (utf_char(*p) == COMBINING_DOT_ABOVE) {
776
- /* TODO: don’t we need to make sure we don’t go beyond the end
777
- * of ‘p’? */
778
- *p = utf_next(*p);
779
- return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
780
- }
812
+ if (locale_type == LOCALE_TURKIC && c == 'I')
813
+ return tolower_turkic_i(p, buf);
781
814
 
782
- return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
783
- }
815
+ /* Introduce an explicit dot above the lowercasing capital I’s
816
+ * and J’s whenever there are more accents above.
817
+ * [SpecialCasing.txt] */
818
+ if (locale_type == LOCALE_LITHUANIAN) {
819
+ unichar base = LATIN_SMALL_LETTER_I;
820
+ unichar combiner = '\0';
784
821
 
785
- if (locale_type == LOCALE_LITHUANIAN &&
786
- (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
787
- c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
788
- c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
789
- /* Introduce an explicit dot above the lowercasing capital I's
790
- * and J's whenever there are more accents above.
791
- * [SpecialCasing.txt] */
792
- size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
793
- len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
794
822
  switch (c) {
795
823
  case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
796
- len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
797
- OFFSET_IF(buf, len));
824
+ combiner = COMBINING_GRAVE_ACCENT;
798
825
  break;
799
826
  case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
800
- len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
801
- OFFSET_IF(buf, len));
827
+ combiner = COMBINING_ACUTE_ACCENT;
802
828
  break;
803
829
  case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
804
- len += unichar_to_utf(COMBINING_TILDE,
805
- OFFSET_IF(buf, len));
830
+ combiner = COMBINING_TILDE;
806
831
  break;
807
- }
832
+ case 'I':
833
+ case 'J':
834
+ case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
835
+ if (!has_more_above(*p))
836
+ goto no_lithuanian_i_casing;
808
837
 
809
- return len;
810
- }
838
+ base = unichar_tolower(c);
839
+ break;
840
+ default:
841
+ goto no_lithuanian_i_casing;
842
+ }
811
843
 
812
- if (locale_type == LOCALE_LITHUANIAN &&
813
- (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
814
- has_more_above(*p)) {
815
- size_t len = unichar_to_utf(unichar_tolower(c), buf);
816
- return len + unichar_to_utf(COMBINING_DOT_ABOVE,
817
- OFFSET_IF(buf, len));
844
+ return tolower_lithuianian_i(buf, base, combiner);
818
845
  }
819
846
 
820
- if (c == GREEK_CAPITAL_LETTER_SIGMA) {
821
- unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
822
-
823
- if ((!use_end || *p < end) && **p != '\0') {
824
- unichar next_c = utf_char(*p);
825
- int next_type = s_type(next_c);
847
+ no_lithuanian_i_casing:
826
848
 
827
- /* SIGMA maps differently depending on whether it is
828
- * final or not. The following simplified test would
829
- * fail in the case of combining marks following the
830
- * sigma, but I don't think that occurs in real text.
831
- * The test here matches that in ICU. */
832
- if (s_isalpha(next_type))
833
- tv = GREEK_SMALL_LETTER_SIGMA;
834
- }
835
-
836
- return unichar_to_utf(tv, buf);
837
- }
849
+ if (c == GREEK_CAPITAL_LETTER_SIGMA)
850
+ return tolower_sigma(p, buf, end, use_end);
838
851
 
839
852
  if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
840
853
  OR(UNICODE_TITLECASE_LETTER, 0))))
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
879
892
  size_t len = real_tolower(str, max, use_max, NULL, locale_type);
880
893
  char *result = ALLOC_N(char, len + 1);
881
894
  real_tolower(str, max, use_max, result, locale_type);
882
- result[len] = NUL;
895
+ result[len] = '\0';
883
896
 
884
897
  return result;
885
898
  }
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
915
928
  static bool
916
929
  casefold_table_lookup(unichar c, char *folded, size_t *len)
917
930
  {
918
- int begin = 0;
919
- int end = lengthof(casefold_table);
931
+ int index;
920
932
 
921
- if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
933
+ if (!unicode_table_lookup(casefold_table, c, &index))
922
934
  return false;
923
935
 
924
- while (true) {
925
- int mid = (begin + end) / 2;
926
-
927
- if (c == casefold_table[mid].ch) {
928
- if (folded != NULL)
929
- strcpy(folded, casefold_table[mid].data);
930
- *len += utf_byte_length(casefold_table[mid].data);
931
- return true;
932
- } else if (mid == begin) {
933
- return false;
934
- } else if (c > casefold_table[mid].ch) {
935
- begin = mid;
936
- } else {
937
- end = mid;
938
- }
939
- }
936
+ char const *folded_c = casefold_table[index].data;
937
+
938
+ if (folded != NULL)
939
+ strcpy(folded, folded_c);
940
+
941
+ *len += utf_byte_length(folded_c);
942
+
943
+ return true;
940
944
  }
941
945
 
942
946
  static char *
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
1037
1041
  bool
1038
1042
  unichar_mirror(unichar c, unichar *mirrored)
1039
1043
  {
1040
- int begin = 0;
1041
- int end = lengthof(bidi_mirroring_table);
1044
+ int index;
1042
1045
 
1043
- while (true) {
1044
- int mid = (begin + end) / 2;
1046
+ if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
1047
+ return false;
1045
1048
 
1046
- if (c == bidi_mirroring_table[mid].ch) {
1047
- if (mirrored != NULL)
1048
- *mirrored = bidi_mirroring_table[mid].mirrored_ch;
1049
- return true;
1050
- } else if (mid == begin) {
1051
- return false;
1052
- } else if (c > bidi_mirroring_table[mid].ch) {
1053
- begin = mid;
1054
- } else {
1055
- end = mid;
1056
- }
1057
- }
1049
+ if (mirrored != NULL)
1050
+ *mirrored = bidi_mirroring_table[index].mirrored_ch;
1051
+
1052
+ return true;
1058
1053
  }
1059
1054
 
1060
1055
 
@@ -13,6 +13,7 @@
13
13
  #include <stdint.h>
14
14
  #include "unicode.h"
15
15
  #include "private.h"
16
+ #include "rb_private.h"
16
17
  #include "rb_methods.h"
17
18
 
18
19
  #endif /* RB_INCLUDES_H */
@@ -0,0 +1,52 @@
1
+ /*
2
+ * contents: Private Ruby-related functions.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_PRIVATE_H
8
+ #define RB_PRIVATE_H
9
+
10
+
11
+ void need_at_least_n_arguments(int argc, int n) HIDDEN;
12
+
13
+ unichar _utf_char_validated(char const *const str,
14
+ char const *const str_end) HIDDEN;
15
+ char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
16
+ const char *limit, bool noisy) HIDDEN;
17
+
18
+ char *_utf_offset_to_pointer_validated(const char *str, long offset,
19
+ const char *end) HIDDEN;
20
+
21
+ char *_utf_offset_to_pointer_failable(const char *str, long offset,
22
+ const char *end) HIDDEN;
23
+
24
+ VALUE rb_utf_new(const char *str, long len) HIDDEN;
25
+
26
+ VALUE rb_utf_new2(const char *str) HIDDEN;
27
+
28
+ VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
29
+
30
+ VALUE rb_utf_alloc_using(char *str) HIDDEN;
31
+
32
+ VALUE rb_utf_dup(VALUE str) HIDDEN;
33
+
34
+ long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
35
+
36
+ bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
37
+ char **limit) HIDDEN;
38
+
39
+ void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
40
+ char **limit) HIDDEN;
41
+
42
+ char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
43
+
44
+ VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
45
+
46
+ char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
47
+
48
+ long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
49
+ long offset, bool reverse) HIDDEN;
50
+
51
+
52
+ #endif /* RB_PRIVATE_H */
@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
113
113
 
114
114
  if (*non_digit != 0)
115
115
  rb_raise(rb_eArgError,
116
- "unexpected ‘%lc’ found at position %ld", c, s - str);
116
+ "unexpected ‘%lc’ found at position %ld",
117
+ c, utf_pointer_to_offset(str, s));
117
118
 
118
119
  *non_digit = c;
119
120
 
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
135
136
  return false;
136
137
  rb_raise(rb_eArgError,
137
138
  "non-digit character ‘%lc’ found at position %ld",
138
- c, s - str);
139
+ c, utf_pointer_to_offset(str, s));
139
140
  }
140
141
 
141
142
  if (value >= base) {
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
144
145
 
145
146
  rb_raise(rb_eArgError,
146
147
  "value (%d) greater than base (%d) at position %ld",
147
- value, base, s - str);
148
+ value, base, utf_pointer_to_offset(str, s));
148
149
  }
149
150
 
150
151
  *digit_value = value;
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
181
182
  if (*s != '\0')
182
183
  rb_raise(rb_eArgError,
183
184
  "trailing garbage found at position %ld",
184
- s - str);
185
+ utf_pointer_to_offset(str, s));
185
186
  }
186
187
 
187
188
  if (POSFIXABLE(value)) {
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
221
222
  if (verify)
222
223
  rb_raise(rb_eArgError,
223
224
  "extra sign ‘%c’ found at position %ld",
224
- *s, s - str);
225
+ *s, utf_pointer_to_offset(str, s));
225
226
  return INT2FIX(0);
226
227
  }
227
228
 
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
245
246
  if (verify && *str == '_')
246
247
  rb_raise(rb_eArgError,
247
248
  "leading digit-separator ‘_’ found at position %ld",
248
- s - str);
249
+ utf_pointer_to_offset(str, s));
249
250
 
250
251
  bit_length = bit_length / BITSPERDIG + 1;
251
252
 
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
269
270
 
270
271
  bool more_to_shift = true;
271
272
  while (more_to_shift) {
272
- BDIGIT_DBL num = c;
273
+ BDIGIT_DBL num = digit_value;
273
274
 
274
275
  for (int i = 0; i < big_len; i++) {
275
276
  num += (BDIGIT_DBL)zds[i] * base;
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
294
295
  if (str + 1 < s && s[-1] == '_')
295
296
  rb_raise(rb_eArgError,
296
297
  "trailing digit-separator ‘_’ found at position %ld",
297
- s - str);
298
+ utf_pointer_to_offset(str, s));
298
299
 
299
300
  if (*s != '\0')
300
301
  rb_raise(rb_eArgError,
301
302
  "trailing garbage found at position %ld",
302
- s - str);
303
+ utf_pointer_to_offset(str, s));
303
304
 
304
305
  return rb_big_norm(z);
305
306
  }
@@ -0,0 +1,38 @@
1
+ /*
2
+ * contents: Functions for dealing with Unicode tables.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef TABLES_H
8
+ #define TABLES_H
9
+
10
+
11
+ /*
12
+ static inline int
13
+ split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
14
+ {
15
+ return (page >= UNICODE_MAX_TABLE_INDEX) ?
16
+ page - UNICODE_MAX_TABLE_INDEX :
17
+ data[page][c & 0xff];
18
+ }
19
+
20
+ static inline int
21
+ split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
22
+ {
23
+ if (c <= UNICODE_LAST_CHAR_PART1)
24
+ return split_unicode_table_lookup_page(data,
25
+ part1[c >> 8],
26
+ c);
27
+
28
+ if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
29
+ return split_unicode_table_lookup_page(data,
30
+ part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
31
+ c);
32
+
33
+ return fallback;
34
+ }
35
+ */
36
+
37
+
38
+ #endif /* TABLES_H */
@@ -13,6 +13,7 @@
13
13
  #include <limits.h>
14
14
  #include "unicode.h"
15
15
  #include "private.h"
16
+ #include "rb_private.h"
16
17
  #include "rb_methods.h"
17
18
 
18
19
  static VALUE mUTF8Methods;
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
85
86
  saved_offset);
86
87
  else
87
88
  return NULL;
88
- break;
89
89
  }
90
90
 
91
91
  offset += utf_pointer_to_offset(p, base);
@@ -190,6 +190,9 @@ utf_char(const char *str)
190
190
  unichar
191
191
  utf_char_n(const char *str, size_t max)
192
192
  {
193
+ if (max == 0)
194
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
195
+
193
196
  size_t len;
194
197
  unichar c = (unsigned char)*str;
195
198
 
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
454
457
 
455
458
  unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
456
459
  unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
457
- setlocale(LC_COLLATE, "");
460
+
458
461
  int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
459
462
 
460
463
  free(a_norm);
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
518
521
  assert(str != NULL);
519
522
 
520
523
  unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
521
- setlocale(LC_COLLATE, "");
522
524
  size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
523
525
  wchar_t result_wc[xfrm_len + 1];
524
526
  wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
863
865
  char *result = ALLOC_N(char, len + 1);
864
866
  char *r = result + len;
865
867
  const char *p = str;
866
- while (*p != NUL) {
868
+ while (r > result) {
867
869
  uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
868
870
  r -= skip;
869
871
  for (char *m = r; skip > 0; skip--)
data/tests/case.rb ADDED
@@ -0,0 +1,102 @@
1
+ # contents: Tests for String#upcase and String#downcase.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'tests/unicodedatatestbase'
6
+ require 'encoding/character/utf-8'
7
+
8
+ class TC_StringCase < Test::Unit::TestCase
9
+ include UnicodeDataTestBase
10
+
11
+ Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
12
+ CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
13
+
14
+ def test_upcase_and_downcase
15
+ # TODO: Do it like this. First read in SpecialCasing.txt and set up lookup
16
+ # tables for all the characters that need special casing. Then, iterate
17
+ # over UnicodeData and simply check that the correct casings are performed,
18
+ # looking up data in the tables for special casing if no simple casing
19
+ # information is available (and skipping when appropriate - such as when
20
+ # there is some condition defined for the special casing).
21
+ special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
22
+ open_data_file('SpecialCasing.txt') do |file|
23
+ i = 0
24
+ file.each_line do |line|
25
+ i += 1
26
+ next if line =~ /^(#|\s*$)/
27
+ fields = line.sub(/\s*#.*$/, "").split('; ')
28
+ unless fields.size == 4 or fields.size == 5
29
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
30
+ end
31
+ code = fields[CasingCode].hex
32
+ special.conditions[code] = fields[CasingCondition] if fields.size == 5
33
+ special.upper[code] = utfify(fields[CasingUpper])
34
+ special.lower[code] = utfify(fields[CasingLower])
35
+ special.title[code] = utfify(fields[CasingTitle])
36
+ end
37
+ end
38
+
39
+ open_data_file('UnicodeData.txt') do |file|
40
+ i = 0
41
+ prev_code = -1
42
+ file.each_line do |line|
43
+ i += 1
44
+ next if line =~ /^(#|\s*$)/
45
+ fields = line.split(';')
46
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
47
+ code = fields[Code].hex
48
+ if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
49
+ prev_code.upto(code - 1){ |c| test_one c, fields, special }
50
+ end
51
+ test_one code, fields, special
52
+ prev_code = code
53
+ end
54
+ end
55
+ puts @i
56
+ end
57
+
58
+ private
59
+
60
+ def utfify(codepoints)
61
+ return codepoints if codepoints == ""
62
+ codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
63
+ end
64
+
65
+ def utfone(codepoint)
66
+ u([codepoint].pack('U*'))
67
+ end
68
+
69
+ def test_one(code, fields, special)
70
+ @i ||= 0
71
+ @i += 1
72
+ case fields[Category]
73
+ when 'Ll'
74
+ test_upcase(code, fields, special)
75
+ when 'Lu'
76
+ test_downcase(code, fields, special)
77
+ when 'Lt'
78
+ test_upcase(code, fields, special)
79
+ test_downcase(code, fields, special)
80
+ end
81
+ end
82
+
83
+ def test_upcase(code, fields, special)
84
+ if special.upper[code]
85
+ if not special.conditions[code]
86
+ assert_equal(special.upper[code], utfone(code).upcase)
87
+ end
88
+ elsif not fields[Upper].empty?
89
+ assert_equal(utfify(fields[Upper]), utfone(code).upcase)
90
+ end
91
+ end
92
+
93
+ def test_downcase(code, fields, special)
94
+ if special.lower[code]
95
+ if not special.conditions[code]
96
+ assert_equal(special.lower[code], utfone(code).downcase)
97
+ end
98
+ elsif not fields[Lower].empty?
99
+ assert_equal(utfify(fields[Lower]), utfone(code).downcase)
100
+ end
101
+ end
102
+ end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: character-encodings
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
7
- date: 2006-07-27 00:00:00 +02:00
6
+ version: 0.3.0
7
+ date: 2007-11-22 00:00:00 +01:00
8
8
  summary: A pluggable character-encoding library
9
9
  require_paths:
10
10
  - lib
@@ -48,6 +48,7 @@ files:
48
48
  - ext/encoding/character/unicode/codepoint.c
49
49
  - ext/encoding/character/utf-8/break.c
50
50
  - ext/encoding/character/utf-8/decompose.c
51
+ - ext/encoding/character/utf-8/private.c
51
52
  - ext/encoding/character/utf-8/properties.c
52
53
  - ext/encoding/character/utf-8/rb_utf_aref.c
53
54
  - ext/encoding/character/utf-8/rb_utf_aset.c
@@ -80,19 +81,22 @@ files:
80
81
  - ext/encoding/character/utf-8/unicode.c
81
82
  - ext/encoding/character/utf-8/utf.c
82
83
  - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
84
+ - ext/encoding/character/utf-8/data/break.h
85
+ - ext/encoding/character/utf-8/data/character-tables.h
86
+ - ext/encoding/character/utf-8/data/compose.h
87
+ - ext/encoding/character/utf-8/data/decompose.h
83
88
  - ext/encoding/character/utf-8/private.h
84
89
  - ext/encoding/character/utf-8/rb_includes.h
85
90
  - ext/encoding/character/utf-8/rb_methods.h
91
+ - ext/encoding/character/utf-8/rb_private.h
86
92
  - ext/encoding/character/utf-8/rb_utf_internal_tr.h
93
+ - ext/encoding/character/utf-8/tables.h
87
94
  - ext/encoding/character/utf-8/unicode.h
88
95
  - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
89
- - ext/encoding/character/utf-8/data/break.h
90
- - ext/encoding/character/utf-8/data/character-tables.h
91
- - ext/encoding/character/utf-8/data/compose.h
92
- - ext/encoding/character/utf-8/data/decompose.h
93
- - ext/encoding/character/utf-8/extconf.rb
94
96
  - ext/encoding/character/utf-8/data/generate-unicode-data.rb
97
+ - ext/encoding/character/utf-8/extconf.rb
95
98
  - ext/encoding/character/utf-8/depend
99
+ - tests/case.rb
96
100
  - tests/foldcase.rb
97
101
  - tests/normalize.rb
98
102
  - tests/unicodedatatestbase.rb