character-encodings 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ require 'rake/testtask'
10
10
  require 'spec/rake/spectask'
11
11
 
12
12
  PackageName = 'character-encodings'
13
- PackageVersion = '0.2.0'
13
+ PackageVersion = '0.3.0'
14
14
 
15
15
  desc 'Default task'
16
16
  task :default => [:extensions]
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
92
92
  end
93
93
 
94
94
  Tests = [
95
- ['tests/foldcase.rb'],
95
+ ['tests/foldcase.rb', 'tests/case.rb'],
96
96
  ['tests/normalize.rb']
97
97
  ]
98
98
 
@@ -9,30 +9,17 @@
9
9
  #include <stdint.h>
10
10
  #include "unicode.h"
11
11
  #include "data/break.h"
12
-
12
+ #include "private.h"
13
13
 
14
14
  /* Figure out what break type the Unicode character ‘c’ possesses, if any.
15
15
  * This information is used for finding word and line boundaries, which is
16
16
  * useful when displaying Unicode text on screen. */
17
- static UnicodeBreakType
18
- break_type(const int16_t table[], unsigned int page, unichar c)
19
- {
20
- int16_t break_property = table[page];
21
-
22
- return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
23
- break_property - UNICODE_MAX_TABLE_INDEX :
24
- break_property_data[break_property][c & 0xff];
25
- }
26
-
27
17
  UnicodeBreakType
28
18
  unichar_break_type(unichar c)
29
19
  {
30
- if (c <= UNICODE_LAST_CHAR_PART1)
31
- return break_type(break_property_table_part1, c >> 8, c);
32
-
33
- if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
34
- return break_type(break_property_table_part2,
35
- (c - 0xe0000) >> 8, c);
36
-
37
- return UNICODE_BREAK_UNKNOWN;
20
+ return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
21
+ break_property_table_part1,
22
+ break_property_table_part2,
23
+ c,
24
+ UNICODE_BREAK_UNKNOWN);
38
25
  }
@@ -13,6 +13,8 @@
13
13
 
14
14
  #define UNICODE_LAST_PAGE_PART1 762
15
15
 
16
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
17
+
16
18
  #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
17
19
 
18
20
 
@@ -10,6 +10,7 @@
10
10
 
11
11
  #define UNICODE_LAST_CHAR_PART1 0x2faff
12
12
  #define UNICODE_LAST_PAGE_PART1 762
13
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
13
14
 
14
15
  #define UNICODE_NOT_PRESENT_OFFSET 65535
15
16
 
@@ -534,6 +534,8 @@ private
534
534
 
535
535
  #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
536
 
537
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
538
+
537
539
  #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
538
540
  EOF
539
541
  print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -694,8 +696,11 @@ EOF
694
696
  #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
695
697
 
696
698
  #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
699
+
697
700
  #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
698
701
 
702
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
703
+
699
704
  #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
700
705
  EOF
701
706
  print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -8,35 +8,16 @@
8
8
  #include <stdbool.h>
9
9
  #include <stddef.h>
10
10
  #include <stdint.h>
11
+
11
12
  #include "unicode.h"
12
- #include "private.h"
13
+
13
14
  #include "data/decompose.h"
14
15
  #include "data/compose.h"
15
16
 
17
+ #include "private.h"
16
18
 
17
- /* {{{1
18
- * Macros for accessing the combining class property tables for a given
19
- * character.
20
- *
21
- * TODO: Turn these macros into full-fledged functions, as this is rather silly
22
- * when we have ‹inline› in C99.
23
- */
24
- #define CC_PART1(page, char) \
25
- ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
26
- ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
27
- : (cclass_data[combining_class_table_part1[page]][char]))
28
-
29
- #define CC_PART2(page, char) \
30
- ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
31
- ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
32
- : (cclass_data[combining_class_table_part2[page]][char]))
33
-
34
- #define COMBINING_CLASS(char) \
35
- (((char) <= UNICODE_LAST_CHAR_PART1) \
36
- ? CC_PART1((char) >> 8, (char) & 0xff) \
37
- : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
38
- ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
39
- : 0))
19
+ #define COMBINING_CLASS(c) \
20
+ SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
40
21
 
41
22
 
42
23
  /* {{{1
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
115
96
  /* {{{1
116
97
  * Decompose the character ‘s’ according to the rules outlined in
117
98
  * http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
118
- * or of sufficient length to store the decomposition of ‘s’. The number of
119
- * characters stored (or would be if it were non-‹NULL) in ‘r’ is put in
120
- * ‘r_len’.
99
+ * or of sufficient length to store the decomposition of ‘s’. Returns the
100
+ * number of characters stored (or would be if it were non-NULL) in R.
121
101
  */
122
- static void
123
- decompose_hangul(unichar s, unichar *r, size_t *r_len)
102
+ static size_t
103
+ decompose_hangul(unichar s, unichar *r)
124
104
  {
125
105
  int SIndex = s - SBase;
126
106
 
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
128
108
  if (SIndex < 0 || SIndex >= SCount) {
129
109
  if (r != NULL)
130
110
  r[0] = s;
131
- *r_len = 1;
132
- return;
111
+ return 1;
133
112
  }
134
113
 
135
114
  unichar L = LBase + SIndex / NCount;
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
141
120
  r[1] = V;
142
121
  }
143
122
 
144
- if (T != TBase) {
145
- if (r != NULL)
146
- r[2] = T;
147
- *r_len = 3;
148
- } else {
149
- *r_len = 2;
150
- }
123
+ if (T == TBase)
124
+ return 2;
125
+
126
+ if (r != NULL)
127
+ r[2] = T;
128
+
129
+ return 3;
151
130
  }
152
131
 
153
132
 
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
179
158
  static const char *
180
159
  find_decomposition(unichar c, bool compat)
181
160
  {
182
- int begin = 0;
183
- int end = lengthof(decomp_table);
161
+ int index;
184
162
 
185
- if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
163
+ if (!unicode_table_lookup(decomp_table, c, &index))
186
164
  return NULL;
187
165
 
188
- while (true) {
189
- int middle = (begin + end) / 2;
166
+ return get_decomposition(index, compat);
167
+ }
190
168
 
191
- if (c == decomp_table[middle].ch)
192
- return get_decomposition(middle, compat);
193
- else if (middle == begin)
194
- break;
195
- else if (c > decomp_table[middle].ch)
196
- begin = middle;
197
- else
198
- end = middle;
199
- }
200
169
 
201
- return NULL;
170
+ /* {{{1
171
+ * Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
172
+ * ‘chars’. Return the number of unichars in ‘chars’.
173
+ */
174
+ static size_t
175
+ decomposition_to_wc(const char *decomposition, unichar *chars)
176
+ {
177
+ size_t i = 0;
178
+ for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
179
+ chars[i++] = utf_char(p);
180
+
181
+ return i;
202
182
  }
203
183
 
204
184
 
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
215
195
 
216
196
  /* Hangul syllable */
217
197
  if (c >= SBase && c <= SLast) {
218
- decompose_hangul(c, NULL, len);
198
+ *len = decompose_hangul(c, NULL);
219
199
  r = ALLOC_N(unichar, *len);
220
- decompose_hangul(c, r, len);
200
+ decompose_hangul(c, r);
221
201
  } else if ((decomp = find_decomposition(c, false)) != NULL) {
222
202
  *len = utf_length(decomp);
223
203
  r = ALLOC_N(unichar, *len);
224
-
225
- int i;
226
- const char *p;
227
- for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
228
- r[i] = utf_char(p);
204
+ decomposition_to_wc(decomp, r);
229
205
  } else {
230
206
  r = ALLOC(unichar);
231
207
  *r = c;
@@ -281,23 +257,19 @@ compose_index(unichar c)
281
257
  if (page > COMPOSE_TABLE_LAST)
282
258
  return 0;
283
259
 
284
- /* TODO: why is this signed, exactly? */
285
- int16_t compose_offset = compose_table[page];
286
- return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
287
- compose_offset - UNICODE_MAX_TABLE_INDEX :
288
- compose_data[compose_offset][c & 0xff];
260
+ return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
289
261
  }
290
262
 
291
263
  static bool
292
264
  lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
293
265
  unichar *result)
294
266
  {
295
- if (c == table[index][0]) {
296
- *result = table[index][1];
297
- return true;
298
- }
267
+ if (c != table[index][0])
268
+ return false;
299
269
 
300
- return false;
270
+ *result = table[index][1];
271
+
272
+ return true;
301
273
  }
302
274
 
303
275
  static bool
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
307
279
  return true;
308
280
 
309
281
  uint16_t index_a = compose_index(a);
310
- if (index_a >= COMPOSE_FIRST_SINGLE_START &&
311
- index_a < COMPOSE_SECOND_START) {
282
+ if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
312
283
  return lookup_compose(compose_first_single,
313
284
  index_a - COMPOSE_FIRST_SINGLE_START,
314
285
  b,
315
286
  result);
316
- }
317
287
 
318
288
  uint16_t index_b = compose_index(b);
319
- if (index_b >= COMPOSE_SECOND_SINGLE_START) {
289
+ if (index_b >= COMPOSE_SECOND_SINGLE_START)
320
290
  return lookup_compose(compose_second_single,
321
291
  index_b - COMPOSE_SECOND_SINGLE_START,
322
292
  a,
323
293
  result);
324
- }
325
294
 
326
295
  if (index_a >= COMPOSE_FIRST_START &&
327
296
  index_a < COMPOSE_FIRST_SINGLE_START &&
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
356
325
  return 1;
357
326
  }
358
327
 
359
- if (buf != NULL) {
360
- int i;
361
- for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
362
- buf[i] = utf_char(decomp);
363
- return i;
364
- }
328
+ if (buf != NULL)
329
+ return decomposition_to_wc(decomp, buf);
365
330
 
366
331
  return utf_length(decomp);
367
332
  }
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
378
343
  size_t prev_n = n;
379
344
 
380
345
  unichar *base = (buf != NULL) ? buf + n : NULL;
381
- if (c >= SBase && c <= SLast) {
382
- size_t len;
383
-
384
- decompose_hangul(c, base, &len);
385
- n += len;
386
- } else {
346
+ if (c >= SBase && c <= SLast)
347
+ n += decompose_hangul(c, base);
348
+ else
387
349
  n += normalize_wc_decompose_one(c, mode, base);
388
- }
389
350
 
390
351
  if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
391
352
  unicode_canonical_ordering(buf + prev_start,
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
403
364
  *buf_len = n;
404
365
  }
405
366
 
406
- unichar *
407
- _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
367
+ static unichar *
368
+ normalize_wc_compose(unichar *buf, size_t len)
408
369
  {
409
- size_t n;
410
- normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
411
- unichar *buf = ALLOC_N(unichar, n + 1);
412
- normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
413
-
414
- /* Just return if we don’t want composition. */
415
- if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
416
- return buf;
417
-
370
+ int new_len = len;
418
371
  size_t prev_start = 0;
419
372
  int prev_cc = 0;
420
- for (size_t i = 0; i < n; i++) {
373
+
374
+ for (size_t i = 0; i < len; i++) {
421
375
  int cc = COMBINING_CLASS(buf[i]);
376
+ size_t j = i - (len - new_len);
422
377
 
423
- if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
378
+ if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
424
379
  combine(buf[prev_start], buf[i], &buf[prev_start])) {
425
- for (size_t j = i + 1; j < n; j++)
426
- buf[j - 1] = buf[j];
427
-
428
- n--;
429
- i--;
430
- prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
380
+ new_len--;
381
+ prev_cc = (j + 1 == prev_start) ?
382
+ 0 : COMBINING_CLASS(buf[j - 1]);
431
383
  } else {
432
384
  if (cc == 0)
433
- prev_start = i;
385
+ prev_start = j;
434
386
 
387
+ buf[j] = buf[i];
435
388
  prev_cc = cc;
436
389
  }
437
390
  }
438
391
 
439
- buf[n] = NUL;
392
+ buf[new_len] = NUL;
440
393
 
441
394
  return buf;
442
395
  }
443
396
 
397
+ unichar *
398
+ _utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
399
+ {
400
+ size_t n;
401
+ normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
402
+ unichar *buf = ALLOC_N(unichar, n + 1);
403
+ normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
404
+
405
+ /* Just return if we don’t want composition. */
406
+ if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
407
+ return buf;
408
+
409
+ return normalize_wc_compose(buf, n);
410
+ }
411
+
444
412
 
445
413
  /* {{{1
446
414
  * Normalize (compose/decompose) characters in ‘str˚ so that strings that
@@ -1,6 +1,7 @@
1
1
  break.o: break.c unicode.h data/break.h
2
2
  decompose.o: decompose.c unicode.h private.h data/decompose.h \
3
3
  data/compose.h
4
+ private.o: private.c private.h
4
5
  properties.o: properties.c unicode.h private.h data/character-tables.h
5
6
  rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
6
7
  rb_methods.h
@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
12
12
  end
13
13
 
14
14
  try_compiler_option('-std=c99')
15
+ try_compiler_option('-finline-functions')
15
16
  try_compiler_option('-Wall')
16
17
  try_compiler_option('-Wextra')
17
18
  try_compiler_option('-Wwrite-strings')
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
23
24
  try_compiler_option('-Wpointer-arith')
24
25
  try_compiler_option('-Wcast-align')
25
26
  try_compiler_option('-Werror')
27
+ try_compiler_option('-Winline')
26
28
  # XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
27
29
  # local variables called “index” on FreeBSD.
28
30
  # try_compiler_option('-Wshadow')
@@ -0,0 +1,62 @@
1
+ /*
2
+ * contents: Private functions used by the UTF-8 character-encoding library.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include <ruby.h>
8
+ #include <stdbool.h>
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+ #include <stdlib.h>
12
+
13
+ #include "unicode.h"
14
+
15
+ #include "private.h"
16
+
17
+ /* Lookup C in the sorted TABLE using binary search. TABLE consists of N
18
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
19
+ * component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
20
+ * index is stored in INDEX and true is returned. Otherwise, false is returned
21
+ * and INDEX is left untouched. */
22
+ bool
23
+ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
24
+ {
25
+ #define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
26
+
27
+ int begin = 0;
28
+ int end = n - 1;
29
+ int middle;
30
+
31
+ /* This is ugly, but not all tables use unichars as their lookup
32
+ * character. The casefold table, for example, uses uint16_t-sized
33
+ * characters. To only get the interesting part of our table entry
34
+ * we’ll have to mask the retrieved value. */
35
+ int char_mask = (1 << (8 * sizeof_char)) - 1;
36
+
37
+ /* Drop out early if we know for certain that C can’t be in the
38
+ * decomposition table. */
39
+ if (c < ENTRY(0) || c > ENTRY(end))
40
+ return false;
41
+
42
+ while (begin <= end) {
43
+ middle = binary_search_middle_of(begin, end);
44
+
45
+ unichar probe = ENTRY(middle);
46
+ if (c < probe)
47
+ end = middle - 1;
48
+ else if (c > probe)
49
+ begin = middle + 1;
50
+ else
51
+ break;
52
+ }
53
+
54
+ if (begin > end)
55
+ return false;
56
+
57
+ *index = middle;
58
+
59
+ return true;
60
+
61
+ #undef ENTRY
62
+ }
@@ -21,48 +21,28 @@
21
21
  # define HIDDEN(u)
22
22
  #endif
23
23
 
24
- unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
25
- NormalizeMode mode) HIDDEN;
26
- inline int _unichar_combining_class(unichar c) HIDDEN;
27
-
28
- void need_at_least_n_arguments(int argc, int n) HIDDEN;
29
-
30
- unichar _utf_char_validated(char const *const str,
31
- char const *const str_end) HIDDEN;
32
- char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
33
- const char *limit, bool noisy) HIDDEN;
34
-
35
- char *_utf_offset_to_pointer_validated(const char *str, long offset,
36
- const char *end) HIDDEN;
37
-
38
- char *_utf_offset_to_pointer_failable(const char *str, long offset,
39
- const char *end) HIDDEN;
40
-
41
- VALUE rb_utf_new(const char *str, long len) HIDDEN;
42
-
43
- VALUE rb_utf_new2(const char *str) HIDDEN;
24
+ #define binary_search_middle_of(begin, end) \
25
+ (((unsigned)((begin) + (end))) >> 1)
44
26
 
45
- VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
27
+ #define unicode_table_lookup(table, c, index) \
28
+ binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
46
29
 
47
- VALUE rb_utf_alloc_using(char *str) HIDDEN;
30
+ bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
48
31
 
49
- VALUE rb_utf_dup(VALUE str) HIDDEN;
32
+ #define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
33
+ ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
34
+ ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
35
+ : (data[part[page]][(c) & 0xff]))
50
36
 
51
- long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
37
+ #define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
38
+ (((c) <= UNICODE_LAST_CHAR_PART1) \
39
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
40
+ : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
41
+ ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
42
+ : (fallback)))
52
43
 
53
- bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
54
- char **limit) HIDDEN;
55
-
56
- void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
57
- char **limit) HIDDEN;
58
-
59
- char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
60
-
61
- VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
62
-
63
- char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
64
-
65
- long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
66
- long offset, bool reverse) HIDDEN;
44
+ unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
45
+ NormalizeMode mode) HIDDEN;
46
+ int _unichar_combining_class(unichar c) HIDDEN;
67
47
 
68
48
  #endif /* PRIVATE_H */
@@ -63,8 +63,8 @@ s_type(unichar c)
63
63
  if (c <= UNICODE_LAST_CHAR_PART1) {
64
64
  page = c >> 8;
65
65
  table = type_table_part1;
66
- } else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
67
- page = (c - 0xe0000) >> 8;
66
+ } else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
67
+ page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
68
68
  table = type_table_part2;
69
69
  } else {
70
70
  return UNICODE_UNASSIGNED;
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
364
364
  unichar tv = ATTTABLE(c >> 8, c & 0xff);
365
365
 
366
366
  if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
367
- return utf_char(special_case_table +
368
- tv - UNICODE_SPECIAL_CASE_TABLE_START);
367
+ tv = utf_char(special_case_table +
368
+ tv - UNICODE_SPECIAL_CASE_TABLE_START);
369
369
 
370
370
  if (tv == '\0')
371
371
  return c;
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
429
429
  return title_table[i][0];
430
430
 
431
431
  if (s_type(c) == UNICODE_LOWERCASE_LETTER)
432
- return ATTTABLE(c >> 8, c & 0xff);
432
+ return unichar_toupper(c);
433
433
 
434
434
  return c;
435
435
  }
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
585
585
 
586
586
  if (*was_i) {
587
587
  size_t len = remove_all_combining_dot_above(c, buf);
588
- return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
589
- true);
588
+ return len + output_marks(p, OFFSET_IF(buf, len), true);
590
589
  }
591
590
 
592
591
  if (!s_ismark(type))
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
761
760
 
762
761
  /* {{{1
763
762
  * The real implementation of downcase.
764
- *
765
- * TODO: this needs a cleanup.
766
763
  */
764
+ static size_t
765
+ tolower_turkic_i(const char **p, char *buf)
766
+ {
767
+ unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
768
+
769
+ if (utf_char(*p) == COMBINING_DOT_ABOVE) {
770
+ /* TODO: don’t we need to make sure we don’t go beyond the end
771
+ * of ‘p’? */
772
+ *p = utf_next(*p);
773
+ i = LATIN_SMALL_LETTER_I;
774
+ }
775
+
776
+ return unichar_to_utf(i, buf);
777
+ }
778
+
779
+ static size_t
780
+ tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
781
+ {
782
+ size_t len = unichar_to_utf(base, buf);
783
+ len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
784
+ if (combiner != '\0')
785
+ len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
786
+
787
+ return len;
788
+ }
789
+
790
+ static size_t
791
+ tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
792
+ {
793
+ unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
794
+
795
+ /* SIGMA maps differently depending on whether it is final or not. The
796
+ * following simplified test would fail in the case of combining marks
797
+ * following the sigma, but I don't think that occurs in real text.
798
+ * The test here matches that in ICU. */
799
+ if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
800
+ sigma = GREEK_SMALL_LETTER_SIGMA;
801
+
802
+ return unichar_to_utf(sigma, buf);
803
+ }
804
+
767
805
  static size_t
768
806
  real_tolower_one(const char **p, const char *prev, char *buf,
769
807
  LocaleType locale_type, const char *end, bool use_end)
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
771
809
  unichar c = utf_char(prev);
772
810
  int type = s_type(c);
773
811
 
774
- if (locale_type == LOCALE_TURKIC && c == 'I') {
775
- if (utf_char(*p) == COMBINING_DOT_ABOVE) {
776
- /* TODO: don’t we need to make sure we don’t go beyond the end
777
- * of ‘p’? */
778
- *p = utf_next(*p);
779
- return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
780
- }
812
+ if (locale_type == LOCALE_TURKIC && c == 'I')
813
+ return tolower_turkic_i(p, buf);
781
814
 
782
- return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
783
- }
815
+ /* Introduce an explicit dot above the lowercasing capital I’s
816
+ * and J’s whenever there are more accents above.
817
+ * [SpecialCasing.txt] */
818
+ if (locale_type == LOCALE_LITHUANIAN) {
819
+ unichar base = LATIN_SMALL_LETTER_I;
820
+ unichar combiner = '\0';
784
821
 
785
- if (locale_type == LOCALE_LITHUANIAN &&
786
- (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
787
- c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
788
- c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
789
- /* Introduce an explicit dot above the lowercasing capital I's
790
- * and J's whenever there are more accents above.
791
- * [SpecialCasing.txt] */
792
- size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
793
- len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
794
822
  switch (c) {
795
823
  case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
796
- len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
797
- OFFSET_IF(buf, len));
824
+ combiner = COMBINING_GRAVE_ACCENT;
798
825
  break;
799
826
  case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
800
- len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
801
- OFFSET_IF(buf, len));
827
+ combiner = COMBINING_ACUTE_ACCENT;
802
828
  break;
803
829
  case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
804
- len += unichar_to_utf(COMBINING_TILDE,
805
- OFFSET_IF(buf, len));
830
+ combiner = COMBINING_TILDE;
806
831
  break;
807
- }
832
+ case 'I':
833
+ case 'J':
834
+ case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
835
+ if (!has_more_above(*p))
836
+ goto no_lithuanian_i_casing;
808
837
 
809
- return len;
810
- }
838
+ base = unichar_tolower(c);
839
+ break;
840
+ default:
841
+ goto no_lithuanian_i_casing;
842
+ }
811
843
 
812
- if (locale_type == LOCALE_LITHUANIAN &&
813
- (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
814
- has_more_above(*p)) {
815
- size_t len = unichar_to_utf(unichar_tolower(c), buf);
816
- return len + unichar_to_utf(COMBINING_DOT_ABOVE,
817
- OFFSET_IF(buf, len));
844
+ return tolower_lithuianian_i(buf, base, combiner);
818
845
  }
819
846
 
820
- if (c == GREEK_CAPITAL_LETTER_SIGMA) {
821
- unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
822
-
823
- if ((!use_end || *p < end) && **p != '\0') {
824
- unichar next_c = utf_char(*p);
825
- int next_type = s_type(next_c);
847
+ no_lithuanian_i_casing:
826
848
 
827
- /* SIGMA maps differently depending on whether it is
828
- * final or not. The following simplified test would
829
- * fail in the case of combining marks following the
830
- * sigma, but I don't think that occurs in real text.
831
- * The test here matches that in ICU. */
832
- if (s_isalpha(next_type))
833
- tv = GREEK_SMALL_LETTER_SIGMA;
834
- }
835
-
836
- return unichar_to_utf(tv, buf);
837
- }
849
+ if (c == GREEK_CAPITAL_LETTER_SIGMA)
850
+ return tolower_sigma(p, buf, end, use_end);
838
851
 
839
852
  if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
840
853
  OR(UNICODE_TITLECASE_LETTER, 0))))
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
879
892
  size_t len = real_tolower(str, max, use_max, NULL, locale_type);
880
893
  char *result = ALLOC_N(char, len + 1);
881
894
  real_tolower(str, max, use_max, result, locale_type);
882
- result[len] = NUL;
895
+ result[len] = '\0';
883
896
 
884
897
  return result;
885
898
  }
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
915
928
  static bool
916
929
  casefold_table_lookup(unichar c, char *folded, size_t *len)
917
930
  {
918
- int begin = 0;
919
- int end = lengthof(casefold_table);
931
+ int index;
920
932
 
921
- if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
933
+ if (!unicode_table_lookup(casefold_table, c, &index))
922
934
  return false;
923
935
 
924
- while (true) {
925
- int mid = (begin + end) / 2;
926
-
927
- if (c == casefold_table[mid].ch) {
928
- if (folded != NULL)
929
- strcpy(folded, casefold_table[mid].data);
930
- *len += utf_byte_length(casefold_table[mid].data);
931
- return true;
932
- } else if (mid == begin) {
933
- return false;
934
- } else if (c > casefold_table[mid].ch) {
935
- begin = mid;
936
- } else {
937
- end = mid;
938
- }
939
- }
936
+ char const *folded_c = casefold_table[index].data;
937
+
938
+ if (folded != NULL)
939
+ strcpy(folded, folded_c);
940
+
941
+ *len += utf_byte_length(folded_c);
942
+
943
+ return true;
940
944
  }
941
945
 
942
946
  static char *
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
1037
1041
  bool
1038
1042
  unichar_mirror(unichar c, unichar *mirrored)
1039
1043
  {
1040
- int begin = 0;
1041
- int end = lengthof(bidi_mirroring_table);
1044
+ int index;
1042
1045
 
1043
- while (true) {
1044
- int mid = (begin + end) / 2;
1046
+ if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
1047
+ return false;
1045
1048
 
1046
- if (c == bidi_mirroring_table[mid].ch) {
1047
- if (mirrored != NULL)
1048
- *mirrored = bidi_mirroring_table[mid].mirrored_ch;
1049
- return true;
1050
- } else if (mid == begin) {
1051
- return false;
1052
- } else if (c > bidi_mirroring_table[mid].ch) {
1053
- begin = mid;
1054
- } else {
1055
- end = mid;
1056
- }
1057
- }
1049
+ if (mirrored != NULL)
1050
+ *mirrored = bidi_mirroring_table[index].mirrored_ch;
1051
+
1052
+ return true;
1058
1053
  }
1059
1054
 
1060
1055
 
@@ -13,6 +13,7 @@
13
13
  #include <stdint.h>
14
14
  #include "unicode.h"
15
15
  #include "private.h"
16
+ #include "rb_private.h"
16
17
  #include "rb_methods.h"
17
18
 
18
19
  #endif /* RB_INCLUDES_H */
@@ -0,0 +1,52 @@
1
+ /*
2
+ * contents: Private Ruby-related functions.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_PRIVATE_H
8
+ #define RB_PRIVATE_H
9
+
10
+
11
+ void need_at_least_n_arguments(int argc, int n) HIDDEN;
12
+
13
+ unichar _utf_char_validated(char const *const str,
14
+ char const *const str_end) HIDDEN;
15
+ char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
16
+ const char *limit, bool noisy) HIDDEN;
17
+
18
+ char *_utf_offset_to_pointer_validated(const char *str, long offset,
19
+ const char *end) HIDDEN;
20
+
21
+ char *_utf_offset_to_pointer_failable(const char *str, long offset,
22
+ const char *end) HIDDEN;
23
+
24
+ VALUE rb_utf_new(const char *str, long len) HIDDEN;
25
+
26
+ VALUE rb_utf_new2(const char *str) HIDDEN;
27
+
28
+ VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
29
+
30
+ VALUE rb_utf_alloc_using(char *str) HIDDEN;
31
+
32
+ VALUE rb_utf_dup(VALUE str) HIDDEN;
33
+
34
+ long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
35
+
36
+ bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
37
+ char **limit) HIDDEN;
38
+
39
+ void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
40
+ char **limit) HIDDEN;
41
+
42
+ char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
43
+
44
+ VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
45
+
46
+ char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
47
+
48
+ long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
49
+ long offset, bool reverse) HIDDEN;
50
+
51
+
52
+ #endif /* RB_PRIVATE_H */
@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
113
113
 
114
114
  if (*non_digit != 0)
115
115
  rb_raise(rb_eArgError,
116
- "unexpected ‘%lc’ found at position %ld", c, s - str);
116
+ "unexpected ‘%lc’ found at position %ld",
117
+ c, utf_pointer_to_offset(str, s));
117
118
 
118
119
  *non_digit = c;
119
120
 
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
135
136
  return false;
136
137
  rb_raise(rb_eArgError,
137
138
  "non-digit character ‘%lc’ found at position %ld",
138
- c, s - str);
139
+ c, utf_pointer_to_offset(str, s));
139
140
  }
140
141
 
141
142
  if (value >= base) {
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
144
145
 
145
146
  rb_raise(rb_eArgError,
146
147
  "value (%d) greater than base (%d) at position %ld",
147
- value, base, s - str);
148
+ value, base, utf_pointer_to_offset(str, s));
148
149
  }
149
150
 
150
151
  *digit_value = value;
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
181
182
  if (*s != '\0')
182
183
  rb_raise(rb_eArgError,
183
184
  "trailing garbage found at position %ld",
184
- s - str);
185
+ utf_pointer_to_offset(str, s));
185
186
  }
186
187
 
187
188
  if (POSFIXABLE(value)) {
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
221
222
  if (verify)
222
223
  rb_raise(rb_eArgError,
223
224
  "extra sign ‘%c’ found at position %ld",
224
- *s, s - str);
225
+ *s, utf_pointer_to_offset(str, s));
225
226
  return INT2FIX(0);
226
227
  }
227
228
 
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
245
246
  if (verify && *str == '_')
246
247
  rb_raise(rb_eArgError,
247
248
  "leading digit-separator ‘_’ found at position %ld",
248
- s - str);
249
+ utf_pointer_to_offset(str, s));
249
250
 
250
251
  bit_length = bit_length / BITSPERDIG + 1;
251
252
 
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
269
270
 
270
271
  bool more_to_shift = true;
271
272
  while (more_to_shift) {
272
- BDIGIT_DBL num = c;
273
+ BDIGIT_DBL num = digit_value;
273
274
 
274
275
  for (int i = 0; i < big_len; i++) {
275
276
  num += (BDIGIT_DBL)zds[i] * base;
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
294
295
  if (str + 1 < s && s[-1] == '_')
295
296
  rb_raise(rb_eArgError,
296
297
  "trailing digit-separator ‘_’ found at position %ld",
297
- s - str);
298
+ utf_pointer_to_offset(str, s));
298
299
 
299
300
  if (*s != '\0')
300
301
  rb_raise(rb_eArgError,
301
302
  "trailing garbage found at position %ld",
302
- s - str);
303
+ utf_pointer_to_offset(str, s));
303
304
 
304
305
  return rb_big_norm(z);
305
306
  }
@@ -0,0 +1,38 @@
1
+ /*
2
+ * contents: Functions for dealing with Unicode tables.
3
+ *
4
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef TABLES_H
8
+ #define TABLES_H
9
+
10
+
11
+ /*
12
+ static inline int
13
+ split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
14
+ {
15
+ return (page >= UNICODE_MAX_TABLE_INDEX) ?
16
+ page - UNICODE_MAX_TABLE_INDEX :
17
+ data[page][c & 0xff];
18
+ }
19
+
20
+ static inline int
21
+ split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
22
+ {
23
+ if (c <= UNICODE_LAST_CHAR_PART1)
24
+ return split_unicode_table_lookup_page(data,
25
+ part1[c >> 8],
26
+ c);
27
+
28
+ if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
29
+ return split_unicode_table_lookup_page(data,
30
+ part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
31
+ c);
32
+
33
+ return fallback;
34
+ }
35
+ */
36
+
37
+
38
+ #endif /* TABLES_H */
@@ -13,6 +13,7 @@
13
13
  #include <limits.h>
14
14
  #include "unicode.h"
15
15
  #include "private.h"
16
+ #include "rb_private.h"
16
17
  #include "rb_methods.h"
17
18
 
18
19
  static VALUE mUTF8Methods;
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
85
86
  saved_offset);
86
87
  else
87
88
  return NULL;
88
- break;
89
89
  }
90
90
 
91
91
  offset += utf_pointer_to_offset(p, base);
@@ -190,6 +190,9 @@ utf_char(const char *str)
190
190
  unichar
191
191
  utf_char_n(const char *str, size_t max)
192
192
  {
193
+ if (max == 0)
194
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
195
+
193
196
  size_t len;
194
197
  unichar c = (unsigned char)*str;
195
198
 
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
454
457
 
455
458
  unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
456
459
  unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
457
- setlocale(LC_COLLATE, "");
460
+
458
461
  int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
459
462
 
460
463
  free(a_norm);
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
518
521
  assert(str != NULL);
519
522
 
520
523
  unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
521
- setlocale(LC_COLLATE, "");
522
524
  size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
523
525
  wchar_t result_wc[xfrm_len + 1];
524
526
  wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
863
865
  char *result = ALLOC_N(char, len + 1);
864
866
  char *r = result + len;
865
867
  const char *p = str;
866
- while (*p != NUL) {
868
+ while (r > result) {
867
869
  uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
868
870
  r -= skip;
869
871
  for (char *m = r; skip > 0; skip--)
data/tests/case.rb ADDED
@@ -0,0 +1,102 @@
1
+ # contents: Tests for String#upcase and String#downcase.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'tests/unicodedatatestbase'
6
+ require 'encoding/character/utf-8'
7
+
8
+ class TC_StringCase < Test::Unit::TestCase
9
+ include UnicodeDataTestBase
10
+
11
+ Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
12
+ CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
13
+
14
+ def test_upcase_and_downcase
15
+ # TODO: Do it like this. First read in SpecialCasing.txt and set up lookup
16
+ # tables for all the characters that need special casing. Then, iterate
17
+ # over UnicodeData and simply check that the correct casings are performed,
18
+ # looking up data in the tables for special casing if no simple casing
19
+ # information is available (and skipping when appropriate - such as when
20
+ # there is some condition defined for the special casing).
21
+ special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
22
+ open_data_file('SpecialCasing.txt') do |file|
23
+ i = 0
24
+ file.each_line do |line|
25
+ i += 1
26
+ next if line =~ /^(#|\s*$)/
27
+ fields = line.sub(/\s*#.*$/, "").split('; ')
28
+ unless fields.size == 4 or fields.size == 5
29
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
30
+ end
31
+ code = fields[CasingCode].hex
32
+ special.conditions[code] = fields[CasingCondition] if fields.size == 5
33
+ special.upper[code] = utfify(fields[CasingUpper])
34
+ special.lower[code] = utfify(fields[CasingLower])
35
+ special.title[code] = utfify(fields[CasingTitle])
36
+ end
37
+ end
38
+
39
+ open_data_file('UnicodeData.txt') do |file|
40
+ i = 0
41
+ prev_code = -1
42
+ file.each_line do |line|
43
+ i += 1
44
+ next if line =~ /^(#|\s*$)/
45
+ fields = line.split(';')
46
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
47
+ code = fields[Code].hex
48
+ if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
49
+ prev_code.upto(code - 1){ |c| test_one c, fields, special }
50
+ end
51
+ test_one code, fields, special
52
+ prev_code = code
53
+ end
54
+ end
55
+ puts @i
56
+ end
57
+
58
+ private
59
+
60
+ def utfify(codepoints)
61
+ return codepoints if codepoints == ""
62
+ codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
63
+ end
64
+
65
+ def utfone(codepoint)
66
+ u([codepoint].pack('U*'))
67
+ end
68
+
69
+ def test_one(code, fields, special)
70
+ @i ||= 0
71
+ @i += 1
72
+ case fields[Category]
73
+ when 'Ll'
74
+ test_upcase(code, fields, special)
75
+ when 'Lu'
76
+ test_downcase(code, fields, special)
77
+ when 'Lt'
78
+ test_upcase(code, fields, special)
79
+ test_downcase(code, fields, special)
80
+ end
81
+ end
82
+
83
+ def test_upcase(code, fields, special)
84
+ if special.upper[code]
85
+ if not special.conditions[code]
86
+ assert_equal(special.upper[code], utfone(code).upcase)
87
+ end
88
+ elsif not fields[Upper].empty?
89
+ assert_equal(utfify(fields[Upper]), utfone(code).upcase)
90
+ end
91
+ end
92
+
93
+ def test_downcase(code, fields, special)
94
+ if special.lower[code]
95
+ if not special.conditions[code]
96
+ assert_equal(special.lower[code], utfone(code).downcase)
97
+ end
98
+ elsif not fields[Lower].empty?
99
+ assert_equal(utfify(fields[Lower]), utfone(code).downcase)
100
+ end
101
+ end
102
+ end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: character-encodings
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
7
- date: 2006-07-27 00:00:00 +02:00
6
+ version: 0.3.0
7
+ date: 2007-11-22 00:00:00 +01:00
8
8
  summary: A pluggable character-encoding library
9
9
  require_paths:
10
10
  - lib
@@ -48,6 +48,7 @@ files:
48
48
  - ext/encoding/character/unicode/codepoint.c
49
49
  - ext/encoding/character/utf-8/break.c
50
50
  - ext/encoding/character/utf-8/decompose.c
51
+ - ext/encoding/character/utf-8/private.c
51
52
  - ext/encoding/character/utf-8/properties.c
52
53
  - ext/encoding/character/utf-8/rb_utf_aref.c
53
54
  - ext/encoding/character/utf-8/rb_utf_aset.c
@@ -80,19 +81,22 @@ files:
80
81
  - ext/encoding/character/utf-8/unicode.c
81
82
  - ext/encoding/character/utf-8/utf.c
82
83
  - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
84
+ - ext/encoding/character/utf-8/data/break.h
85
+ - ext/encoding/character/utf-8/data/character-tables.h
86
+ - ext/encoding/character/utf-8/data/compose.h
87
+ - ext/encoding/character/utf-8/data/decompose.h
83
88
  - ext/encoding/character/utf-8/private.h
84
89
  - ext/encoding/character/utf-8/rb_includes.h
85
90
  - ext/encoding/character/utf-8/rb_methods.h
91
+ - ext/encoding/character/utf-8/rb_private.h
86
92
  - ext/encoding/character/utf-8/rb_utf_internal_tr.h
93
+ - ext/encoding/character/utf-8/tables.h
87
94
  - ext/encoding/character/utf-8/unicode.h
88
95
  - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
89
- - ext/encoding/character/utf-8/data/break.h
90
- - ext/encoding/character/utf-8/data/character-tables.h
91
- - ext/encoding/character/utf-8/data/compose.h
92
- - ext/encoding/character/utf-8/data/decompose.h
93
- - ext/encoding/character/utf-8/extconf.rb
94
96
  - ext/encoding/character/utf-8/data/generate-unicode-data.rb
97
+ - ext/encoding/character/utf-8/extconf.rb
95
98
  - ext/encoding/character/utf-8/depend
99
+ - tests/case.rb
96
100
  - tests/foldcase.rb
97
101
  - tests/normalize.rb
98
102
  - tests/unicodedatatestbase.rb