character-encodings 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -2
- data/ext/encoding/character/utf-8/break.c +6 -19
- data/ext/encoding/character/utf-8/data/character-tables.h +2 -0
- data/ext/encoding/character/utf-8/data/decompose.h +1 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +5 -0
- data/ext/encoding/character/utf-8/decompose.c +77 -109
- data/ext/encoding/character/utf-8/depend +1 -0
- data/ext/encoding/character/utf-8/extconf.rb +2 -0
- data/ext/encoding/character/utf-8/private.c +62 -0
- data/ext/encoding/character/utf-8/private.h +18 -38
- data/ext/encoding/character/utf-8/properties.c +90 -95
- data/ext/encoding/character/utf-8/rb_includes.h +1 -0
- data/ext/encoding/character/utf-8/rb_private.h +52 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +10 -9
- data/ext/encoding/character/utf-8/tables.h +38 -0
- data/ext/encoding/character/utf-8/unicode.c +1 -1
- data/ext/encoding/character/utf-8/utf.c +5 -3
- data/tests/case.rb +102 -0
- metadata +12 -8
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ require 'rake/testtask'
|
|
10
10
|
require 'spec/rake/spectask'
|
11
11
|
|
12
12
|
PackageName = 'character-encodings'
|
13
|
-
PackageVersion = '0.
|
13
|
+
PackageVersion = '0.3.0'
|
14
14
|
|
15
15
|
desc 'Default task'
|
16
16
|
task :default => [:extensions]
|
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
|
|
92
92
|
end
|
93
93
|
|
94
94
|
Tests = [
|
95
|
-
['tests/foldcase.rb'],
|
95
|
+
['tests/foldcase.rb', 'tests/case.rb'],
|
96
96
|
['tests/normalize.rb']
|
97
97
|
]
|
98
98
|
|
@@ -9,30 +9,17 @@
|
|
9
9
|
#include <stdint.h>
|
10
10
|
#include "unicode.h"
|
11
11
|
#include "data/break.h"
|
12
|
-
|
12
|
+
#include "private.h"
|
13
13
|
|
14
14
|
/* Figure out what break type the Unicode character ‘c’ possesses, if any.
|
15
15
|
* This information is used for finding word and line boundaries, which is
|
16
16
|
* useful when displaying Unicode text on screen. */
|
17
|
-
static UnicodeBreakType
|
18
|
-
break_type(const int16_t table[], unsigned int page, unichar c)
|
19
|
-
{
|
20
|
-
int16_t break_property = table[page];
|
21
|
-
|
22
|
-
return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
|
23
|
-
break_property - UNICODE_MAX_TABLE_INDEX :
|
24
|
-
break_property_data[break_property][c & 0xff];
|
25
|
-
}
|
26
|
-
|
27
17
|
UnicodeBreakType
|
28
18
|
unichar_break_type(unichar c)
|
29
19
|
{
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
(c - 0xe0000) >> 8, c);
|
36
|
-
|
37
|
-
return UNICODE_BREAK_UNKNOWN;
|
20
|
+
return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
|
21
|
+
break_property_table_part1,
|
22
|
+
break_property_table_part2,
|
23
|
+
c,
|
24
|
+
UNICODE_BREAK_UNKNOWN);
|
38
25
|
}
|
@@ -534,6 +534,8 @@ private
|
|
534
534
|
|
535
535
|
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
536
536
|
|
537
|
+
#define UNICODE_FIRST_CHAR_PART2 0xe0000
|
538
|
+
|
537
539
|
#define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
|
538
540
|
EOF
|
539
541
|
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
@@ -694,8 +696,11 @@ EOF
|
|
694
696
|
#define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
|
695
697
|
|
696
698
|
#define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
|
699
|
+
|
697
700
|
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
698
701
|
|
702
|
+
#define UNICODE_FIRST_CHAR_PART2 0xe0000
|
703
|
+
|
699
704
|
#define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
|
700
705
|
EOF
|
701
706
|
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
@@ -8,35 +8,16 @@
|
|
8
8
|
#include <stdbool.h>
|
9
9
|
#include <stddef.h>
|
10
10
|
#include <stdint.h>
|
11
|
+
|
11
12
|
#include "unicode.h"
|
12
|
-
|
13
|
+
|
13
14
|
#include "data/decompose.h"
|
14
15
|
#include "data/compose.h"
|
15
16
|
|
17
|
+
#include "private.h"
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
* character.
|
20
|
-
*
|
21
|
-
* TODO: Turn these macros into full-fledged functions, as this is rather silly
|
22
|
-
* when we have ‹inline› in C99.
|
23
|
-
*/
|
24
|
-
#define CC_PART1(page, char) \
|
25
|
-
((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
|
26
|
-
? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
|
27
|
-
: (cclass_data[combining_class_table_part1[page]][char]))
|
28
|
-
|
29
|
-
#define CC_PART2(page, char) \
|
30
|
-
((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
|
31
|
-
? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
|
32
|
-
: (cclass_data[combining_class_table_part2[page]][char]))
|
33
|
-
|
34
|
-
#define COMBINING_CLASS(char) \
|
35
|
-
(((char) <= UNICODE_LAST_CHAR_PART1) \
|
36
|
-
? CC_PART1((char) >> 8, (char) & 0xff) \
|
37
|
-
: (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
|
38
|
-
? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
|
39
|
-
: 0))
|
19
|
+
#define COMBINING_CLASS(c) \
|
20
|
+
SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
|
40
21
|
|
41
22
|
|
42
23
|
/* {{{1
|
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
|
|
115
96
|
/* {{{1
|
116
97
|
* Decompose the character ‘s’ according to the rules outlined in
|
117
98
|
* http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
|
118
|
-
* or of sufficient length to store the decomposition of ‘s’.
|
119
|
-
* characters stored (or would be if it were non
|
120
|
-
* ‘r_len’.
|
99
|
+
* or of sufficient length to store the decomposition of ‘s’. Returns the
|
100
|
+
* number of characters stored (or would be if it were non-NULL) in R.
|
121
101
|
*/
|
122
|
-
static
|
123
|
-
decompose_hangul(unichar s, unichar *r
|
102
|
+
static size_t
|
103
|
+
decompose_hangul(unichar s, unichar *r)
|
124
104
|
{
|
125
105
|
int SIndex = s - SBase;
|
126
106
|
|
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
|
|
128
108
|
if (SIndex < 0 || SIndex >= SCount) {
|
129
109
|
if (r != NULL)
|
130
110
|
r[0] = s;
|
131
|
-
|
132
|
-
return;
|
111
|
+
return 1;
|
133
112
|
}
|
134
113
|
|
135
114
|
unichar L = LBase + SIndex / NCount;
|
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
|
|
141
120
|
r[1] = V;
|
142
121
|
}
|
143
122
|
|
144
|
-
if (T
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
123
|
+
if (T == TBase)
|
124
|
+
return 2;
|
125
|
+
|
126
|
+
if (r != NULL)
|
127
|
+
r[2] = T;
|
128
|
+
|
129
|
+
return 3;
|
151
130
|
}
|
152
131
|
|
153
132
|
|
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
|
|
179
158
|
static const char *
|
180
159
|
find_decomposition(unichar c, bool compat)
|
181
160
|
{
|
182
|
-
int
|
183
|
-
int end = lengthof(decomp_table);
|
161
|
+
int index;
|
184
162
|
|
185
|
-
if (
|
163
|
+
if (!unicode_table_lookup(decomp_table, c, &index))
|
186
164
|
return NULL;
|
187
165
|
|
188
|
-
|
189
|
-
|
166
|
+
return get_decomposition(index, compat);
|
167
|
+
}
|
190
168
|
|
191
|
-
if (c == decomp_table[middle].ch)
|
192
|
-
return get_decomposition(middle, compat);
|
193
|
-
else if (middle == begin)
|
194
|
-
break;
|
195
|
-
else if (c > decomp_table[middle].ch)
|
196
|
-
begin = middle;
|
197
|
-
else
|
198
|
-
end = middle;
|
199
|
-
}
|
200
169
|
|
201
|
-
|
170
|
+
/* {{{1
|
171
|
+
* Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
|
172
|
+
* ‘chars’. Return the number of unichars in ‘chars’.
|
173
|
+
*/
|
174
|
+
static size_t
|
175
|
+
decomposition_to_wc(const char *decomposition, unichar *chars)
|
176
|
+
{
|
177
|
+
size_t i = 0;
|
178
|
+
for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
|
179
|
+
chars[i++] = utf_char(p);
|
180
|
+
|
181
|
+
return i;
|
202
182
|
}
|
203
183
|
|
204
184
|
|
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
|
|
215
195
|
|
216
196
|
/* Hangul syllable */
|
217
197
|
if (c >= SBase && c <= SLast) {
|
218
|
-
decompose_hangul(c, NULL
|
198
|
+
*len = decompose_hangul(c, NULL);
|
219
199
|
r = ALLOC_N(unichar, *len);
|
220
|
-
decompose_hangul(c, r
|
200
|
+
decompose_hangul(c, r);
|
221
201
|
} else if ((decomp = find_decomposition(c, false)) != NULL) {
|
222
202
|
*len = utf_length(decomp);
|
223
203
|
r = ALLOC_N(unichar, *len);
|
224
|
-
|
225
|
-
int i;
|
226
|
-
const char *p;
|
227
|
-
for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
|
228
|
-
r[i] = utf_char(p);
|
204
|
+
decomposition_to_wc(decomp, r);
|
229
205
|
} else {
|
230
206
|
r = ALLOC(unichar);
|
231
207
|
*r = c;
|
@@ -281,23 +257,19 @@ compose_index(unichar c)
|
|
281
257
|
if (page > COMPOSE_TABLE_LAST)
|
282
258
|
return 0;
|
283
259
|
|
284
|
-
|
285
|
-
int16_t compose_offset = compose_table[page];
|
286
|
-
return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
|
287
|
-
compose_offset - UNICODE_MAX_TABLE_INDEX :
|
288
|
-
compose_data[compose_offset][c & 0xff];
|
260
|
+
return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
|
289
261
|
}
|
290
262
|
|
291
263
|
static bool
|
292
264
|
lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
|
293
265
|
unichar *result)
|
294
266
|
{
|
295
|
-
if (c
|
296
|
-
|
297
|
-
return true;
|
298
|
-
}
|
267
|
+
if (c != table[index][0])
|
268
|
+
return false;
|
299
269
|
|
300
|
-
|
270
|
+
*result = table[index][1];
|
271
|
+
|
272
|
+
return true;
|
301
273
|
}
|
302
274
|
|
303
275
|
static bool
|
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
|
|
307
279
|
return true;
|
308
280
|
|
309
281
|
uint16_t index_a = compose_index(a);
|
310
|
-
if (index_a >= COMPOSE_FIRST_SINGLE_START &&
|
311
|
-
index_a < COMPOSE_SECOND_START) {
|
282
|
+
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
|
312
283
|
return lookup_compose(compose_first_single,
|
313
284
|
index_a - COMPOSE_FIRST_SINGLE_START,
|
314
285
|
b,
|
315
286
|
result);
|
316
|
-
}
|
317
287
|
|
318
288
|
uint16_t index_b = compose_index(b);
|
319
|
-
if (index_b >= COMPOSE_SECOND_SINGLE_START)
|
289
|
+
if (index_b >= COMPOSE_SECOND_SINGLE_START)
|
320
290
|
return lookup_compose(compose_second_single,
|
321
291
|
index_b - COMPOSE_SECOND_SINGLE_START,
|
322
292
|
a,
|
323
293
|
result);
|
324
|
-
}
|
325
294
|
|
326
295
|
if (index_a >= COMPOSE_FIRST_START &&
|
327
296
|
index_a < COMPOSE_FIRST_SINGLE_START &&
|
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
|
|
356
325
|
return 1;
|
357
326
|
}
|
358
327
|
|
359
|
-
if (buf != NULL)
|
360
|
-
|
361
|
-
for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
|
362
|
-
buf[i] = utf_char(decomp);
|
363
|
-
return i;
|
364
|
-
}
|
328
|
+
if (buf != NULL)
|
329
|
+
return decomposition_to_wc(decomp, buf);
|
365
330
|
|
366
331
|
return utf_length(decomp);
|
367
332
|
}
|
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
|
|
378
343
|
size_t prev_n = n;
|
379
344
|
|
380
345
|
unichar *base = (buf != NULL) ? buf + n : NULL;
|
381
|
-
if (c >= SBase && c <= SLast)
|
382
|
-
|
383
|
-
|
384
|
-
decompose_hangul(c, base, &len);
|
385
|
-
n += len;
|
386
|
-
} else {
|
346
|
+
if (c >= SBase && c <= SLast)
|
347
|
+
n += decompose_hangul(c, base);
|
348
|
+
else
|
387
349
|
n += normalize_wc_decompose_one(c, mode, base);
|
388
|
-
}
|
389
350
|
|
390
351
|
if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
|
391
352
|
unicode_canonical_ordering(buf + prev_start,
|
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
|
|
403
364
|
*buf_len = n;
|
404
365
|
}
|
405
366
|
|
406
|
-
unichar *
|
407
|
-
|
367
|
+
static unichar *
|
368
|
+
normalize_wc_compose(unichar *buf, size_t len)
|
408
369
|
{
|
409
|
-
|
410
|
-
normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
|
411
|
-
unichar *buf = ALLOC_N(unichar, n + 1);
|
412
|
-
normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
|
413
|
-
|
414
|
-
/* Just return if we don’t want composition. */
|
415
|
-
if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
|
416
|
-
return buf;
|
417
|
-
|
370
|
+
int new_len = len;
|
418
371
|
size_t prev_start = 0;
|
419
372
|
int prev_cc = 0;
|
420
|
-
|
373
|
+
|
374
|
+
for (size_t i = 0; i < len; i++) {
|
421
375
|
int cc = COMBINING_CLASS(buf[i]);
|
376
|
+
size_t j = i - (len - new_len);
|
422
377
|
|
423
|
-
if (
|
378
|
+
if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
|
424
379
|
combine(buf[prev_start], buf[i], &buf[prev_start])) {
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
n--;
|
429
|
-
i--;
|
430
|
-
prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
|
380
|
+
new_len--;
|
381
|
+
prev_cc = (j + 1 == prev_start) ?
|
382
|
+
0 : COMBINING_CLASS(buf[j - 1]);
|
431
383
|
} else {
|
432
384
|
if (cc == 0)
|
433
|
-
prev_start =
|
385
|
+
prev_start = j;
|
434
386
|
|
387
|
+
buf[j] = buf[i];
|
435
388
|
prev_cc = cc;
|
436
389
|
}
|
437
390
|
}
|
438
391
|
|
439
|
-
buf[
|
392
|
+
buf[new_len] = NUL;
|
440
393
|
|
441
394
|
return buf;
|
442
395
|
}
|
443
396
|
|
397
|
+
unichar *
|
398
|
+
_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
|
399
|
+
{
|
400
|
+
size_t n;
|
401
|
+
normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
|
402
|
+
unichar *buf = ALLOC_N(unichar, n + 1);
|
403
|
+
normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
|
404
|
+
|
405
|
+
/* Just return if we don’t want composition. */
|
406
|
+
if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
|
407
|
+
return buf;
|
408
|
+
|
409
|
+
return normalize_wc_compose(buf, n);
|
410
|
+
}
|
411
|
+
|
444
412
|
|
445
413
|
/* {{{1
|
446
414
|
* Normalize (compose/decompose) characters in ‘str˚ so that strings that
|
@@ -1,6 +1,7 @@
|
|
1
1
|
break.o: break.c unicode.h data/break.h
|
2
2
|
decompose.o: decompose.c unicode.h private.h data/decompose.h \
|
3
3
|
data/compose.h
|
4
|
+
private.o: private.c private.h
|
4
5
|
properties.o: properties.c unicode.h private.h data/character-tables.h
|
5
6
|
rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
|
6
7
|
rb_methods.h
|
@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
|
|
12
12
|
end
|
13
13
|
|
14
14
|
try_compiler_option('-std=c99')
|
15
|
+
try_compiler_option('-finline-functions')
|
15
16
|
try_compiler_option('-Wall')
|
16
17
|
try_compiler_option('-Wextra')
|
17
18
|
try_compiler_option('-Wwrite-strings')
|
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
|
|
23
24
|
try_compiler_option('-Wpointer-arith')
|
24
25
|
try_compiler_option('-Wcast-align')
|
25
26
|
try_compiler_option('-Werror')
|
27
|
+
try_compiler_option('-Winline')
|
26
28
|
# XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
|
27
29
|
# local variables called “index” on FreeBSD.
|
28
30
|
# try_compiler_option('-Wshadow')
|
@@ -0,0 +1,62 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private functions used by the UTF-8 character-encoding library.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include <ruby.h>
|
8
|
+
#include <stdbool.h>
|
9
|
+
#include <stddef.h>
|
10
|
+
#include <stdint.h>
|
11
|
+
#include <stdlib.h>
|
12
|
+
|
13
|
+
#include "unicode.h"
|
14
|
+
|
15
|
+
#include "private.h"
|
16
|
+
|
17
|
+
/* Lookup C in the sorted TABLE using binary search. TABLE consists of N
|
18
|
+
* entries, where each entry is SIZEOF_ENTRY bytes in size and the first
|
19
|
+
* component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
|
20
|
+
* index is stored in INDEX and true is returned. Otherwise, false is returned
|
21
|
+
* and INDEX is left untouched. */
|
22
|
+
bool
|
23
|
+
binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
|
24
|
+
{
|
25
|
+
#define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
|
26
|
+
|
27
|
+
int begin = 0;
|
28
|
+
int end = n - 1;
|
29
|
+
int middle;
|
30
|
+
|
31
|
+
/* This is ugly, but not all tables use unichars as their lookup
|
32
|
+
* character. The casefold table, for example, uses uint16_t-sized
|
33
|
+
* characters. To only get the interesting part of our table entry
|
34
|
+
* we’ll have to mask the retrieved value. */
|
35
|
+
int char_mask = (1 << (8 * sizeof_char)) - 1;
|
36
|
+
|
37
|
+
/* Drop out early if we know for certain that C can’t be in the
|
38
|
+
* decomposition table. */
|
39
|
+
if (c < ENTRY(0) || c > ENTRY(end))
|
40
|
+
return false;
|
41
|
+
|
42
|
+
while (begin <= end) {
|
43
|
+
middle = binary_search_middle_of(begin, end);
|
44
|
+
|
45
|
+
unichar probe = ENTRY(middle);
|
46
|
+
if (c < probe)
|
47
|
+
end = middle - 1;
|
48
|
+
else if (c > probe)
|
49
|
+
begin = middle + 1;
|
50
|
+
else
|
51
|
+
break;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (begin > end)
|
55
|
+
return false;
|
56
|
+
|
57
|
+
*index = middle;
|
58
|
+
|
59
|
+
return true;
|
60
|
+
|
61
|
+
#undef ENTRY
|
62
|
+
}
|
@@ -21,48 +21,28 @@
|
|
21
21
|
# define HIDDEN(u)
|
22
22
|
#endif
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
inline int _unichar_combining_class(unichar c) HIDDEN;
|
27
|
-
|
28
|
-
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
29
|
-
|
30
|
-
unichar _utf_char_validated(char const *const str,
|
31
|
-
char const *const str_end) HIDDEN;
|
32
|
-
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
33
|
-
const char *limit, bool noisy) HIDDEN;
|
34
|
-
|
35
|
-
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
36
|
-
const char *end) HIDDEN;
|
37
|
-
|
38
|
-
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
39
|
-
const char *end) HIDDEN;
|
40
|
-
|
41
|
-
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
42
|
-
|
43
|
-
VALUE rb_utf_new2(const char *str) HIDDEN;
|
24
|
+
#define binary_search_middle_of(begin, end) \
|
25
|
+
(((unsigned)((begin) + (end))) >> 1)
|
44
26
|
|
45
|
-
|
27
|
+
#define unicode_table_lookup(table, c, index) \
|
28
|
+
binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
|
46
29
|
|
47
|
-
|
30
|
+
bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
|
48
31
|
|
49
|
-
|
32
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
|
33
|
+
((part[page] >= UNICODE_MAX_TABLE_INDEX) \
|
34
|
+
? (part[page] - UNICODE_MAX_TABLE_INDEX) \
|
35
|
+
: (data[part[page]][(c) & 0xff]))
|
50
36
|
|
51
|
-
|
37
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
|
38
|
+
(((c) <= UNICODE_LAST_CHAR_PART1) \
|
39
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
|
40
|
+
: (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
|
41
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
|
42
|
+
: (fallback)))
|
52
43
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
57
|
-
char **limit) HIDDEN;
|
58
|
-
|
59
|
-
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
60
|
-
|
61
|
-
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
62
|
-
|
63
|
-
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
64
|
-
|
65
|
-
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
66
|
-
long offset, bool reverse) HIDDEN;
|
44
|
+
unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
|
45
|
+
NormalizeMode mode) HIDDEN;
|
46
|
+
int _unichar_combining_class(unichar c) HIDDEN;
|
67
47
|
|
68
48
|
#endif /* PRIVATE_H */
|
@@ -63,8 +63,8 @@ s_type(unichar c)
|
|
63
63
|
if (c <= UNICODE_LAST_CHAR_PART1) {
|
64
64
|
page = c >> 8;
|
65
65
|
table = type_table_part1;
|
66
|
-
} else if (c >=
|
67
|
-
page = (c -
|
66
|
+
} else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
|
67
|
+
page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
|
68
68
|
table = type_table_part2;
|
69
69
|
} else {
|
70
70
|
return UNICODE_UNASSIGNED;
|
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
|
|
364
364
|
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
365
365
|
|
366
366
|
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
367
|
-
|
368
|
-
|
367
|
+
tv = utf_char(special_case_table +
|
368
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START);
|
369
369
|
|
370
370
|
if (tv == '\0')
|
371
371
|
return c;
|
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
|
|
429
429
|
return title_table[i][0];
|
430
430
|
|
431
431
|
if (s_type(c) == UNICODE_LOWERCASE_LETTER)
|
432
|
-
return
|
432
|
+
return unichar_toupper(c);
|
433
433
|
|
434
434
|
return c;
|
435
435
|
}
|
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
|
|
585
585
|
|
586
586
|
if (*was_i) {
|
587
587
|
size_t len = remove_all_combining_dot_above(c, buf);
|
588
|
-
return len + output_marks(p, (buf
|
589
|
-
true);
|
588
|
+
return len + output_marks(p, OFFSET_IF(buf, len), true);
|
590
589
|
}
|
591
590
|
|
592
591
|
if (!s_ismark(type))
|
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
|
|
761
760
|
|
762
761
|
/* {{{1
|
763
762
|
* The real implementation of downcase.
|
764
|
-
*
|
765
|
-
* TODO: this needs a cleanup.
|
766
763
|
*/
|
764
|
+
static size_t
|
765
|
+
tolower_turkic_i(const char **p, char *buf)
|
766
|
+
{
|
767
|
+
unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
|
768
|
+
|
769
|
+
if (utf_char(*p) == COMBINING_DOT_ABOVE) {
|
770
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
771
|
+
* of ‘p’? */
|
772
|
+
*p = utf_next(*p);
|
773
|
+
i = LATIN_SMALL_LETTER_I;
|
774
|
+
}
|
775
|
+
|
776
|
+
return unichar_to_utf(i, buf);
|
777
|
+
}
|
778
|
+
|
779
|
+
static size_t
|
780
|
+
tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
|
781
|
+
{
|
782
|
+
size_t len = unichar_to_utf(base, buf);
|
783
|
+
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
784
|
+
if (combiner != '\0')
|
785
|
+
len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
|
786
|
+
|
787
|
+
return len;
|
788
|
+
}
|
789
|
+
|
790
|
+
static size_t
|
791
|
+
tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
|
792
|
+
{
|
793
|
+
unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
794
|
+
|
795
|
+
/* SIGMA maps differently depending on whether it is final or not. The
|
796
|
+
* following simplified test would fail in the case of combining marks
|
797
|
+
* following the sigma, but I don't think that occurs in real text.
|
798
|
+
* The test here matches that in ICU. */
|
799
|
+
if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
|
800
|
+
sigma = GREEK_SMALL_LETTER_SIGMA;
|
801
|
+
|
802
|
+
return unichar_to_utf(sigma, buf);
|
803
|
+
}
|
804
|
+
|
767
805
|
static size_t
|
768
806
|
real_tolower_one(const char **p, const char *prev, char *buf,
|
769
807
|
LocaleType locale_type, const char *end, bool use_end)
|
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
|
|
771
809
|
unichar c = utf_char(prev);
|
772
810
|
int type = s_type(c);
|
773
811
|
|
774
|
-
if (locale_type == LOCALE_TURKIC && c == 'I')
|
775
|
-
|
776
|
-
/* TODO: don’t we need to make sure we don’t go beyond the end
|
777
|
-
* of ‘p’? */
|
778
|
-
*p = utf_next(*p);
|
779
|
-
return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
780
|
-
}
|
812
|
+
if (locale_type == LOCALE_TURKIC && c == 'I')
|
813
|
+
return tolower_turkic_i(p, buf);
|
781
814
|
|
782
|
-
|
783
|
-
|
815
|
+
/* Introduce an explicit dot above the lowercasing capital I’s
|
816
|
+
* and J’s whenever there are more accents above.
|
817
|
+
* [SpecialCasing.txt] */
|
818
|
+
if (locale_type == LOCALE_LITHUANIAN) {
|
819
|
+
unichar base = LATIN_SMALL_LETTER_I;
|
820
|
+
unichar combiner = '\0';
|
784
821
|
|
785
|
-
if (locale_type == LOCALE_LITHUANIAN &&
|
786
|
-
(c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
|
787
|
-
c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
|
788
|
-
c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
|
789
|
-
/* Introduce an explicit dot above the lowercasing capital I's
|
790
|
-
* and J's whenever there are more accents above.
|
791
|
-
* [SpecialCasing.txt] */
|
792
|
-
size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
793
|
-
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
794
822
|
switch (c) {
|
795
823
|
case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
|
796
|
-
|
797
|
-
OFFSET_IF(buf, len));
|
824
|
+
combiner = COMBINING_GRAVE_ACCENT;
|
798
825
|
break;
|
799
826
|
case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
|
800
|
-
|
801
|
-
OFFSET_IF(buf, len));
|
827
|
+
combiner = COMBINING_ACUTE_ACCENT;
|
802
828
|
break;
|
803
829
|
case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
|
804
|
-
|
805
|
-
OFFSET_IF(buf, len));
|
830
|
+
combiner = COMBINING_TILDE;
|
806
831
|
break;
|
807
|
-
|
832
|
+
case 'I':
|
833
|
+
case 'J':
|
834
|
+
case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
|
835
|
+
if (!has_more_above(*p))
|
836
|
+
goto no_lithuanian_i_casing;
|
808
837
|
|
809
|
-
|
810
|
-
|
838
|
+
base = unichar_tolower(c);
|
839
|
+
break;
|
840
|
+
default:
|
841
|
+
goto no_lithuanian_i_casing;
|
842
|
+
}
|
811
843
|
|
812
|
-
|
813
|
-
(c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
|
814
|
-
has_more_above(*p)) {
|
815
|
-
size_t len = unichar_to_utf(unichar_tolower(c), buf);
|
816
|
-
return len + unichar_to_utf(COMBINING_DOT_ABOVE,
|
817
|
-
OFFSET_IF(buf, len));
|
844
|
+
return tolower_lithuianian_i(buf, base, combiner);
|
818
845
|
}
|
819
846
|
|
820
|
-
|
821
|
-
unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
822
|
-
|
823
|
-
if ((!use_end || *p < end) && **p != '\0') {
|
824
|
-
unichar next_c = utf_char(*p);
|
825
|
-
int next_type = s_type(next_c);
|
847
|
+
no_lithuanian_i_casing:
|
826
848
|
|
827
|
-
|
828
|
-
|
829
|
-
* fail in the case of combining marks following the
|
830
|
-
* sigma, but I don't think that occurs in real text.
|
831
|
-
* The test here matches that in ICU. */
|
832
|
-
if (s_isalpha(next_type))
|
833
|
-
tv = GREEK_SMALL_LETTER_SIGMA;
|
834
|
-
}
|
835
|
-
|
836
|
-
return unichar_to_utf(tv, buf);
|
837
|
-
}
|
849
|
+
if (c == GREEK_CAPITAL_LETTER_SIGMA)
|
850
|
+
return tolower_sigma(p, buf, end, use_end);
|
838
851
|
|
839
852
|
if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
|
840
853
|
OR(UNICODE_TITLECASE_LETTER, 0))))
|
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
|
|
879
892
|
size_t len = real_tolower(str, max, use_max, NULL, locale_type);
|
880
893
|
char *result = ALLOC_N(char, len + 1);
|
881
894
|
real_tolower(str, max, use_max, result, locale_type);
|
882
|
-
result[len] =
|
895
|
+
result[len] = '\0';
|
883
896
|
|
884
897
|
return result;
|
885
898
|
}
|
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
|
|
915
928
|
static bool
|
916
929
|
casefold_table_lookup(unichar c, char *folded, size_t *len)
|
917
930
|
{
|
918
|
-
int
|
919
|
-
int end = lengthof(casefold_table);
|
931
|
+
int index;
|
920
932
|
|
921
|
-
if (
|
933
|
+
if (!unicode_table_lookup(casefold_table, c, &index))
|
922
934
|
return false;
|
923
935
|
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
} else if (mid == begin) {
|
933
|
-
return false;
|
934
|
-
} else if (c > casefold_table[mid].ch) {
|
935
|
-
begin = mid;
|
936
|
-
} else {
|
937
|
-
end = mid;
|
938
|
-
}
|
939
|
-
}
|
936
|
+
char const *folded_c = casefold_table[index].data;
|
937
|
+
|
938
|
+
if (folded != NULL)
|
939
|
+
strcpy(folded, folded_c);
|
940
|
+
|
941
|
+
*len += utf_byte_length(folded_c);
|
942
|
+
|
943
|
+
return true;
|
940
944
|
}
|
941
945
|
|
942
946
|
static char *
|
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
|
|
1037
1041
|
bool
|
1038
1042
|
unichar_mirror(unichar c, unichar *mirrored)
|
1039
1043
|
{
|
1040
|
-
|
1041
|
-
int end = lengthof(bidi_mirroring_table);
|
1044
|
+
int index;
|
1042
1045
|
|
1043
|
-
|
1044
|
-
|
1046
|
+
if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
|
1047
|
+
return false;
|
1045
1048
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
} else if (mid == begin) {
|
1051
|
-
return false;
|
1052
|
-
} else if (c > bidi_mirroring_table[mid].ch) {
|
1053
|
-
begin = mid;
|
1054
|
-
} else {
|
1055
|
-
end = mid;
|
1056
|
-
}
|
1057
|
-
}
|
1049
|
+
if (mirrored != NULL)
|
1050
|
+
*mirrored = bidi_mirroring_table[index].mirrored_ch;
|
1051
|
+
|
1052
|
+
return true;
|
1058
1053
|
}
|
1059
1054
|
|
1060
1055
|
|
@@ -0,0 +1,52 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private Ruby-related functions.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_PRIVATE_H
|
8
|
+
#define RB_PRIVATE_H
|
9
|
+
|
10
|
+
|
11
|
+
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
12
|
+
|
13
|
+
unichar _utf_char_validated(char const *const str,
|
14
|
+
char const *const str_end) HIDDEN;
|
15
|
+
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
16
|
+
const char *limit, bool noisy) HIDDEN;
|
17
|
+
|
18
|
+
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
19
|
+
const char *end) HIDDEN;
|
20
|
+
|
21
|
+
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
22
|
+
const char *end) HIDDEN;
|
23
|
+
|
24
|
+
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
25
|
+
|
26
|
+
VALUE rb_utf_new2(const char *str) HIDDEN;
|
27
|
+
|
28
|
+
VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
|
29
|
+
|
30
|
+
VALUE rb_utf_alloc_using(char *str) HIDDEN;
|
31
|
+
|
32
|
+
VALUE rb_utf_dup(VALUE str) HIDDEN;
|
33
|
+
|
34
|
+
long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
|
35
|
+
|
36
|
+
bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
|
37
|
+
char **limit) HIDDEN;
|
38
|
+
|
39
|
+
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
40
|
+
char **limit) HIDDEN;
|
41
|
+
|
42
|
+
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
43
|
+
|
44
|
+
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
45
|
+
|
46
|
+
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
47
|
+
|
48
|
+
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
49
|
+
long offset, bool reverse) HIDDEN;
|
50
|
+
|
51
|
+
|
52
|
+
#endif /* RB_PRIVATE_H */
|
@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
|
|
113
113
|
|
114
114
|
if (*non_digit != 0)
|
115
115
|
rb_raise(rb_eArgError,
|
116
|
-
"unexpected ‘%lc’ found at position %ld",
|
116
|
+
"unexpected ‘%lc’ found at position %ld",
|
117
|
+
c, utf_pointer_to_offset(str, s));
|
117
118
|
|
118
119
|
*non_digit = c;
|
119
120
|
|
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
|
|
135
136
|
return false;
|
136
137
|
rb_raise(rb_eArgError,
|
137
138
|
"non-digit character ‘%lc’ found at position %ld",
|
138
|
-
c, s
|
139
|
+
c, utf_pointer_to_offset(str, s));
|
139
140
|
}
|
140
141
|
|
141
142
|
if (value >= base) {
|
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
|
|
144
145
|
|
145
146
|
rb_raise(rb_eArgError,
|
146
147
|
"value (%d) greater than base (%d) at position %ld",
|
147
|
-
value, base, s
|
148
|
+
value, base, utf_pointer_to_offset(str, s));
|
148
149
|
}
|
149
150
|
|
150
151
|
*digit_value = value;
|
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
|
|
181
182
|
if (*s != '\0')
|
182
183
|
rb_raise(rb_eArgError,
|
183
184
|
"trailing garbage found at position %ld",
|
184
|
-
s
|
185
|
+
utf_pointer_to_offset(str, s));
|
185
186
|
}
|
186
187
|
|
187
188
|
if (POSFIXABLE(value)) {
|
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
221
222
|
if (verify)
|
222
223
|
rb_raise(rb_eArgError,
|
223
224
|
"extra sign ‘%c’ found at position %ld",
|
224
|
-
*s, s
|
225
|
+
*s, utf_pointer_to_offset(str, s));
|
225
226
|
return INT2FIX(0);
|
226
227
|
}
|
227
228
|
|
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
245
246
|
if (verify && *str == '_')
|
246
247
|
rb_raise(rb_eArgError,
|
247
248
|
"leading digit-separator ‘_’ found at position %ld",
|
248
|
-
s
|
249
|
+
utf_pointer_to_offset(str, s));
|
249
250
|
|
250
251
|
bit_length = bit_length / BITSPERDIG + 1;
|
251
252
|
|
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
269
270
|
|
270
271
|
bool more_to_shift = true;
|
271
272
|
while (more_to_shift) {
|
272
|
-
BDIGIT_DBL num =
|
273
|
+
BDIGIT_DBL num = digit_value;
|
273
274
|
|
274
275
|
for (int i = 0; i < big_len; i++) {
|
275
276
|
num += (BDIGIT_DBL)zds[i] * base;
|
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
294
295
|
if (str + 1 < s && s[-1] == '_')
|
295
296
|
rb_raise(rb_eArgError,
|
296
297
|
"trailing digit-separator ‘_’ found at position %ld",
|
297
|
-
s
|
298
|
+
utf_pointer_to_offset(str, s));
|
298
299
|
|
299
300
|
if (*s != '\0')
|
300
301
|
rb_raise(rb_eArgError,
|
301
302
|
"trailing garbage found at position %ld",
|
302
|
-
s
|
303
|
+
utf_pointer_to_offset(str, s));
|
303
304
|
|
304
305
|
return rb_big_norm(z);
|
305
306
|
}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Functions for dealing with Unicode tables.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef TABLES_H
|
8
|
+
#define TABLES_H
|
9
|
+
|
10
|
+
|
11
|
+
/*
|
12
|
+
static inline int
|
13
|
+
split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
|
14
|
+
{
|
15
|
+
return (page >= UNICODE_MAX_TABLE_INDEX) ?
|
16
|
+
page - UNICODE_MAX_TABLE_INDEX :
|
17
|
+
data[page][c & 0xff];
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline int
|
21
|
+
split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
|
22
|
+
{
|
23
|
+
if (c <= UNICODE_LAST_CHAR_PART1)
|
24
|
+
return split_unicode_table_lookup_page(data,
|
25
|
+
part1[c >> 8],
|
26
|
+
c);
|
27
|
+
|
28
|
+
if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
|
29
|
+
return split_unicode_table_lookup_page(data,
|
30
|
+
part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
|
31
|
+
c);
|
32
|
+
|
33
|
+
return fallback;
|
34
|
+
}
|
35
|
+
*/
|
36
|
+
|
37
|
+
|
38
|
+
#endif /* TABLES_H */
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include <limits.h>
|
14
14
|
#include "unicode.h"
|
15
15
|
#include "private.h"
|
16
|
+
#include "rb_private.h"
|
16
17
|
#include "rb_methods.h"
|
17
18
|
|
18
19
|
static VALUE mUTF8Methods;
|
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
|
85
86
|
saved_offset);
|
86
87
|
else
|
87
88
|
return NULL;
|
88
|
-
break;
|
89
89
|
}
|
90
90
|
|
91
91
|
offset += utf_pointer_to_offset(p, base);
|
@@ -190,6 +190,9 @@ utf_char(const char *str)
|
|
190
190
|
unichar
|
191
191
|
utf_char_n(const char *str, size_t max)
|
192
192
|
{
|
193
|
+
if (max == 0)
|
194
|
+
return UTF_INCOMPLETE_INPUT_UNICHAR;
|
195
|
+
|
193
196
|
size_t len;
|
194
197
|
unichar c = (unsigned char)*str;
|
195
198
|
|
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
|
|
454
457
|
|
455
458
|
unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
|
456
459
|
unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
|
457
|
-
|
460
|
+
|
458
461
|
int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
|
459
462
|
|
460
463
|
free(a_norm);
|
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
|
|
518
521
|
assert(str != NULL);
|
519
522
|
|
520
523
|
unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
|
521
|
-
setlocale(LC_COLLATE, "");
|
522
524
|
size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
|
523
525
|
wchar_t result_wc[xfrm_len + 1];
|
524
526
|
wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
|
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
|
|
863
865
|
char *result = ALLOC_N(char, len + 1);
|
864
866
|
char *r = result + len;
|
865
867
|
const char *p = str;
|
866
|
-
|
868
|
+
while (r > result) {
|
867
869
|
uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
|
868
870
|
r -= skip;
|
869
871
|
for (char *m = r; skip > 0; skip--)
|
data/tests/case.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# contents: Tests for String#upcase and String#downcase.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'tests/unicodedatatestbase'
|
6
|
+
require 'encoding/character/utf-8'
|
7
|
+
|
8
|
+
class TC_StringCase < Test::Unit::TestCase
|
9
|
+
include UnicodeDataTestBase
|
10
|
+
|
11
|
+
Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
|
12
|
+
CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
|
13
|
+
|
14
|
+
def test_upcase_and_downcase
|
15
|
+
# TODO: Do it like this. First read in SpecialCasing.txt and set up lookup
|
16
|
+
# tables for all the characters that need special casing. Then, iterate
|
17
|
+
# over UnicodeData and simply check that the correct casings are performed,
|
18
|
+
# looking up data in the tables for special casing if no simple casing
|
19
|
+
# information is available (and skipping when appropriate - such as when
|
20
|
+
# there is some condition defined for the special casing).
|
21
|
+
special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
|
22
|
+
open_data_file('SpecialCasing.txt') do |file|
|
23
|
+
i = 0
|
24
|
+
file.each_line do |line|
|
25
|
+
i += 1
|
26
|
+
next if line =~ /^(#|\s*$)/
|
27
|
+
fields = line.sub(/\s*#.*$/, "").split('; ')
|
28
|
+
unless fields.size == 4 or fields.size == 5
|
29
|
+
raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
|
30
|
+
end
|
31
|
+
code = fields[CasingCode].hex
|
32
|
+
special.conditions[code] = fields[CasingCondition] if fields.size == 5
|
33
|
+
special.upper[code] = utfify(fields[CasingUpper])
|
34
|
+
special.lower[code] = utfify(fields[CasingLower])
|
35
|
+
special.title[code] = utfify(fields[CasingTitle])
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
open_data_file('UnicodeData.txt') do |file|
|
40
|
+
i = 0
|
41
|
+
prev_code = -1
|
42
|
+
file.each_line do |line|
|
43
|
+
i += 1
|
44
|
+
next if line =~ /^(#|\s*$)/
|
45
|
+
fields = line.split(';')
|
46
|
+
raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
|
47
|
+
code = fields[Code].hex
|
48
|
+
if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
|
49
|
+
prev_code.upto(code - 1){ |c| test_one c, fields, special }
|
50
|
+
end
|
51
|
+
test_one code, fields, special
|
52
|
+
prev_code = code
|
53
|
+
end
|
54
|
+
end
|
55
|
+
puts @i
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def utfify(codepoints)
|
61
|
+
return codepoints if codepoints == ""
|
62
|
+
codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
|
63
|
+
end
|
64
|
+
|
65
|
+
def utfone(codepoint)
|
66
|
+
u([codepoint].pack('U*'))
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_one(code, fields, special)
|
70
|
+
@i ||= 0
|
71
|
+
@i += 1
|
72
|
+
case fields[Category]
|
73
|
+
when 'Ll'
|
74
|
+
test_upcase(code, fields, special)
|
75
|
+
when 'Lu'
|
76
|
+
test_downcase(code, fields, special)
|
77
|
+
when 'Lt'
|
78
|
+
test_upcase(code, fields, special)
|
79
|
+
test_downcase(code, fields, special)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_upcase(code, fields, special)
|
84
|
+
if special.upper[code]
|
85
|
+
if not special.conditions[code]
|
86
|
+
assert_equal(special.upper[code], utfone(code).upcase)
|
87
|
+
end
|
88
|
+
elsif not fields[Upper].empty?
|
89
|
+
assert_equal(utfify(fields[Upper]), utfone(code).upcase)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_downcase(code, fields, special)
|
94
|
+
if special.lower[code]
|
95
|
+
if not special.conditions[code]
|
96
|
+
assert_equal(special.lower[code], utfone(code).downcase)
|
97
|
+
end
|
98
|
+
elsif not fields[Lower].empty?
|
99
|
+
assert_equal(utfify(fields[Lower]), utfone(code).downcase)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: character-encodings
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2007-11-22 00:00:00 +01:00
|
8
8
|
summary: A pluggable character-encoding library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -48,6 +48,7 @@ files:
|
|
48
48
|
- ext/encoding/character/unicode/codepoint.c
|
49
49
|
- ext/encoding/character/utf-8/break.c
|
50
50
|
- ext/encoding/character/utf-8/decompose.c
|
51
|
+
- ext/encoding/character/utf-8/private.c
|
51
52
|
- ext/encoding/character/utf-8/properties.c
|
52
53
|
- ext/encoding/character/utf-8/rb_utf_aref.c
|
53
54
|
- ext/encoding/character/utf-8/rb_utf_aset.c
|
@@ -80,19 +81,22 @@ files:
|
|
80
81
|
- ext/encoding/character/utf-8/unicode.c
|
81
82
|
- ext/encoding/character/utf-8/utf.c
|
82
83
|
- ext/encoding/character/utf-8/rb_utf_internal_bignum.c
|
84
|
+
- ext/encoding/character/utf-8/data/break.h
|
85
|
+
- ext/encoding/character/utf-8/data/character-tables.h
|
86
|
+
- ext/encoding/character/utf-8/data/compose.h
|
87
|
+
- ext/encoding/character/utf-8/data/decompose.h
|
83
88
|
- ext/encoding/character/utf-8/private.h
|
84
89
|
- ext/encoding/character/utf-8/rb_includes.h
|
85
90
|
- ext/encoding/character/utf-8/rb_methods.h
|
91
|
+
- ext/encoding/character/utf-8/rb_private.h
|
86
92
|
- ext/encoding/character/utf-8/rb_utf_internal_tr.h
|
93
|
+
- ext/encoding/character/utf-8/tables.h
|
87
94
|
- ext/encoding/character/utf-8/unicode.h
|
88
95
|
- ext/encoding/character/utf-8/rb_utf_internal_bignum.h
|
89
|
-
- ext/encoding/character/utf-8/data/break.h
|
90
|
-
- ext/encoding/character/utf-8/data/character-tables.h
|
91
|
-
- ext/encoding/character/utf-8/data/compose.h
|
92
|
-
- ext/encoding/character/utf-8/data/decompose.h
|
93
|
-
- ext/encoding/character/utf-8/extconf.rb
|
94
96
|
- ext/encoding/character/utf-8/data/generate-unicode-data.rb
|
97
|
+
- ext/encoding/character/utf-8/extconf.rb
|
95
98
|
- ext/encoding/character/utf-8/depend
|
99
|
+
- tests/case.rb
|
96
100
|
- tests/foldcase.rb
|
97
101
|
- tests/normalize.rb
|
98
102
|
- tests/unicodedatatestbase.rb
|