character-encodings 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -2
- data/ext/encoding/character/utf-8/break.c +6 -19
- data/ext/encoding/character/utf-8/data/character-tables.h +2 -0
- data/ext/encoding/character/utf-8/data/decompose.h +1 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +5 -0
- data/ext/encoding/character/utf-8/decompose.c +77 -109
- data/ext/encoding/character/utf-8/depend +1 -0
- data/ext/encoding/character/utf-8/extconf.rb +2 -0
- data/ext/encoding/character/utf-8/private.c +62 -0
- data/ext/encoding/character/utf-8/private.h +18 -38
- data/ext/encoding/character/utf-8/properties.c +90 -95
- data/ext/encoding/character/utf-8/rb_includes.h +1 -0
- data/ext/encoding/character/utf-8/rb_private.h +52 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +10 -9
- data/ext/encoding/character/utf-8/tables.h +38 -0
- data/ext/encoding/character/utf-8/unicode.c +1 -1
- data/ext/encoding/character/utf-8/utf.c +5 -3
- data/tests/case.rb +102 -0
- metadata +12 -8
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ require 'rake/testtask'
|
|
10
10
|
require 'spec/rake/spectask'
|
11
11
|
|
12
12
|
PackageName = 'character-encodings'
|
13
|
-
PackageVersion = '0.
|
13
|
+
PackageVersion = '0.3.0'
|
14
14
|
|
15
15
|
desc 'Default task'
|
16
16
|
task :default => [:extensions]
|
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
|
|
92
92
|
end
|
93
93
|
|
94
94
|
Tests = [
|
95
|
-
['tests/foldcase.rb'],
|
95
|
+
['tests/foldcase.rb', 'tests/case.rb'],
|
96
96
|
['tests/normalize.rb']
|
97
97
|
]
|
98
98
|
|
@@ -9,30 +9,17 @@
|
|
9
9
|
#include <stdint.h>
|
10
10
|
#include "unicode.h"
|
11
11
|
#include "data/break.h"
|
12
|
-
|
12
|
+
#include "private.h"
|
13
13
|
|
14
14
|
/* Figure out what break type the Unicode character ‘c’ possesses, if any.
|
15
15
|
* This information is used for finding word and line boundaries, which is
|
16
16
|
* useful when displaying Unicode text on screen. */
|
17
|
-
static UnicodeBreakType
|
18
|
-
break_type(const int16_t table[], unsigned int page, unichar c)
|
19
|
-
{
|
20
|
-
int16_t break_property = table[page];
|
21
|
-
|
22
|
-
return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
|
23
|
-
break_property - UNICODE_MAX_TABLE_INDEX :
|
24
|
-
break_property_data[break_property][c & 0xff];
|
25
|
-
}
|
26
|
-
|
27
17
|
UnicodeBreakType
|
28
18
|
unichar_break_type(unichar c)
|
29
19
|
{
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
(c - 0xe0000) >> 8, c);
|
36
|
-
|
37
|
-
return UNICODE_BREAK_UNKNOWN;
|
20
|
+
return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
|
21
|
+
break_property_table_part1,
|
22
|
+
break_property_table_part2,
|
23
|
+
c,
|
24
|
+
UNICODE_BREAK_UNKNOWN);
|
38
25
|
}
|
@@ -534,6 +534,8 @@ private
|
|
534
534
|
|
535
535
|
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
536
536
|
|
537
|
+
#define UNICODE_FIRST_CHAR_PART2 0xe0000
|
538
|
+
|
537
539
|
#define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
|
538
540
|
EOF
|
539
541
|
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
@@ -694,8 +696,11 @@ EOF
|
|
694
696
|
#define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
|
695
697
|
|
696
698
|
#define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
|
699
|
+
|
697
700
|
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
698
701
|
|
702
|
+
#define UNICODE_FIRST_CHAR_PART2 0xe0000
|
703
|
+
|
699
704
|
#define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
|
700
705
|
EOF
|
701
706
|
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
@@ -8,35 +8,16 @@
|
|
8
8
|
#include <stdbool.h>
|
9
9
|
#include <stddef.h>
|
10
10
|
#include <stdint.h>
|
11
|
+
|
11
12
|
#include "unicode.h"
|
12
|
-
|
13
|
+
|
13
14
|
#include "data/decompose.h"
|
14
15
|
#include "data/compose.h"
|
15
16
|
|
17
|
+
#include "private.h"
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
* character.
|
20
|
-
*
|
21
|
-
* TODO: Turn these macros into full-fledged functions, as this is rather silly
|
22
|
-
* when we have ‹inline› in C99.
|
23
|
-
*/
|
24
|
-
#define CC_PART1(page, char) \
|
25
|
-
((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
|
26
|
-
? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
|
27
|
-
: (cclass_data[combining_class_table_part1[page]][char]))
|
28
|
-
|
29
|
-
#define CC_PART2(page, char) \
|
30
|
-
((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
|
31
|
-
? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
|
32
|
-
: (cclass_data[combining_class_table_part2[page]][char]))
|
33
|
-
|
34
|
-
#define COMBINING_CLASS(char) \
|
35
|
-
(((char) <= UNICODE_LAST_CHAR_PART1) \
|
36
|
-
? CC_PART1((char) >> 8, (char) & 0xff) \
|
37
|
-
: (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
|
38
|
-
? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
|
39
|
-
: 0))
|
19
|
+
#define COMBINING_CLASS(c) \
|
20
|
+
SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
|
40
21
|
|
41
22
|
|
42
23
|
/* {{{1
|
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
|
|
115
96
|
/* {{{1
|
116
97
|
* Decompose the character ‘s’ according to the rules outlined in
|
117
98
|
* http://www.unicode.org/unicode/reports/tr15/#Hangul. ‘r’ should be ‹NULL›
|
118
|
-
* or of sufficient length to store the decomposition of ‘s’.
|
119
|
-
* characters stored (or would be if it were non
|
120
|
-
* ‘r_len’.
|
99
|
+
* or of sufficient length to store the decomposition of ‘s’. Returns the
|
100
|
+
* number of characters stored (or would be if it were non-NULL) in R.
|
121
101
|
*/
|
122
|
-
static
|
123
|
-
decompose_hangul(unichar s, unichar *r
|
102
|
+
static size_t
|
103
|
+
decompose_hangul(unichar s, unichar *r)
|
124
104
|
{
|
125
105
|
int SIndex = s - SBase;
|
126
106
|
|
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
|
|
128
108
|
if (SIndex < 0 || SIndex >= SCount) {
|
129
109
|
if (r != NULL)
|
130
110
|
r[0] = s;
|
131
|
-
|
132
|
-
return;
|
111
|
+
return 1;
|
133
112
|
}
|
134
113
|
|
135
114
|
unichar L = LBase + SIndex / NCount;
|
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
|
|
141
120
|
r[1] = V;
|
142
121
|
}
|
143
122
|
|
144
|
-
if (T
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
123
|
+
if (T == TBase)
|
124
|
+
return 2;
|
125
|
+
|
126
|
+
if (r != NULL)
|
127
|
+
r[2] = T;
|
128
|
+
|
129
|
+
return 3;
|
151
130
|
}
|
152
131
|
|
153
132
|
|
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
|
|
179
158
|
static const char *
|
180
159
|
find_decomposition(unichar c, bool compat)
|
181
160
|
{
|
182
|
-
int
|
183
|
-
int end = lengthof(decomp_table);
|
161
|
+
int index;
|
184
162
|
|
185
|
-
if (
|
163
|
+
if (!unicode_table_lookup(decomp_table, c, &index))
|
186
164
|
return NULL;
|
187
165
|
|
188
|
-
|
189
|
-
|
166
|
+
return get_decomposition(index, compat);
|
167
|
+
}
|
190
168
|
|
191
|
-
if (c == decomp_table[middle].ch)
|
192
|
-
return get_decomposition(middle, compat);
|
193
|
-
else if (middle == begin)
|
194
|
-
break;
|
195
|
-
else if (c > decomp_table[middle].ch)
|
196
|
-
begin = middle;
|
197
|
-
else
|
198
|
-
end = middle;
|
199
|
-
}
|
200
169
|
|
201
|
-
|
170
|
+
/* {{{1
|
171
|
+
* Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
|
172
|
+
* ‘chars’. Return the number of unichars in ‘chars’.
|
173
|
+
*/
|
174
|
+
static size_t
|
175
|
+
decomposition_to_wc(const char *decomposition, unichar *chars)
|
176
|
+
{
|
177
|
+
size_t i = 0;
|
178
|
+
for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
|
179
|
+
chars[i++] = utf_char(p);
|
180
|
+
|
181
|
+
return i;
|
202
182
|
}
|
203
183
|
|
204
184
|
|
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
|
|
215
195
|
|
216
196
|
/* Hangul syllable */
|
217
197
|
if (c >= SBase && c <= SLast) {
|
218
|
-
decompose_hangul(c, NULL
|
198
|
+
*len = decompose_hangul(c, NULL);
|
219
199
|
r = ALLOC_N(unichar, *len);
|
220
|
-
decompose_hangul(c, r
|
200
|
+
decompose_hangul(c, r);
|
221
201
|
} else if ((decomp = find_decomposition(c, false)) != NULL) {
|
222
202
|
*len = utf_length(decomp);
|
223
203
|
r = ALLOC_N(unichar, *len);
|
224
|
-
|
225
|
-
int i;
|
226
|
-
const char *p;
|
227
|
-
for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
|
228
|
-
r[i] = utf_char(p);
|
204
|
+
decomposition_to_wc(decomp, r);
|
229
205
|
} else {
|
230
206
|
r = ALLOC(unichar);
|
231
207
|
*r = c;
|
@@ -281,23 +257,19 @@ compose_index(unichar c)
|
|
281
257
|
if (page > COMPOSE_TABLE_LAST)
|
282
258
|
return 0;
|
283
259
|
|
284
|
-
|
285
|
-
int16_t compose_offset = compose_table[page];
|
286
|
-
return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
|
287
|
-
compose_offset - UNICODE_MAX_TABLE_INDEX :
|
288
|
-
compose_data[compose_offset][c & 0xff];
|
260
|
+
return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
|
289
261
|
}
|
290
262
|
|
291
263
|
static bool
|
292
264
|
lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
|
293
265
|
unichar *result)
|
294
266
|
{
|
295
|
-
if (c
|
296
|
-
|
297
|
-
return true;
|
298
|
-
}
|
267
|
+
if (c != table[index][0])
|
268
|
+
return false;
|
299
269
|
|
300
|
-
|
270
|
+
*result = table[index][1];
|
271
|
+
|
272
|
+
return true;
|
301
273
|
}
|
302
274
|
|
303
275
|
static bool
|
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
|
|
307
279
|
return true;
|
308
280
|
|
309
281
|
uint16_t index_a = compose_index(a);
|
310
|
-
if (index_a >= COMPOSE_FIRST_SINGLE_START &&
|
311
|
-
index_a < COMPOSE_SECOND_START) {
|
282
|
+
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
|
312
283
|
return lookup_compose(compose_first_single,
|
313
284
|
index_a - COMPOSE_FIRST_SINGLE_START,
|
314
285
|
b,
|
315
286
|
result);
|
316
|
-
}
|
317
287
|
|
318
288
|
uint16_t index_b = compose_index(b);
|
319
|
-
if (index_b >= COMPOSE_SECOND_SINGLE_START)
|
289
|
+
if (index_b >= COMPOSE_SECOND_SINGLE_START)
|
320
290
|
return lookup_compose(compose_second_single,
|
321
291
|
index_b - COMPOSE_SECOND_SINGLE_START,
|
322
292
|
a,
|
323
293
|
result);
|
324
|
-
}
|
325
294
|
|
326
295
|
if (index_a >= COMPOSE_FIRST_START &&
|
327
296
|
index_a < COMPOSE_FIRST_SINGLE_START &&
|
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
|
|
356
325
|
return 1;
|
357
326
|
}
|
358
327
|
|
359
|
-
if (buf != NULL)
|
360
|
-
|
361
|
-
for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
|
362
|
-
buf[i] = utf_char(decomp);
|
363
|
-
return i;
|
364
|
-
}
|
328
|
+
if (buf != NULL)
|
329
|
+
return decomposition_to_wc(decomp, buf);
|
365
330
|
|
366
331
|
return utf_length(decomp);
|
367
332
|
}
|
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
|
|
378
343
|
size_t prev_n = n;
|
379
344
|
|
380
345
|
unichar *base = (buf != NULL) ? buf + n : NULL;
|
381
|
-
if (c >= SBase && c <= SLast)
|
382
|
-
|
383
|
-
|
384
|
-
decompose_hangul(c, base, &len);
|
385
|
-
n += len;
|
386
|
-
} else {
|
346
|
+
if (c >= SBase && c <= SLast)
|
347
|
+
n += decompose_hangul(c, base);
|
348
|
+
else
|
387
349
|
n += normalize_wc_decompose_one(c, mode, base);
|
388
|
-
}
|
389
350
|
|
390
351
|
if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
|
391
352
|
unicode_canonical_ordering(buf + prev_start,
|
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
|
|
403
364
|
*buf_len = n;
|
404
365
|
}
|
405
366
|
|
406
|
-
unichar *
|
407
|
-
|
367
|
+
static unichar *
|
368
|
+
normalize_wc_compose(unichar *buf, size_t len)
|
408
369
|
{
|
409
|
-
|
410
|
-
normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
|
411
|
-
unichar *buf = ALLOC_N(unichar, n + 1);
|
412
|
-
normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
|
413
|
-
|
414
|
-
/* Just return if we don’t want composition. */
|
415
|
-
if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
|
416
|
-
return buf;
|
417
|
-
|
370
|
+
int new_len = len;
|
418
371
|
size_t prev_start = 0;
|
419
372
|
int prev_cc = 0;
|
420
|
-
|
373
|
+
|
374
|
+
for (size_t i = 0; i < len; i++) {
|
421
375
|
int cc = COMBINING_CLASS(buf[i]);
|
376
|
+
size_t j = i - (len - new_len);
|
422
377
|
|
423
|
-
if (
|
378
|
+
if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
|
424
379
|
combine(buf[prev_start], buf[i], &buf[prev_start])) {
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
n--;
|
429
|
-
i--;
|
430
|
-
prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
|
380
|
+
new_len--;
|
381
|
+
prev_cc = (j + 1 == prev_start) ?
|
382
|
+
0 : COMBINING_CLASS(buf[j - 1]);
|
431
383
|
} else {
|
432
384
|
if (cc == 0)
|
433
|
-
prev_start =
|
385
|
+
prev_start = j;
|
434
386
|
|
387
|
+
buf[j] = buf[i];
|
435
388
|
prev_cc = cc;
|
436
389
|
}
|
437
390
|
}
|
438
391
|
|
439
|
-
buf[
|
392
|
+
buf[new_len] = NUL;
|
440
393
|
|
441
394
|
return buf;
|
442
395
|
}
|
443
396
|
|
397
|
+
unichar *
|
398
|
+
_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
|
399
|
+
{
|
400
|
+
size_t n;
|
401
|
+
normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
|
402
|
+
unichar *buf = ALLOC_N(unichar, n + 1);
|
403
|
+
normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
|
404
|
+
|
405
|
+
/* Just return if we don’t want composition. */
|
406
|
+
if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
|
407
|
+
return buf;
|
408
|
+
|
409
|
+
return normalize_wc_compose(buf, n);
|
410
|
+
}
|
411
|
+
|
444
412
|
|
445
413
|
/* {{{1
|
446
414
|
* Normalize (compose/decompose) characters in ‘str˚ so that strings that
|
@@ -1,6 +1,7 @@
|
|
1
1
|
break.o: break.c unicode.h data/break.h
|
2
2
|
decompose.o: decompose.c unicode.h private.h data/decompose.h \
|
3
3
|
data/compose.h
|
4
|
+
private.o: private.c private.h
|
4
5
|
properties.o: properties.c unicode.h private.h data/character-tables.h
|
5
6
|
rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
|
6
7
|
rb_methods.h
|
@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
|
|
12
12
|
end
|
13
13
|
|
14
14
|
try_compiler_option('-std=c99')
|
15
|
+
try_compiler_option('-finline-functions')
|
15
16
|
try_compiler_option('-Wall')
|
16
17
|
try_compiler_option('-Wextra')
|
17
18
|
try_compiler_option('-Wwrite-strings')
|
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
|
|
23
24
|
try_compiler_option('-Wpointer-arith')
|
24
25
|
try_compiler_option('-Wcast-align')
|
25
26
|
try_compiler_option('-Werror')
|
27
|
+
try_compiler_option('-Winline')
|
26
28
|
# XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
|
27
29
|
# local variables called “index” on FreeBSD.
|
28
30
|
# try_compiler_option('-Wshadow')
|
@@ -0,0 +1,62 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private functions used by the UTF-8 character-encoding library.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include <ruby.h>
|
8
|
+
#include <stdbool.h>
|
9
|
+
#include <stddef.h>
|
10
|
+
#include <stdint.h>
|
11
|
+
#include <stdlib.h>
|
12
|
+
|
13
|
+
#include "unicode.h"
|
14
|
+
|
15
|
+
#include "private.h"
|
16
|
+
|
17
|
+
/* Lookup C in the sorted TABLE using binary search. TABLE consists of N
|
18
|
+
* entries, where each entry is SIZEOF_ENTRY bytes in size and the first
|
19
|
+
* component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
|
20
|
+
* index is stored in INDEX and true is returned. Otherwise, false is returned
|
21
|
+
* and INDEX is left untouched. */
|
22
|
+
bool
|
23
|
+
binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
|
24
|
+
{
|
25
|
+
#define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
|
26
|
+
|
27
|
+
int begin = 0;
|
28
|
+
int end = n - 1;
|
29
|
+
int middle;
|
30
|
+
|
31
|
+
/* This is ugly, but not all tables use unichars as their lookup
|
32
|
+
* character. The casefold table, for example, uses uint16_t-sized
|
33
|
+
* characters. To only get the interesting part of our table entry
|
34
|
+
* we’ll have to mask the retrieved value. */
|
35
|
+
int char_mask = (1 << (8 * sizeof_char)) - 1;
|
36
|
+
|
37
|
+
/* Drop out early if we know for certain that C can’t be in the
|
38
|
+
* decomposition table. */
|
39
|
+
if (c < ENTRY(0) || c > ENTRY(end))
|
40
|
+
return false;
|
41
|
+
|
42
|
+
while (begin <= end) {
|
43
|
+
middle = binary_search_middle_of(begin, end);
|
44
|
+
|
45
|
+
unichar probe = ENTRY(middle);
|
46
|
+
if (c < probe)
|
47
|
+
end = middle - 1;
|
48
|
+
else if (c > probe)
|
49
|
+
begin = middle + 1;
|
50
|
+
else
|
51
|
+
break;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (begin > end)
|
55
|
+
return false;
|
56
|
+
|
57
|
+
*index = middle;
|
58
|
+
|
59
|
+
return true;
|
60
|
+
|
61
|
+
#undef ENTRY
|
62
|
+
}
|
@@ -21,48 +21,28 @@
|
|
21
21
|
# define HIDDEN(u)
|
22
22
|
#endif
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
inline int _unichar_combining_class(unichar c) HIDDEN;
|
27
|
-
|
28
|
-
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
29
|
-
|
30
|
-
unichar _utf_char_validated(char const *const str,
|
31
|
-
char const *const str_end) HIDDEN;
|
32
|
-
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
33
|
-
const char *limit, bool noisy) HIDDEN;
|
34
|
-
|
35
|
-
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
36
|
-
const char *end) HIDDEN;
|
37
|
-
|
38
|
-
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
39
|
-
const char *end) HIDDEN;
|
40
|
-
|
41
|
-
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
42
|
-
|
43
|
-
VALUE rb_utf_new2(const char *str) HIDDEN;
|
24
|
+
#define binary_search_middle_of(begin, end) \
|
25
|
+
(((unsigned)((begin) + (end))) >> 1)
|
44
26
|
|
45
|
-
|
27
|
+
#define unicode_table_lookup(table, c, index) \
|
28
|
+
binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
|
46
29
|
|
47
|
-
|
30
|
+
bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
|
48
31
|
|
49
|
-
|
32
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
|
33
|
+
((part[page] >= UNICODE_MAX_TABLE_INDEX) \
|
34
|
+
? (part[page] - UNICODE_MAX_TABLE_INDEX) \
|
35
|
+
: (data[part[page]][(c) & 0xff]))
|
50
36
|
|
51
|
-
|
37
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
|
38
|
+
(((c) <= UNICODE_LAST_CHAR_PART1) \
|
39
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
|
40
|
+
: (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
|
41
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
|
42
|
+
: (fallback)))
|
52
43
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
57
|
-
char **limit) HIDDEN;
|
58
|
-
|
59
|
-
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
60
|
-
|
61
|
-
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
62
|
-
|
63
|
-
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
64
|
-
|
65
|
-
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
66
|
-
long offset, bool reverse) HIDDEN;
|
44
|
+
unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
|
45
|
+
NormalizeMode mode) HIDDEN;
|
46
|
+
int _unichar_combining_class(unichar c) HIDDEN;
|
67
47
|
|
68
48
|
#endif /* PRIVATE_H */
|
@@ -63,8 +63,8 @@ s_type(unichar c)
|
|
63
63
|
if (c <= UNICODE_LAST_CHAR_PART1) {
|
64
64
|
page = c >> 8;
|
65
65
|
table = type_table_part1;
|
66
|
-
} else if (c >=
|
67
|
-
page = (c -
|
66
|
+
} else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
|
67
|
+
page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
|
68
68
|
table = type_table_part2;
|
69
69
|
} else {
|
70
70
|
return UNICODE_UNASSIGNED;
|
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
|
|
364
364
|
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
365
365
|
|
366
366
|
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
367
|
-
|
368
|
-
|
367
|
+
tv = utf_char(special_case_table +
|
368
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START);
|
369
369
|
|
370
370
|
if (tv == '\0')
|
371
371
|
return c;
|
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
|
|
429
429
|
return title_table[i][0];
|
430
430
|
|
431
431
|
if (s_type(c) == UNICODE_LOWERCASE_LETTER)
|
432
|
-
return
|
432
|
+
return unichar_toupper(c);
|
433
433
|
|
434
434
|
return c;
|
435
435
|
}
|
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
|
|
585
585
|
|
586
586
|
if (*was_i) {
|
587
587
|
size_t len = remove_all_combining_dot_above(c, buf);
|
588
|
-
return len + output_marks(p, (buf
|
589
|
-
true);
|
588
|
+
return len + output_marks(p, OFFSET_IF(buf, len), true);
|
590
589
|
}
|
591
590
|
|
592
591
|
if (!s_ismark(type))
|
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
|
|
761
760
|
|
762
761
|
/* {{{1
|
763
762
|
* The real implementation of downcase.
|
764
|
-
*
|
765
|
-
* TODO: this needs a cleanup.
|
766
763
|
*/
|
764
|
+
static size_t
|
765
|
+
tolower_turkic_i(const char **p, char *buf)
|
766
|
+
{
|
767
|
+
unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
|
768
|
+
|
769
|
+
if (utf_char(*p) == COMBINING_DOT_ABOVE) {
|
770
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
771
|
+
* of ‘p’? */
|
772
|
+
*p = utf_next(*p);
|
773
|
+
i = LATIN_SMALL_LETTER_I;
|
774
|
+
}
|
775
|
+
|
776
|
+
return unichar_to_utf(i, buf);
|
777
|
+
}
|
778
|
+
|
779
|
+
static size_t
|
780
|
+
tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
|
781
|
+
{
|
782
|
+
size_t len = unichar_to_utf(base, buf);
|
783
|
+
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
784
|
+
if (combiner != '\0')
|
785
|
+
len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
|
786
|
+
|
787
|
+
return len;
|
788
|
+
}
|
789
|
+
|
790
|
+
static size_t
|
791
|
+
tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
|
792
|
+
{
|
793
|
+
unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
794
|
+
|
795
|
+
/* SIGMA maps differently depending on whether it is final or not. The
|
796
|
+
* following simplified test would fail in the case of combining marks
|
797
|
+
* following the sigma, but I don't think that occurs in real text.
|
798
|
+
* The test here matches that in ICU. */
|
799
|
+
if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
|
800
|
+
sigma = GREEK_SMALL_LETTER_SIGMA;
|
801
|
+
|
802
|
+
return unichar_to_utf(sigma, buf);
|
803
|
+
}
|
804
|
+
|
767
805
|
static size_t
|
768
806
|
real_tolower_one(const char **p, const char *prev, char *buf,
|
769
807
|
LocaleType locale_type, const char *end, bool use_end)
|
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
|
|
771
809
|
unichar c = utf_char(prev);
|
772
810
|
int type = s_type(c);
|
773
811
|
|
774
|
-
if (locale_type == LOCALE_TURKIC && c == 'I')
|
775
|
-
|
776
|
-
/* TODO: don’t we need to make sure we don’t go beyond the end
|
777
|
-
* of ‘p’? */
|
778
|
-
*p = utf_next(*p);
|
779
|
-
return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
780
|
-
}
|
812
|
+
if (locale_type == LOCALE_TURKIC && c == 'I')
|
813
|
+
return tolower_turkic_i(p, buf);
|
781
814
|
|
782
|
-
|
783
|
-
|
815
|
+
/* Introduce an explicit dot above the lowercasing capital I’s
|
816
|
+
* and J’s whenever there are more accents above.
|
817
|
+
* [SpecialCasing.txt] */
|
818
|
+
if (locale_type == LOCALE_LITHUANIAN) {
|
819
|
+
unichar base = LATIN_SMALL_LETTER_I;
|
820
|
+
unichar combiner = '\0';
|
784
821
|
|
785
|
-
if (locale_type == LOCALE_LITHUANIAN &&
|
786
|
-
(c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
|
787
|
-
c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
|
788
|
-
c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
|
789
|
-
/* Introduce an explicit dot above the lowercasing capital I's
|
790
|
-
* and J's whenever there are more accents above.
|
791
|
-
* [SpecialCasing.txt] */
|
792
|
-
size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
793
|
-
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
794
822
|
switch (c) {
|
795
823
|
case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
|
796
|
-
|
797
|
-
OFFSET_IF(buf, len));
|
824
|
+
combiner = COMBINING_GRAVE_ACCENT;
|
798
825
|
break;
|
799
826
|
case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
|
800
|
-
|
801
|
-
OFFSET_IF(buf, len));
|
827
|
+
combiner = COMBINING_ACUTE_ACCENT;
|
802
828
|
break;
|
803
829
|
case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
|
804
|
-
|
805
|
-
OFFSET_IF(buf, len));
|
830
|
+
combiner = COMBINING_TILDE;
|
806
831
|
break;
|
807
|
-
|
832
|
+
case 'I':
|
833
|
+
case 'J':
|
834
|
+
case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
|
835
|
+
if (!has_more_above(*p))
|
836
|
+
goto no_lithuanian_i_casing;
|
808
837
|
|
809
|
-
|
810
|
-
|
838
|
+
base = unichar_tolower(c);
|
839
|
+
break;
|
840
|
+
default:
|
841
|
+
goto no_lithuanian_i_casing;
|
842
|
+
}
|
811
843
|
|
812
|
-
|
813
|
-
(c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
|
814
|
-
has_more_above(*p)) {
|
815
|
-
size_t len = unichar_to_utf(unichar_tolower(c), buf);
|
816
|
-
return len + unichar_to_utf(COMBINING_DOT_ABOVE,
|
817
|
-
OFFSET_IF(buf, len));
|
844
|
+
return tolower_lithuianian_i(buf, base, combiner);
|
818
845
|
}
|
819
846
|
|
820
|
-
|
821
|
-
unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
822
|
-
|
823
|
-
if ((!use_end || *p < end) && **p != '\0') {
|
824
|
-
unichar next_c = utf_char(*p);
|
825
|
-
int next_type = s_type(next_c);
|
847
|
+
no_lithuanian_i_casing:
|
826
848
|
|
827
|
-
|
828
|
-
|
829
|
-
* fail in the case of combining marks following the
|
830
|
-
* sigma, but I don't think that occurs in real text.
|
831
|
-
* The test here matches that in ICU. */
|
832
|
-
if (s_isalpha(next_type))
|
833
|
-
tv = GREEK_SMALL_LETTER_SIGMA;
|
834
|
-
}
|
835
|
-
|
836
|
-
return unichar_to_utf(tv, buf);
|
837
|
-
}
|
849
|
+
if (c == GREEK_CAPITAL_LETTER_SIGMA)
|
850
|
+
return tolower_sigma(p, buf, end, use_end);
|
838
851
|
|
839
852
|
if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
|
840
853
|
OR(UNICODE_TITLECASE_LETTER, 0))))
|
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
|
|
879
892
|
size_t len = real_tolower(str, max, use_max, NULL, locale_type);
|
880
893
|
char *result = ALLOC_N(char, len + 1);
|
881
894
|
real_tolower(str, max, use_max, result, locale_type);
|
882
|
-
result[len] =
|
895
|
+
result[len] = '\0';
|
883
896
|
|
884
897
|
return result;
|
885
898
|
}
|
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
|
|
915
928
|
static bool
|
916
929
|
casefold_table_lookup(unichar c, char *folded, size_t *len)
|
917
930
|
{
|
918
|
-
int
|
919
|
-
int end = lengthof(casefold_table);
|
931
|
+
int index;
|
920
932
|
|
921
|
-
if (
|
933
|
+
if (!unicode_table_lookup(casefold_table, c, &index))
|
922
934
|
return false;
|
923
935
|
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
} else if (mid == begin) {
|
933
|
-
return false;
|
934
|
-
} else if (c > casefold_table[mid].ch) {
|
935
|
-
begin = mid;
|
936
|
-
} else {
|
937
|
-
end = mid;
|
938
|
-
}
|
939
|
-
}
|
936
|
+
char const *folded_c = casefold_table[index].data;
|
937
|
+
|
938
|
+
if (folded != NULL)
|
939
|
+
strcpy(folded, folded_c);
|
940
|
+
|
941
|
+
*len += utf_byte_length(folded_c);
|
942
|
+
|
943
|
+
return true;
|
940
944
|
}
|
941
945
|
|
942
946
|
static char *
|
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
|
|
1037
1041
|
bool
|
1038
1042
|
unichar_mirror(unichar c, unichar *mirrored)
|
1039
1043
|
{
|
1040
|
-
|
1041
|
-
int end = lengthof(bidi_mirroring_table);
|
1044
|
+
int index;
|
1042
1045
|
|
1043
|
-
|
1044
|
-
|
1046
|
+
if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
|
1047
|
+
return false;
|
1045
1048
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
} else if (mid == begin) {
|
1051
|
-
return false;
|
1052
|
-
} else if (c > bidi_mirroring_table[mid].ch) {
|
1053
|
-
begin = mid;
|
1054
|
-
} else {
|
1055
|
-
end = mid;
|
1056
|
-
}
|
1057
|
-
}
|
1049
|
+
if (mirrored != NULL)
|
1050
|
+
*mirrored = bidi_mirroring_table[index].mirrored_ch;
|
1051
|
+
|
1052
|
+
return true;
|
1058
1053
|
}
|
1059
1054
|
|
1060
1055
|
|
@@ -0,0 +1,52 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private Ruby-related functions.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_PRIVATE_H
|
8
|
+
#define RB_PRIVATE_H
|
9
|
+
|
10
|
+
|
11
|
+
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
12
|
+
|
13
|
+
unichar _utf_char_validated(char const *const str,
|
14
|
+
char const *const str_end) HIDDEN;
|
15
|
+
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
16
|
+
const char *limit, bool noisy) HIDDEN;
|
17
|
+
|
18
|
+
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
19
|
+
const char *end) HIDDEN;
|
20
|
+
|
21
|
+
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
22
|
+
const char *end) HIDDEN;
|
23
|
+
|
24
|
+
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
25
|
+
|
26
|
+
VALUE rb_utf_new2(const char *str) HIDDEN;
|
27
|
+
|
28
|
+
VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
|
29
|
+
|
30
|
+
VALUE rb_utf_alloc_using(char *str) HIDDEN;
|
31
|
+
|
32
|
+
VALUE rb_utf_dup(VALUE str) HIDDEN;
|
33
|
+
|
34
|
+
long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
|
35
|
+
|
36
|
+
bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
|
37
|
+
char **limit) HIDDEN;
|
38
|
+
|
39
|
+
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
40
|
+
char **limit) HIDDEN;
|
41
|
+
|
42
|
+
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
43
|
+
|
44
|
+
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
45
|
+
|
46
|
+
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
47
|
+
|
48
|
+
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
49
|
+
long offset, bool reverse) HIDDEN;
|
50
|
+
|
51
|
+
|
52
|
+
#endif /* RB_PRIVATE_H */
|
@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
|
|
113
113
|
|
114
114
|
if (*non_digit != 0)
|
115
115
|
rb_raise(rb_eArgError,
|
116
|
-
"unexpected ‘%lc’ found at position %ld",
|
116
|
+
"unexpected ‘%lc’ found at position %ld",
|
117
|
+
c, utf_pointer_to_offset(str, s));
|
117
118
|
|
118
119
|
*non_digit = c;
|
119
120
|
|
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
|
|
135
136
|
return false;
|
136
137
|
rb_raise(rb_eArgError,
|
137
138
|
"non-digit character ‘%lc’ found at position %ld",
|
138
|
-
c, s
|
139
|
+
c, utf_pointer_to_offset(str, s));
|
139
140
|
}
|
140
141
|
|
141
142
|
if (value >= base) {
|
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
|
|
144
145
|
|
145
146
|
rb_raise(rb_eArgError,
|
146
147
|
"value (%d) greater than base (%d) at position %ld",
|
147
|
-
value, base, s
|
148
|
+
value, base, utf_pointer_to_offset(str, s));
|
148
149
|
}
|
149
150
|
|
150
151
|
*digit_value = value;
|
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
|
|
181
182
|
if (*s != '\0')
|
182
183
|
rb_raise(rb_eArgError,
|
183
184
|
"trailing garbage found at position %ld",
|
184
|
-
s
|
185
|
+
utf_pointer_to_offset(str, s));
|
185
186
|
}
|
186
187
|
|
187
188
|
if (POSFIXABLE(value)) {
|
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
221
222
|
if (verify)
|
222
223
|
rb_raise(rb_eArgError,
|
223
224
|
"extra sign ‘%c’ found at position %ld",
|
224
|
-
*s, s
|
225
|
+
*s, utf_pointer_to_offset(str, s));
|
225
226
|
return INT2FIX(0);
|
226
227
|
}
|
227
228
|
|
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
245
246
|
if (verify && *str == '_')
|
246
247
|
rb_raise(rb_eArgError,
|
247
248
|
"leading digit-separator ‘_’ found at position %ld",
|
248
|
-
s
|
249
|
+
utf_pointer_to_offset(str, s));
|
249
250
|
|
250
251
|
bit_length = bit_length / BITSPERDIG + 1;
|
251
252
|
|
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
269
270
|
|
270
271
|
bool more_to_shift = true;
|
271
272
|
while (more_to_shift) {
|
272
|
-
BDIGIT_DBL num =
|
273
|
+
BDIGIT_DBL num = digit_value;
|
273
274
|
|
274
275
|
for (int i = 0; i < big_len; i++) {
|
275
276
|
num += (BDIGIT_DBL)zds[i] * base;
|
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
|
|
294
295
|
if (str + 1 < s && s[-1] == '_')
|
295
296
|
rb_raise(rb_eArgError,
|
296
297
|
"trailing digit-separator ‘_’ found at position %ld",
|
297
|
-
s
|
298
|
+
utf_pointer_to_offset(str, s));
|
298
299
|
|
299
300
|
if (*s != '\0')
|
300
301
|
rb_raise(rb_eArgError,
|
301
302
|
"trailing garbage found at position %ld",
|
302
|
-
s
|
303
|
+
utf_pointer_to_offset(str, s));
|
303
304
|
|
304
305
|
return rb_big_norm(z);
|
305
306
|
}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Functions for dealing with Unicode tables.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef TABLES_H
|
8
|
+
#define TABLES_H
|
9
|
+
|
10
|
+
|
11
|
+
/*
|
12
|
+
static inline int
|
13
|
+
split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
|
14
|
+
{
|
15
|
+
return (page >= UNICODE_MAX_TABLE_INDEX) ?
|
16
|
+
page - UNICODE_MAX_TABLE_INDEX :
|
17
|
+
data[page][c & 0xff];
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline int
|
21
|
+
split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
|
22
|
+
{
|
23
|
+
if (c <= UNICODE_LAST_CHAR_PART1)
|
24
|
+
return split_unicode_table_lookup_page(data,
|
25
|
+
part1[c >> 8],
|
26
|
+
c);
|
27
|
+
|
28
|
+
if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
|
29
|
+
return split_unicode_table_lookup_page(data,
|
30
|
+
part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
|
31
|
+
c);
|
32
|
+
|
33
|
+
return fallback;
|
34
|
+
}
|
35
|
+
*/
|
36
|
+
|
37
|
+
|
38
|
+
#endif /* TABLES_H */
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include <limits.h>
|
14
14
|
#include "unicode.h"
|
15
15
|
#include "private.h"
|
16
|
+
#include "rb_private.h"
|
16
17
|
#include "rb_methods.h"
|
17
18
|
|
18
19
|
static VALUE mUTF8Methods;
|
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
|
85
86
|
saved_offset);
|
86
87
|
else
|
87
88
|
return NULL;
|
88
|
-
break;
|
89
89
|
}
|
90
90
|
|
91
91
|
offset += utf_pointer_to_offset(p, base);
|
@@ -190,6 +190,9 @@ utf_char(const char *str)
|
|
190
190
|
unichar
|
191
191
|
utf_char_n(const char *str, size_t max)
|
192
192
|
{
|
193
|
+
if (max == 0)
|
194
|
+
return UTF_INCOMPLETE_INPUT_UNICHAR;
|
195
|
+
|
193
196
|
size_t len;
|
194
197
|
unichar c = (unsigned char)*str;
|
195
198
|
|
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
|
|
454
457
|
|
455
458
|
unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
|
456
459
|
unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
|
457
|
-
|
460
|
+
|
458
461
|
int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
|
459
462
|
|
460
463
|
free(a_norm);
|
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
|
|
518
521
|
assert(str != NULL);
|
519
522
|
|
520
523
|
unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
|
521
|
-
setlocale(LC_COLLATE, "");
|
522
524
|
size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
|
523
525
|
wchar_t result_wc[xfrm_len + 1];
|
524
526
|
wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
|
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
|
|
863
865
|
char *result = ALLOC_N(char, len + 1);
|
864
866
|
char *r = result + len;
|
865
867
|
const char *p = str;
|
866
|
-
|
868
|
+
while (r > result) {
|
867
869
|
uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
|
868
870
|
r -= skip;
|
869
871
|
for (char *m = r; skip > 0; skip--)
|
data/tests/case.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# contents: Tests for String#upcase and String#downcase.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'tests/unicodedatatestbase'
|
6
|
+
require 'encoding/character/utf-8'
|
7
|
+
|
8
|
+
class TC_StringCase < Test::Unit::TestCase
|
9
|
+
include UnicodeDataTestBase
|
10
|
+
|
11
|
+
Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
|
12
|
+
CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
|
13
|
+
|
14
|
+
def test_upcase_and_downcase
|
15
|
+
# TODO: Do it like this. First read in SpecialCasing.txt and set up lookup
|
16
|
+
# tables for all the characters that need special casing. Then, iterate
|
17
|
+
# over UnicodeData and simply check that the correct casings are performed,
|
18
|
+
# looking up data in the tables for special casing if no simple casing
|
19
|
+
# information is available (and skipping when appropriate - such as when
|
20
|
+
# there is some condition defined for the special casing).
|
21
|
+
special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
|
22
|
+
open_data_file('SpecialCasing.txt') do |file|
|
23
|
+
i = 0
|
24
|
+
file.each_line do |line|
|
25
|
+
i += 1
|
26
|
+
next if line =~ /^(#|\s*$)/
|
27
|
+
fields = line.sub(/\s*#.*$/, "").split('; ')
|
28
|
+
unless fields.size == 4 or fields.size == 5
|
29
|
+
raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
|
30
|
+
end
|
31
|
+
code = fields[CasingCode].hex
|
32
|
+
special.conditions[code] = fields[CasingCondition] if fields.size == 5
|
33
|
+
special.upper[code] = utfify(fields[CasingUpper])
|
34
|
+
special.lower[code] = utfify(fields[CasingLower])
|
35
|
+
special.title[code] = utfify(fields[CasingTitle])
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
open_data_file('UnicodeData.txt') do |file|
|
40
|
+
i = 0
|
41
|
+
prev_code = -1
|
42
|
+
file.each_line do |line|
|
43
|
+
i += 1
|
44
|
+
next if line =~ /^(#|\s*$)/
|
45
|
+
fields = line.split(';')
|
46
|
+
raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
|
47
|
+
code = fields[Code].hex
|
48
|
+
if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
|
49
|
+
prev_code.upto(code - 1){ |c| test_one c, fields, special }
|
50
|
+
end
|
51
|
+
test_one code, fields, special
|
52
|
+
prev_code = code
|
53
|
+
end
|
54
|
+
end
|
55
|
+
puts @i
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def utfify(codepoints)
|
61
|
+
return codepoints if codepoints == ""
|
62
|
+
codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
|
63
|
+
end
|
64
|
+
|
65
|
+
def utfone(codepoint)
|
66
|
+
u([codepoint].pack('U*'))
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_one(code, fields, special)
|
70
|
+
@i ||= 0
|
71
|
+
@i += 1
|
72
|
+
case fields[Category]
|
73
|
+
when 'Ll'
|
74
|
+
test_upcase(code, fields, special)
|
75
|
+
when 'Lu'
|
76
|
+
test_downcase(code, fields, special)
|
77
|
+
when 'Lt'
|
78
|
+
test_upcase(code, fields, special)
|
79
|
+
test_downcase(code, fields, special)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_upcase(code, fields, special)
|
84
|
+
if special.upper[code]
|
85
|
+
if not special.conditions[code]
|
86
|
+
assert_equal(special.upper[code], utfone(code).upcase)
|
87
|
+
end
|
88
|
+
elsif not fields[Upper].empty?
|
89
|
+
assert_equal(utfify(fields[Upper]), utfone(code).upcase)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_downcase(code, fields, special)
|
94
|
+
if special.lower[code]
|
95
|
+
if not special.conditions[code]
|
96
|
+
assert_equal(special.lower[code], utfone(code).downcase)
|
97
|
+
end
|
98
|
+
elsif not fields[Lower].empty?
|
99
|
+
assert_equal(utfify(fields[Lower]), utfone(code).downcase)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: character-encodings
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2007-11-22 00:00:00 +01:00
|
8
8
|
summary: A pluggable character-encoding library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -48,6 +48,7 @@ files:
|
|
48
48
|
- ext/encoding/character/unicode/codepoint.c
|
49
49
|
- ext/encoding/character/utf-8/break.c
|
50
50
|
- ext/encoding/character/utf-8/decompose.c
|
51
|
+
- ext/encoding/character/utf-8/private.c
|
51
52
|
- ext/encoding/character/utf-8/properties.c
|
52
53
|
- ext/encoding/character/utf-8/rb_utf_aref.c
|
53
54
|
- ext/encoding/character/utf-8/rb_utf_aset.c
|
@@ -80,19 +81,22 @@ files:
|
|
80
81
|
- ext/encoding/character/utf-8/unicode.c
|
81
82
|
- ext/encoding/character/utf-8/utf.c
|
82
83
|
- ext/encoding/character/utf-8/rb_utf_internal_bignum.c
|
84
|
+
- ext/encoding/character/utf-8/data/break.h
|
85
|
+
- ext/encoding/character/utf-8/data/character-tables.h
|
86
|
+
- ext/encoding/character/utf-8/data/compose.h
|
87
|
+
- ext/encoding/character/utf-8/data/decompose.h
|
83
88
|
- ext/encoding/character/utf-8/private.h
|
84
89
|
- ext/encoding/character/utf-8/rb_includes.h
|
85
90
|
- ext/encoding/character/utf-8/rb_methods.h
|
91
|
+
- ext/encoding/character/utf-8/rb_private.h
|
86
92
|
- ext/encoding/character/utf-8/rb_utf_internal_tr.h
|
93
|
+
- ext/encoding/character/utf-8/tables.h
|
87
94
|
- ext/encoding/character/utf-8/unicode.h
|
88
95
|
- ext/encoding/character/utf-8/rb_utf_internal_bignum.h
|
89
|
-
- ext/encoding/character/utf-8/data/break.h
|
90
|
-
- ext/encoding/character/utf-8/data/character-tables.h
|
91
|
-
- ext/encoding/character/utf-8/data/compose.h
|
92
|
-
- ext/encoding/character/utf-8/data/decompose.h
|
93
|
-
- ext/encoding/character/utf-8/extconf.rb
|
94
96
|
- ext/encoding/character/utf-8/data/generate-unicode-data.rb
|
97
|
+
- ext/encoding/character/utf-8/extconf.rb
|
95
98
|
- ext/encoding/character/utf-8/depend
|
99
|
+
- tests/case.rb
|
96
100
|
- tests/foldcase.rb
|
97
101
|
- tests/normalize.rb
|
98
102
|
- tests/unicodedatatestbase.rb
|