u 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +38 -0
- data/Rakefile +64 -0
- data/ext/encoding/character/utf-8/break.c +25 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
- data/ext/encoding/character/utf-8/decompose.c +444 -0
- data/ext/encoding/character/utf-8/depend +65 -0
- data/ext/encoding/character/utf-8/extconf.rb +67 -0
- data/ext/encoding/character/utf-8/private.c +62 -0
- data/ext/encoding/character/utf-8/private.h +51 -0
- data/ext/encoding/character/utf-8/properties.c +1056 -0
- data/ext/encoding/character/utf-8/rb_includes.h +19 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_private.h +52 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/tables.h +38 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +216 -0
- data/ext/encoding/character/utf-8/utf.c +1334 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/lib/u.rb +16 -0
- data/lib/u/string.rb +185 -0
- data/lib/u/version.rb +5 -0
- data/test/unit/u.rb +5 -0
- data/test/unit/u/string.rb +91 -0
- metadata +174 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
def try_compiler_option(opt, &block)
|
4
|
+
checking_for "#{opt} option to compiler" do
|
5
|
+
$CFLAGS += " #{opt}" if try_compile '', opt, &block
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
try_compiler_option '-std=c99'
|
10
|
+
try_compiler_option '-finline-functions'
|
11
|
+
try_compiler_option '-fno-common'
|
12
|
+
try_compiler_option '-Wall'
|
13
|
+
try_compiler_option '-Waggregate-return'
|
14
|
+
try_compiler_option '-Wcast-align'
|
15
|
+
try_compiler_option '-Wextra'
|
16
|
+
try_compiler_option '-Wformat=2'
|
17
|
+
try_compiler_option '-Winit-self'
|
18
|
+
try_compiler_option '-Winline'
|
19
|
+
try_compiler_option '-Wmissing-declarations'
|
20
|
+
try_compiler_option '-Wmissing-format-attribute'
|
21
|
+
try_compiler_option '-Wmissing-include-dirs'
|
22
|
+
try_compiler_option '-Wmissing-noreturn'
|
23
|
+
try_compiler_option '-Wmissing-prototypes'
|
24
|
+
try_compiler_option '-Wnested-externs'
|
25
|
+
try_compiler_option '-Wold-style-definition'
|
26
|
+
try_compiler_option '-Wpacked'
|
27
|
+
try_compiler_option '-Wp,-D_FORTIFY_SOURCE=2'
|
28
|
+
try_compiler_option '-Wpointer-arith'
|
29
|
+
try_compiler_option '-Wsign-compare'
|
30
|
+
try_compiler_option '-Wstrict-aliasing=2'
|
31
|
+
try_compiler_option '-Wswitch-default'
|
32
|
+
try_compiler_option '-Wswitch-enum'
|
33
|
+
try_compiler_option '-Wundef'
|
34
|
+
try_compiler_option '-Wunsafe-loop-optimizations'
|
35
|
+
try_compiler_option '-Wwrite-strings'
|
36
|
+
|
37
|
+
checking_for 'GNUC visibility attribute' do
|
38
|
+
$defs.push('-DHAVE_GNUC_VISIBILITY') if try_compile <<EOC, '-Werror'
|
39
|
+
void f_hidden(void);
|
40
|
+
void __attribute__((visibility("hidden")))
|
41
|
+
f_hidden(void)
|
42
|
+
{
|
43
|
+
}
|
44
|
+
int main(void)
|
45
|
+
{
|
46
|
+
f_hidden();
|
47
|
+
return 0;
|
48
|
+
}
|
49
|
+
EOC
|
50
|
+
end
|
51
|
+
|
52
|
+
have_header 'assert.h'
|
53
|
+
have_header 'limits.h'
|
54
|
+
have_header 'locale.h'
|
55
|
+
have_header 'stdbool.h'
|
56
|
+
have_header 'stddef.h'
|
57
|
+
have_header 'stdint.h'
|
58
|
+
have_header 'stdio.h'
|
59
|
+
have_header 'stdlib.h'
|
60
|
+
have_header 'string.h'
|
61
|
+
have_header 'sys/types.h'
|
62
|
+
have_header 'wchar.h'
|
63
|
+
|
64
|
+
$INSTALLFILES ||= []
|
65
|
+
$INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
|
66
|
+
|
67
|
+
create_makefile 'encoding/character/utf-8/utf8'
|
@@ -0,0 +1,62 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private functions used by the UTF-8 character-encoding library.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include <ruby.h>
|
8
|
+
#include <stdbool.h>
|
9
|
+
#include <stddef.h>
|
10
|
+
#include <stdint.h>
|
11
|
+
#include <stdlib.h>
|
12
|
+
|
13
|
+
#include "unicode.h"
|
14
|
+
|
15
|
+
#include "private.h"
|
16
|
+
|
17
|
+
/* Lookup C in the sorted TABLE using binary search. TABLE consists of N
|
18
|
+
* entries, where each entry is SIZEOF_ENTRY bytes in size and the first
|
19
|
+
* component is a unichar of size SIZEOF_CHAR. If C is found in TABLE, its
|
20
|
+
* index is stored in INDEX and true is returned. Otherwise, false is returned
|
21
|
+
* and INDEX is left untouched. */
|
22
|
+
bool
|
23
|
+
binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
|
24
|
+
{
|
25
|
+
#define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
|
26
|
+
|
27
|
+
int begin = 0;
|
28
|
+
int end = n - 1;
|
29
|
+
int middle;
|
30
|
+
|
31
|
+
/* This is ugly, but not all tables use unichars as their lookup
|
32
|
+
* character. The casefold table, for example, uses uint16_t-sized
|
33
|
+
* characters. To only get the interesting part of our table entry
|
34
|
+
* we’ll have to mask the retrieved value. */
|
35
|
+
int char_mask = (1 << (8 * sizeof_char)) - 1;
|
36
|
+
|
37
|
+
/* Drop out early if we know for certain that C can’t be in the
|
38
|
+
* decomposition table. */
|
39
|
+
if (c < ENTRY(0) || c > ENTRY(end))
|
40
|
+
return false;
|
41
|
+
|
42
|
+
while (begin <= end) {
|
43
|
+
middle = binary_search_middle_of(begin, end);
|
44
|
+
|
45
|
+
unichar probe = ENTRY(middle);
|
46
|
+
if (c < probe)
|
47
|
+
end = middle - 1;
|
48
|
+
else if (c > probe)
|
49
|
+
begin = middle + 1;
|
50
|
+
else
|
51
|
+
break;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (begin > end)
|
55
|
+
return false;
|
56
|
+
|
57
|
+
*index = middle;
|
58
|
+
|
59
|
+
return true;
|
60
|
+
|
61
|
+
#undef ENTRY
|
62
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private Unicode related information.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef PRIVATE_H
|
8
|
+
#define PRIVATE_H
|
9
|
+
|
10
|
+
#define NUL '\0'
|
11
|
+
#define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
|
12
|
+
|
13
|
+
#if defined(HAVE_GNUC_VISIBILITY)
|
14
|
+
# define HIDDEN \
|
15
|
+
__attribute__((visibility("hidden")))
|
16
|
+
#else
|
17
|
+
# define HIDDEN
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#if defined(__GNUC__)
|
21
|
+
# define UNUSED(u) \
|
22
|
+
u __attribute__((__unused__))
|
23
|
+
#else
|
24
|
+
# define UNUSED(u) \
|
25
|
+
u
|
26
|
+
#endif
|
27
|
+
|
28
|
+
#define binary_search_middle_of(begin, end) \
|
29
|
+
(((unsigned)((begin) + (end))) >> 1)
|
30
|
+
|
31
|
+
#define unicode_table_lookup(table, c, index) \
|
32
|
+
binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
|
33
|
+
|
34
|
+
bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
|
35
|
+
|
36
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c) \
|
37
|
+
((part[page] >= UNICODE_MAX_TABLE_INDEX) \
|
38
|
+
? (part[page] - UNICODE_MAX_TABLE_INDEX) \
|
39
|
+
: (data[part[page]][(c) & 0xff]))
|
40
|
+
|
41
|
+
#define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback) \
|
42
|
+
(((c) <= UNICODE_LAST_CHAR_PART1) \
|
43
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
|
44
|
+
: (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
|
45
|
+
? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
|
46
|
+
: (fallback)))
|
47
|
+
|
48
|
+
unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
|
49
|
+
NormalizeMode mode) HIDDEN;
|
50
|
+
|
51
|
+
#endif /* PRIVATE_H */
|
@@ -0,0 +1,1056 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode character properties.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include <ruby.h>
|
8
|
+
#include <assert.h>
|
9
|
+
#include <locale.h>
|
10
|
+
#include <stdbool.h>
|
11
|
+
#include <stddef.h>
|
12
|
+
#include <stdint.h>
|
13
|
+
#include <string.h>
|
14
|
+
#include "unicode.h"
|
15
|
+
#include "private.h"
|
16
|
+
#include "data/character-tables.h"
|
17
|
+
|
18
|
+
|
19
|
+
#define COMBINING_DOT_ABOVE ((unichar)0x0307)
|
20
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
|
21
|
+
#define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
|
22
|
+
#define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
|
23
|
+
#define LATIN_SMALL_LETTER_I ((unichar)0x0069)
|
24
|
+
#define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
|
25
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
|
26
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
|
27
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
|
28
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
|
29
|
+
#define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
|
30
|
+
#define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
|
31
|
+
#define COMBINING_TILDE ((unichar)0x0303)
|
32
|
+
#define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
|
33
|
+
#define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
|
34
|
+
#define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
|
35
|
+
|
36
|
+
#define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
|
37
|
+
|
38
|
+
/* {{{1
|
39
|
+
* Macros for accessing the Unicode character attribute table.
|
40
|
+
*
|
41
|
+
* TODO: Turn these macros into full-fledged functions, as this is rather silly
|
42
|
+
* when we have ‹inline› in C99.
|
43
|
+
*/
|
44
|
+
#define ATTR_TABLE(page) \
|
45
|
+
(((page) <= UNICODE_LAST_PAGE_PART1) \
|
46
|
+
? attr_table_part1[page] \
|
47
|
+
: attr_table_part2[(page) - 0xe00])
|
48
|
+
|
49
|
+
#define ATTTABLE(page, char) \
|
50
|
+
((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
|
51
|
+
? 0 : (attr_data[ATTR_TABLE(page)][char]))
|
52
|
+
|
53
|
+
|
54
|
+
/* {{{1
|
55
|
+
* Internal function used for figuring out the type of a given character.
|
56
|
+
*/
|
57
|
+
static inline int
|
58
|
+
s_type(unichar c)
|
59
|
+
{
|
60
|
+
const int16_t *table;
|
61
|
+
unsigned int page;
|
62
|
+
|
63
|
+
if (c <= UNICODE_LAST_CHAR_PART1) {
|
64
|
+
page = c >> 8;
|
65
|
+
table = type_table_part1;
|
66
|
+
} else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
|
67
|
+
page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
|
68
|
+
table = type_table_part2;
|
69
|
+
} else {
|
70
|
+
return UNICODE_UNASSIGNED;
|
71
|
+
}
|
72
|
+
|
73
|
+
if (table[page] >= UNICODE_MAX_TABLE_INDEX)
|
74
|
+
return table[page] - UNICODE_MAX_TABLE_INDEX;
|
75
|
+
else
|
76
|
+
return type_data[table[page]][c & 0xff];
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
/* {{{1
|
81
|
+
* Bit-fiddling macros for testing the class of a type.
|
82
|
+
*/
|
83
|
+
#define IS(type, class) (((unsigned int)1 << (type)) & (class))
|
84
|
+
#define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
|
85
|
+
|
86
|
+
|
87
|
+
/* {{{1
|
88
|
+
* Internal function used to check if the given type represents a digit type.
|
89
|
+
*/
|
90
|
+
static inline bool
|
91
|
+
s_isdigit(int type)
|
92
|
+
{
|
93
|
+
return IS(type,
|
94
|
+
OR(UNICODE_DECIMAL_NUMBER,
|
95
|
+
OR(UNICODE_LETTER_NUMBER,
|
96
|
+
OR(UNICODE_OTHER_NUMBER, 0))));
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
/* {{{1
|
101
|
+
* Internal function used to check if the given type represents an alphabetic
|
102
|
+
* type.
|
103
|
+
*/
|
104
|
+
static inline bool
|
105
|
+
s_isalpha(int type)
|
106
|
+
{
|
107
|
+
return IS(type,
|
108
|
+
OR(UNICODE_LOWERCASE_LETTER,
|
109
|
+
OR(UNICODE_UPPERCASE_LETTER,
|
110
|
+
OR(UNICODE_TITLECASE_LETTER,
|
111
|
+
OR(UNICODE_MODIFIER_LETTER,
|
112
|
+
OR(UNICODE_OTHER_LETTER, 0))))));
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
/* {{{1
|
117
|
+
* Internal function used to check if the given type represents a mark type.
|
118
|
+
*/
|
119
|
+
static inline bool
|
120
|
+
s_ismark(int type)
|
121
|
+
{
|
122
|
+
return IS(type,
|
123
|
+
OR(UNICODE_NON_SPACING_MARK,
|
124
|
+
OR(UNICODE_COMBINING_MARK,
|
125
|
+
OR(UNICODE_ENCLOSING_MARK, 0))));
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
/* {{{1
|
130
|
+
* Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
|
131
|
+
*/
|
132
|
+
bool
|
133
|
+
unichar_isalnum(unichar c)
|
134
|
+
{
|
135
|
+
int type = s_type(c);
|
136
|
+
|
137
|
+
return s_isdigit(type) || s_isalpha(type);
|
138
|
+
}
|
139
|
+
|
140
|
+
|
141
|
+
/* {{{1
|
142
|
+
* Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
|
143
|
+
*/
|
144
|
+
bool
|
145
|
+
unichar_isalpha(unichar c)
|
146
|
+
{
|
147
|
+
return s_isalpha(s_type(c));
|
148
|
+
}
|
149
|
+
|
150
|
+
|
151
|
+
/* {{{1
|
152
|
+
* Determine whether ‘c’ is a control character, such as ‹NUL›.
|
153
|
+
*/
|
154
|
+
bool
|
155
|
+
unichar_iscntrl(unichar c)
|
156
|
+
{
|
157
|
+
return s_type(c) == UNICODE_CONTROL;
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
/* {{{1
|
162
|
+
* Determine whether ‘c’ is a digit, such as 0, 1, or 2.
|
163
|
+
*/
|
164
|
+
bool
|
165
|
+
unichar_isdigit(unichar c)
|
166
|
+
{
|
167
|
+
return s_type(c) == UNICODE_DECIMAL_NUMBER;
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
/* {{{1
|
172
|
+
* Determine whether ‘c’ is printable and not a space or control character such
|
173
|
+
* as tab or <NUL›, such as A, B, or C.
|
174
|
+
*/
|
175
|
+
bool
|
176
|
+
unichar_isgraph(unichar c)
|
177
|
+
{
|
178
|
+
return !IS(s_type(c),
|
179
|
+
OR(UNICODE_CONTROL,
|
180
|
+
OR(UNICODE_FORMAT,
|
181
|
+
OR(UNICODE_UNASSIGNED,
|
182
|
+
OR(UNICODE_PRIVATE_USE,
|
183
|
+
OR(UNICODE_SURROGATE,
|
184
|
+
OR(UNICODE_SPACE_SEPARATOR, 0)))))));
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
/* {{{1
|
189
|
+
* Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
|
190
|
+
*/
|
191
|
+
bool
|
192
|
+
unichar_islower(unichar c)
|
193
|
+
{
|
194
|
+
return s_type(c) == UNICODE_LOWERCASE_LETTER;
|
195
|
+
}
|
196
|
+
|
197
|
+
|
198
|
+
/* {{{1
|
199
|
+
* Determine whether ‘c’ is printable, which works the same as
|
200
|
+
* unichar_isgraph(), except that space characters are also printable.
|
201
|
+
*/
|
202
|
+
bool
|
203
|
+
unichar_isprint(unichar c)
|
204
|
+
{
|
205
|
+
return !IS(s_type(c),
|
206
|
+
OR(UNICODE_CONTROL,
|
207
|
+
OR(UNICODE_FORMAT,
|
208
|
+
OR(UNICODE_UNASSIGNED,
|
209
|
+
OR(UNICODE_PRIVATE_USE,
|
210
|
+
OR(UNICODE_SURROGATE, 0))))));
|
211
|
+
}
|
212
|
+
|
213
|
+
|
214
|
+
/* {{{1
|
215
|
+
* Determine whether ‘c’ is some form of punctuation or other symbol.
|
216
|
+
*/
|
217
|
+
bool
|
218
|
+
unichar_ispunct(unichar c)
|
219
|
+
{
|
220
|
+
return IS(s_type(c),
|
221
|
+
OR(UNICODE_CONNECT_PUNCTUATION,
|
222
|
+
OR(UNICODE_DASH_PUNCTUATION,
|
223
|
+
OR(UNICODE_OPEN_PUNCTUATION,
|
224
|
+
OR(UNICODE_CLOSE_PUNCTUATION,
|
225
|
+
OR(UNICODE_INITIAL_PUNCTUATION,
|
226
|
+
OR(UNICODE_FINAL_PUNCTUATION,
|
227
|
+
OR(UNICODE_OTHER_PUNCTUATION,
|
228
|
+
OR(UNICODE_MODIFIER_SYMBOL,
|
229
|
+
OR(UNICODE_MATH_SYMBOL,
|
230
|
+
OR(UNICODE_CURRENCY_SYMBOL,
|
231
|
+
OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
|
232
|
+
}
|
233
|
+
|
234
|
+
|
235
|
+
/* {{{1
|
236
|
+
* Determine whether ‘c’ is some form of whitespace, such as space, tab or a
|
237
|
+
* line separator (newline, carriage return, etc.).
|
238
|
+
*/
|
239
|
+
bool
|
240
|
+
unichar_isspace(unichar c)
|
241
|
+
{
|
242
|
+
switch (c) {
|
243
|
+
case '\t':
|
244
|
+
case '\n':
|
245
|
+
case '\r':
|
246
|
+
case '\f':
|
247
|
+
return true;
|
248
|
+
default:
|
249
|
+
return IS(s_type(c),
|
250
|
+
OR(UNICODE_SPACE_SEPARATOR,
|
251
|
+
OR(UNICODE_LINE_SEPARATOR,
|
252
|
+
OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
/* {{{1
|
258
|
+
* Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
|
259
|
+
*/
|
260
|
+
bool
|
261
|
+
unichar_isupper(unichar c)
|
262
|
+
{
|
263
|
+
return s_type(c) == UNICODE_UPPERCASE_LETTER;
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
/* {{{1
|
268
|
+
* Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
|
269
|
+
* which at the beginning of a word is written as Dz, where only the initial D
|
270
|
+
* is capitalized. (Complicated huh?)
|
271
|
+
*/
|
272
|
+
bool
|
273
|
+
unichar_istitle(unichar c)
|
274
|
+
{
|
275
|
+
/* TODO: binary search helpful? */
|
276
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
277
|
+
if (title_table[i][0] == c)
|
278
|
+
return true;
|
279
|
+
|
280
|
+
return false;
|
281
|
+
}
|
282
|
+
|
283
|
+
|
284
|
+
/* {{{1
|
285
|
+
* Determine whether ‘c’ is a new-line.
|
286
|
+
*/
|
287
|
+
#define UNICHAR_NEXT_LINE ((unichar)0x0085)
|
288
|
+
#define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
|
289
|
+
#define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
|
290
|
+
|
291
|
+
bool
|
292
|
+
unichar_isnewline(unichar c)
|
293
|
+
{
|
294
|
+
switch (c) {
|
295
|
+
case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
|
296
|
+
case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
|
297
|
+
return true;
|
298
|
+
default:
|
299
|
+
return false;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
/* {{{1
|
304
|
+
* Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
|
305
|
+
* ..., f, or A, B, ..., F.
|
306
|
+
*/
|
307
|
+
#define UNICHAR_FULLWIDTH_A 0xff21
|
308
|
+
#define UNICHAR_FULLWIDTH_F 0xff26
|
309
|
+
#define UNICHAR_FULLWIDTH_a 0xff41
|
310
|
+
#define UNICHAR_FULLWIDTH_f 0xff46
|
311
|
+
bool
|
312
|
+
unichar_isxdigit(unichar c)
|
313
|
+
{
|
314
|
+
return ((c >= 'a' && c <= 'f') ||
|
315
|
+
(c >= 'A' && c <= 'F') ||
|
316
|
+
(c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
|
317
|
+
(c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
|
318
|
+
(s_type(c) == UNICODE_DECIMAL_NUMBER));
|
319
|
+
// s_isdigit(s_type(c)));
|
320
|
+
}
|
321
|
+
|
322
|
+
|
323
|
+
/* {{{1
|
324
|
+
* Determine whether code point ‘c’ has been assigned a code value.
|
325
|
+
*/
|
326
|
+
bool
|
327
|
+
unichar_isassigned(unichar c)
|
328
|
+
{
|
329
|
+
return s_type(c) != UNICODE_UNASSIGNED;
|
330
|
+
}
|
331
|
+
|
332
|
+
|
333
|
+
/* {{{1
|
334
|
+
* Determine whether ‘c’ is a wide character, thus is typically rendered in a
|
335
|
+
* double-width cell on a terminal.
|
336
|
+
*/
|
337
|
+
bool
|
338
|
+
unichar_iswide(unichar c)
|
339
|
+
{
|
340
|
+
if (c < 0x1100)
|
341
|
+
return false;
|
342
|
+
|
343
|
+
return (c <= 0x115f || /* Hangul Jamo init. consonants */
|
344
|
+
c == 0x2329 || c == 0x232a || /* angle brackets */
|
345
|
+
(c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
|
346
|
+
(c < 0x302a || c > 0x302f) &&
|
347
|
+
c != 0x303f && c != 0x3099 && c != 0x309a) ||
|
348
|
+
(c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
|
349
|
+
(c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
|
350
|
+
(c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
|
351
|
+
(c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
|
352
|
+
(c >= 0xffe0 && c <= 0xffe6) || /* -"- */
|
353
|
+
(c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
|
354
|
+
(c >= 0x30000 && c <= 0x3fffd)); /* -"- */
|
355
|
+
}
|
356
|
+
|
357
|
+
|
358
|
+
/* {{{1
|
359
|
+
* Convert ‘c’ to its uppercase representation (if any).
|
360
|
+
*/
|
361
|
+
static unichar
|
362
|
+
special_case_table_lookup(unichar c)
|
363
|
+
{
|
364
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
365
|
+
|
366
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
367
|
+
tv = utf_char(special_case_table +
|
368
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START);
|
369
|
+
|
370
|
+
if (tv == '\0')
|
371
|
+
return c;
|
372
|
+
|
373
|
+
return tv;
|
374
|
+
}
|
375
|
+
|
376
|
+
static unichar
|
377
|
+
titlecase_table_lookup(unichar c, bool want_upper)
|
378
|
+
{
|
379
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
380
|
+
if (title_table[i][0] == c)
|
381
|
+
return title_table[i][want_upper ? 1 : 2];
|
382
|
+
|
383
|
+
return c;
|
384
|
+
}
|
385
|
+
|
386
|
+
unichar
|
387
|
+
unichar_toupper(unichar c)
|
388
|
+
{
|
389
|
+
int type = s_type(c);
|
390
|
+
|
391
|
+
if (type == UNICODE_LOWERCASE_LETTER)
|
392
|
+
return special_case_table_lookup(c);
|
393
|
+
|
394
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
395
|
+
return titlecase_table_lookup(c, true);
|
396
|
+
|
397
|
+
return c;
|
398
|
+
}
|
399
|
+
|
400
|
+
|
401
|
+
/* {{{1
|
402
|
+
* Convert ‘c’ to its lowercase representation (if any).
|
403
|
+
*/
|
404
|
+
unichar
|
405
|
+
unichar_tolower(unichar c)
|
406
|
+
{
|
407
|
+
int type = s_type(c);
|
408
|
+
|
409
|
+
if (type == UNICODE_UPPERCASE_LETTER)
|
410
|
+
return special_case_table_lookup(c);
|
411
|
+
|
412
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
413
|
+
return titlecase_table_lookup(c, false);
|
414
|
+
|
415
|
+
return c;
|
416
|
+
}
|
417
|
+
|
418
|
+
|
419
|
+
/* {{{1
|
420
|
+
* Convert ‘c’ to its titlecase representation (if any).
|
421
|
+
*/
|
422
|
+
unichar
|
423
|
+
unichar_totitle(unichar c)
|
424
|
+
{
|
425
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
426
|
+
if (title_table[i][0] == c ||
|
427
|
+
title_table[i][1] == c ||
|
428
|
+
title_table[i][2] == c)
|
429
|
+
return title_table[i][0];
|
430
|
+
|
431
|
+
if (s_type(c) == UNICODE_LOWERCASE_LETTER)
|
432
|
+
return unichar_toupper(c);
|
433
|
+
|
434
|
+
return c;
|
435
|
+
}
|
436
|
+
|
437
|
+
|
438
|
+
/* {{{1
|
439
|
+
* Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
|
440
|
+
*/
|
441
|
+
int
|
442
|
+
unichar_digit_value(unichar c)
|
443
|
+
{
|
444
|
+
if (s_type(c) == UNICODE_DECIMAL_NUMBER)
|
445
|
+
return ATTTABLE(c >> 8, c & 0xff);
|
446
|
+
|
447
|
+
return -1;
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
/* {{{1
|
452
|
+
* Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
|
453
|
+
*/
|
454
|
+
int
|
455
|
+
unichar_xdigit_value(unichar c)
|
456
|
+
{
|
457
|
+
if (c >= 'a' && c <= 'f')
|
458
|
+
return c - 'a' + 10;
|
459
|
+
else if (c >= 'A' && c <= 'F')
|
460
|
+
return c - 'A' + 10;
|
461
|
+
else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
|
462
|
+
return c - UNICHAR_FULLWIDTH_a + 10;
|
463
|
+
else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
|
464
|
+
return c - UNICHAR_FULLWIDTH_A + 10;
|
465
|
+
else
|
466
|
+
return unichar_digit_value(c);
|
467
|
+
}
|
468
|
+
|
469
|
+
|
470
|
+
/* {{{1
|
471
|
+
* Determine the Unicode character type of ‘c’.
|
472
|
+
*/
|
473
|
+
UnicodeType
|
474
|
+
unichar_type(unichar c)
|
475
|
+
{
|
476
|
+
return s_type(c);
|
477
|
+
}
|
478
|
+
|
479
|
+
|
480
|
+
/* {{{1
|
481
|
+
* LocaleType: This ‹enum› is used for dealing with different locales for
|
482
|
+
* turning strings into uppercase or lowercase.
|
483
|
+
*/
|
484
|
+
typedef enum {
|
485
|
+
LOCALE_NORMAL,
|
486
|
+
LOCALE_TURKIC,
|
487
|
+
LOCALE_LITHUANIAN
|
488
|
+
} LocaleType;
|
489
|
+
|
490
|
+
|
491
|
+
/* {{{1
|
492
|
+
* Retrieve the locale type from the environment (LC_CTYPE).
|
493
|
+
*/
|
494
|
+
static LocaleType
|
495
|
+
get_locale_type(void)
|
496
|
+
{
|
497
|
+
const char *locale = setlocale(LC_CTYPE, NULL);
|
498
|
+
|
499
|
+
if ((locale[0] == 'a' && locale[1] == 'z') ||
|
500
|
+
(locale[0] == 't' && locale[1] == 'r'))
|
501
|
+
return LOCALE_TURKIC;
|
502
|
+
|
503
|
+
if (locale[0] == 'l' && locale[1] == 't')
|
504
|
+
return LOCALE_LITHUANIAN;
|
505
|
+
|
506
|
+
return LOCALE_NORMAL;
|
507
|
+
}
|
508
|
+
|
509
|
+
|
510
|
+
/* {{{1
|
511
|
+
* Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
|
512
|
+
* true, remove the dot over an uppercase I for a turkish locale.
|
513
|
+
*/
|
514
|
+
static size_t
|
515
|
+
output_marks(const char **p_inout, char *buf, bool remove_dot)
|
516
|
+
{
|
517
|
+
size_t len = 0;
|
518
|
+
const char *p = *p_inout;
|
519
|
+
|
520
|
+
for ( ; *p != '\0'; p = utf_next(p)) {
|
521
|
+
unichar c = utf_char(p);
|
522
|
+
|
523
|
+
if (!s_ismark(s_type(c)))
|
524
|
+
break;
|
525
|
+
|
526
|
+
if (!remove_dot || c != COMBINING_DOT_ABOVE)
|
527
|
+
len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
|
528
|
+
}
|
529
|
+
|
530
|
+
*p_inout = p;
|
531
|
+
|
532
|
+
return len;
|
533
|
+
}
|
534
|
+
|
535
|
+
/* {{{1
|
536
|
+
* Output titlecases where appropriate.
|
537
|
+
*/
|
538
|
+
static size_t
|
539
|
+
output_special_case(char *buf, int offset, int type, bool upper)
|
540
|
+
{
|
541
|
+
const char *p = special_case_table + offset;
|
542
|
+
|
543
|
+
if (type != UNICODE_TITLECASE_LETTER)
|
544
|
+
p = utf_next(p);
|
545
|
+
|
546
|
+
if (upper)
|
547
|
+
p += utf_byte_length(p) + 1;
|
548
|
+
|
549
|
+
size_t len = utf_byte_length(p);
|
550
|
+
|
551
|
+
if (buf != NULL)
|
552
|
+
memcpy(buf, p, len);
|
553
|
+
|
554
|
+
return len;
|
555
|
+
}
|
556
|
+
|
557
|
+
/* {{{1
|
558
|
+
* Do uppercasing of ‘p’ for Lithuanian locales.
|
559
|
+
*/
|
560
|
+
static size_t
|
561
|
+
remove_all_combining_dot_above(unichar c, char *buf)
|
562
|
+
{
|
563
|
+
size_t decomp_len;
|
564
|
+
unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
|
565
|
+
|
566
|
+
size_t len = 0;
|
567
|
+
for (size_t i = 0; i < decomp_len; i++)
|
568
|
+
if (decomp[i] != COMBINING_DOT_ABOVE)
|
569
|
+
len += unichar_to_utf(unichar_toupper(decomp[i]),
|
570
|
+
OFFSET_IF(buf, len));
|
571
|
+
|
572
|
+
free(decomp);
|
573
|
+
|
574
|
+
return len;
|
575
|
+
}
|
576
|
+
|
577
|
+
static size_t
|
578
|
+
real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
|
579
|
+
bool *was_i)
|
580
|
+
{
|
581
|
+
if (c == 'i') {
|
582
|
+
*was_i = true;
|
583
|
+
return 0;
|
584
|
+
}
|
585
|
+
|
586
|
+
if (*was_i) {
|
587
|
+
size_t len = remove_all_combining_dot_above(c, buf);
|
588
|
+
return len + output_marks(p, OFFSET_IF(buf, len), true);
|
589
|
+
}
|
590
|
+
|
591
|
+
if (!s_ismark(type))
|
592
|
+
*was_i = false;
|
593
|
+
|
594
|
+
return 0;
|
595
|
+
}
|
596
|
+
|
597
|
+
/* {{{1
|
598
|
+
* Do real upcasing. */
|
599
|
+
static inline size_t
|
600
|
+
real_do_toupper(unichar c, int type, char *buf)
|
601
|
+
{
|
602
|
+
bool upper = (type != UNICODE_LOWERCASE_LETTER);
|
603
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
604
|
+
|
605
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
606
|
+
return output_special_case(buf,
|
607
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START,
|
608
|
+
type, upper);
|
609
|
+
|
610
|
+
/* TODO: this should really use titlecase_table_lookup somehow. */
|
611
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
612
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
613
|
+
if (title_table[i][0] == c)
|
614
|
+
return unichar_to_utf(title_table[i][1], buf);
|
615
|
+
|
616
|
+
return unichar_to_utf(tv != '\0' ? tv : c, buf);
|
617
|
+
}
|
618
|
+
|
619
|
+
/* {{{1
|
620
|
+
* Do real uppercasing of ‘str’.
|
621
|
+
*/
|
622
|
+
static size_t
|
623
|
+
real_toupper_one(const char **p, const char *prev, char *buf,
|
624
|
+
LocaleType locale_type, bool *was_i)
|
625
|
+
{
|
626
|
+
unichar c = utf_char(prev);
|
627
|
+
int type = s_type(c);
|
628
|
+
|
629
|
+
if (locale_type == LOCALE_LITHUANIAN) {
|
630
|
+
size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
|
631
|
+
if (len > 0)
|
632
|
+
return len;
|
633
|
+
}
|
634
|
+
|
635
|
+
if (locale_type == LOCALE_TURKIC && c == 'i')
|
636
|
+
return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
|
637
|
+
buf);
|
638
|
+
|
639
|
+
if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
|
640
|
+
/* Nasty, need to move it after other combining marks...this
|
641
|
+
* would go away if we normalized first. */
|
642
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
643
|
+
* of ‘p’? */
|
644
|
+
size_t len = output_marks(p, buf, false);
|
645
|
+
return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
|
646
|
+
OFFSET_IF(buf, len));
|
647
|
+
}
|
648
|
+
|
649
|
+
if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
|
650
|
+
OR(UNICODE_TITLECASE_LETTER, 0))))
|
651
|
+
return real_do_toupper(c, type, buf);
|
652
|
+
|
653
|
+
size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
|
654
|
+
|
655
|
+
if (buf != NULL)
|
656
|
+
memcpy(buf, prev, len);
|
657
|
+
|
658
|
+
return len;
|
659
|
+
}
|
660
|
+
|
661
|
+
static size_t
|
662
|
+
real_toupper(const char *str, size_t max, bool use_max, char *buf,
|
663
|
+
LocaleType locale_type)
|
664
|
+
{
|
665
|
+
const char *p = str;
|
666
|
+
size_t len = 0;
|
667
|
+
bool p_was_i = false;
|
668
|
+
|
669
|
+
while ((!use_max || p < str + max) && *p != '\0') {
|
670
|
+
const char *prev = p;
|
671
|
+
p = utf_next(p);
|
672
|
+
|
673
|
+
len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
|
674
|
+
locale_type, &p_was_i);
|
675
|
+
}
|
676
|
+
|
677
|
+
return len;
|
678
|
+
}
|
679
|
+
|
680
|
+
/* {{{1
|
681
|
+
* Wrapper around real_toupper() for dealing with memory allocation and such.
|
682
|
+
*/
|
683
|
+
static char *
|
684
|
+
utf_upcase_impl(const char *str, size_t max, bool use_max)
|
685
|
+
{
|
686
|
+
assert(str != NULL);
|
687
|
+
|
688
|
+
LocaleType locale_type = get_locale_type();
|
689
|
+
|
690
|
+
size_t len = real_toupper(str, max, use_max, NULL, locale_type);
|
691
|
+
char *result = ALLOC_N(char, len + 1);
|
692
|
+
real_toupper(str, max, use_max, result, locale_type);
|
693
|
+
result[len] = '\0';
|
694
|
+
|
695
|
+
return result;
|
696
|
+
}
|
697
|
+
|
698
|
+
|
699
|
+
/* {{{1
|
700
|
+
* Convert all characters in ‘str’ to their uppercase representation if
|
701
|
+
* applicable. Returns the freshly allocated representation.
|
702
|
+
*/
|
703
|
+
char *
|
704
|
+
utf_upcase(const char *str)
|
705
|
+
{
|
706
|
+
return utf_upcase_impl(str, 0, false);
|
707
|
+
}
|
708
|
+
|
709
|
+
|
710
|
+
/* {{{1
|
711
|
+
* Convert all characters in ‘str’ to their uppercase representation if
|
712
|
+
* applicable. Returns the freshly allocated representation. Do this for at
|
713
|
+
* most ‘len˚ bytes from ‘str’.
|
714
|
+
*/
|
715
|
+
char *
|
716
|
+
utf_upcase_n(const char *str, size_t len)
|
717
|
+
{
|
718
|
+
return utf_upcase_impl(str, len, true);
|
719
|
+
}
|
720
|
+
|
721
|
+
|
722
|
+
/* {{{1
|
723
|
+
* Traverse the string checking for characters with combining class == 230
|
724
|
+
* until a base character is found.
|
725
|
+
*/
|
726
|
+
static bool
|
727
|
+
has_more_above(const char *str)
|
728
|
+
{
|
729
|
+
for (const char *p = str; *p != '\0'; p = utf_next(p)) {
|
730
|
+
int c_class = unichar_combining_class(utf_char(p));
|
731
|
+
|
732
|
+
if (c_class == 230)
|
733
|
+
return true;
|
734
|
+
|
735
|
+
if (c_class == 0)
|
736
|
+
return false;
|
737
|
+
}
|
738
|
+
|
739
|
+
return false;
|
740
|
+
}
|
741
|
+
|
742
|
+
static inline size_t
|
743
|
+
real_do_tolower(unichar c, int type, char *buf)
|
744
|
+
{
|
745
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
746
|
+
|
747
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
748
|
+
return output_special_case(buf,
|
749
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START,
|
750
|
+
type, false);
|
751
|
+
|
752
|
+
/* TODO: this should really use titlecase_table_lookup somehow. */
|
753
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
754
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
755
|
+
if (title_table[i][0] == c)
|
756
|
+
return unichar_to_utf(title_table[i][2], buf);
|
757
|
+
|
758
|
+
return unichar_to_utf(tv != '\0' ? tv : c, buf);
|
759
|
+
}
|
760
|
+
|
761
|
+
/* {{{1
|
762
|
+
* The real implementation of downcase.
|
763
|
+
*/
|
764
|
+
static size_t
|
765
|
+
tolower_turkic_i(const char **p, char *buf)
|
766
|
+
{
|
767
|
+
unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
|
768
|
+
|
769
|
+
if (utf_char(*p) == COMBINING_DOT_ABOVE) {
|
770
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
771
|
+
* of ‘p’? */
|
772
|
+
*p = utf_next(*p);
|
773
|
+
i = LATIN_SMALL_LETTER_I;
|
774
|
+
}
|
775
|
+
|
776
|
+
return unichar_to_utf(i, buf);
|
777
|
+
}
|
778
|
+
|
779
|
+
static size_t
|
780
|
+
tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
|
781
|
+
{
|
782
|
+
size_t len = unichar_to_utf(base, buf);
|
783
|
+
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
784
|
+
if (combiner != '\0')
|
785
|
+
len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
|
786
|
+
|
787
|
+
return len;
|
788
|
+
}
|
789
|
+
|
790
|
+
static size_t
|
791
|
+
tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
|
792
|
+
{
|
793
|
+
unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
794
|
+
|
795
|
+
/* SIGMA maps differently depending on whether it is final or not. The
|
796
|
+
* following simplified test would fail in the case of combining marks
|
797
|
+
* following the sigma, but I don't think that occurs in real text.
|
798
|
+
* The test here matches that in ICU. */
|
799
|
+
if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
|
800
|
+
sigma = GREEK_SMALL_LETTER_SIGMA;
|
801
|
+
|
802
|
+
return unichar_to_utf(sigma, buf);
|
803
|
+
}
|
804
|
+
|
805
|
+
static size_t
|
806
|
+
real_tolower_one(const char **p, const char *prev, char *buf,
|
807
|
+
LocaleType locale_type, const char *end, bool use_end)
|
808
|
+
{
|
809
|
+
unichar c = utf_char(prev);
|
810
|
+
int type = s_type(c);
|
811
|
+
|
812
|
+
if (locale_type == LOCALE_TURKIC && c == 'I')
|
813
|
+
return tolower_turkic_i(p, buf);
|
814
|
+
|
815
|
+
/* Introduce an explicit dot above the lowercasing capital I’s
|
816
|
+
* and J’s whenever there are more accents above.
|
817
|
+
* [SpecialCasing.txt] */
|
818
|
+
if (locale_type == LOCALE_LITHUANIAN) {
|
819
|
+
unichar base = LATIN_SMALL_LETTER_I;
|
820
|
+
unichar combiner = '\0';
|
821
|
+
|
822
|
+
switch (c) {
|
823
|
+
case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
|
824
|
+
combiner = COMBINING_GRAVE_ACCENT;
|
825
|
+
break;
|
826
|
+
case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
|
827
|
+
combiner = COMBINING_ACUTE_ACCENT;
|
828
|
+
break;
|
829
|
+
case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
|
830
|
+
combiner = COMBINING_TILDE;
|
831
|
+
break;
|
832
|
+
case 'I':
|
833
|
+
case 'J':
|
834
|
+
case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
|
835
|
+
if (!has_more_above(*p))
|
836
|
+
goto no_lithuanian_i_casing;
|
837
|
+
|
838
|
+
base = unichar_tolower(c);
|
839
|
+
break;
|
840
|
+
default:
|
841
|
+
goto no_lithuanian_i_casing;
|
842
|
+
}
|
843
|
+
|
844
|
+
return tolower_lithuianian_i(buf, base, combiner);
|
845
|
+
}
|
846
|
+
|
847
|
+
no_lithuanian_i_casing:
|
848
|
+
|
849
|
+
if (c == GREEK_CAPITAL_LETTER_SIGMA)
|
850
|
+
return tolower_sigma(p, buf, end, use_end);
|
851
|
+
|
852
|
+
if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
|
853
|
+
OR(UNICODE_TITLECASE_LETTER, 0))))
|
854
|
+
return real_do_tolower(c, type, buf);
|
855
|
+
|
856
|
+
size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
|
857
|
+
|
858
|
+
if (buf != NULL)
|
859
|
+
memcpy(buf, prev, len);
|
860
|
+
|
861
|
+
return len;
|
862
|
+
}
|
863
|
+
|
864
|
+
static size_t
|
865
|
+
real_tolower(const char *str, size_t max, bool use_max, char *buf,
|
866
|
+
LocaleType locale_type)
|
867
|
+
{
|
868
|
+
const char *p = str;
|
869
|
+
const char *end = str + max;
|
870
|
+
size_t len = 0;
|
871
|
+
|
872
|
+
while ((!use_max || p < end) && *p != '\0') {
|
873
|
+
const char *prev = p;
|
874
|
+
p = utf_next(p);
|
875
|
+
|
876
|
+
len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
|
877
|
+
locale_type, end, use_max);
|
878
|
+
}
|
879
|
+
|
880
|
+
return len;
|
881
|
+
}
|
882
|
+
|
883
|
+
|
884
|
+
/* {{{1 */
|
885
|
+
static char *
|
886
|
+
utf_downcase_impl(const char *str, size_t max, bool use_max)
|
887
|
+
{
|
888
|
+
assert(str != NULL);
|
889
|
+
|
890
|
+
LocaleType locale_type = get_locale_type();
|
891
|
+
|
892
|
+
size_t len = real_tolower(str, max, use_max, NULL, locale_type);
|
893
|
+
char *result = ALLOC_N(char, len + 1);
|
894
|
+
real_tolower(str, max, use_max, result, locale_type);
|
895
|
+
result[len] = '\0';
|
896
|
+
|
897
|
+
return result;
|
898
|
+
}
|
899
|
+
|
900
|
+
|
901
|
+
/* {{{1
|
902
|
+
* Convert all characters in ‘str’ to their lowercase representation if
|
903
|
+
* applicable. Returns the freshly allocated representation.
|
904
|
+
*/
|
905
|
+
char *
|
906
|
+
utf_downcase(const char *str)
|
907
|
+
{
|
908
|
+
return utf_downcase_impl(str, 0, false);
|
909
|
+
}
|
910
|
+
|
911
|
+
|
912
|
+
/* {{{1
|
913
|
+
* Convert all characters in ‘str’ to their lowercase representation if
|
914
|
+
* applicable. Returns the freshly allocated representation. Do this for at
|
915
|
+
* most ‘len˚ bytes from ‘str’.
|
916
|
+
*/
|
917
|
+
char *
|
918
|
+
utf_downcase_n(const char *str, size_t len)
|
919
|
+
{
|
920
|
+
return utf_downcase_impl(str, len, true);
|
921
|
+
}
|
922
|
+
|
923
|
+
|
924
|
+
/* {{{1
|
925
|
+
* The real implementation of case folding below.
|
926
|
+
*/
|
927
|
+
|
928
|
+
static bool
|
929
|
+
casefold_table_lookup(unichar c, char *folded, size_t *len)
|
930
|
+
{
|
931
|
+
int index;
|
932
|
+
|
933
|
+
if (!unicode_table_lookup(casefold_table, c, &index))
|
934
|
+
return false;
|
935
|
+
|
936
|
+
char const *folded_c = casefold_table[index].data;
|
937
|
+
|
938
|
+
if (folded != NULL)
|
939
|
+
strcpy(folded, folded_c);
|
940
|
+
|
941
|
+
*len += utf_byte_length(folded_c);
|
942
|
+
|
943
|
+
return true;
|
944
|
+
}
|
945
|
+
|
946
|
+
static char *
|
947
|
+
utf_foldcase_impl(const char *str, size_t max, bool use_max)
|
948
|
+
{
|
949
|
+
assert(str != NULL);
|
950
|
+
|
951
|
+
char *folded = NULL;
|
952
|
+
size_t len = 0;
|
953
|
+
|
954
|
+
again:
|
955
|
+
for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
|
956
|
+
unichar c = utf_char(p);
|
957
|
+
|
958
|
+
if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
|
959
|
+
continue;
|
960
|
+
|
961
|
+
len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
|
962
|
+
}
|
963
|
+
|
964
|
+
if (folded == NULL) {
|
965
|
+
folded = ALLOC_N(char, len + 1);
|
966
|
+
folded[0] = NUL;
|
967
|
+
len = 0;
|
968
|
+
goto again;
|
969
|
+
}
|
970
|
+
|
971
|
+
folded[len] = '\0';
|
972
|
+
|
973
|
+
return folded;
|
974
|
+
}
|
975
|
+
|
976
|
+
|
977
|
+
/* {{{1
|
978
|
+
* Convert a string into a form that is independent of case. Return the
|
979
|
+
* freshly allocated representation.
|
980
|
+
*/
|
981
|
+
char *
|
982
|
+
utf_foldcase(const char *str)
|
983
|
+
{
|
984
|
+
return utf_foldcase_impl(str, 0, false);
|
985
|
+
}
|
986
|
+
|
987
|
+
|
988
|
+
/* {{{1
|
989
|
+
* Convert a string into a form that is independent of case. Return the
|
990
|
+
* freshly allocated representation. Do this for at most ‘len’ bytes from the
|
991
|
+
* string.
|
992
|
+
*/
|
993
|
+
char *
|
994
|
+
utf_foldcase_n(const char *str, size_t len)
|
995
|
+
{
|
996
|
+
return utf_foldcase_impl(str, len, true);
|
997
|
+
}
|
998
|
+
|
999
|
+
|
1000
|
+
/* {{{1
|
1001
|
+
* The real implementation of utf_width() and utf_width_n() below.
|
1002
|
+
*/
|
1003
|
+
static size_t
|
1004
|
+
utf_width_impl(const char *str, size_t len, bool use_len)
|
1005
|
+
{
|
1006
|
+
assert(str != NULL);
|
1007
|
+
|
1008
|
+
size_t width = 0;
|
1009
|
+
|
1010
|
+
for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
|
1011
|
+
width += unichar_iswide(utf_char(p)) ? 2 : 1;
|
1012
|
+
|
1013
|
+
return width;
|
1014
|
+
}
|
1015
|
+
|
1016
|
+
|
1017
|
+
/* {{{1
|
1018
|
+
* Calculate the width in cells of ‘str’.
|
1019
|
+
*/
|
1020
|
+
size_t
|
1021
|
+
utf_width(const char *str)
|
1022
|
+
{
|
1023
|
+
return utf_width_impl(str, 0, false);
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
|
1027
|
+
/* {{{1
|
1028
|
+
* Calculate the width in cells of ‘str’, which is of length ‘len’.
|
1029
|
+
*/
|
1030
|
+
size_t
|
1031
|
+
utf_width_n(const char *str, size_t len)
|
1032
|
+
{
|
1033
|
+
return utf_width_impl(str, len, true);
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
|
1037
|
+
/* {{{1
|
1038
|
+
* Retrieve the mirrored representation of ‘c’ (if any) and store it in
|
1039
|
+
* ‘mirrored’.
|
1040
|
+
*/
|
1041
|
+
bool
|
1042
|
+
unichar_mirror(unichar c, unichar *mirrored)
|
1043
|
+
{
|
1044
|
+
int index;
|
1045
|
+
|
1046
|
+
if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
|
1047
|
+
return false;
|
1048
|
+
|
1049
|
+
if (mirrored != NULL)
|
1050
|
+
*mirrored = bidi_mirroring_table[index].mirrored_ch;
|
1051
|
+
|
1052
|
+
return true;
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
|
1056
|
+
/* }}}1 */
|