character-encodings 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,64 @@
|
|
1
|
+
break.o: break.c unicode.h data/break.h
|
2
|
+
decompose.o: decompose.c unicode.h private.h data/decompose.h \
|
3
|
+
data/compose.h
|
4
|
+
properties.o: properties.c unicode.h private.h data/character-tables.h
|
5
|
+
rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
|
6
|
+
rb_methods.h
|
7
|
+
rb_utf_aset.o: rb_utf_aset.c rb_includes.h unicode.h private.h \
|
8
|
+
rb_methods.h
|
9
|
+
rb_utf_casecmp.o: rb_utf_casecmp.c rb_includes.h unicode.h private.h \
|
10
|
+
rb_methods.h
|
11
|
+
rb_utf_chomp.o: rb_utf_chomp.c rb_includes.h unicode.h private.h \
|
12
|
+
rb_methods.h
|
13
|
+
rb_utf_chop.o: rb_utf_chop.c rb_includes.h unicode.h private.h \
|
14
|
+
rb_methods.h
|
15
|
+
rb_utf_collate.o: rb_utf_collate.c rb_includes.h unicode.h private.h \
|
16
|
+
rb_methods.h
|
17
|
+
rb_utf_count.o: rb_utf_count.c rb_includes.h unicode.h private.h \
|
18
|
+
rb_methods.h rb_utf_internal_tr.h
|
19
|
+
rb_utf_delete.o: rb_utf_delete.c rb_includes.h unicode.h private.h \
|
20
|
+
rb_methods.h rb_utf_internal_tr.h
|
21
|
+
rb_utf_downcase.o: rb_utf_downcase.c rb_includes.h unicode.h private.h \
|
22
|
+
rb_methods.h
|
23
|
+
rb_utf_each_char.o: rb_utf_each_char.c rb_includes.h unicode.h private.h \
|
24
|
+
rb_methods.h
|
25
|
+
rb_utf_foldcase.o: rb_utf_foldcase.c rb_includes.h unicode.h private.h \
|
26
|
+
rb_methods.h
|
27
|
+
rb_utf_hex.o: rb_utf_hex.c rb_includes.h unicode.h private.h rb_methods.h \
|
28
|
+
rb_utf_internal_bignum.h
|
29
|
+
rb_utf_index.o: rb_utf_index.c rb_includes.h unicode.h private.h \
|
30
|
+
rb_methods.h
|
31
|
+
rb_utf_insert.o: rb_utf_insert.c rb_includes.h unicode.h private.h \
|
32
|
+
rb_methods.h
|
33
|
+
rb_utf_internal_bignum.o: rb_utf_internal_bignum.c rb_includes.h \
|
34
|
+
unicode.h private.h rb_methods.h rb_utf_internal_bignum.h
|
35
|
+
rb_utf_internal_tr.o: rb_utf_internal_tr.c rb_includes.h unicode.h \
|
36
|
+
private.h rb_methods.h rb_utf_internal_tr.h
|
37
|
+
rb_utf_justify.o: rb_utf_justify.c rb_includes.h unicode.h private.h \
|
38
|
+
rb_methods.h
|
39
|
+
rb_utf_length.o: rb_utf_length.c rb_includes.h unicode.h private.h \
|
40
|
+
rb_methods.h
|
41
|
+
rb_utf_lstrip.o: rb_utf_lstrip.c rb_includes.h unicode.h private.h \
|
42
|
+
rb_methods.h
|
43
|
+
rb_utf_normalize.o: rb_utf_normalize.c rb_includes.h unicode.h private.h \
|
44
|
+
rb_methods.h
|
45
|
+
rb_utf_oct.o: rb_utf_oct.c rb_includes.h unicode.h private.h rb_methods.h \
|
46
|
+
rb_utf_internal_bignum.h
|
47
|
+
rb_utf_reverse.o: rb_utf_reverse.c rb_includes.h unicode.h private.h \
|
48
|
+
rb_methods.h
|
49
|
+
rb_utf_rindex.o: rb_utf_rindex.c rb_includes.h unicode.h private.h \
|
50
|
+
rb_methods.h
|
51
|
+
rb_utf_rstrip.o: rb_utf_rstrip.c rb_includes.h unicode.h private.h \
|
52
|
+
rb_methods.h
|
53
|
+
rb_utf_squeeze.o: rb_utf_squeeze.c rb_includes.h unicode.h private.h \
|
54
|
+
rb_methods.h rb_utf_internal_tr.h
|
55
|
+
rb_utf_strip.o: rb_utf_strip.c rb_includes.h unicode.h private.h \
|
56
|
+
rb_methods.h
|
57
|
+
rb_utf_to_i.o: rb_utf_to_i.c rb_includes.h unicode.h private.h \
|
58
|
+
rb_methods.h rb_utf_internal_bignum.h
|
59
|
+
rb_utf_tr.o: rb_utf_tr.c rb_includes.h unicode.h private.h rb_methods.h \
|
60
|
+
rb_utf_internal_tr.h
|
61
|
+
rb_utf_upcase.o: rb_utf_upcase.c rb_includes.h unicode.h private.h \
|
62
|
+
rb_methods.h
|
63
|
+
unicode.o: unicode.c unicode.h private.h rb_methods.h
|
64
|
+
utf.o: utf.c unicode.h private.h
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
def try_compiler_option(opt, &b)
|
4
|
+
checking_for "‘#{opt}’ option to compiler" do
|
5
|
+
if try_compile('', opt, &b)
|
6
|
+
$CFLAGS += " #{opt}"
|
7
|
+
true
|
8
|
+
else
|
9
|
+
false
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
try_compiler_option('-std=c99')
|
15
|
+
try_compiler_option('-Wall')
|
16
|
+
try_compiler_option('-Wextra')
|
17
|
+
try_compiler_option('-Wwrite-strings')
|
18
|
+
try_compiler_option('-Waggregate-return')
|
19
|
+
try_compiler_option('-Wmissing-prototypes')
|
20
|
+
try_compiler_option('-Wmissing-declarations')
|
21
|
+
try_compiler_option('-Wnested-externs')
|
22
|
+
try_compiler_option('-Wundef')
|
23
|
+
try_compiler_option('-Wpointer-arith')
|
24
|
+
try_compiler_option('-Wcast-align')
|
25
|
+
try_compiler_option('-Werror')
|
26
|
+
# XXX: sadly, -Wshadow is a bit too strict. It will, for example, whine about
|
27
|
+
# local variables called “index” on FreeBSD.
|
28
|
+
# try_compiler_option('-Wshadow')
|
29
|
+
# XXX: This is also too strict.
|
30
|
+
# try_compiler_option('-Wconversion')
|
31
|
+
|
32
|
+
have_header('assert.h')
|
33
|
+
have_header('limits.h')
|
34
|
+
have_header('locale.h')
|
35
|
+
have_header('stdbool.h')
|
36
|
+
have_header('stddef.h')
|
37
|
+
have_header('stdint.h')
|
38
|
+
have_header('stdio.h')
|
39
|
+
have_header('stdlib.h')
|
40
|
+
have_header('string.h')
|
41
|
+
have_header('sys/types.h')
|
42
|
+
have_header('wchar.h')
|
43
|
+
|
44
|
+
$INSTALLFILES ||= []
|
45
|
+
$INSTALLFILES << ['unicode.h', '$(RUBYARCHDIR)', 'lib']
|
46
|
+
|
47
|
+
create_makefile('encoding/character/utf-8/utf8')
|
@@ -0,0 +1,68 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private Unicode related information.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef PRIVATE_H
|
8
|
+
#define PRIVATE_H
|
9
|
+
|
10
|
+
#define NUL '\0'
|
11
|
+
#define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
|
12
|
+
|
13
|
+
#if defined(__GNUC__)
|
14
|
+
# define UNUSED(u) \
|
15
|
+
u __attribute__((__unused__))
|
16
|
+
# define HIDDEN \
|
17
|
+
__attribute__((visibility("hidden")))
|
18
|
+
#else
|
19
|
+
# define UNUSED(u) \
|
20
|
+
u
|
21
|
+
# define HIDDEN(u)
|
22
|
+
#endif
|
23
|
+
|
24
|
+
unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
|
25
|
+
NormalizeMode mode) HIDDEN;
|
26
|
+
inline int _unichar_combining_class(unichar c) HIDDEN;
|
27
|
+
|
28
|
+
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
29
|
+
|
30
|
+
unichar _utf_char_validated(char const *const str,
|
31
|
+
char const *const str_end) HIDDEN;
|
32
|
+
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
33
|
+
const char *limit, bool noisy) HIDDEN;
|
34
|
+
|
35
|
+
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
36
|
+
const char *end) HIDDEN;
|
37
|
+
|
38
|
+
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
39
|
+
const char *end) HIDDEN;
|
40
|
+
|
41
|
+
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
42
|
+
|
43
|
+
VALUE rb_utf_new2(const char *str) HIDDEN;
|
44
|
+
|
45
|
+
VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
|
46
|
+
|
47
|
+
VALUE rb_utf_alloc_using(char *str) HIDDEN;
|
48
|
+
|
49
|
+
VALUE rb_utf_dup(VALUE str) HIDDEN;
|
50
|
+
|
51
|
+
long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
|
52
|
+
|
53
|
+
bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
|
54
|
+
char **limit) HIDDEN;
|
55
|
+
|
56
|
+
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
57
|
+
char **limit) HIDDEN;
|
58
|
+
|
59
|
+
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
60
|
+
|
61
|
+
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
62
|
+
|
63
|
+
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
64
|
+
|
65
|
+
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
66
|
+
long offset, bool reverse) HIDDEN;
|
67
|
+
|
68
|
+
#endif /* PRIVATE_H */
|
@@ -0,0 +1,1061 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode character properties.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include <ruby.h>
|
8
|
+
#include <assert.h>
|
9
|
+
#include <locale.h>
|
10
|
+
#include <stdbool.h>
|
11
|
+
#include <stddef.h>
|
12
|
+
#include <stdint.h>
|
13
|
+
#include <string.h>
|
14
|
+
#include "unicode.h"
|
15
|
+
#include "private.h"
|
16
|
+
#include "data/character-tables.h"
|
17
|
+
|
18
|
+
|
19
|
+
#define COMBINING_DOT_ABOVE ((unichar)0x0307)
|
20
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((unichar)0x0130)
|
21
|
+
#define COMBINING_GREEK_YPOGEGRAMMENI ((unichar)0x0345)
|
22
|
+
#define GREEK_CAPITAL_LETTER_IOTA ((unichar)0x0399)
|
23
|
+
#define LATIN_SMALL_LETTER_I ((unichar)0x0069)
|
24
|
+
#define LATIN_SMALL_LETTER_DOTLESS_I ((unichar)0x0131)
|
25
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((unichar)0x00cc)
|
26
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((unichar)0x00cd)
|
27
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((unichar)0x0128)
|
28
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((unichar)0x012e)
|
29
|
+
#define COMBINING_GRAVE_ACCENT ((unichar)0x0300)
|
30
|
+
#define COMBINING_ACUTE_ACCENT ((unichar)0x0301)
|
31
|
+
#define COMBINING_TILDE ((unichar)0x0303)
|
32
|
+
#define GREEK_CAPITAL_LETTER_SIGMA ((unichar)0x03a3)
|
33
|
+
#define GREEK_SMALL_LETTER_SIGMA ((unichar)0x03c3)
|
34
|
+
#define GREEK_SMALL_LETTER_FINAL_SIGMA ((unichar)0x03c2)
|
35
|
+
|
36
|
+
#define OFFSET_IF(buf, len) (((buf) != NULL) ? (buf) + (len) : NULL)
|
37
|
+
|
38
|
+
/* {{{1
|
39
|
+
* Macros for accessing the Unicode character attribute table.
|
40
|
+
*
|
41
|
+
* TODO: Turn these macros into full-fledged functions, as this is rather silly
|
42
|
+
* when we have ‹inline› in C99.
|
43
|
+
*/
|
44
|
+
#define ATTR_TABLE(page) \
|
45
|
+
(((page) <= UNICODE_LAST_PAGE_PART1) \
|
46
|
+
? attr_table_part1[page] \
|
47
|
+
: attr_table_part2[(page) - 0xe00])
|
48
|
+
|
49
|
+
#define ATTTABLE(page, char) \
|
50
|
+
((ATTR_TABLE(page) == UNICODE_MAX_TABLE_INDEX) \
|
51
|
+
? 0 : (attr_data[ATTR_TABLE(page)][char]))
|
52
|
+
|
53
|
+
|
54
|
+
/* {{{1
|
55
|
+
* Internal function used for figuring out the type of a given character.
|
56
|
+
*/
|
57
|
+
static inline int
|
58
|
+
s_type(unichar c)
|
59
|
+
{
|
60
|
+
const int16_t *table;
|
61
|
+
unsigned int page;
|
62
|
+
|
63
|
+
if (c <= UNICODE_LAST_CHAR_PART1) {
|
64
|
+
page = c >> 8;
|
65
|
+
table = type_table_part1;
|
66
|
+
} else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
|
67
|
+
page = (c - 0xe0000) >> 8;
|
68
|
+
table = type_table_part2;
|
69
|
+
} else {
|
70
|
+
return UNICODE_UNASSIGNED;
|
71
|
+
}
|
72
|
+
|
73
|
+
if (table[page] >= UNICODE_MAX_TABLE_INDEX)
|
74
|
+
return table[page] - UNICODE_MAX_TABLE_INDEX;
|
75
|
+
else
|
76
|
+
return type_data[table[page]][c & 0xff];
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
/* {{{1
|
81
|
+
* Bit-fiddling macros for testing the class of a type.
|
82
|
+
*/
|
83
|
+
#define IS(type, class) (((unsigned int)1 << (type)) & (class))
|
84
|
+
#define OR(type, rest) (((unsigned int)1 << (type)) | (rest))
|
85
|
+
|
86
|
+
|
87
|
+
/* {{{1
|
88
|
+
* Internal function used to check if the given type represents a digit type.
|
89
|
+
*/
|
90
|
+
static inline bool
|
91
|
+
s_isdigit(int type)
|
92
|
+
{
|
93
|
+
return IS(type,
|
94
|
+
OR(UNICODE_DECIMAL_NUMBER,
|
95
|
+
OR(UNICODE_LETTER_NUMBER,
|
96
|
+
OR(UNICODE_OTHER_NUMBER, 0))));
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
/* {{{1
|
101
|
+
* Internal function used to check if the given type represents an alphabetic
|
102
|
+
* type.
|
103
|
+
*/
|
104
|
+
static inline bool
|
105
|
+
s_isalpha(int type)
|
106
|
+
{
|
107
|
+
return IS(type,
|
108
|
+
OR(UNICODE_LOWERCASE_LETTER,
|
109
|
+
OR(UNICODE_UPPERCASE_LETTER,
|
110
|
+
OR(UNICODE_TITLECASE_LETTER,
|
111
|
+
OR(UNICODE_MODIFIER_LETTER,
|
112
|
+
OR(UNICODE_OTHER_LETTER, 0))))));
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
/* {{{1
|
117
|
+
* Internal function used to check if the given type represents a mark type.
|
118
|
+
*/
|
119
|
+
static inline bool
|
120
|
+
s_ismark(int type)
|
121
|
+
{
|
122
|
+
return IS(type,
|
123
|
+
OR(UNICODE_NON_SPACING_MARK,
|
124
|
+
OR(UNICODE_COMBINING_MARK,
|
125
|
+
OR(UNICODE_ENCLOSING_MARK, 0))));
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
/* {{{1
|
130
|
+
* Determine whether ‘c’ is an alphanumeric, such as A, B, C, 0, 1, or 2.
|
131
|
+
*/
|
132
|
+
bool
|
133
|
+
unichar_isalnum(unichar c)
|
134
|
+
{
|
135
|
+
int type = s_type(c);
|
136
|
+
|
137
|
+
return s_isdigit(type) || s_isalpha(type);
|
138
|
+
}
|
139
|
+
|
140
|
+
|
141
|
+
/* {{{1
|
142
|
+
* Determine whether ‘c’ is an alphabetic (i.e. a letter), such as A, B, or C.
|
143
|
+
*/
|
144
|
+
bool
|
145
|
+
unichar_isalpha(unichar c)
|
146
|
+
{
|
147
|
+
return s_isalpha(s_type(c));
|
148
|
+
}
|
149
|
+
|
150
|
+
|
151
|
+
/* {{{1
|
152
|
+
* Determine whether ‘c’ is a control character, such as ‹NUL›.
|
153
|
+
*/
|
154
|
+
bool
|
155
|
+
unichar_iscntrl(unichar c)
|
156
|
+
{
|
157
|
+
return s_type(c) == UNICODE_CONTROL;
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
/* {{{1
|
162
|
+
* Determine whether ‘c’ is a digit, such as 0, 1, or 2.
|
163
|
+
*/
|
164
|
+
bool
|
165
|
+
unichar_isdigit(unichar c)
|
166
|
+
{
|
167
|
+
return s_type(c) == UNICODE_DECIMAL_NUMBER;
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
/* {{{1
|
172
|
+
* Determine whether ‘c’ is printable and not a space or control character such
|
173
|
+
* as tab or <NUL›, such as A, B, or C.
|
174
|
+
*/
|
175
|
+
bool
|
176
|
+
unichar_isgraph(unichar c)
|
177
|
+
{
|
178
|
+
return !IS(s_type(c),
|
179
|
+
OR(UNICODE_CONTROL,
|
180
|
+
OR(UNICODE_FORMAT,
|
181
|
+
OR(UNICODE_UNASSIGNED,
|
182
|
+
OR(UNICODE_PRIVATE_USE,
|
183
|
+
OR(UNICODE_SURROGATE,
|
184
|
+
OR(UNICODE_SPACE_SEPARATOR, 0)))))));
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
/* {{{1
|
189
|
+
* Determine whether ‘c’ is a lowercase letter, such as a, b, or c.
|
190
|
+
*/
|
191
|
+
bool
|
192
|
+
unichar_islower(unichar c)
|
193
|
+
{
|
194
|
+
return s_type(c) == UNICODE_LOWERCASE_LETTER;
|
195
|
+
}
|
196
|
+
|
197
|
+
|
198
|
+
/* {{{1
|
199
|
+
* Determine whether ‘c’ is printable, which works the same as
|
200
|
+
* unichar_isgraph(), except that space characters are also printable.
|
201
|
+
*/
|
202
|
+
bool
|
203
|
+
unichar_isprint(unichar c)
|
204
|
+
{
|
205
|
+
return !IS(s_type(c),
|
206
|
+
OR(UNICODE_CONTROL,
|
207
|
+
OR(UNICODE_FORMAT,
|
208
|
+
OR(UNICODE_UNASSIGNED,
|
209
|
+
OR(UNICODE_PRIVATE_USE,
|
210
|
+
OR(UNICODE_SURROGATE, 0))))));
|
211
|
+
}
|
212
|
+
|
213
|
+
|
214
|
+
/* {{{1
|
215
|
+
* Determine whether ‘c’ is some form of punctuation or other symbol.
|
216
|
+
*/
|
217
|
+
bool
|
218
|
+
unichar_ispunct(unichar c)
|
219
|
+
{
|
220
|
+
return IS(s_type(c),
|
221
|
+
OR(UNICODE_CONNECT_PUNCTUATION,
|
222
|
+
OR(UNICODE_DASH_PUNCTUATION,
|
223
|
+
OR(UNICODE_OPEN_PUNCTUATION,
|
224
|
+
OR(UNICODE_CLOSE_PUNCTUATION,
|
225
|
+
OR(UNICODE_INITIAL_PUNCTUATION,
|
226
|
+
OR(UNICODE_FINAL_PUNCTUATION,
|
227
|
+
OR(UNICODE_OTHER_PUNCTUATION,
|
228
|
+
OR(UNICODE_MODIFIER_SYMBOL,
|
229
|
+
OR(UNICODE_MATH_SYMBOL,
|
230
|
+
OR(UNICODE_CURRENCY_SYMBOL,
|
231
|
+
OR(UNICODE_OTHER_SYMBOL, 0)))))))))))) ? true : false;
|
232
|
+
}
|
233
|
+
|
234
|
+
|
235
|
+
/* {{{1
|
236
|
+
* Determine whether ‘c’ is some form of whitespace, such as space, tab or a
|
237
|
+
* line separator (newline, carriage return, etc.).
|
238
|
+
*/
|
239
|
+
bool
|
240
|
+
unichar_isspace(unichar c)
|
241
|
+
{
|
242
|
+
switch (c) {
|
243
|
+
case '\t':
|
244
|
+
case '\n':
|
245
|
+
case '\r':
|
246
|
+
case '\f':
|
247
|
+
return true;
|
248
|
+
default:
|
249
|
+
return IS(s_type(c),
|
250
|
+
OR(UNICODE_SPACE_SEPARATOR,
|
251
|
+
OR(UNICODE_LINE_SEPARATOR,
|
252
|
+
OR(UNICODE_PARAGRAPH_SEPARATOR, 0)))) ? true : false;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
/* {{{1
|
258
|
+
* Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
|
259
|
+
*/
|
260
|
+
bool
|
261
|
+
unichar_isupper(unichar c)
|
262
|
+
{
|
263
|
+
return s_type(c) == UNICODE_UPPERCASE_LETTER;
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
/* {{{1
|
268
|
+
* Determine whether ‘c’ is a titlecase letter, such as the slavic digraph DZ,
|
269
|
+
* which at the beginning of a word is written as Dz, where only the initial D
|
270
|
+
* is capitalized. (Complicated huh?)
|
271
|
+
*/
|
272
|
+
bool
|
273
|
+
unichar_istitle(unichar c)
|
274
|
+
{
|
275
|
+
/* TODO: binary search helpful? */
|
276
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
277
|
+
if (title_table[i][0] == c)
|
278
|
+
return true;
|
279
|
+
|
280
|
+
return false;
|
281
|
+
}
|
282
|
+
|
283
|
+
|
284
|
+
/* {{{1
|
285
|
+
* Determine whether ‘c’ is a new-line.
|
286
|
+
*/
|
287
|
+
#define UNICHAR_NEXT_LINE ((unichar)0x0085)
|
288
|
+
#define UNICHAR_LINE_SEPARATOR ((unichar)0x2028)
|
289
|
+
#define UNICHAR_PARAGRAPH_SEPARATOR ((unichar)0x2029)
|
290
|
+
|
291
|
+
bool
|
292
|
+
unichar_isnewline(unichar c)
|
293
|
+
{
|
294
|
+
switch (c) {
|
295
|
+
case '\n': case '\f': case '\r': case UNICHAR_NEXT_LINE:
|
296
|
+
case UNICHAR_LINE_SEPARATOR: case UNICHAR_PARAGRAPH_SEPARATOR:
|
297
|
+
return true;
|
298
|
+
default:
|
299
|
+
return false;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
/* {{{1
|
304
|
+
* Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
|
305
|
+
* ..., f, or A, B, ..., F.
|
306
|
+
*/
|
307
|
+
#define UNICHAR_FULLWIDTH_A 0xff21
|
308
|
+
#define UNICHAR_FULLWIDTH_F 0xff26
|
309
|
+
#define UNICHAR_FULLWIDTH_a 0xff41
|
310
|
+
#define UNICHAR_FULLWIDTH_f 0xff46
|
311
|
+
bool
|
312
|
+
unichar_isxdigit(unichar c)
|
313
|
+
{
|
314
|
+
return ((c >= 'a' && c <= 'f') ||
|
315
|
+
(c >= 'A' && c <= 'F') ||
|
316
|
+
(c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f) ||
|
317
|
+
(c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F) ||
|
318
|
+
(s_type(c) == UNICODE_DECIMAL_NUMBER));
|
319
|
+
// s_isdigit(s_type(c)));
|
320
|
+
}
|
321
|
+
|
322
|
+
|
323
|
+
/* {{{1
|
324
|
+
* Determine whether code point ‘c’ has been assigned a code value.
|
325
|
+
*/
|
326
|
+
bool
|
327
|
+
unichar_isassigned(unichar c)
|
328
|
+
{
|
329
|
+
return s_type(c) != UNICODE_UNASSIGNED;
|
330
|
+
}
|
331
|
+
|
332
|
+
|
333
|
+
/* {{{1
|
334
|
+
* Determine whether ‘c’ is a wide character, thus is typically rendered in a
|
335
|
+
* double-width cell on a terminal.
|
336
|
+
*/
|
337
|
+
bool
|
338
|
+
unichar_iswide(unichar c)
|
339
|
+
{
|
340
|
+
if (c < 0x1100)
|
341
|
+
return false;
|
342
|
+
|
343
|
+
return (c <= 0x115f || /* Hangul Jamo init. consonants */
|
344
|
+
c == 0x2329 || c == 0x232a || /* angle brackets */
|
345
|
+
(c >= 0x2e80 && c <= 0xa4cf && /* CJK ... Yi */
|
346
|
+
(c < 0x302a || c > 0x302f) &&
|
347
|
+
c != 0x303f && c != 0x3099 && c != 0x309a) ||
|
348
|
+
(c >= 0xac00 && c <= 0xd7a3) || /* Hangul syllables */
|
349
|
+
(c >= 0xf900 && c <= 0xfaff) || /* CJK comp. graphs */
|
350
|
+
(c >= 0xfe30 && c <= 0xfe6f) || /* CJK comp. forms */
|
351
|
+
(c >= 0xff00 && c <= 0xff60) || /* fullwidth forms */
|
352
|
+
(c >= 0xffe0 && c <= 0xffe6) || /* -"- */
|
353
|
+
(c >= 0x20000 && c <= 0x2fffd) || /* CJK extra stuff */
|
354
|
+
(c >= 0x30000 && c <= 0x3fffd)); /* -"- */
|
355
|
+
}
|
356
|
+
|
357
|
+
|
358
|
+
/* {{{1
|
359
|
+
* Convert ‘c’ to its uppercase representation (if any).
|
360
|
+
*/
|
361
|
+
static unichar
|
362
|
+
special_case_table_lookup(unichar c)
|
363
|
+
{
|
364
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
365
|
+
|
366
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
367
|
+
return utf_char(special_case_table +
|
368
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START);
|
369
|
+
|
370
|
+
if (tv == '\0')
|
371
|
+
return c;
|
372
|
+
|
373
|
+
return tv;
|
374
|
+
}
|
375
|
+
|
376
|
+
static unichar
|
377
|
+
titlecase_table_lookup(unichar c, bool want_upper)
|
378
|
+
{
|
379
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
380
|
+
if (title_table[i][0] == c)
|
381
|
+
return title_table[i][want_upper ? 1 : 2];
|
382
|
+
|
383
|
+
return c;
|
384
|
+
}
|
385
|
+
|
386
|
+
unichar
|
387
|
+
unichar_toupper(unichar c)
|
388
|
+
{
|
389
|
+
int type = s_type(c);
|
390
|
+
|
391
|
+
if (type == UNICODE_LOWERCASE_LETTER)
|
392
|
+
return special_case_table_lookup(c);
|
393
|
+
|
394
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
395
|
+
return titlecase_table_lookup(c, true);
|
396
|
+
|
397
|
+
return c;
|
398
|
+
}
|
399
|
+
|
400
|
+
|
401
|
+
/* {{{1
|
402
|
+
* Convert ‘c’ to its lowercase representation (if any).
|
403
|
+
*/
|
404
|
+
unichar
|
405
|
+
unichar_tolower(unichar c)
|
406
|
+
{
|
407
|
+
int type = s_type(c);
|
408
|
+
|
409
|
+
if (type == UNICODE_UPPERCASE_LETTER)
|
410
|
+
return special_case_table_lookup(c);
|
411
|
+
|
412
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
413
|
+
return titlecase_table_lookup(c, false);
|
414
|
+
|
415
|
+
return c;
|
416
|
+
}
|
417
|
+
|
418
|
+
|
419
|
+
/* {{{1
|
420
|
+
* Convert ‘c’ to its titlecase representation (if any).
|
421
|
+
*/
|
422
|
+
unichar
|
423
|
+
unichar_totitle(unichar c)
|
424
|
+
{
|
425
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
426
|
+
if (title_table[i][0] == c ||
|
427
|
+
title_table[i][1] == c ||
|
428
|
+
title_table[i][2] == c)
|
429
|
+
return title_table[i][0];
|
430
|
+
|
431
|
+
if (s_type(c) == UNICODE_LOWERCASE_LETTER)
|
432
|
+
return ATTTABLE(c >> 8, c & 0xff);
|
433
|
+
|
434
|
+
return c;
|
435
|
+
}
|
436
|
+
|
437
|
+
|
438
|
+
/* {{{1
|
439
|
+
* Return the numeric value of ‘c’ if it's a decimal digit, or -1 if not.
|
440
|
+
*/
|
441
|
+
int
|
442
|
+
unichar_digit_value(unichar c)
|
443
|
+
{
|
444
|
+
if (s_type(c) == UNICODE_DECIMAL_NUMBER)
|
445
|
+
return ATTTABLE(c >> 8, c & 0xff);
|
446
|
+
|
447
|
+
return -1;
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
/* {{{1
|
452
|
+
* Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
|
453
|
+
*/
|
454
|
+
int
|
455
|
+
unichar_xdigit_value(unichar c)
|
456
|
+
{
|
457
|
+
if (c >= 'a' && c <= 'f')
|
458
|
+
return c - 'a' + 10;
|
459
|
+
else if (c >= 'A' && c <= 'F')
|
460
|
+
return c - 'A' + 10;
|
461
|
+
else if (c >= UNICHAR_FULLWIDTH_a && c <= UNICHAR_FULLWIDTH_f)
|
462
|
+
return c - UNICHAR_FULLWIDTH_a + 10;
|
463
|
+
else if (c >= UNICHAR_FULLWIDTH_A && c <= UNICHAR_FULLWIDTH_F)
|
464
|
+
return c - UNICHAR_FULLWIDTH_A + 10;
|
465
|
+
else
|
466
|
+
return unichar_digit_value(c);
|
467
|
+
}
|
468
|
+
|
469
|
+
|
470
|
+
/* {{{1
|
471
|
+
* Determine the Unicode character type of ‘c’.
|
472
|
+
*/
|
473
|
+
UnicodeType
|
474
|
+
unichar_type(unichar c)
|
475
|
+
{
|
476
|
+
return s_type(c);
|
477
|
+
}
|
478
|
+
|
479
|
+
|
480
|
+
/* {{{1
|
481
|
+
* LocaleType: This ‹enum› is used for dealing with different locales for
|
482
|
+
* turning strings into uppercase or lowercase.
|
483
|
+
*/
|
484
|
+
typedef enum {
|
485
|
+
LOCALE_NORMAL,
|
486
|
+
LOCALE_TURKIC,
|
487
|
+
LOCALE_LITHUANIAN
|
488
|
+
} LocaleType;
|
489
|
+
|
490
|
+
|
491
|
+
/* {{{1
|
492
|
+
* Retrieve the locale type from the environment (LC_CTYPE).
|
493
|
+
*/
|
494
|
+
static LocaleType
|
495
|
+
get_locale_type(void)
|
496
|
+
{
|
497
|
+
const char *locale = setlocale(LC_CTYPE, NULL);
|
498
|
+
|
499
|
+
if ((locale[0] == 'a' && locale[1] == 'z') ||
|
500
|
+
(locale[0] == 't' && locale[1] == 'r'))
|
501
|
+
return LOCALE_TURKIC;
|
502
|
+
|
503
|
+
if (locale[0] == 'l' && locale[1] == 't')
|
504
|
+
return LOCALE_LITHUANIAN;
|
505
|
+
|
506
|
+
return LOCALE_NORMAL;
|
507
|
+
}
|
508
|
+
|
509
|
+
|
510
|
+
/* {{{1
|
511
|
+
* Put character marks found in ‘p_inout’ into itself. If ‘remove_dot’ is
|
512
|
+
* true, remove the dot over an uppercase I for a turkish locale.
|
513
|
+
*/
|
514
|
+
static size_t
|
515
|
+
output_marks(const char **p_inout, char *buf, bool remove_dot)
|
516
|
+
{
|
517
|
+
size_t len = 0;
|
518
|
+
const char *p = *p_inout;
|
519
|
+
|
520
|
+
for ( ; *p != '\0'; p = utf_next(p)) {
|
521
|
+
unichar c = utf_char(p);
|
522
|
+
|
523
|
+
if (!s_ismark(s_type(c)))
|
524
|
+
break;
|
525
|
+
|
526
|
+
if (!remove_dot || c != COMBINING_DOT_ABOVE)
|
527
|
+
len += unichar_to_utf(c, (buf != NULL) ? buf + len : NULL);
|
528
|
+
}
|
529
|
+
|
530
|
+
*p_inout = p;
|
531
|
+
|
532
|
+
return len;
|
533
|
+
}
|
534
|
+
|
535
|
+
/* {{{1
|
536
|
+
* Output titlecases where appropriate.
|
537
|
+
*/
|
538
|
+
static size_t
|
539
|
+
output_special_case(char *buf, int offset, int type, bool upper)
|
540
|
+
{
|
541
|
+
const char *p = special_case_table + offset;
|
542
|
+
|
543
|
+
if (type != UNICODE_TITLECASE_LETTER)
|
544
|
+
p = utf_next(p);
|
545
|
+
|
546
|
+
if (upper)
|
547
|
+
p += utf_byte_length(p) + 1;
|
548
|
+
|
549
|
+
size_t len = utf_byte_length(p);
|
550
|
+
|
551
|
+
if (buf != NULL)
|
552
|
+
memcpy(buf, p, len);
|
553
|
+
|
554
|
+
return len;
|
555
|
+
}
|
556
|
+
|
557
|
+
/* {{{1
|
558
|
+
* Do uppercasing of ‘p’ for Lithuanian locales.
|
559
|
+
*/
|
560
|
+
static size_t
|
561
|
+
remove_all_combining_dot_above(unichar c, char *buf)
|
562
|
+
{
|
563
|
+
size_t decomp_len;
|
564
|
+
unichar *decomp = unicode_canonical_decomposition(c, &decomp_len);
|
565
|
+
|
566
|
+
size_t len = 0;
|
567
|
+
for (size_t i = 0; i < decomp_len; i++)
|
568
|
+
if (decomp[i] != COMBINING_DOT_ABOVE)
|
569
|
+
len += unichar_to_utf(unichar_toupper(decomp[i]),
|
570
|
+
OFFSET_IF(buf, len));
|
571
|
+
|
572
|
+
free(decomp);
|
573
|
+
|
574
|
+
return len;
|
575
|
+
}
|
576
|
+
|
577
|
+
static size_t
|
578
|
+
real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
|
579
|
+
bool *was_i)
|
580
|
+
{
|
581
|
+
if (c == 'i') {
|
582
|
+
*was_i = true;
|
583
|
+
return 0;
|
584
|
+
}
|
585
|
+
|
586
|
+
if (*was_i) {
|
587
|
+
size_t len = remove_all_combining_dot_above(c, buf);
|
588
|
+
return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
|
589
|
+
true);
|
590
|
+
}
|
591
|
+
|
592
|
+
if (!s_ismark(type))
|
593
|
+
*was_i = false;
|
594
|
+
|
595
|
+
return 0;
|
596
|
+
}
|
597
|
+
|
598
|
+
/* {{{1
|
599
|
+
* Do real upcasing. */
|
600
|
+
static inline size_t
|
601
|
+
real_do_toupper(unichar c, int type, char *buf)
|
602
|
+
{
|
603
|
+
bool upper = (type != UNICODE_LOWERCASE_LETTER);
|
604
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
605
|
+
|
606
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
607
|
+
return output_special_case(buf,
|
608
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START,
|
609
|
+
type, upper);
|
610
|
+
|
611
|
+
/* TODO: this should really use titlecase_table_lookup somehow. */
|
612
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
613
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
614
|
+
if (title_table[i][0] == c)
|
615
|
+
return unichar_to_utf(title_table[i][1], buf);
|
616
|
+
|
617
|
+
return unichar_to_utf(tv != '\0' ? tv : c, buf);
|
618
|
+
}
|
619
|
+
|
620
|
+
/* {{{1
|
621
|
+
* Do real uppercasing of ‘str’.
|
622
|
+
*/
|
623
|
+
static size_t
|
624
|
+
real_toupper_one(const char **p, const char *prev, char *buf,
|
625
|
+
LocaleType locale_type, bool *was_i)
|
626
|
+
{
|
627
|
+
unichar c = utf_char(prev);
|
628
|
+
int type = s_type(c);
|
629
|
+
|
630
|
+
if (locale_type == LOCALE_LITHUANIAN) {
|
631
|
+
size_t len = real_toupper_lithuanian(p, c, type, buf, was_i);
|
632
|
+
if (len > 0)
|
633
|
+
return len;
|
634
|
+
}
|
635
|
+
|
636
|
+
if (locale_type == LOCALE_TURKIC && c == 'i')
|
637
|
+
return unichar_to_utf(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
|
638
|
+
buf);
|
639
|
+
|
640
|
+
if (c == COMBINING_GREEK_YPOGEGRAMMENI) {
|
641
|
+
/* Nasty, need to move it after other combining marks...this
|
642
|
+
* would go away if we normalized first. */
|
643
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
644
|
+
* of ‘p’? */
|
645
|
+
size_t len = output_marks(p, buf, false);
|
646
|
+
return len + unichar_to_utf(GREEK_CAPITAL_LETTER_IOTA,
|
647
|
+
OFFSET_IF(buf, len));
|
648
|
+
}
|
649
|
+
|
650
|
+
if (IS(type, OR(UNICODE_LOWERCASE_LETTER,
|
651
|
+
OR(UNICODE_TITLECASE_LETTER, 0))))
|
652
|
+
return real_do_toupper(c, type, buf);
|
653
|
+
|
654
|
+
size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
|
655
|
+
|
656
|
+
if (buf != NULL)
|
657
|
+
memcpy(buf, prev, len);
|
658
|
+
|
659
|
+
return len;
|
660
|
+
}
|
661
|
+
|
662
|
+
static size_t
|
663
|
+
real_toupper(const char *str, size_t max, bool use_max, char *buf,
|
664
|
+
LocaleType locale_type)
|
665
|
+
{
|
666
|
+
const char *p = str;
|
667
|
+
size_t len = 0;
|
668
|
+
bool p_was_i = false;
|
669
|
+
|
670
|
+
while ((!use_max || p < str + max) && *p != '\0') {
|
671
|
+
const char *prev = p;
|
672
|
+
p = utf_next(p);
|
673
|
+
|
674
|
+
len += real_toupper_one(&p, prev, OFFSET_IF(buf, len),
|
675
|
+
locale_type, &p_was_i);
|
676
|
+
}
|
677
|
+
|
678
|
+
return len;
|
679
|
+
}
|
680
|
+
|
681
|
+
/* {{{1
|
682
|
+
* Wrapper around real_toupper() for dealing with memory allocation and such.
|
683
|
+
*/
|
684
|
+
static char *
|
685
|
+
utf_upcase_impl(const char *str, size_t max, bool use_max)
|
686
|
+
{
|
687
|
+
assert(str != NULL);
|
688
|
+
|
689
|
+
LocaleType locale_type = get_locale_type();
|
690
|
+
|
691
|
+
size_t len = real_toupper(str, max, use_max, NULL, locale_type);
|
692
|
+
char *result = ALLOC_N(char, len + 1);
|
693
|
+
real_toupper(str, max, use_max, result, locale_type);
|
694
|
+
result[len] = '\0';
|
695
|
+
|
696
|
+
return result;
|
697
|
+
}
|
698
|
+
|
699
|
+
|
700
|
+
/* {{{1
|
701
|
+
* Convert all characters in ‘str’ to their uppercase representation if
|
702
|
+
* applicable. Returns the freshly allocated representation.
|
703
|
+
*/
|
704
|
+
char *
|
705
|
+
utf_upcase(const char *str)
|
706
|
+
{
|
707
|
+
return utf_upcase_impl(str, 0, false);
|
708
|
+
}
|
709
|
+
|
710
|
+
|
711
|
+
/* {{{1
|
712
|
+
* Convert all characters in ‘str’ to their uppercase representation if
|
713
|
+
* applicable. Returns the freshly allocated representation. Do this for at
|
714
|
+
* most ‘len˚ bytes from ‘str’.
|
715
|
+
*/
|
716
|
+
char *
|
717
|
+
utf_upcase_n(const char *str, size_t len)
|
718
|
+
{
|
719
|
+
return utf_upcase_impl(str, len, true);
|
720
|
+
}
|
721
|
+
|
722
|
+
|
723
|
+
/* {{{1
|
724
|
+
* Traverse the string checking for characters with combining class == 230
|
725
|
+
* until a base character is found.
|
726
|
+
*/
|
727
|
+
static bool
|
728
|
+
has_more_above(const char *str)
|
729
|
+
{
|
730
|
+
for (const char *p = str; *p != '\0'; p = utf_next(p)) {
|
731
|
+
int c_class = _unichar_combining_class(utf_char(p));
|
732
|
+
|
733
|
+
if (c_class == 230)
|
734
|
+
return true;
|
735
|
+
|
736
|
+
if (c_class == 0)
|
737
|
+
return false;
|
738
|
+
}
|
739
|
+
|
740
|
+
return false;
|
741
|
+
}
|
742
|
+
|
743
|
+
static inline size_t
|
744
|
+
real_do_tolower(unichar c, int type, char *buf)
|
745
|
+
{
|
746
|
+
unichar tv = ATTTABLE(c >> 8, c & 0xff);
|
747
|
+
|
748
|
+
if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
|
749
|
+
return output_special_case(buf,
|
750
|
+
tv - UNICODE_SPECIAL_CASE_TABLE_START,
|
751
|
+
type, false);
|
752
|
+
|
753
|
+
/* TODO: this should really use titlecase_table_lookup somehow. */
|
754
|
+
if (type == UNICODE_TITLECASE_LETTER)
|
755
|
+
for (size_t i = 0; i < lengthof(title_table); i++)
|
756
|
+
if (title_table[i][0] == c)
|
757
|
+
return unichar_to_utf(title_table[i][2], buf);
|
758
|
+
|
759
|
+
return unichar_to_utf(tv != '\0' ? tv : c, buf);
|
760
|
+
}
|
761
|
+
|
762
|
+
/* {{{1
|
763
|
+
* The real implementation of downcase.
|
764
|
+
*
|
765
|
+
* TODO: this needs a cleanup.
|
766
|
+
*/
|
767
|
+
static size_t
|
768
|
+
real_tolower_one(const char **p, const char *prev, char *buf,
|
769
|
+
LocaleType locale_type, const char *end, bool use_end)
|
770
|
+
{
|
771
|
+
unichar c = utf_char(prev);
|
772
|
+
int type = s_type(c);
|
773
|
+
|
774
|
+
if (locale_type == LOCALE_TURKIC && c == 'I') {
|
775
|
+
if (utf_char(*p) == COMBINING_DOT_ABOVE) {
|
776
|
+
/* TODO: don’t we need to make sure we don’t go beyond the end
|
777
|
+
* of ‘p’? */
|
778
|
+
*p = utf_next(*p);
|
779
|
+
return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
780
|
+
}
|
781
|
+
|
782
|
+
return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
|
783
|
+
}
|
784
|
+
|
785
|
+
if (locale_type == LOCALE_LITHUANIAN &&
|
786
|
+
(c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
|
787
|
+
c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
|
788
|
+
c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
|
789
|
+
/* Introduce an explicit dot above the lowercasing capital I's
|
790
|
+
* and J's whenever there are more accents above.
|
791
|
+
* [SpecialCasing.txt] */
|
792
|
+
size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
|
793
|
+
len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
|
794
|
+
switch (c) {
|
795
|
+
case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
|
796
|
+
len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
|
797
|
+
OFFSET_IF(buf, len));
|
798
|
+
break;
|
799
|
+
case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
|
800
|
+
len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
|
801
|
+
OFFSET_IF(buf, len));
|
802
|
+
break;
|
803
|
+
case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
|
804
|
+
len += unichar_to_utf(COMBINING_TILDE,
|
805
|
+
OFFSET_IF(buf, len));
|
806
|
+
break;
|
807
|
+
}
|
808
|
+
|
809
|
+
return len;
|
810
|
+
}
|
811
|
+
|
812
|
+
if (locale_type == LOCALE_LITHUANIAN &&
|
813
|
+
(c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
|
814
|
+
has_more_above(*p)) {
|
815
|
+
size_t len = unichar_to_utf(unichar_tolower(c), buf);
|
816
|
+
return len + unichar_to_utf(COMBINING_DOT_ABOVE,
|
817
|
+
OFFSET_IF(buf, len));
|
818
|
+
}
|
819
|
+
|
820
|
+
if (c == GREEK_CAPITAL_LETTER_SIGMA) {
|
821
|
+
unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
|
822
|
+
|
823
|
+
if ((!use_end || *p < end) && **p != '\0') {
|
824
|
+
unichar next_c = utf_char(*p);
|
825
|
+
int next_type = s_type(next_c);
|
826
|
+
|
827
|
+
/* SIGMA maps differently depending on whether it is
|
828
|
+
* final or not. The following simplified test would
|
829
|
+
* fail in the case of combining marks following the
|
830
|
+
* sigma, but I don't think that occurs in real text.
|
831
|
+
* The test here matches that in ICU. */
|
832
|
+
if (s_isalpha(next_type))
|
833
|
+
tv = GREEK_SMALL_LETTER_SIGMA;
|
834
|
+
}
|
835
|
+
|
836
|
+
return unichar_to_utf(tv, buf);
|
837
|
+
}
|
838
|
+
|
839
|
+
if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
|
840
|
+
OR(UNICODE_TITLECASE_LETTER, 0))))
|
841
|
+
return real_do_tolower(c, type, buf);
|
842
|
+
|
843
|
+
size_t len = s_utf_skip_lengths[*(const unsigned char *)prev];
|
844
|
+
|
845
|
+
if (buf != NULL)
|
846
|
+
memcpy(buf, prev, len);
|
847
|
+
|
848
|
+
return len;
|
849
|
+
}
|
850
|
+
|
851
|
+
static size_t
|
852
|
+
real_tolower(const char *str, size_t max, bool use_max, char *buf,
|
853
|
+
LocaleType locale_type)
|
854
|
+
{
|
855
|
+
const char *p = str;
|
856
|
+
const char *end = str + max;
|
857
|
+
size_t len = 0;
|
858
|
+
|
859
|
+
while ((!use_max || p < end) && *p != '\0') {
|
860
|
+
const char *prev = p;
|
861
|
+
p = utf_next(p);
|
862
|
+
|
863
|
+
len += real_tolower_one(&p, prev, OFFSET_IF(buf, len),
|
864
|
+
locale_type, end, use_max);
|
865
|
+
}
|
866
|
+
|
867
|
+
return len;
|
868
|
+
}
|
869
|
+
|
870
|
+
|
871
|
+
/* {{{1 */
|
872
|
+
static char *
|
873
|
+
utf_downcase_impl(const char *str, size_t max, bool use_max)
|
874
|
+
{
|
875
|
+
assert(str != NULL);
|
876
|
+
|
877
|
+
LocaleType locale_type = get_locale_type();
|
878
|
+
|
879
|
+
size_t len = real_tolower(str, max, use_max, NULL, locale_type);
|
880
|
+
char *result = ALLOC_N(char, len + 1);
|
881
|
+
real_tolower(str, max, use_max, result, locale_type);
|
882
|
+
result[len] = NUL;
|
883
|
+
|
884
|
+
return result;
|
885
|
+
}
|
886
|
+
|
887
|
+
|
888
|
+
/* {{{1
|
889
|
+
* Convert all characters in ‘str’ to their lowercase representation if
|
890
|
+
* applicable. Returns the freshly allocated representation.
|
891
|
+
*/
|
892
|
+
char *
|
893
|
+
utf_downcase(const char *str)
|
894
|
+
{
|
895
|
+
return utf_downcase_impl(str, 0, false);
|
896
|
+
}
|
897
|
+
|
898
|
+
|
899
|
+
/* {{{1
|
900
|
+
* Convert all characters in ‘str’ to their lowercase representation if
|
901
|
+
* applicable. Returns the freshly allocated representation. Do this for at
|
902
|
+
* most ‘len˚ bytes from ‘str’.
|
903
|
+
*/
|
904
|
+
char *
|
905
|
+
utf_downcase_n(const char *str, size_t len)
|
906
|
+
{
|
907
|
+
return utf_downcase_impl(str, len, true);
|
908
|
+
}
|
909
|
+
|
910
|
+
|
911
|
+
/* {{{1
|
912
|
+
* The real implementation of case folding below.
|
913
|
+
*/
|
914
|
+
|
915
|
+
static bool
|
916
|
+
casefold_table_lookup(unichar c, char *folded, size_t *len)
|
917
|
+
{
|
918
|
+
int begin = 0;
|
919
|
+
int end = lengthof(casefold_table);
|
920
|
+
|
921
|
+
if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
|
922
|
+
return false;
|
923
|
+
|
924
|
+
while (true) {
|
925
|
+
int mid = (begin + end) / 2;
|
926
|
+
|
927
|
+
if (c == casefold_table[mid].ch) {
|
928
|
+
if (folded != NULL)
|
929
|
+
strcpy(folded, casefold_table[mid].data);
|
930
|
+
*len += utf_byte_length(casefold_table[mid].data);
|
931
|
+
return true;
|
932
|
+
} else if (mid == begin) {
|
933
|
+
return false;
|
934
|
+
} else if (c > casefold_table[mid].ch) {
|
935
|
+
begin = mid;
|
936
|
+
} else {
|
937
|
+
end = mid;
|
938
|
+
}
|
939
|
+
}
|
940
|
+
}
|
941
|
+
|
942
|
+
static char *
|
943
|
+
utf_foldcase_impl(const char *str, size_t max, bool use_max)
|
944
|
+
{
|
945
|
+
assert(str != NULL);
|
946
|
+
|
947
|
+
char *folded = NULL;
|
948
|
+
size_t len = 0;
|
949
|
+
|
950
|
+
again:
|
951
|
+
for (const char *p = str; (!use_max || p < str + max) && *p != '\0'; p = utf_next(p)) {
|
952
|
+
unichar c = utf_char(p);
|
953
|
+
|
954
|
+
if (casefold_table_lookup(c, OFFSET_IF(folded, len), &len))
|
955
|
+
continue;
|
956
|
+
|
957
|
+
len += unichar_to_utf(unichar_tolower(c), OFFSET_IF(folded, len));
|
958
|
+
}
|
959
|
+
|
960
|
+
if (folded == NULL) {
|
961
|
+
folded = ALLOC_N(char, len + 1);
|
962
|
+
folded[0] = NUL;
|
963
|
+
len = 0;
|
964
|
+
goto again;
|
965
|
+
}
|
966
|
+
|
967
|
+
folded[len] = '\0';
|
968
|
+
|
969
|
+
return folded;
|
970
|
+
}
|
971
|
+
|
972
|
+
|
973
|
+
/* {{{1
|
974
|
+
* Convert a string into a form that is independent of case. Return the
|
975
|
+
* freshly allocated representation.
|
976
|
+
*/
|
977
|
+
char *
|
978
|
+
utf_foldcase(const char *str)
|
979
|
+
{
|
980
|
+
return utf_foldcase_impl(str, 0, false);
|
981
|
+
}
|
982
|
+
|
983
|
+
|
984
|
+
/* {{{1
|
985
|
+
* Convert a string into a form that is independent of case. Return the
|
986
|
+
* freshly allocated representation. Do this for at most ‘len’ bytes from the
|
987
|
+
* string.
|
988
|
+
*/
|
989
|
+
char *
|
990
|
+
utf_foldcase_n(const char *str, size_t len)
|
991
|
+
{
|
992
|
+
return utf_foldcase_impl(str, len, true);
|
993
|
+
}
|
994
|
+
|
995
|
+
|
996
|
+
/* {{{1
|
997
|
+
* The real implementation of utf_width() and utf_width_n() below.
|
998
|
+
*/
|
999
|
+
static size_t
|
1000
|
+
utf_width_impl(const char *str, size_t len, bool use_len)
|
1001
|
+
{
|
1002
|
+
assert(str != NULL);
|
1003
|
+
|
1004
|
+
size_t width = 0;
|
1005
|
+
|
1006
|
+
for (const char *p = str; (!use_len || p < str + len) && *p != NUL; p = utf_next(p))
|
1007
|
+
width += unichar_iswide(utf_char(p)) ? 2 : 1;
|
1008
|
+
|
1009
|
+
return width;
|
1010
|
+
}
|
1011
|
+
|
1012
|
+
|
1013
|
+
/* {{{1
|
1014
|
+
* Calculate the width in cells of ‘str’.
|
1015
|
+
*/
|
1016
|
+
size_t
|
1017
|
+
utf_width(const char *str)
|
1018
|
+
{
|
1019
|
+
return utf_width_impl(str, 0, false);
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
|
1023
|
+
/* {{{1
|
1024
|
+
* Calculate the width in cells of ‘str’, which is of length ‘len’.
|
1025
|
+
*/
|
1026
|
+
size_t
|
1027
|
+
utf_width_n(const char *str, size_t len)
|
1028
|
+
{
|
1029
|
+
return utf_width_impl(str, len, true);
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
|
1033
|
+
/* {{{1
|
1034
|
+
* Retrieve the mirrored representation of ‘c’ (if any) and store it in
|
1035
|
+
* ‘mirrored’.
|
1036
|
+
*/
|
1037
|
+
bool
|
1038
|
+
unichar_mirror(unichar c, unichar *mirrored)
|
1039
|
+
{
|
1040
|
+
int begin = 0;
|
1041
|
+
int end = lengthof(bidi_mirroring_table);
|
1042
|
+
|
1043
|
+
while (true) {
|
1044
|
+
int mid = (begin + end) / 2;
|
1045
|
+
|
1046
|
+
if (c == bidi_mirroring_table[mid].ch) {
|
1047
|
+
if (mirrored != NULL)
|
1048
|
+
*mirrored = bidi_mirroring_table[mid].mirrored_ch;
|
1049
|
+
return true;
|
1050
|
+
} else if (mid == begin) {
|
1051
|
+
return false;
|
1052
|
+
} else if (c > bidi_mirroring_table[mid].ch) {
|
1053
|
+
begin = mid;
|
1054
|
+
} else {
|
1055
|
+
end = mid;
|
1056
|
+
}
|
1057
|
+
}
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
|
1061
|
+
/* }}}1 */
|