character-encodings 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,208 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode handling.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
#ifndef UNICODE_H
|
9
|
+
#define UNICODE_H
|
10
|
+
|
11
|
+
|
12
|
+
typedef uint32_t unichar;
|
13
|
+
|
14
|
+
#define MAXUNICHAR UINT32_MAX
|
15
|
+
|
16
|
+
#define MAX_UNICHAR_BYTE_LENGTH 6
|
17
|
+
|
18
|
+
#define UNICODE_N_CODEPOINTS (0x10ffff + 1)
|
19
|
+
|
20
|
+
/* unichar return used for representing bad input to a function. */
|
21
|
+
#define UTF_BAD_INPUT_UNICHAR ((unichar)-1)
|
22
|
+
|
23
|
+
|
24
|
+
/* unichar return used for representing an incomplete input to a function. */
|
25
|
+
#define UTF_INCOMPLETE_INPUT_UNICHAR ((unichar)-2)
|
26
|
+
|
27
|
+
|
28
|
+
typedef enum {
|
29
|
+
UNICODE_CONTROL,
|
30
|
+
UNICODE_FORMAT,
|
31
|
+
UNICODE_UNASSIGNED,
|
32
|
+
UNICODE_PRIVATE_USE,
|
33
|
+
UNICODE_SURROGATE,
|
34
|
+
UNICODE_LOWERCASE_LETTER,
|
35
|
+
UNICODE_MODIFIER_LETTER,
|
36
|
+
UNICODE_OTHER_LETTER,
|
37
|
+
UNICODE_TITLECASE_LETTER,
|
38
|
+
UNICODE_UPPERCASE_LETTER,
|
39
|
+
UNICODE_COMBINING_MARK,
|
40
|
+
UNICODE_ENCLOSING_MARK,
|
41
|
+
UNICODE_NON_SPACING_MARK,
|
42
|
+
UNICODE_DECIMAL_NUMBER,
|
43
|
+
UNICODE_LETTER_NUMBER,
|
44
|
+
UNICODE_OTHER_NUMBER,
|
45
|
+
UNICODE_CONNECT_PUNCTUATION,
|
46
|
+
UNICODE_DASH_PUNCTUATION,
|
47
|
+
UNICODE_CLOSE_PUNCTUATION,
|
48
|
+
UNICODE_FINAL_PUNCTUATION,
|
49
|
+
UNICODE_INITIAL_PUNCTUATION,
|
50
|
+
UNICODE_OTHER_PUNCTUATION,
|
51
|
+
UNICODE_OPEN_PUNCTUATION,
|
52
|
+
UNICODE_CURRENCY_SYMBOL,
|
53
|
+
UNICODE_MODIFIER_SYMBOL,
|
54
|
+
UNICODE_MATH_SYMBOL,
|
55
|
+
UNICODE_OTHER_SYMBOL,
|
56
|
+
UNICODE_LINE_SEPARATOR,
|
57
|
+
UNICODE_PARAGRAPH_SEPARATOR,
|
58
|
+
UNICODE_SPACE_SEPARATOR
|
59
|
+
} UnicodeType;
|
60
|
+
|
61
|
+
bool unichar_isalnum(unichar c);
|
62
|
+
bool unichar_isalpha(unichar c);
|
63
|
+
bool unichar_iscntrl(unichar c);
|
64
|
+
bool unichar_isdigit(unichar c);
|
65
|
+
bool unichar_isgraph(unichar c);
|
66
|
+
bool unichar_islower(unichar c);
|
67
|
+
bool unichar_isprint(unichar c);
|
68
|
+
bool unichar_ispunct(unichar c);
|
69
|
+
bool unichar_isspace(unichar c);
|
70
|
+
bool unichar_isupper(unichar c);
|
71
|
+
bool unichar_istitle(unichar c);
|
72
|
+
bool unichar_isnewline(unichar c);
|
73
|
+
bool unichar_isxdigit(unichar c);
|
74
|
+
bool unichar_isassigned(unichar c);
|
75
|
+
bool unichar_iswide(unichar c);
|
76
|
+
bool unichar_isvalid(unichar c);
|
77
|
+
|
78
|
+
unichar unichar_toupper(unichar c);
|
79
|
+
unichar unichar_tolower(unichar c);
|
80
|
+
unichar unichar_totitle(unichar c);
|
81
|
+
|
82
|
+
int unichar_digit_value(unichar c);
|
83
|
+
int unichar_xdigit_value(unichar c);
|
84
|
+
|
85
|
+
UnicodeType unichar_type(unichar c);
|
86
|
+
|
87
|
+
bool unichar_mirror(unichar c, unichar *mirrored);
|
88
|
+
|
89
|
+
|
90
|
+
typedef enum {
|
91
|
+
UNICODE_BREAK_MANDATORY,
|
92
|
+
UNICODE_BREAK_CARRIAGE_RETURN,
|
93
|
+
UNICODE_BREAK_LINE_FEED,
|
94
|
+
UNICODE_BREAK_COMBINING_MARK,
|
95
|
+
UNICODE_BREAK_SURROGATE,
|
96
|
+
UNICODE_BREAK_ZERO_WIDTH_SPACE,
|
97
|
+
UNICODE_BREAK_INSEPARABLE,
|
98
|
+
UNICODE_BREAK_NON_BREAKING_GLUE,
|
99
|
+
UNICODE_BREAK_CONTINGENT,
|
100
|
+
UNICODE_BREAK_SPACE,
|
101
|
+
UNICODE_BREAK_AFTER,
|
102
|
+
UNICODE_BREAK_BEFORE,
|
103
|
+
UNICODE_BREAK_BEFORE_AND_AFTER,
|
104
|
+
UNICODE_BREAK_HYPHEN,
|
105
|
+
UNICODE_BREAK_NON_STARTER,
|
106
|
+
UNICODE_BREAK_OPEN_PUNCTUATION,
|
107
|
+
UNICODE_BREAK_CLOSE_PUNCTUATION,
|
108
|
+
UNICODE_BREAK_QUOTATION,
|
109
|
+
UNICODE_BREAK_EXCLAMATION,
|
110
|
+
UNICODE_BREAK_IDEOGRAPHIC,
|
111
|
+
UNICODE_BREAK_NUMERIC,
|
112
|
+
UNICODE_BREAK_INFIX_SEPARATOR,
|
113
|
+
UNICODE_BREAK_SYMBOL,
|
114
|
+
UNICODE_BREAK_ALPHABETIC,
|
115
|
+
UNICODE_BREAK_PREFIX,
|
116
|
+
UNICODE_BREAK_POSTFIX,
|
117
|
+
UNICODE_BREAK_COMPLEX_CONTEXT,
|
118
|
+
UNICODE_BREAK_AMBIGUOUS,
|
119
|
+
UNICODE_BREAK_UNKNOWN,
|
120
|
+
UNICODE_BREAK_NEXT_LINE,
|
121
|
+
UNICODE_BREAK_WORD_JOINER,
|
122
|
+
UNICODE_BREAK_HANGUL_L_JAMO,
|
123
|
+
UNICODE_BREAK_HANGUL_V_JAMO,
|
124
|
+
UNICODE_BREAK_HANGUL_T_JAMO,
|
125
|
+
UNICODE_BREAK_HANGUL_LV_SYLLABLE,
|
126
|
+
UNICODE_BREAK_HANGUL_LVT_SYLLABLE
|
127
|
+
} UnicodeBreakType;
|
128
|
+
|
129
|
+
UnicodeBreakType unichar_break_type(unichar c);
|
130
|
+
|
131
|
+
|
132
|
+
typedef enum {
|
133
|
+
NORMALIZE_DEFAULT,
|
134
|
+
NORMALIZE_NFD = NORMALIZE_DEFAULT,
|
135
|
+
NORMALIZE_DEFAULT_COMPOSE,
|
136
|
+
NORMALIZE_NFC = NORMALIZE_DEFAULT_COMPOSE,
|
137
|
+
NORMALIZE_ALL,
|
138
|
+
NORMALIZE_NFKD = NORMALIZE_ALL,
|
139
|
+
NORMALIZE_ALL_COMPOSE,
|
140
|
+
NORMALIZE_NFKC = NORMALIZE_ALL_COMPOSE
|
141
|
+
} NormalizeMode;
|
142
|
+
|
143
|
+
void unicode_canonical_ordering(unichar *str, size_t len);
|
144
|
+
unichar *unicode_canonical_decomposition(unichar c, size_t *result_len);
|
145
|
+
|
146
|
+
char *utf_normalize(const char *str, NormalizeMode mode);
|
147
|
+
char *utf_normalize_n(const char *str, NormalizeMode mode, size_t len);
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
char *utf_upcase(const char *str);
|
153
|
+
char *utf_upcase_n(const char *str, size_t len);
|
154
|
+
char *utf_downcase(const char *str);
|
155
|
+
char *utf_downcase_n(const char *str, size_t len);
|
156
|
+
char *utf_foldcase(const char *str);
|
157
|
+
char *utf_foldcase_n(const char *str, size_t len);
|
158
|
+
|
159
|
+
unichar utf_char(const char *str);
|
160
|
+
unichar utf_char_n(const char *str, size_t max);
|
161
|
+
unichar utf_char_validated(const char *str);
|
162
|
+
unichar utf_char_validated_n(const char *str, size_t max);
|
163
|
+
|
164
|
+
extern const char * const s_utf_skip_lengths;
|
165
|
+
#define utf_next(str) ((str) + s_utf_skip_lengths[*(const unsigned char *)(str)])
|
166
|
+
char *utf_find_next(const char *p, const char *end);
|
167
|
+
char *utf_prev(const char *p);
|
168
|
+
char *utf_find_prev(const char *begin, const char *p);
|
169
|
+
char *utf_offset_to_pointer(const char *str, long offset);
|
170
|
+
long utf_pointer_to_offset(const char *str, const char *pos);
|
171
|
+
|
172
|
+
void utf_copy(char *dest, const char *src);
|
173
|
+
void utf_copy_n(char *dest, const char *src, size_t n);
|
174
|
+
void utf_append(char *dest, const char *src);
|
175
|
+
void utf_append_n(char *dest, const char *src, size_t n);
|
176
|
+
int utf_collate(const char *a, const char *b);
|
177
|
+
char *utf_collate_key(const char *str);
|
178
|
+
char *utf_collate_key_n(const char *str, size_t len);
|
179
|
+
int utf_char_index(const char *str, unichar c);
|
180
|
+
int utf_char_index_n(const char *str, unichar c, size_t len);
|
181
|
+
int utf_char_rindex(const char *str, unichar c);
|
182
|
+
int utf_char_rindex_n(const char *str, unichar c, size_t len);
|
183
|
+
int utf_index(const char *haystack, const char *needle);
|
184
|
+
int utf_index_n(const char *haystack, const char *needle, size_t len);
|
185
|
+
int utf_rindex(const char *haystack, const char *needle);
|
186
|
+
int utf_rindex_n(const char *haystack, const char *needle, size_t len);
|
187
|
+
bool utf_has_prefix(const char *str, const char *prefix);
|
188
|
+
long utf_length(const char *str);
|
189
|
+
long utf_length_n(const char *str, long len);
|
190
|
+
size_t utf_width(const char *str);
|
191
|
+
size_t utf_width_n(const char *str, size_t len);
|
192
|
+
size_t utf_byte_length(const char *str);
|
193
|
+
char *utf_reverse(const char *str);
|
194
|
+
char *utf_reverse_n(const char *str, size_t len);
|
195
|
+
|
196
|
+
bool utf_isvalid(const char *str);
|
197
|
+
bool utf_isvalid_n(const char *str, size_t max, const char **end);
|
198
|
+
|
199
|
+
/* XXX: should probably name stuff utf32 instead of ucs4 */
|
200
|
+
int unichar_to_utf(unichar c, char *result);
|
201
|
+
char *ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written);
|
202
|
+
char *ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written);
|
203
|
+
unichar *utf8_to_ucs4_fast(const char *str, size_t *items_written);
|
204
|
+
unichar *utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written);
|
205
|
+
unichar *utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written);
|
206
|
+
unichar *utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written);
|
207
|
+
|
208
|
+
#endif /* UNICODE_H */
|
@@ -0,0 +1,1332 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF-8 string operations.
|
3
|
+
*
|
4
|
+
* Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
#include <ruby.h>
|
9
|
+
#include <assert.h>
|
10
|
+
#include <locale.h>
|
11
|
+
#include <stdbool.h>
|
12
|
+
#include <stddef.h>
|
13
|
+
#include <stdint.h>
|
14
|
+
#include <stdlib.h>
|
15
|
+
#include <string.h>
|
16
|
+
#include <wchar.h>
|
17
|
+
|
18
|
+
#include "unicode.h"
|
19
|
+
#include "private.h"
|
20
|
+
|
21
|
+
|
22
|
+
#define UNICODE_ISVALID(char) \
|
23
|
+
((char) < 0x110000 && \
|
24
|
+
(((char) & 0xffffff800) != 0xd800) && \
|
25
|
+
((char) < 0xfdd0 || (char) > 0xfdef) && \
|
26
|
+
((char) & 0xfffe) != 0xfffe)
|
27
|
+
|
28
|
+
|
29
|
+
/* {{{1
|
30
|
+
* These are a couple of constants we use for dealing with the bit-twiddling
|
31
|
+
* necessary when dealing with UTF-8 character sequences.
|
32
|
+
*/
|
33
|
+
enum {
|
34
|
+
BIT_1 = 7,
|
35
|
+
BIT_X = 6,
|
36
|
+
BIT_2 = 5,
|
37
|
+
BIT_3 = 4,
|
38
|
+
BIT_4 = 3,
|
39
|
+
BIT_5 = 2,
|
40
|
+
BIT_6 = 1,
|
41
|
+
|
42
|
+
OCT_1 = ((1 << (BIT_1 + 1)) - 1) ^ 0xff, /* 0000 0000 */
|
43
|
+
OCT_X = ((1 << (BIT_X + 1)) - 1) ^ 0xff, /* 1000 0000 */
|
44
|
+
OCT_2 = ((1 << (BIT_2 + 1)) - 1) ^ 0xff, /* 1100 0000 */
|
45
|
+
OCT_3 = ((1 << (BIT_3 + 1)) - 1) ^ 0xff, /* 1110 0000 */
|
46
|
+
OCT_4 = ((1 << (BIT_4 + 1)) - 1) ^ 0xff, /* 1111 0000 */
|
47
|
+
OCT_5 = ((1 << (BIT_5 + 1)) - 1) ^ 0xff, /* 1111 1000 */
|
48
|
+
OCT_6 = ((1 << (BIT_6 + 1)) - 1) ^ 0xff, /* 1111 1100 */
|
49
|
+
|
50
|
+
UNI_LEN1 = 0x80,
|
51
|
+
UNI_LEN2 = 0x800,
|
52
|
+
UNI_LEN3 = 0x10000,
|
53
|
+
UNI_LEN4 = 0x200000,
|
54
|
+
UNI_LEN5 = 0x4000000,
|
55
|
+
|
56
|
+
MASK_X = (1 << BIT_X) - 1, /* 0011 1111 */
|
57
|
+
TEST_X = MASK_X ^ 0xff, /* 1100 0000 */
|
58
|
+
};
|
59
|
+
|
60
|
+
/* {{{1
|
61
|
+
* Determine whether ‘p’ is part of a UTF-8 multi-byte sequence.
|
62
|
+
*/
|
63
|
+
#define CONT_X(p) ((((unsigned char)p) & TEST_X) == OCT_X)
|
64
|
+
|
65
|
+
/* {{{1
|
66
|
+
* Add the bits from ‘p’ to ‘c’, which is first shifted right to make room for
|
67
|
+
* the additional bits.
|
68
|
+
*/
|
69
|
+
#define ADD_X(c, p) (((c) << BIT_X) | (((unsigned char)p) & MASK_X))
|
70
|
+
|
71
|
+
/* {{{1
|
72
|
+
* Put bits from ‘c’ into ‘p’ and shift them off of ‘c’ afterwards.
|
73
|
+
*/
|
74
|
+
#define PUT_X(c, p) ((p) = OCT_X | ((c) & MASK_X), (c) >> BIT_X)
|
75
|
+
|
76
|
+
|
77
|
+
/* {{{1
|
78
|
+
* s_utf_skip_lengths: This table is used for keeping track of how long a given
|
79
|
+
* UTF-8 character sequence is from the contents of the first byte.
|
80
|
+
*/
|
81
|
+
static const uint8_t s_utf_skip_length_data[256] = {
|
82
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
83
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
84
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
85
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
86
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
87
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
88
|
+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
89
|
+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
|
90
|
+
};
|
91
|
+
|
92
|
+
|
93
|
+
const char * const s_utf_skip_lengths = (const char *)s_utf_skip_length_data;
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
/* {{{1
|
98
|
+
* Private function used to calculate the length and mask to use when dealing
|
99
|
+
* with a given UTF-8 character sequence.
|
100
|
+
*/
|
101
|
+
static inline void
|
102
|
+
_utf_compute(unsigned char c, int *mask, int *len)
|
103
|
+
{
|
104
|
+
if (c < 0x80) {
|
105
|
+
*len = 1;
|
106
|
+
*mask = 0x7f;
|
107
|
+
} else if ((c & 0xe0) == 0xc0) {
|
108
|
+
*len = 2;
|
109
|
+
*mask = 0x1f;
|
110
|
+
} else if ((c & 0xf0) == 0xe0) {
|
111
|
+
*len = 3;
|
112
|
+
*mask = 0x0f;
|
113
|
+
} else if ((c & 0xf8) == 0xf0) {
|
114
|
+
*len = 4;
|
115
|
+
*mask = 0x07;
|
116
|
+
} else if ((c & 0xfc) == 0xf8) {
|
117
|
+
*len = 5;
|
118
|
+
*mask = 0x03;
|
119
|
+
} else if ((c & 0xfe) == 0xfc) {
|
120
|
+
*len = 6;
|
121
|
+
*mask = 0x01;
|
122
|
+
} else {
|
123
|
+
*len = -1;
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
/* {{{1
|
128
|
+
* Private function used to figure out the length of the UTF-8 representation
|
129
|
+
* of a given Unicode character (UTF-32).
|
130
|
+
*/
|
131
|
+
static inline unsigned short
|
132
|
+
_utf_length(const unichar c)
|
133
|
+
{
|
134
|
+
if (c < UNI_LEN1)
|
135
|
+
return 1;
|
136
|
+
else if (c < UNI_LEN2)
|
137
|
+
return 2;
|
138
|
+
else if (c < UNI_LEN3)
|
139
|
+
return 3;
|
140
|
+
else if (c < UNI_LEN4)
|
141
|
+
return 4;
|
142
|
+
else if (c < UNI_LEN5)
|
143
|
+
return 5;
|
144
|
+
else
|
145
|
+
return 6;
|
146
|
+
}
|
147
|
+
|
148
|
+
/* {{{1
|
149
|
+
* Private function used to retrieve a UTF-32 character from an UTF-8 character
|
150
|
+
* sequence given a mask and length previously retrieved with _utf_compute().
|
151
|
+
*/
|
152
|
+
static inline unichar
|
153
|
+
_utf_get(const char *str, int mask, int len)
|
154
|
+
{
|
155
|
+
unichar c = (unsigned char)str[0] & mask;
|
156
|
+
|
157
|
+
for (int i = 1; i < len; i++) {
|
158
|
+
unsigned char ch = ((const unsigned char *)str)[i];
|
159
|
+
|
160
|
+
if (CONT_X(ch)) {
|
161
|
+
c = ADD_X(c, ch);
|
162
|
+
} else {
|
163
|
+
c = UTF_BAD_INPUT_UNICHAR;
|
164
|
+
break;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
return c;
|
169
|
+
}
|
170
|
+
|
171
|
+
|
172
|
+
/* {{{1
|
173
|
+
* Retrieve a UTF-32 character from a UTF-8 character sequence.
|
174
|
+
*/
|
175
|
+
unichar
|
176
|
+
utf_char(const char *str)
|
177
|
+
{
|
178
|
+
int mask;
|
179
|
+
int len;
|
180
|
+
|
181
|
+
_utf_compute(*str, &mask, &len);
|
182
|
+
|
183
|
+
return (len > -1) ? _utf_get(str, mask, len) : UTF_BAD_INPUT_UNICHAR;
|
184
|
+
}
|
185
|
+
|
186
|
+
|
187
|
+
/* {{{1
|
188
|
+
* TODO
|
189
|
+
*/
|
190
|
+
unichar
|
191
|
+
utf_char_n(const char *str, size_t max)
|
192
|
+
{
|
193
|
+
size_t len;
|
194
|
+
unichar c = (unsigned char)*str;
|
195
|
+
|
196
|
+
/* TODO: _utf_compute() here */
|
197
|
+
if (c < 0x80) {
|
198
|
+
return c;
|
199
|
+
} else if (c < 0xc0) {
|
200
|
+
return UTF_BAD_INPUT_UNICHAR;
|
201
|
+
} else if (c < 0xe0) {
|
202
|
+
len = 2;
|
203
|
+
c &= 0x1f;
|
204
|
+
} else if (c < 0xf0) {
|
205
|
+
len = 3;
|
206
|
+
c &= 0x0f;
|
207
|
+
} else if (c < 0xf8) {
|
208
|
+
len = 4;
|
209
|
+
c &= 0x07;
|
210
|
+
} else if (c < 0xfc) {
|
211
|
+
len = 5;
|
212
|
+
c &= 0x03;
|
213
|
+
} else if (c < 0xfe) {
|
214
|
+
len = 6;
|
215
|
+
c &= 0x01;
|
216
|
+
} else {
|
217
|
+
return UTF_BAD_INPUT_UNICHAR;
|
218
|
+
}
|
219
|
+
|
220
|
+
if (len > max) {
|
221
|
+
for (size_t i = 1; i < max; i++) {
|
222
|
+
if (!CONT_X(str[i]))
|
223
|
+
return UTF_BAD_INPUT_UNICHAR;
|
224
|
+
}
|
225
|
+
|
226
|
+
return UTF_INCOMPLETE_INPUT_UNICHAR;
|
227
|
+
}
|
228
|
+
|
229
|
+
for (size_t i = 1; i < len; i++) {
|
230
|
+
unsigned char ch = ((const unsigned char *)str)[i];
|
231
|
+
|
232
|
+
if (!CONT_X(ch))
|
233
|
+
return (ch != NUL) ? UTF_BAD_INPUT_UNICHAR : UTF_INCOMPLETE_INPUT_UNICHAR;
|
234
|
+
|
235
|
+
c = ADD_X(c, ch);
|
236
|
+
}
|
237
|
+
|
238
|
+
return (_utf_length(c) == len) ? c : UTF_BAD_INPUT_UNICHAR;
|
239
|
+
}
|
240
|
+
|
241
|
+
|
242
|
+
/* {{{1
|
243
|
+
* Retrieve a UTF-32 character from a UTF-8 character sequence. This function
|
244
|
+
* does additional checking while converitng, such as not overruning a maximum
|
245
|
+
* length and checks for incomplete, invalid or out-of-range characters.
|
246
|
+
*/
|
247
|
+
unichar
|
248
|
+
utf_char_validated(const char *str)
|
249
|
+
{
|
250
|
+
unichar result = utf_char(str);
|
251
|
+
|
252
|
+
if (result & 0x80000000) {
|
253
|
+
return result;
|
254
|
+
} else if (!unichar_isvalid(result)) {
|
255
|
+
return UTF_BAD_INPUT_UNICHAR;
|
256
|
+
} else {
|
257
|
+
return result;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
|
262
|
+
/* {{{1 */
|
263
|
+
unichar
|
264
|
+
utf_char_validated_n(const char *str, size_t max)
|
265
|
+
{
|
266
|
+
unichar result = utf_char_n(str, max);
|
267
|
+
|
268
|
+
if (result & 0x80000000) {
|
269
|
+
return result;
|
270
|
+
} else if (!unichar_isvalid(result)) {
|
271
|
+
return UTF_BAD_INPUT_UNICHAR;
|
272
|
+
} else {
|
273
|
+
return result;
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
|
278
|
+
/* {{{1
|
279
|
+
* Return a pointer to the next UTF-8 character sequence in ‘str’. This
|
280
|
+
* requires that it is at the start of the previous one already and no
|
281
|
+
* additional error checking is done.
|
282
|
+
*/
|
283
|
+
/*
|
284
|
+
inline char *
|
285
|
+
utf_next(const char *str)
|
286
|
+
{
|
287
|
+
return (char *)str + s_utf_skip_lengths[*(const uchar *)str];
|
288
|
+
}
|
289
|
+
*/
|
290
|
+
|
291
|
+
|
292
|
+
/* {{{1
|
293
|
+
* Synchronize and go to the next UTF-8 character sequence in ‘p’. This search
|
294
|
+
* will not go beyond ‘end’. ‹NULL› is returned if it couldn't be found.
|
295
|
+
*/
|
296
|
+
char *
|
297
|
+
utf_find_next(const char *p, const char *end)
|
298
|
+
{
|
299
|
+
if (*p != NUL) {
|
300
|
+
if (end != NULL) {
|
301
|
+
for (p++; p < end && CONT_X(*p); p++) {
|
302
|
+
/* this loop intentionally left empty */
|
303
|
+
}
|
304
|
+
} else {
|
305
|
+
for (p++; CONT_X(*p); p++) {
|
306
|
+
/* this loop intentionally left empty */
|
307
|
+
}
|
308
|
+
}
|
309
|
+
}
|
310
|
+
return (p == end) ? NULL : (char *)p;
|
311
|
+
}
|
312
|
+
|
313
|
+
|
314
|
+
/* {{{1
|
315
|
+
* Return a pointer to the previous UTF-8 character sequence in ‘str’.
|
316
|
+
*/
|
317
|
+
char *
|
318
|
+
utf_prev(const char *p)
|
319
|
+
{
|
320
|
+
while (true) {
|
321
|
+
p--;
|
322
|
+
|
323
|
+
if (!CONT_X(*p))
|
324
|
+
return (char *)p;
|
325
|
+
}
|
326
|
+
}
|
327
|
+
|
328
|
+
|
329
|
+
/* {{{1
|
330
|
+
* Synchronize and go to the previous UTF-8 character sequence in ‘p’. This
|
331
|
+
* search will not go beyond ‘begin’. ‹NULL› is returned if it couldn't be
|
332
|
+
* found.
|
333
|
+
*/
|
334
|
+
char *
|
335
|
+
utf_find_prev(const char *begin, const char *p)
|
336
|
+
{
|
337
|
+
for (p--; p >= begin; p--) {
|
338
|
+
if (!CONT_X(*p))
|
339
|
+
return (char *)p;
|
340
|
+
}
|
341
|
+
|
342
|
+
return NULL;
|
343
|
+
}
|
344
|
+
|
345
|
+
|
346
|
+
/* {{{1
|
347
|
+
* Convert an integer offset to a pointer within ‘str’.
|
348
|
+
*
|
349
|
+
*/
|
350
|
+
char *
|
351
|
+
utf_offset_to_pointer(const char *str, long offset)
|
352
|
+
{
|
353
|
+
const char *p = str;
|
354
|
+
|
355
|
+
if (offset > 0) {
|
356
|
+
while (offset-- > 0)
|
357
|
+
p = utf_next(p);
|
358
|
+
} else {
|
359
|
+
while (offset != 0) {
|
360
|
+
const char *base = p;
|
361
|
+
p += offset;
|
362
|
+
while ((*p & 0xc0) == 0x80)
|
363
|
+
p--;
|
364
|
+
|
365
|
+
offset += utf_pointer_to_offset(p, base);
|
366
|
+
}
|
367
|
+
}
|
368
|
+
|
369
|
+
return (char *)p;
|
370
|
+
}
|
371
|
+
|
372
|
+
|
373
|
+
/* {{{1
|
374
|
+
* Convert a pointer to an integer offset within ‘str’.
|
375
|
+
*/
|
376
|
+
long
|
377
|
+
utf_pointer_to_offset(const char *str, const char *pos)
|
378
|
+
{
|
379
|
+
if (pos < str)
|
380
|
+
return -utf_pointer_to_offset(pos, str);
|
381
|
+
|
382
|
+
long offset = 0;
|
383
|
+
for (const char *p = str; p < pos; p = utf_next(p))
|
384
|
+
offset++;
|
385
|
+
|
386
|
+
return offset;
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
/* {{{1
|
391
|
+
* Copy the contents of an UTF-8 string to another.
|
392
|
+
*/
|
393
|
+
void
|
394
|
+
utf_copy(char *dest, const char *src)
|
395
|
+
{
|
396
|
+
strcpy(dest, src);
|
397
|
+
}
|
398
|
+
|
399
|
+
|
400
|
+
/* {{{1
|
401
|
+
* Copy at most n Unicode characters from an UTF-8 string to another. The
|
402
|
+
* destination string will be ‹NUL›-terminated properly.
|
403
|
+
*/
|
404
|
+
void
|
405
|
+
utf_copy_n(char *dest, const char *src, size_t n)
|
406
|
+
{
|
407
|
+
const char *p;
|
408
|
+
|
409
|
+
for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
|
410
|
+
/* this loop intentionally left empty */;
|
411
|
+
}
|
412
|
+
|
413
|
+
strncpy(dest, src, p - src);
|
414
|
+
dest[p - src] = NUL;
|
415
|
+
}
|
416
|
+
|
417
|
+
|
418
|
+
/* {{{1
|
419
|
+
* Append an UTF-8 string onto another.
|
420
|
+
*/
|
421
|
+
void
|
422
|
+
utf_append(char *dest, const char *src)
|
423
|
+
{
|
424
|
+
strcat(dest, src);
|
425
|
+
}
|
426
|
+
|
427
|
+
|
428
|
+
/* {{{1
|
429
|
+
* Append at most ‘n’ Unicode character from an UTF-8 string onto another.
|
430
|
+
*/
|
431
|
+
void
|
432
|
+
utf_append_n(char *dest, const char *src, size_t n)
|
433
|
+
{
|
434
|
+
const char *p;
|
435
|
+
|
436
|
+
for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
|
437
|
+
/* this loop intentionally left empty */;
|
438
|
+
}
|
439
|
+
|
440
|
+
strncat(dest, src, p - src);
|
441
|
+
dest[p - src] = NUL;
|
442
|
+
}
|
443
|
+
|
444
|
+
|
445
|
+
/* {{{1
|
446
|
+
* Compare two strings for ordering using the linguistically correct rules of
|
447
|
+
* the current locale.
|
448
|
+
*/
|
449
|
+
int
|
450
|
+
utf_collate(const char *a, const char *b)
|
451
|
+
{
|
452
|
+
assert(a != NULL);
|
453
|
+
assert(b != NULL);
|
454
|
+
|
455
|
+
unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
|
456
|
+
unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
|
457
|
+
setlocale(LC_COLLATE, "");
|
458
|
+
int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
|
459
|
+
|
460
|
+
free(a_norm);
|
461
|
+
free(b_norm);
|
462
|
+
|
463
|
+
return result;
|
464
|
+
}
|
465
|
+
|
466
|
+
|
467
|
+
/* {{{1
|
468
|
+
* We need UTF-8 encoding of numbers to encode the weights if
|
469
|
+
* we are using wcsxfrm. However, we aren't encoding Unicode
|
470
|
+
* characters, so we can't simply use unichar_to_utf.
|
471
|
+
*
|
472
|
+
* The following routine is taken (with modification) from GNU
|
473
|
+
* libc's strxfrm routine:
|
474
|
+
*
|
475
|
+
* Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
|
476
|
+
* Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
|
477
|
+
*/
|
478
|
+
static inline int
|
479
|
+
_utf_encode(char *buf, wchar_t c)
|
480
|
+
{
|
481
|
+
int retval;
|
482
|
+
|
483
|
+
if (c < 0x80) {
|
484
|
+
if (buf != NULL)
|
485
|
+
*buf++ = (char)c;
|
486
|
+
retval = 1;
|
487
|
+
} else {
|
488
|
+
int step;
|
489
|
+
|
490
|
+
for (step = 2; step < 6; step++) {
|
491
|
+
if ((c & (~(uint32_t)0 << (5 * step + 1))) == 0)
|
492
|
+
break;
|
493
|
+
}
|
494
|
+
|
495
|
+
retval = step;
|
496
|
+
|
497
|
+
if (buf != NULL) {
|
498
|
+
*buf = (unsigned char)(~0xff >> step);
|
499
|
+
step--;
|
500
|
+
do {
|
501
|
+
c = PUT_X(c, buf[step]);
|
502
|
+
} while (--step > 0);
|
503
|
+
*buf |= c;
|
504
|
+
}
|
505
|
+
}
|
506
|
+
|
507
|
+
return retval;
|
508
|
+
}
|
509
|
+
|
510
|
+
|
511
|
+
/* {{{1
|
512
|
+
* Generate a collation key from a string which can be compared with other
|
513
|
+
* collation keys using str_compare().
|
514
|
+
*/
|
515
|
+
static char *
|
516
|
+
utf_collate_key_impl(const char *str, size_t len, bool use_len)
|
517
|
+
{
|
518
|
+
assert(str != NULL);
|
519
|
+
|
520
|
+
unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
|
521
|
+
setlocale(LC_COLLATE, "");
|
522
|
+
size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
|
523
|
+
wchar_t result_wc[xfrm_len + 1];
|
524
|
+
wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
|
525
|
+
|
526
|
+
int result_len = 0;
|
527
|
+
for (size_t i = 0; i < xfrm_len; i++)
|
528
|
+
result_len += _utf_encode(NULL, result_wc[i]);
|
529
|
+
|
530
|
+
char *result = ALLOC_N(char, result_len + 1);
|
531
|
+
result_len = 0;
|
532
|
+
for (size_t i = 0; i < xfrm_len; i++)
|
533
|
+
result_len += _utf_encode(result + result_len, result_wc[i]);
|
534
|
+
result[result_len] = NUL;
|
535
|
+
|
536
|
+
free(str_norm);
|
537
|
+
|
538
|
+
return result;
|
539
|
+
}
|
540
|
+
|
541
|
+
|
542
|
+
/* {{{1
|
543
|
+
* Generate a collation key from a string which can be compared with other
|
544
|
+
* collation keys using str_compare().
|
545
|
+
*/
|
546
|
+
char *
|
547
|
+
utf_collate_key(const char *str)
|
548
|
+
{
|
549
|
+
return utf_collate_key_impl(str, 0, false);
|
550
|
+
}
|
551
|
+
|
552
|
+
|
553
|
+
/* {{{1
|
554
|
+
* Generate a collation key from a string (of length ‘len’) which can be
|
555
|
+
* compared with other collation keys using str_compare().
|
556
|
+
*/
|
557
|
+
char *
|
558
|
+
utf_collate_key_n(const char *str, size_t len)
|
559
|
+
{
|
560
|
+
return utf_collate_key_impl(str, len, true);
|
561
|
+
}
|
562
|
+
|
563
|
+
|
564
|
+
/* {{{1
|
565
|
+
* Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
|
566
|
+
* ‘haystack_len’.
|
567
|
+
*/
|
568
|
+
static int
|
569
|
+
str_index_n(const char *haystack, const char *needle, size_t haystack_len)
|
570
|
+
{
|
571
|
+
assert(haystack != NULL);
|
572
|
+
assert(needle != NULL);
|
573
|
+
|
574
|
+
size_t needle_len = strlen(needle);
|
575
|
+
|
576
|
+
if (needle_len == 0)
|
577
|
+
return 0;
|
578
|
+
|
579
|
+
if (haystack_len < needle_len)
|
580
|
+
return -1;
|
581
|
+
|
582
|
+
const char *end = haystack + haystack_len - needle_len;
|
583
|
+
for (const char *p = haystack; *p != '\0' && p <= end; p++) {
|
584
|
+
size_t i;
|
585
|
+
|
586
|
+
for (i = 0; i < needle_len; i++) {
|
587
|
+
if (p[i] != needle[i])
|
588
|
+
break;
|
589
|
+
}
|
590
|
+
|
591
|
+
if (i == needle_len)
|
592
|
+
return p - haystack;
|
593
|
+
}
|
594
|
+
|
595
|
+
return -1;
|
596
|
+
}
|
597
|
+
|
598
|
+
|
599
|
+
/* {{{1
|
600
|
+
* Retrieve the index/offset of the right-most occurence of ‘needle’ in
|
601
|
+
* ‘haystack’, or -1 if it doesn't exist.
|
602
|
+
*/
|
603
|
+
static int
|
604
|
+
str_rindex(const char *haystack, const char *needle)
|
605
|
+
{
|
606
|
+
assert(haystack != NULL);
|
607
|
+
assert(needle != NULL);
|
608
|
+
|
609
|
+
size_t needle_len = strlen(needle);
|
610
|
+
size_t haystack_len = strlen(haystack);
|
611
|
+
|
612
|
+
if (needle_len == 0)
|
613
|
+
return haystack_len;
|
614
|
+
|
615
|
+
if (haystack_len < needle_len)
|
616
|
+
return -1;
|
617
|
+
|
618
|
+
for (const char *p = haystack + haystack_len - needle_len; p >= haystack; p--) {
|
619
|
+
size_t i;
|
620
|
+
|
621
|
+
for (i = 0; i < needle_len; i++) {
|
622
|
+
if (p[i] != needle[i])
|
623
|
+
break;
|
624
|
+
}
|
625
|
+
|
626
|
+
if (i == needle_len)
|
627
|
+
return p - haystack;
|
628
|
+
}
|
629
|
+
|
630
|
+
return -1;
|
631
|
+
}
|
632
|
+
|
633
|
+
|
634
|
+
/* {{{1
|
635
|
+
* Retrieve the index/offset of the right-most occurence of ‘needle’ in
|
636
|
+
* ‘haystack’, or -1 if it doesn't exist.
|
637
|
+
*/
|
638
|
+
static int
|
639
|
+
str_rindex_n(const char *haystack, const char *needle, size_t haystack_len)
|
640
|
+
{
|
641
|
+
assert(haystack != NULL);
|
642
|
+
assert(needle != NULL);
|
643
|
+
|
644
|
+
size_t needle_len = strlen(needle);
|
645
|
+
const char *haystack_max = haystack + haystack_len;
|
646
|
+
const char *p = haystack;
|
647
|
+
|
648
|
+
while (p < haystack_max && *p != '\0')
|
649
|
+
p++;
|
650
|
+
|
651
|
+
if (p < haystack + needle_len)
|
652
|
+
return -1;
|
653
|
+
|
654
|
+
p -= needle_len;
|
655
|
+
|
656
|
+
for ( ; p >= haystack; p--) {
|
657
|
+
size_t i;
|
658
|
+
|
659
|
+
for (i = 0; i < needle_len; i++) {
|
660
|
+
if (p[i] != needle[i])
|
661
|
+
break;
|
662
|
+
}
|
663
|
+
|
664
|
+
if (i == needle_len)
|
665
|
+
return p - haystack;
|
666
|
+
}
|
667
|
+
|
668
|
+
return -1;
|
669
|
+
}
|
670
|
+
|
671
|
+
|
672
|
+
/* {{{1
|
673
|
+
* Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
|
674
|
+
* doesn't exist.
|
675
|
+
*/
|
676
|
+
int
|
677
|
+
utf_char_index(const char *str, unichar c)
|
678
|
+
{
|
679
|
+
char ch[7];
|
680
|
+
|
681
|
+
ch[unichar_to_utf(c, ch)] = NUL;
|
682
|
+
char *p = strstr(str, ch);
|
683
|
+
return (p != NULL) ? p - str : -1;
|
684
|
+
}
|
685
|
+
|
686
|
+
|
687
|
+
/* {{{1
|
688
|
+
* Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
|
689
|
+
* doesn't exist, going over at most ‘len’ bytes in ‘str’.
|
690
|
+
*/
|
691
|
+
int
|
692
|
+
utf_char_index_n(const char *str, unichar c, size_t len)
|
693
|
+
{
|
694
|
+
char ch[7];
|
695
|
+
|
696
|
+
ch[unichar_to_utf(c, ch)] = NUL;
|
697
|
+
|
698
|
+
return str_index_n(str, ch, len);
|
699
|
+
}
|
700
|
+
|
701
|
+
|
702
|
+
/* {{{1
|
703
|
+
* Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
|
704
|
+
* doesn't exist.
|
705
|
+
*/
|
706
|
+
int
|
707
|
+
utf_char_rindex(const char *str, unichar c)
|
708
|
+
{
|
709
|
+
char ch[7];
|
710
|
+
|
711
|
+
ch[unichar_to_utf(c, ch)] = NUL;
|
712
|
+
|
713
|
+
return str_rindex(str, ch);
|
714
|
+
}
|
715
|
+
|
716
|
+
|
717
|
+
/* {{{1
|
718
|
+
* Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
|
719
|
+
* doesn't exist, going over at most ‘len’ bytes in ‘str’.
|
720
|
+
*/
|
721
|
+
int
|
722
|
+
utf_char_rindex_n(const char *str, unichar c, size_t len)
|
723
|
+
{
|
724
|
+
char ch[7];
|
725
|
+
|
726
|
+
ch[unichar_to_utf(c, ch)] = NUL;
|
727
|
+
|
728
|
+
return str_rindex_n(str, ch, len);
|
729
|
+
}
|
730
|
+
|
731
|
+
|
732
|
+
/* {{{1
|
733
|
+
* Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
|
734
|
+
* -1 if it doesn't exist.
|
735
|
+
*/
|
736
|
+
int
|
737
|
+
utf_index(const char *haystack, const char *needle)
|
738
|
+
{
|
739
|
+
return strstr(haystack, needle) - haystack;
|
740
|
+
}
|
741
|
+
|
742
|
+
|
743
|
+
/* {{{1
|
744
|
+
* Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
|
745
|
+
* -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
|
746
|
+
*/
|
747
|
+
int
|
748
|
+
utf_index_n(const char *haystack, const char *needle, size_t len)
|
749
|
+
{
|
750
|
+
return str_index_n(haystack, needle, len);
|
751
|
+
}
|
752
|
+
|
753
|
+
|
754
|
+
/* {{{1
|
755
|
+
* Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
|
756
|
+
* -1 if it doesn't exist.
|
757
|
+
*/
|
758
|
+
int
|
759
|
+
utf_rindex(const char *haystack, const char *needle)
|
760
|
+
{
|
761
|
+
return str_rindex(haystack, needle);
|
762
|
+
}
|
763
|
+
|
764
|
+
|
765
|
+
/* {{{1
|
766
|
+
* Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
|
767
|
+
* -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
|
768
|
+
*/
|
769
|
+
int
|
770
|
+
utf_rindex_n(const char *haystack, const char *needle, size_t len)
|
771
|
+
{
|
772
|
+
return str_rindex_n(haystack, needle, len);
|
773
|
+
}
|
774
|
+
|
775
|
+
|
776
|
+
/* {{{1
|
777
|
+
* Check if the given string begins with ‘prefix’.
|
778
|
+
*/
|
779
|
+
bool
|
780
|
+
utf_has_prefix(const char *str, const char *prefix)
|
781
|
+
{
|
782
|
+
assert(str != NULL);
|
783
|
+
assert(prefix != NULL);
|
784
|
+
|
785
|
+
do {
|
786
|
+
if (*prefix == NUL)
|
787
|
+
return true;
|
788
|
+
else if (*str == NUL)
|
789
|
+
return false;
|
790
|
+
} while (*str++ == *prefix++);
|
791
|
+
|
792
|
+
return false;
|
793
|
+
}
|
794
|
+
|
795
|
+
|
796
|
+
/* {{{1
|
797
|
+
* Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
|
798
|
+
*/
|
799
|
+
long
|
800
|
+
utf_length(const char *str)
|
801
|
+
{
|
802
|
+
assert(str != NULL);
|
803
|
+
|
804
|
+
long n = 0;
|
805
|
+
const char *p = str;
|
806
|
+
while (*p != '\0') {
|
807
|
+
n++;
|
808
|
+
p = utf_next(p);
|
809
|
+
}
|
810
|
+
|
811
|
+
return n;
|
812
|
+
}
|
813
|
+
|
814
|
+
|
815
|
+
/* {{{1
|
816
|
+
* Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
|
817
|
+
* ‘len’ bytes.
|
818
|
+
*/
|
819
|
+
long
|
820
|
+
utf_length_n(const char *str, long len)
|
821
|
+
{
|
822
|
+
assert(str != NULL || len == 0);
|
823
|
+
|
824
|
+
if (len == 0)
|
825
|
+
return 0;
|
826
|
+
|
827
|
+
long n = 0;
|
828
|
+
const char *p = str;
|
829
|
+
const char *end = str + len;
|
830
|
+
while (p < end) {
|
831
|
+
n++;
|
832
|
+
p = utf_next(p);
|
833
|
+
}
|
834
|
+
|
835
|
+
/* This makes sure that we don’t count incomplete characters. It won’t
|
836
|
+
* save us from illegal UTF-8-sequences, however. */
|
837
|
+
if (p > end)
|
838
|
+
n--;
|
839
|
+
|
840
|
+
return n;
|
841
|
+
}
|
842
|
+
|
843
|
+
|
844
|
+
/* {{{1
|
845
|
+
* Retrieve the number of bytes making up the given UTF-8 string.
|
846
|
+
*/
|
847
|
+
size_t
|
848
|
+
utf_byte_length(const char *str)
|
849
|
+
{
|
850
|
+
return strlen(str);
|
851
|
+
}
|
852
|
+
|
853
|
+
|
854
|
+
/* {{{1
|
855
|
+
* The real implementation of utf_reverse() and utf_reverse_n() below.
|
856
|
+
*/
|
857
|
+
static char *
|
858
|
+
utf_reverse_impl(const char *str, size_t len, bool use_len)
|
859
|
+
{
|
860
|
+
if (!use_len)
|
861
|
+
len = utf_byte_length(str);
|
862
|
+
|
863
|
+
char *result = ALLOC_N(char, len + 1);
|
864
|
+
char *r = result + len;
|
865
|
+
const char *p = str;
|
866
|
+
while (*p != NUL) {
|
867
|
+
uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
|
868
|
+
r -= skip;
|
869
|
+
for (char *m = r; skip > 0; skip--)
|
870
|
+
*m++ = *p++;
|
871
|
+
}
|
872
|
+
result[len] = 0;
|
873
|
+
|
874
|
+
return result;
|
875
|
+
}
|
876
|
+
|
877
|
+
|
878
|
+
/* {{{1
|
879
|
+
* Return a new string which is ‘str’ reversed.
|
880
|
+
*/
|
881
|
+
char *
|
882
|
+
utf_reverse(const char *str)
|
883
|
+
{
|
884
|
+
return utf_reverse_impl(str, 0, false);
|
885
|
+
}
|
886
|
+
|
887
|
+
|
888
|
+
/* {{{1
|
889
|
+
* Return a new string which is ‘str’ reversed, examining at most ‘len’ bytes
|
890
|
+
* of it.
|
891
|
+
*/
|
892
|
+
char *
|
893
|
+
utf_reverse_n(const char *str, size_t len)
|
894
|
+
{
|
895
|
+
return utf_reverse_impl(str, len, true);
|
896
|
+
}
|
897
|
+
|
898
|
+
|
899
|
+
/* {{{1
|
900
|
+
* The real implementation of utf_isvalid() and utf_isvalid_n() below.
|
901
|
+
*
|
902
|
+
* TODO: this needs optimizing. Look at glib's new optimized implementation
|
903
|
+
* (2.6.0) and also separate the ‘use_max’ into two cases.
|
904
|
+
*/
|
905
|
+
#define CONTINUATION_CHAR do { \
|
906
|
+
if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
|
907
|
+
goto error; \
|
908
|
+
val <<= 6; \
|
909
|
+
val |= (*(unsigned char *)p) & 0x3f; \
|
910
|
+
} while (0);
|
911
|
+
|
912
|
+
static const char *
|
913
|
+
fast_validate(const char *str)
|
914
|
+
{
|
915
|
+
unichar val = 0;
|
916
|
+
unichar min = 0;
|
917
|
+
const char *p;
|
918
|
+
|
919
|
+
for (p = str; *p != NUL; p++) {
|
920
|
+
if (*(unsigned char *)p < 128)
|
921
|
+
continue;
|
922
|
+
|
923
|
+
const char *last = p;
|
924
|
+
|
925
|
+
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
|
926
|
+
if ((*(unsigned char *)p & 0x1e) == 0)
|
927
|
+
goto error;
|
928
|
+
p++;
|
929
|
+
if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
|
930
|
+
goto error;
|
931
|
+
} else {
|
932
|
+
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
|
933
|
+
min = (1 << 11);
|
934
|
+
val = *(unsigned char *)p & 0x0f;
|
935
|
+
goto two_remaining;
|
936
|
+
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
|
937
|
+
min = (1 << 16);
|
938
|
+
val = *(unsigned char *)p & 0x07;
|
939
|
+
} else {
|
940
|
+
goto error;
|
941
|
+
}
|
942
|
+
|
943
|
+
p++;
|
944
|
+
CONTINUATION_CHAR;
|
945
|
+
two_remaining:
|
946
|
+
p++;
|
947
|
+
CONTINUATION_CHAR;
|
948
|
+
p++;
|
949
|
+
CONTINUATION_CHAR;
|
950
|
+
|
951
|
+
if (val < min)
|
952
|
+
goto error;
|
953
|
+
|
954
|
+
if (!UNICODE_ISVALID(val))
|
955
|
+
goto error;
|
956
|
+
}
|
957
|
+
|
958
|
+
continue;
|
959
|
+
error:
|
960
|
+
return last;
|
961
|
+
}
|
962
|
+
|
963
|
+
return p;
|
964
|
+
}
|
965
|
+
|
966
|
+
static const char *
|
967
|
+
fast_validate_len(const char *str, size_t max_len)
|
968
|
+
{
|
969
|
+
unichar val = 0;
|
970
|
+
unichar min = 0;
|
971
|
+
const char *p;
|
972
|
+
|
973
|
+
for (p = str; (size_t)(p - str) < max_len && *p != NUL; p++) {
|
974
|
+
if (*(unsigned char *)p < 128)
|
975
|
+
continue;
|
976
|
+
|
977
|
+
const char *last = p;
|
978
|
+
|
979
|
+
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
|
980
|
+
if (max_len - (p - str) < 2)
|
981
|
+
goto error;
|
982
|
+
|
983
|
+
if ((*(unsigned char *)p & 0x1e) == 0)
|
984
|
+
goto error;
|
985
|
+
p++;
|
986
|
+
if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
|
987
|
+
goto error;
|
988
|
+
} else {
|
989
|
+
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
|
990
|
+
if (max_len - (p - str) < 3)
|
991
|
+
goto error;
|
992
|
+
|
993
|
+
min = (1 << 11);
|
994
|
+
val = *(unsigned char *)p & 0x0f;
|
995
|
+
goto two_remaining;
|
996
|
+
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
|
997
|
+
if (max_len - (p - str) < 4)
|
998
|
+
goto error;
|
999
|
+
|
1000
|
+
min = (1 << 16);
|
1001
|
+
val = *(unsigned char *)p & 0x07;
|
1002
|
+
} else {
|
1003
|
+
goto error;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
p++;
|
1007
|
+
CONTINUATION_CHAR;
|
1008
|
+
two_remaining:
|
1009
|
+
p++;
|
1010
|
+
CONTINUATION_CHAR;
|
1011
|
+
p++;
|
1012
|
+
CONTINUATION_CHAR;
|
1013
|
+
|
1014
|
+
if (val < min)
|
1015
|
+
goto error;
|
1016
|
+
if (!UNICODE_ISVALID(val))
|
1017
|
+
goto error;
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
continue;
|
1021
|
+
error:
|
1022
|
+
return last;
|
1023
|
+
}
|
1024
|
+
|
1025
|
+
return p;
|
1026
|
+
}
|
1027
|
+
|
1028
|
+
|
1029
|
+
/* {{{1
|
1030
|
+
* Check if ‘str’ constitutes a valid UTF-8 character sequence.
|
1031
|
+
*/
|
1032
|
+
bool
|
1033
|
+
utf_isvalid(const char *str)
|
1034
|
+
{
|
1035
|
+
const char *p = fast_validate(str);
|
1036
|
+
|
1037
|
+
return *p == NUL;
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
|
1041
|
+
/* {{{1
|
1042
|
+
* Check if ‘str’ constitutes a valid UTF-8 character sequence, examining at
|
1043
|
+
* most ‘max’ bytes. If it turns out ‘str’ isn't a valid UTF-8 character
|
1044
|
+
* sequence and ‘end’ is non-‹NULL›, ‘end’ is set to the end of the valid range
|
1045
|
+
* of bytes in ‘str’.
|
1046
|
+
*/
|
1047
|
+
bool
|
1048
|
+
utf_isvalid_n(const char *str, size_t max, const char **end)
|
1049
|
+
{
|
1050
|
+
const char *p = fast_validate_len(str, max);
|
1051
|
+
|
1052
|
+
if (end != NULL)
|
1053
|
+
*end = p;
|
1054
|
+
|
1055
|
+
return p == str + max;
|
1056
|
+
}
|
1057
|
+
|
1058
|
+
|
1059
|
+
/* {{{1
|
1060
|
+
* Check whether ‘c’ is a valid Unicode character.
|
1061
|
+
*/
|
1062
|
+
bool
|
1063
|
+
unichar_isvalid(unichar c)
|
1064
|
+
{
|
1065
|
+
return UNICODE_ISVALID(c);
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
|
1069
|
+
/* {{{1
|
1070
|
+
* Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
|
1071
|
+
* store it in ‘result’, returning the length of the stored sequence.
|
1072
|
+
*/
|
1073
|
+
int
|
1074
|
+
unichar_to_utf(unichar c, char *result)
|
1075
|
+
{
|
1076
|
+
int len = 0;
|
1077
|
+
int first;
|
1078
|
+
|
1079
|
+
if (c < UNI_LEN1) {
|
1080
|
+
first = 0;
|
1081
|
+
len = 1;
|
1082
|
+
} else if (c < UNI_LEN2) {
|
1083
|
+
first = 0xc0;
|
1084
|
+
len = 2;
|
1085
|
+
} else if (c < UNI_LEN3) {
|
1086
|
+
first = 0xe0;
|
1087
|
+
len = 3;
|
1088
|
+
} else if (c < UNI_LEN4) {
|
1089
|
+
first = 0xf0;
|
1090
|
+
len = 4;
|
1091
|
+
} else if (c < UNI_LEN5) {
|
1092
|
+
first = 0xf8;
|
1093
|
+
len = 5;
|
1094
|
+
} else {
|
1095
|
+
first = 0xfc;
|
1096
|
+
len = 6;
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
if (result != NULL) {
|
1100
|
+
for (int i = len - 1; i > 0; i--)
|
1101
|
+
c = PUT_X(c, result[i]);
|
1102
|
+
|
1103
|
+
result[0] = c | first;
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
return len;
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
|
1110
|
+
/* {{{1
|
1111
|
+
* The real implementation of ucs4_to_utf8() and ucs4_to_utf8_n() below.
|
1112
|
+
*/
|
1113
|
+
static char *
|
1114
|
+
ucs4_to_utf8_n_impl(unichar *str, size_t len, bool use_len,
|
1115
|
+
size_t *items_read, size_t *items_written)
|
1116
|
+
{
|
1117
|
+
size_t result_len = 0;
|
1118
|
+
char *result = NULL, *p;
|
1119
|
+
|
1120
|
+
for (size_t i = 0; (!use_len || i < len) && str[i] != NUL; i++) {
|
1121
|
+
if (str[i] >= 0x80000000) {
|
1122
|
+
if (items_read != NULL)
|
1123
|
+
*items_read = i;
|
1124
|
+
|
1125
|
+
rb_raise(rb_eArgError, "UCS-4 input contains character outside of range for UTF-8 (%lc))", str[i]);
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
result_len += _utf_length(str[i]);
|
1129
|
+
}
|
1130
|
+
|
1131
|
+
p = result = ALLOC_N(char, result_len + 1);
|
1132
|
+
size_t i;
|
1133
|
+
for (i = 0; p < result + result_len; i++)
|
1134
|
+
p += unichar_to_utf(str[i], p);
|
1135
|
+
*p = NUL;
|
1136
|
+
|
1137
|
+
if (items_written != NULL)
|
1138
|
+
*items_written = p - result;
|
1139
|
+
if (items_read != NULL)
|
1140
|
+
*items_read = i;
|
1141
|
+
|
1142
|
+
return result;
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
/* {{{1
|
1146
|
+
* Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
|
1147
|
+
* store the number of characters read and bytes written in ‘items_read’ and
|
1148
|
+
* ‘items_written’ respectivelly.
|
1149
|
+
*/
|
1150
|
+
char *
|
1151
|
+
ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written)
|
1152
|
+
{
|
1153
|
+
return ucs4_to_utf8_n_impl(str, 0, false, items_read, items_written);
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
/* {{{1
|
1157
|
+
* Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
|
1158
|
+
* store the number of characters read and bytes written in ‘items_read’ and
|
1159
|
+
* ‘items_written’ respectivelly. Examine at most ‘len’ characters from ‘str’.
|
1160
|
+
*/
|
1161
|
+
char *
|
1162
|
+
ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written)
|
1163
|
+
{
|
1164
|
+
return ucs4_to_utf8_n_impl(str, len, true, items_read, items_written);
|
1165
|
+
}
|
1166
|
+
|
1167
|
+
|
1168
|
+
/* {{{1
|
1169
|
+
* The real implementation of utf8_to_ucs4_fast() and utf8_to_ucs4_fast_n()
|
1170
|
+
* below.
|
1171
|
+
*/
|
1172
|
+
static unichar *
|
1173
|
+
utf8_to_ucs4_fast_impl(const char *str, size_t len, bool use_len, size_t *items_written)
|
1174
|
+
{
|
1175
|
+
assert(str != NULL);
|
1176
|
+
|
1177
|
+
const char *p = str;
|
1178
|
+
size_t n = 0;
|
1179
|
+
if (use_len) {
|
1180
|
+
while (p < str + len && *p != NUL) {
|
1181
|
+
p = utf_next(p);
|
1182
|
+
n++;
|
1183
|
+
}
|
1184
|
+
} else {
|
1185
|
+
while (p != NUL) {
|
1186
|
+
p = utf_next(p);
|
1187
|
+
n++;
|
1188
|
+
}
|
1189
|
+
}
|
1190
|
+
|
1191
|
+
unichar *result = ALLOC_N(unichar, n + 1);
|
1192
|
+
p = str;
|
1193
|
+
size_t i;
|
1194
|
+
for (i = 0; i < n; i++) {
|
1195
|
+
unichar c = ((unsigned char *)p)[0];
|
1196
|
+
int c_len;
|
1197
|
+
|
1198
|
+
if (c < 0x80) {
|
1199
|
+
result[i] = c;
|
1200
|
+
p++;
|
1201
|
+
} else {
|
1202
|
+
/* TODO: use _utf_compute() here */
|
1203
|
+
if (c < 0xe0) {
|
1204
|
+
c_len = 2;
|
1205
|
+
c &= 0x1f;
|
1206
|
+
} else if (c < 0xf0) {
|
1207
|
+
c_len = 3;
|
1208
|
+
c &= 0x0f;
|
1209
|
+
} else if (c < 0xf8) {
|
1210
|
+
c_len = 4;
|
1211
|
+
c &= 0x07;
|
1212
|
+
} else if (c < 0xfc) {
|
1213
|
+
c_len = 5;
|
1214
|
+
c &= 0x03;
|
1215
|
+
} else {
|
1216
|
+
c_len = 6;
|
1217
|
+
c &= 0x01;
|
1218
|
+
}
|
1219
|
+
|
1220
|
+
for (int j = 1; j < c_len; j++) {
|
1221
|
+
c <<= BIT_X;
|
1222
|
+
c |= ((unsigned char *)p)[j] & MASK_X;
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
result[i] = c;
|
1226
|
+
p += c_len;
|
1227
|
+
}
|
1228
|
+
}
|
1229
|
+
result[i] = NUL;
|
1230
|
+
|
1231
|
+
if (items_written != NULL)
|
1232
|
+
*items_written = i;
|
1233
|
+
|
1234
|
+
return result;
|
1235
|
+
}
|
1236
|
+
|
1237
|
+
|
1238
|
+
/* {{{1
|
1239
|
+
* Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
|
1240
|
+
* the number of characters written in ‘items_written’.
|
1241
|
+
*/
|
1242
|
+
unichar *
|
1243
|
+
utf8_to_ucs4_fast(const char *str, size_t *items_written)
|
1244
|
+
{
|
1245
|
+
return utf8_to_ucs4_fast_impl(str, 0, false, items_written);
|
1246
|
+
}
|
1247
|
+
|
1248
|
+
|
1249
|
+
/* {{{1
|
1250
|
+
* Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
|
1251
|
+
* the number of characters written in ‘items_written’. Examine at most ‘len’
|
1252
|
+
* bytes from ‘str’.
|
1253
|
+
*/
|
1254
|
+
unichar *
|
1255
|
+
utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written)
|
1256
|
+
{
|
1257
|
+
return utf8_to_ucs4_fast_impl(str, len, true, items_written);
|
1258
|
+
}
|
1259
|
+
|
1260
|
+
|
1261
|
+
/* {{{1
|
1262
|
+
* The real implementation of utf8_to_ucs4() and utf8_to_ucs4_n() below.
|
1263
|
+
*/
|
1264
|
+
static unichar *
|
1265
|
+
utf8_to_ucs4_impl(const char *str, size_t len, bool use_len, size_t *items_read, size_t *items_written)
|
1266
|
+
{
|
1267
|
+
size_t n = 0;
|
1268
|
+
const char *p = str;
|
1269
|
+
for (; (!use_len || str + len - p > 0) && *p != NUL; p = utf_next(p)) {
|
1270
|
+
unichar c = utf_char_n(p, str + len - p);
|
1271
|
+
if (c & 0x80000000) {
|
1272
|
+
if (c == UTF_INCOMPLETE_INPUT_UNICHAR) {
|
1273
|
+
if (items_read != NULL)
|
1274
|
+
break;
|
1275
|
+
|
1276
|
+
rb_raise(rb_eArgError, "partial character sequence in UTF-8 input");
|
1277
|
+
} else {
|
1278
|
+
rb_raise(rb_eArgError, "UTF-8 input contains character outside of range for UTF-8 (%lc))", c);
|
1279
|
+
}
|
1280
|
+
|
1281
|
+
if (items_read != NULL)
|
1282
|
+
*items_read = p - str;
|
1283
|
+
|
1284
|
+
return NULL;
|
1285
|
+
} else {
|
1286
|
+
n++;
|
1287
|
+
}
|
1288
|
+
}
|
1289
|
+
|
1290
|
+
unichar *result = ALLOC_N(unichar, n + 1);
|
1291
|
+
size_t i;
|
1292
|
+
for (i = 0, p = str; i < n; i++) {
|
1293
|
+
result[i] = utf_char(p);
|
1294
|
+
p = utf_next(p);
|
1295
|
+
}
|
1296
|
+
result[i] = NUL;
|
1297
|
+
|
1298
|
+
if (items_written != NULL)
|
1299
|
+
*items_written = n;
|
1300
|
+
if (items_read != NULL)
|
1301
|
+
*items_read = p - str;
|
1302
|
+
|
1303
|
+
return result;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
|
1307
|
+
/* {{{1
|
1308
|
+
* Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
|
1309
|
+
* the number of characters written in ‘items_written’. This function does
|
1310
|
+
* additional error-checking on the input.
|
1311
|
+
*/
|
1312
|
+
unichar *
|
1313
|
+
utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written)
|
1314
|
+
{
|
1315
|
+
return utf8_to_ucs4_impl(str, 0, false, items_read, items_written);
|
1316
|
+
}
|
1317
|
+
|
1318
|
+
|
1319
|
+
/* {{{1
|
1320
|
+
* Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
|
1321
|
+
* the number of characters written in ‘items_written’. Examine at most ‘len’
|
1322
|
+
* bytes from ‘str’. This function does additional error-checking on the
|
1323
|
+
* input.
|
1324
|
+
*/
|
1325
|
+
unichar *
|
1326
|
+
utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written)
|
1327
|
+
{
|
1328
|
+
return utf8_to_ucs4_impl(str, len, true, items_read, items_written);
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
|
1332
|
+
/* }}}1 */
|