character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,208 @@
1
+ /*
2
+ * contents: Unicode handling.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+
8
+ #ifndef UNICODE_H
9
+ #define UNICODE_H
10
+
11
+
12
+ typedef uint32_t unichar;
13
+
14
+ #define MAXUNICHAR UINT32_MAX
15
+
16
+ #define MAX_UNICHAR_BYTE_LENGTH 6
17
+
18
+ #define UNICODE_N_CODEPOINTS (0x10ffff + 1)
19
+
20
+ /* unichar return used for representing bad input to a function. */
21
+ #define UTF_BAD_INPUT_UNICHAR ((unichar)-1)
22
+
23
+
24
+ /* unichar return used for representing an incomplete input to a function. */
25
+ #define UTF_INCOMPLETE_INPUT_UNICHAR ((unichar)-2)
26
+
27
+
28
+ typedef enum {
29
+ UNICODE_CONTROL,
30
+ UNICODE_FORMAT,
31
+ UNICODE_UNASSIGNED,
32
+ UNICODE_PRIVATE_USE,
33
+ UNICODE_SURROGATE,
34
+ UNICODE_LOWERCASE_LETTER,
35
+ UNICODE_MODIFIER_LETTER,
36
+ UNICODE_OTHER_LETTER,
37
+ UNICODE_TITLECASE_LETTER,
38
+ UNICODE_UPPERCASE_LETTER,
39
+ UNICODE_COMBINING_MARK,
40
+ UNICODE_ENCLOSING_MARK,
41
+ UNICODE_NON_SPACING_MARK,
42
+ UNICODE_DECIMAL_NUMBER,
43
+ UNICODE_LETTER_NUMBER,
44
+ UNICODE_OTHER_NUMBER,
45
+ UNICODE_CONNECT_PUNCTUATION,
46
+ UNICODE_DASH_PUNCTUATION,
47
+ UNICODE_CLOSE_PUNCTUATION,
48
+ UNICODE_FINAL_PUNCTUATION,
49
+ UNICODE_INITIAL_PUNCTUATION,
50
+ UNICODE_OTHER_PUNCTUATION,
51
+ UNICODE_OPEN_PUNCTUATION,
52
+ UNICODE_CURRENCY_SYMBOL,
53
+ UNICODE_MODIFIER_SYMBOL,
54
+ UNICODE_MATH_SYMBOL,
55
+ UNICODE_OTHER_SYMBOL,
56
+ UNICODE_LINE_SEPARATOR,
57
+ UNICODE_PARAGRAPH_SEPARATOR,
58
+ UNICODE_SPACE_SEPARATOR
59
+ } UnicodeType;
60
+
61
+ bool unichar_isalnum(unichar c);
62
+ bool unichar_isalpha(unichar c);
63
+ bool unichar_iscntrl(unichar c);
64
+ bool unichar_isdigit(unichar c);
65
+ bool unichar_isgraph(unichar c);
66
+ bool unichar_islower(unichar c);
67
+ bool unichar_isprint(unichar c);
68
+ bool unichar_ispunct(unichar c);
69
+ bool unichar_isspace(unichar c);
70
+ bool unichar_isupper(unichar c);
71
+ bool unichar_istitle(unichar c);
72
+ bool unichar_isnewline(unichar c);
73
+ bool unichar_isxdigit(unichar c);
74
+ bool unichar_isassigned(unichar c);
75
+ bool unichar_iswide(unichar c);
76
+ bool unichar_isvalid(unichar c);
77
+
78
+ unichar unichar_toupper(unichar c);
79
+ unichar unichar_tolower(unichar c);
80
+ unichar unichar_totitle(unichar c);
81
+
82
+ int unichar_digit_value(unichar c);
83
+ int unichar_xdigit_value(unichar c);
84
+
85
+ UnicodeType unichar_type(unichar c);
86
+
87
+ bool unichar_mirror(unichar c, unichar *mirrored);
88
+
89
+
90
+ typedef enum {
91
+ UNICODE_BREAK_MANDATORY,
92
+ UNICODE_BREAK_CARRIAGE_RETURN,
93
+ UNICODE_BREAK_LINE_FEED,
94
+ UNICODE_BREAK_COMBINING_MARK,
95
+ UNICODE_BREAK_SURROGATE,
96
+ UNICODE_BREAK_ZERO_WIDTH_SPACE,
97
+ UNICODE_BREAK_INSEPARABLE,
98
+ UNICODE_BREAK_NON_BREAKING_GLUE,
99
+ UNICODE_BREAK_CONTINGENT,
100
+ UNICODE_BREAK_SPACE,
101
+ UNICODE_BREAK_AFTER,
102
+ UNICODE_BREAK_BEFORE,
103
+ UNICODE_BREAK_BEFORE_AND_AFTER,
104
+ UNICODE_BREAK_HYPHEN,
105
+ UNICODE_BREAK_NON_STARTER,
106
+ UNICODE_BREAK_OPEN_PUNCTUATION,
107
+ UNICODE_BREAK_CLOSE_PUNCTUATION,
108
+ UNICODE_BREAK_QUOTATION,
109
+ UNICODE_BREAK_EXCLAMATION,
110
+ UNICODE_BREAK_IDEOGRAPHIC,
111
+ UNICODE_BREAK_NUMERIC,
112
+ UNICODE_BREAK_INFIX_SEPARATOR,
113
+ UNICODE_BREAK_SYMBOL,
114
+ UNICODE_BREAK_ALPHABETIC,
115
+ UNICODE_BREAK_PREFIX,
116
+ UNICODE_BREAK_POSTFIX,
117
+ UNICODE_BREAK_COMPLEX_CONTEXT,
118
+ UNICODE_BREAK_AMBIGUOUS,
119
+ UNICODE_BREAK_UNKNOWN,
120
+ UNICODE_BREAK_NEXT_LINE,
121
+ UNICODE_BREAK_WORD_JOINER,
122
+ UNICODE_BREAK_HANGUL_L_JAMO,
123
+ UNICODE_BREAK_HANGUL_V_JAMO,
124
+ UNICODE_BREAK_HANGUL_T_JAMO,
125
+ UNICODE_BREAK_HANGUL_LV_SYLLABLE,
126
+ UNICODE_BREAK_HANGUL_LVT_SYLLABLE
127
+ } UnicodeBreakType;
128
+
129
+ UnicodeBreakType unichar_break_type(unichar c);
130
+
131
+
132
+ typedef enum {
133
+ NORMALIZE_DEFAULT,
134
+ NORMALIZE_NFD = NORMALIZE_DEFAULT,
135
+ NORMALIZE_DEFAULT_COMPOSE,
136
+ NORMALIZE_NFC = NORMALIZE_DEFAULT_COMPOSE,
137
+ NORMALIZE_ALL,
138
+ NORMALIZE_NFKD = NORMALIZE_ALL,
139
+ NORMALIZE_ALL_COMPOSE,
140
+ NORMALIZE_NFKC = NORMALIZE_ALL_COMPOSE
141
+ } NormalizeMode;
142
+
143
+ void unicode_canonical_ordering(unichar *str, size_t len);
144
+ unichar *unicode_canonical_decomposition(unichar c, size_t *result_len);
145
+
146
+ char *utf_normalize(const char *str, NormalizeMode mode);
147
+ char *utf_normalize_n(const char *str, NormalizeMode mode, size_t len);
148
+
149
+
150
+
151
+
152
+ char *utf_upcase(const char *str);
153
+ char *utf_upcase_n(const char *str, size_t len);
154
+ char *utf_downcase(const char *str);
155
+ char *utf_downcase_n(const char *str, size_t len);
156
+ char *utf_foldcase(const char *str);
157
+ char *utf_foldcase_n(const char *str, size_t len);
158
+
159
+ unichar utf_char(const char *str);
160
+ unichar utf_char_n(const char *str, size_t max);
161
+ unichar utf_char_validated(const char *str);
162
+ unichar utf_char_validated_n(const char *str, size_t max);
163
+
164
+ extern const char * const s_utf_skip_lengths;
165
+ #define utf_next(str) ((str) + s_utf_skip_lengths[*(const unsigned char *)(str)])
166
+ char *utf_find_next(const char *p, const char *end);
167
+ char *utf_prev(const char *p);
168
+ char *utf_find_prev(const char *begin, const char *p);
169
+ char *utf_offset_to_pointer(const char *str, long offset);
170
+ long utf_pointer_to_offset(const char *str, const char *pos);
171
+
172
+ void utf_copy(char *dest, const char *src);
173
+ void utf_copy_n(char *dest, const char *src, size_t n);
174
+ void utf_append(char *dest, const char *src);
175
+ void utf_append_n(char *dest, const char *src, size_t n);
176
+ int utf_collate(const char *a, const char *b);
177
+ char *utf_collate_key(const char *str);
178
+ char *utf_collate_key_n(const char *str, size_t len);
179
+ int utf_char_index(const char *str, unichar c);
180
+ int utf_char_index_n(const char *str, unichar c, size_t len);
181
+ int utf_char_rindex(const char *str, unichar c);
182
+ int utf_char_rindex_n(const char *str, unichar c, size_t len);
183
+ int utf_index(const char *haystack, const char *needle);
184
+ int utf_index_n(const char *haystack, const char *needle, size_t len);
185
+ int utf_rindex(const char *haystack, const char *needle);
186
+ int utf_rindex_n(const char *haystack, const char *needle, size_t len);
187
+ bool utf_has_prefix(const char *str, const char *prefix);
188
+ long utf_length(const char *str);
189
+ long utf_length_n(const char *str, long len);
190
+ size_t utf_width(const char *str);
191
+ size_t utf_width_n(const char *str, size_t len);
192
+ size_t utf_byte_length(const char *str);
193
+ char *utf_reverse(const char *str);
194
+ char *utf_reverse_n(const char *str, size_t len);
195
+
196
+ bool utf_isvalid(const char *str);
197
+ bool utf_isvalid_n(const char *str, size_t max, const char **end);
198
+
199
+ /* XXX: should probably name stuff utf32 instead of ucs4 */
200
+ int unichar_to_utf(unichar c, char *result);
201
+ char *ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written);
202
+ char *ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written);
203
+ unichar *utf8_to_ucs4_fast(const char *str, size_t *items_written);
204
+ unichar *utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written);
205
+ unichar *utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written);
206
+ unichar *utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written);
207
+
208
+ #endif /* UNICODE_H */
@@ -0,0 +1,1332 @@
1
+ /*
2
+ * contents: UTF-8 string operations.
3
+ *
4
+ * Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
5
+ */
6
+
7
+
8
+ #include <ruby.h>
9
+ #include <assert.h>
10
+ #include <locale.h>
11
+ #include <stdbool.h>
12
+ #include <stddef.h>
13
+ #include <stdint.h>
14
+ #include <stdlib.h>
15
+ #include <string.h>
16
+ #include <wchar.h>
17
+
18
+ #include "unicode.h"
19
+ #include "private.h"
20
+
21
+
22
+ #define UNICODE_ISVALID(char) \
23
+ ((char) < 0x110000 && \
24
+ (((char) & 0xffffff800) != 0xd800) && \
25
+ ((char) < 0xfdd0 || (char) > 0xfdef) && \
26
+ ((char) & 0xfffe) != 0xfffe)
27
+
28
+
29
+ /* {{{1
30
+ * These are a couple of constants we use for dealing with the bit-twiddling
31
+ * necessary when dealing with UTF-8 character sequences.
32
+ */
33
+ enum {
34
+ BIT_1 = 7,
35
+ BIT_X = 6,
36
+ BIT_2 = 5,
37
+ BIT_3 = 4,
38
+ BIT_4 = 3,
39
+ BIT_5 = 2,
40
+ BIT_6 = 1,
41
+
42
+ OCT_1 = ((1 << (BIT_1 + 1)) - 1) ^ 0xff, /* 0000 0000 */
43
+ OCT_X = ((1 << (BIT_X + 1)) - 1) ^ 0xff, /* 1000 0000 */
44
+ OCT_2 = ((1 << (BIT_2 + 1)) - 1) ^ 0xff, /* 1100 0000 */
45
+ OCT_3 = ((1 << (BIT_3 + 1)) - 1) ^ 0xff, /* 1110 0000 */
46
+ OCT_4 = ((1 << (BIT_4 + 1)) - 1) ^ 0xff, /* 1111 0000 */
47
+ OCT_5 = ((1 << (BIT_5 + 1)) - 1) ^ 0xff, /* 1111 1000 */
48
+ OCT_6 = ((1 << (BIT_6 + 1)) - 1) ^ 0xff, /* 1111 1100 */
49
+
50
+ UNI_LEN1 = 0x80,
51
+ UNI_LEN2 = 0x800,
52
+ UNI_LEN3 = 0x10000,
53
+ UNI_LEN4 = 0x200000,
54
+ UNI_LEN5 = 0x4000000,
55
+
56
+ MASK_X = (1 << BIT_X) - 1, /* 0011 1111 */
57
+ TEST_X = MASK_X ^ 0xff, /* 1100 0000 */
58
+ };
59
+
60
+ /* {{{1
61
+ * Determine whether ‘p’ is part of a UTF-8 multi-byte sequence.
62
+ */
63
+ #define CONT_X(p) ((((unsigned char)p) & TEST_X) == OCT_X)
64
+
65
+ /* {{{1
66
+ * Add the bits from ‘p’ to ‘c’, which is first shifted right to make room for
67
+ * the additional bits.
68
+ */
69
+ #define ADD_X(c, p) (((c) << BIT_X) | (((unsigned char)p) & MASK_X))
70
+
71
+ /* {{{1
72
+ * Put bits from ‘c’ into ‘p’ and shift them off of ‘c’ afterwards.
73
+ */
74
+ #define PUT_X(c, p) ((p) = OCT_X | ((c) & MASK_X), (c) >> BIT_X)
75
+
76
+
77
+ /* {{{1
78
+ * s_utf_skip_lengths: This table is used for keeping track of how long a given
79
+ * UTF-8 character sequence is from the contents of the first byte.
80
+ */
81
+ static const uint8_t s_utf_skip_length_data[256] = {
82
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
90
+ };
91
+
92
+
93
+ const char * const s_utf_skip_lengths = (const char *)s_utf_skip_length_data;
94
+
95
+
96
+
97
+ /* {{{1
98
+ * Private function used to calculate the length and mask to use when dealing
99
+ * with a given UTF-8 character sequence.
100
+ */
101
+ static inline void
102
+ _utf_compute(unsigned char c, int *mask, int *len)
103
+ {
104
+ if (c < 0x80) {
105
+ *len = 1;
106
+ *mask = 0x7f;
107
+ } else if ((c & 0xe0) == 0xc0) {
108
+ *len = 2;
109
+ *mask = 0x1f;
110
+ } else if ((c & 0xf0) == 0xe0) {
111
+ *len = 3;
112
+ *mask = 0x0f;
113
+ } else if ((c & 0xf8) == 0xf0) {
114
+ *len = 4;
115
+ *mask = 0x07;
116
+ } else if ((c & 0xfc) == 0xf8) {
117
+ *len = 5;
118
+ *mask = 0x03;
119
+ } else if ((c & 0xfe) == 0xfc) {
120
+ *len = 6;
121
+ *mask = 0x01;
122
+ } else {
123
+ *len = -1;
124
+ }
125
+ }
126
+
127
+ /* {{{1
128
+ * Private function used to figure out the length of the UTF-8 representation
129
+ * of a given Unicode character (UTF-32).
130
+ */
131
+ static inline unsigned short
132
+ _utf_length(const unichar c)
133
+ {
134
+ if (c < UNI_LEN1)
135
+ return 1;
136
+ else if (c < UNI_LEN2)
137
+ return 2;
138
+ else if (c < UNI_LEN3)
139
+ return 3;
140
+ else if (c < UNI_LEN4)
141
+ return 4;
142
+ else if (c < UNI_LEN5)
143
+ return 5;
144
+ else
145
+ return 6;
146
+ }
147
+
148
+ /* {{{1
149
+ * Private function used to retrieve a UTF-32 character from an UTF-8 character
150
+ * sequence given a mask and length previously retrieved with _utf_compute().
151
+ */
152
+ static inline unichar
153
+ _utf_get(const char *str, int mask, int len)
154
+ {
155
+ unichar c = (unsigned char)str[0] & mask;
156
+
157
+ for (int i = 1; i < len; i++) {
158
+ unsigned char ch = ((const unsigned char *)str)[i];
159
+
160
+ if (CONT_X(ch)) {
161
+ c = ADD_X(c, ch);
162
+ } else {
163
+ c = UTF_BAD_INPUT_UNICHAR;
164
+ break;
165
+ }
166
+ }
167
+
168
+ return c;
169
+ }
170
+
171
+
172
+ /* {{{1
173
+ * Retrieve a UTF-32 character from a UTF-8 character sequence.
174
+ */
175
+ unichar
176
+ utf_char(const char *str)
177
+ {
178
+ int mask;
179
+ int len;
180
+
181
+ _utf_compute(*str, &mask, &len);
182
+
183
+ return (len > -1) ? _utf_get(str, mask, len) : UTF_BAD_INPUT_UNICHAR;
184
+ }
185
+
186
+
187
+ /* {{{1
188
+ * TODO
189
+ */
190
+ unichar
191
+ utf_char_n(const char *str, size_t max)
192
+ {
193
+ size_t len;
194
+ unichar c = (unsigned char)*str;
195
+
196
+ /* TODO: _utf_compute() here */
197
+ if (c < 0x80) {
198
+ return c;
199
+ } else if (c < 0xc0) {
200
+ return UTF_BAD_INPUT_UNICHAR;
201
+ } else if (c < 0xe0) {
202
+ len = 2;
203
+ c &= 0x1f;
204
+ } else if (c < 0xf0) {
205
+ len = 3;
206
+ c &= 0x0f;
207
+ } else if (c < 0xf8) {
208
+ len = 4;
209
+ c &= 0x07;
210
+ } else if (c < 0xfc) {
211
+ len = 5;
212
+ c &= 0x03;
213
+ } else if (c < 0xfe) {
214
+ len = 6;
215
+ c &= 0x01;
216
+ } else {
217
+ return UTF_BAD_INPUT_UNICHAR;
218
+ }
219
+
220
+ if (len > max) {
221
+ for (size_t i = 1; i < max; i++) {
222
+ if (!CONT_X(str[i]))
223
+ return UTF_BAD_INPUT_UNICHAR;
224
+ }
225
+
226
+ return UTF_INCOMPLETE_INPUT_UNICHAR;
227
+ }
228
+
229
+ for (size_t i = 1; i < len; i++) {
230
+ unsigned char ch = ((const unsigned char *)str)[i];
231
+
232
+ if (!CONT_X(ch))
233
+ return (ch != NUL) ? UTF_BAD_INPUT_UNICHAR : UTF_INCOMPLETE_INPUT_UNICHAR;
234
+
235
+ c = ADD_X(c, ch);
236
+ }
237
+
238
+ return (_utf_length(c) == len) ? c : UTF_BAD_INPUT_UNICHAR;
239
+ }
240
+
241
+
242
+ /* {{{1
243
+ * Retrieve a UTF-32 character from a UTF-8 character sequence. This function
244
+ * does additional checking while converitng, such as not overruning a maximum
245
+ * length and checks for incomplete, invalid or out-of-range characters.
246
+ */
247
+ unichar
248
+ utf_char_validated(const char *str)
249
+ {
250
+ unichar result = utf_char(str);
251
+
252
+ if (result & 0x80000000) {
253
+ return result;
254
+ } else if (!unichar_isvalid(result)) {
255
+ return UTF_BAD_INPUT_UNICHAR;
256
+ } else {
257
+ return result;
258
+ }
259
+ }
260
+
261
+
262
+ /* {{{1 */
263
+ unichar
264
+ utf_char_validated_n(const char *str, size_t max)
265
+ {
266
+ unichar result = utf_char_n(str, max);
267
+
268
+ if (result & 0x80000000) {
269
+ return result;
270
+ } else if (!unichar_isvalid(result)) {
271
+ return UTF_BAD_INPUT_UNICHAR;
272
+ } else {
273
+ return result;
274
+ }
275
+ }
276
+
277
+
278
+ /* {{{1
279
+ * Return a pointer to the next UTF-8 character sequence in ‘str’. This
280
+ * requires that it is at the start of the previous one already and no
281
+ * additional error checking is done.
282
+ */
283
+ /*
284
+ inline char *
285
+ utf_next(const char *str)
286
+ {
287
+ return (char *)str + s_utf_skip_lengths[*(const uchar *)str];
288
+ }
289
+ */
290
+
291
+
292
+ /* {{{1
293
+ * Synchronize and go to the next UTF-8 character sequence in ‘p’. This search
294
+ * will not go beyond ‘end’. ‹NULL› is returned if it couldn't be found.
295
+ */
296
+ char *
297
+ utf_find_next(const char *p, const char *end)
298
+ {
299
+ if (*p != NUL) {
300
+ if (end != NULL) {
301
+ for (p++; p < end && CONT_X(*p); p++) {
302
+ /* this loop intentionally left empty */
303
+ }
304
+ } else {
305
+ for (p++; CONT_X(*p); p++) {
306
+ /* this loop intentionally left empty */
307
+ }
308
+ }
309
+ }
310
+ return (p == end) ? NULL : (char *)p;
311
+ }
312
+
313
+
314
+ /* {{{1
315
+ * Return a pointer to the previous UTF-8 character sequence in ‘str’.
316
+ */
317
+ char *
318
+ utf_prev(const char *p)
319
+ {
320
+ while (true) {
321
+ p--;
322
+
323
+ if (!CONT_X(*p))
324
+ return (char *)p;
325
+ }
326
+ }
327
+
328
+
329
+ /* {{{1
330
+ * Synchronize and go to the previous UTF-8 character sequence in ‘p’. This
331
+ * search will not go beyond ‘begin’. ‹NULL› is returned if it couldn't be
332
+ * found.
333
+ */
334
+ char *
335
+ utf_find_prev(const char *begin, const char *p)
336
+ {
337
+ for (p--; p >= begin; p--) {
338
+ if (!CONT_X(*p))
339
+ return (char *)p;
340
+ }
341
+
342
+ return NULL;
343
+ }
344
+
345
+
346
+ /* {{{1
347
+ * Convert an integer offset to a pointer within ‘str’.
348
+ *
349
+ */
350
+ char *
351
+ utf_offset_to_pointer(const char *str, long offset)
352
+ {
353
+ const char *p = str;
354
+
355
+ if (offset > 0) {
356
+ while (offset-- > 0)
357
+ p = utf_next(p);
358
+ } else {
359
+ while (offset != 0) {
360
+ const char *base = p;
361
+ p += offset;
362
+ while ((*p & 0xc0) == 0x80)
363
+ p--;
364
+
365
+ offset += utf_pointer_to_offset(p, base);
366
+ }
367
+ }
368
+
369
+ return (char *)p;
370
+ }
371
+
372
+
373
+ /* {{{1
374
+ * Convert a pointer to an integer offset within ‘str’.
375
+ */
376
+ long
377
+ utf_pointer_to_offset(const char *str, const char *pos)
378
+ {
379
+ if (pos < str)
380
+ return -utf_pointer_to_offset(pos, str);
381
+
382
+ long offset = 0;
383
+ for (const char *p = str; p < pos; p = utf_next(p))
384
+ offset++;
385
+
386
+ return offset;
387
+ }
388
+
389
+
390
+ /* {{{1
391
+ * Copy the contents of an UTF-8 string to another.
392
+ */
393
+ void
394
+ utf_copy(char *dest, const char *src)
395
+ {
396
+ strcpy(dest, src);
397
+ }
398
+
399
+
400
+ /* {{{1
401
+ * Copy at most n Unicode characters from an UTF-8 string to another. The
402
+ * destination string will be ‹NUL›-terminated properly.
403
+ */
404
+ void
405
+ utf_copy_n(char *dest, const char *src, size_t n)
406
+ {
407
+ const char *p;
408
+
409
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
410
+ /* this loop intentionally left empty */;
411
+ }
412
+
413
+ strncpy(dest, src, p - src);
414
+ dest[p - src] = NUL;
415
+ }
416
+
417
+
418
+ /* {{{1
419
+ * Append an UTF-8 string onto another.
420
+ */
421
+ void
422
+ utf_append(char *dest, const char *src)
423
+ {
424
+ strcat(dest, src);
425
+ }
426
+
427
+
428
+ /* {{{1
429
+ * Append at most ‘n’ Unicode character from an UTF-8 string onto another.
430
+ */
431
+ void
432
+ utf_append_n(char *dest, const char *src, size_t n)
433
+ {
434
+ const char *p;
435
+
436
+ for (p = src; n > 0 && *p != NUL; p = utf_next(p), n--) {
437
+ /* this loop intentionally left empty */;
438
+ }
439
+
440
+ strncat(dest, src, p - src);
441
+ dest[p - src] = NUL;
442
+ }
443
+
444
+
445
+ /* {{{1
446
+ * Compare two strings for ordering using the linguistically correct rules of
447
+ * the current locale.
448
+ */
449
+ int
450
+ utf_collate(const char *a, const char *b)
451
+ {
452
+ assert(a != NULL);
453
+ assert(b != NULL);
454
+
455
+ unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
456
+ unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
457
+ setlocale(LC_COLLATE, "");
458
+ int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
459
+
460
+ free(a_norm);
461
+ free(b_norm);
462
+
463
+ return result;
464
+ }
465
+
466
+
467
+ /* {{{1
468
+ * We need UTF-8 encoding of numbers to encode the weights if
469
+ * we are using wcsxfrm. However, we aren't encoding Unicode
470
+ * characters, so we can't simply use unichar_to_utf.
471
+ *
472
+ * The following routine is taken (with modification) from GNU
473
+ * libc's strxfrm routine:
474
+ *
475
+ * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
476
+ * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
477
+ */
478
+ static inline int
479
+ _utf_encode(char *buf, wchar_t c)
480
+ {
481
+ int retval;
482
+
483
+ if (c < 0x80) {
484
+ if (buf != NULL)
485
+ *buf++ = (char)c;
486
+ retval = 1;
487
+ } else {
488
+ int step;
489
+
490
+ for (step = 2; step < 6; step++) {
491
+ if ((c & (~(uint32_t)0 << (5 * step + 1))) == 0)
492
+ break;
493
+ }
494
+
495
+ retval = step;
496
+
497
+ if (buf != NULL) {
498
+ *buf = (unsigned char)(~0xff >> step);
499
+ step--;
500
+ do {
501
+ c = PUT_X(c, buf[step]);
502
+ } while (--step > 0);
503
+ *buf |= c;
504
+ }
505
+ }
506
+
507
+ return retval;
508
+ }
509
+
510
+
511
+ /* {{{1
512
+ * Generate a collation key from a string which can be compared with other
513
+ * collation keys using str_compare().
514
+ */
515
+ static char *
516
+ utf_collate_key_impl(const char *str, size_t len, bool use_len)
517
+ {
518
+ assert(str != NULL);
519
+
520
+ unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
521
+ setlocale(LC_COLLATE, "");
522
+ size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
523
+ wchar_t result_wc[xfrm_len + 1];
524
+ wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
525
+
526
+ int result_len = 0;
527
+ for (size_t i = 0; i < xfrm_len; i++)
528
+ result_len += _utf_encode(NULL, result_wc[i]);
529
+
530
+ char *result = ALLOC_N(char, result_len + 1);
531
+ result_len = 0;
532
+ for (size_t i = 0; i < xfrm_len; i++)
533
+ result_len += _utf_encode(result + result_len, result_wc[i]);
534
+ result[result_len] = NUL;
535
+
536
+ free(str_norm);
537
+
538
+ return result;
539
+ }
540
+
541
+
542
+ /* {{{1
543
+ * Generate a collation key from a string which can be compared with other
544
+ * collation keys using str_compare().
545
+ */
546
+ char *
547
+ utf_collate_key(const char *str)
548
+ {
549
+ return utf_collate_key_impl(str, 0, false);
550
+ }
551
+
552
+
553
+ /* {{{1
554
+ * Generate a collation key from a string (of length ‘len’) which can be
555
+ * compared with other collation keys using str_compare().
556
+ */
557
+ char *
558
+ utf_collate_key_n(const char *str, size_t len)
559
+ {
560
+ return utf_collate_key_impl(str, len, true);
561
+ }
562
+
563
+
564
+ /* {{{1
565
+ * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
566
+ * ‘haystack_len’.
567
+ */
568
+ static int
569
+ str_index_n(const char *haystack, const char *needle, size_t haystack_len)
570
+ {
571
+ assert(haystack != NULL);
572
+ assert(needle != NULL);
573
+
574
+ size_t needle_len = strlen(needle);
575
+
576
+ if (needle_len == 0)
577
+ return 0;
578
+
579
+ if (haystack_len < needle_len)
580
+ return -1;
581
+
582
+ const char *end = haystack + haystack_len - needle_len;
583
+ for (const char *p = haystack; *p != '\0' && p <= end; p++) {
584
+ size_t i;
585
+
586
+ for (i = 0; i < needle_len; i++) {
587
+ if (p[i] != needle[i])
588
+ break;
589
+ }
590
+
591
+ if (i == needle_len)
592
+ return p - haystack;
593
+ }
594
+
595
+ return -1;
596
+ }
597
+
598
+
599
+ /* {{{1
600
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
601
+ * ‘haystack’, or -1 if it doesn't exist.
602
+ */
603
+ static int
604
+ str_rindex(const char *haystack, const char *needle)
605
+ {
606
+ assert(haystack != NULL);
607
+ assert(needle != NULL);
608
+
609
+ size_t needle_len = strlen(needle);
610
+ size_t haystack_len = strlen(haystack);
611
+
612
+ if (needle_len == 0)
613
+ return haystack_len;
614
+
615
+ if (haystack_len < needle_len)
616
+ return -1;
617
+
618
+ for (const char *p = haystack + haystack_len - needle_len; p >= haystack; p--) {
619
+ size_t i;
620
+
621
+ for (i = 0; i < needle_len; i++) {
622
+ if (p[i] != needle[i])
623
+ break;
624
+ }
625
+
626
+ if (i == needle_len)
627
+ return p - haystack;
628
+ }
629
+
630
+ return -1;
631
+ }
632
+
633
+
634
+ /* {{{1
635
+ * Retrieve the index/offset of the right-most occurence of ‘needle’ in
636
+ * ‘haystack’, or -1 if it doesn't exist.
637
+ */
638
+ static int
639
+ str_rindex_n(const char *haystack, const char *needle, size_t haystack_len)
640
+ {
641
+ assert(haystack != NULL);
642
+ assert(needle != NULL);
643
+
644
+ size_t needle_len = strlen(needle);
645
+ const char *haystack_max = haystack + haystack_len;
646
+ const char *p = haystack;
647
+
648
+ while (p < haystack_max && *p != '\0')
649
+ p++;
650
+
651
+ if (p < haystack + needle_len)
652
+ return -1;
653
+
654
+ p -= needle_len;
655
+
656
+ for ( ; p >= haystack; p--) {
657
+ size_t i;
658
+
659
+ for (i = 0; i < needle_len; i++) {
660
+ if (p[i] != needle[i])
661
+ break;
662
+ }
663
+
664
+ if (i == needle_len)
665
+ return p - haystack;
666
+ }
667
+
668
+ return -1;
669
+ }
670
+
671
+
672
+ /* {{{1
673
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
674
+ * doesn't exist.
675
+ */
676
+ int
677
+ utf_char_index(const char *str, unichar c)
678
+ {
679
+ char ch[7];
680
+
681
+ ch[unichar_to_utf(c, ch)] = NUL;
682
+ char *p = strstr(str, ch);
683
+ return (p != NULL) ? p - str : -1;
684
+ }
685
+
686
+
687
+ /* {{{1
688
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
689
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
690
+ */
691
+ int
692
+ utf_char_index_n(const char *str, unichar c, size_t len)
693
+ {
694
+ char ch[7];
695
+
696
+ ch[unichar_to_utf(c, ch)] = NUL;
697
+
698
+ return str_index_n(str, ch, len);
699
+ }
700
+
701
+
702
+ /* {{{1
703
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
704
+ * doesn't exist.
705
+ */
706
+ int
707
+ utf_char_rindex(const char *str, unichar c)
708
+ {
709
+ char ch[7];
710
+
711
+ ch[unichar_to_utf(c, ch)] = NUL;
712
+
713
+ return str_rindex(str, ch);
714
+ }
715
+
716
+
717
+ /* {{{1
718
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
719
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
720
+ */
721
+ int
722
+ utf_char_rindex_n(const char *str, unichar c, size_t len)
723
+ {
724
+ char ch[7];
725
+
726
+ ch[unichar_to_utf(c, ch)] = NUL;
727
+
728
+ return str_rindex_n(str, ch, len);
729
+ }
730
+
731
+
732
+ /* {{{1
733
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
734
+ * -1 if it doesn't exist.
735
+ */
736
+ int
737
+ utf_index(const char *haystack, const char *needle)
738
+ {
739
+ return strstr(haystack, needle) - haystack;
740
+ }
741
+
742
+
743
+ /* {{{1
744
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
745
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
746
+ */
747
+ int
748
+ utf_index_n(const char *haystack, const char *needle, size_t len)
749
+ {
750
+ return str_index_n(haystack, needle, len);
751
+ }
752
+
753
+
754
+ /* {{{1
755
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
756
+ * -1 if it doesn't exist.
757
+ */
758
+ int
759
+ utf_rindex(const char *haystack, const char *needle)
760
+ {
761
+ return str_rindex(haystack, needle);
762
+ }
763
+
764
+
765
+ /* {{{1
766
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
767
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
768
+ */
769
+ int
770
+ utf_rindex_n(const char *haystack, const char *needle, size_t len)
771
+ {
772
+ return str_rindex_n(haystack, needle, len);
773
+ }
774
+
775
+
776
+ /* {{{1
777
+ * Check if the given string begins with ‘prefix’.
778
+ */
779
+ bool
780
+ utf_has_prefix(const char *str, const char *prefix)
781
+ {
782
+ assert(str != NULL);
783
+ assert(prefix != NULL);
784
+
785
+ do {
786
+ if (*prefix == NUL)
787
+ return true;
788
+ else if (*str == NUL)
789
+ return false;
790
+ } while (*str++ == *prefix++);
791
+
792
+ return false;
793
+ }
794
+
795
+
796
+ /* {{{1
797
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
798
+ */
799
+ long
800
+ utf_length(const char *str)
801
+ {
802
+ assert(str != NULL);
803
+
804
+ long n = 0;
805
+ const char *p = str;
806
+ while (*p != '\0') {
807
+ n++;
808
+ p = utf_next(p);
809
+ }
810
+
811
+ return n;
812
+ }
813
+
814
+
815
+ /* {{{1
816
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
817
+ * ‘len’ bytes.
818
+ */
819
+ long
820
+ utf_length_n(const char *str, long len)
821
+ {
822
+ assert(str != NULL || len == 0);
823
+
824
+ if (len == 0)
825
+ return 0;
826
+
827
+ long n = 0;
828
+ const char *p = str;
829
+ const char *end = str + len;
830
+ while (p < end) {
831
+ n++;
832
+ p = utf_next(p);
833
+ }
834
+
835
+ /* This makes sure that we don’t count incomplete characters. It won’t
836
+ * save us from illegal UTF-8-sequences, however. */
837
+ if (p > end)
838
+ n--;
839
+
840
+ return n;
841
+ }
842
+
843
+
844
+ /* {{{1
845
+ * Retrieve the number of bytes making up the given UTF-8 string.
846
+ */
847
+ size_t
848
+ utf_byte_length(const char *str)
849
+ {
850
+ return strlen(str);
851
+ }
852
+
853
+
854
+ /* {{{1
855
+ * The real implementation of utf_reverse() and utf_reverse_n() below.
856
+ */
857
+ static char *
858
+ utf_reverse_impl(const char *str, size_t len, bool use_len)
859
+ {
860
+ if (!use_len)
861
+ len = utf_byte_length(str);
862
+
863
+ char *result = ALLOC_N(char, len + 1);
864
+ char *r = result + len;
865
+ const char *p = str;
866
+ while (*p != NUL) {
867
+ uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
868
+ r -= skip;
869
+ for (char *m = r; skip > 0; skip--)
870
+ *m++ = *p++;
871
+ }
872
+ result[len] = 0;
873
+
874
+ return result;
875
+ }
876
+
877
+
878
+ /* {{{1
879
+ * Return a new string which is ‘str’ reversed.
880
+ */
881
+ char *
882
+ utf_reverse(const char *str)
883
+ {
884
+ return utf_reverse_impl(str, 0, false);
885
+ }
886
+
887
+
888
+ /* {{{1
889
+ * Return a new string which is ‘str’ reversed, examining at most ‘len’ bytes
890
+ * of it.
891
+ */
892
+ char *
893
+ utf_reverse_n(const char *str, size_t len)
894
+ {
895
+ return utf_reverse_impl(str, len, true);
896
+ }
897
+
898
+
899
+ /* {{{1
900
+ * The real implementation of utf_isvalid() and utf_isvalid_n() below.
901
+ *
902
+ * TODO: this needs optimizing. Look at glib's new optimized implementation
903
+ * (2.6.0) and also separate the ‘use_max’ into two cases.
904
+ */
905
+ #define CONTINUATION_CHAR do { \
906
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
907
+ goto error; \
908
+ val <<= 6; \
909
+ val |= (*(unsigned char *)p) & 0x3f; \
910
+ } while (0);
911
+
912
+ static const char *
913
+ fast_validate(const char *str)
914
+ {
915
+ unichar val = 0;
916
+ unichar min = 0;
917
+ const char *p;
918
+
919
+ for (p = str; *p != NUL; p++) {
920
+ if (*(unsigned char *)p < 128)
921
+ continue;
922
+
923
+ const char *last = p;
924
+
925
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
926
+ if ((*(unsigned char *)p & 0x1e) == 0)
927
+ goto error;
928
+ p++;
929
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
930
+ goto error;
931
+ } else {
932
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
933
+ min = (1 << 11);
934
+ val = *(unsigned char *)p & 0x0f;
935
+ goto two_remaining;
936
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
937
+ min = (1 << 16);
938
+ val = *(unsigned char *)p & 0x07;
939
+ } else {
940
+ goto error;
941
+ }
942
+
943
+ p++;
944
+ CONTINUATION_CHAR;
945
+ two_remaining:
946
+ p++;
947
+ CONTINUATION_CHAR;
948
+ p++;
949
+ CONTINUATION_CHAR;
950
+
951
+ if (val < min)
952
+ goto error;
953
+
954
+ if (!UNICODE_ISVALID(val))
955
+ goto error;
956
+ }
957
+
958
+ continue;
959
+ error:
960
+ return last;
961
+ }
962
+
963
+ return p;
964
+ }
965
+
966
+ static const char *
967
+ fast_validate_len(const char *str, size_t max_len)
968
+ {
969
+ unichar val = 0;
970
+ unichar min = 0;
971
+ const char *p;
972
+
973
+ for (p = str; (size_t)(p - str) < max_len && *p != NUL; p++) {
974
+ if (*(unsigned char *)p < 128)
975
+ continue;
976
+
977
+ const char *last = p;
978
+
979
+ if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
980
+ if (max_len - (p - str) < 2)
981
+ goto error;
982
+
983
+ if ((*(unsigned char *)p & 0x1e) == 0)
984
+ goto error;
985
+ p++;
986
+ if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */
987
+ goto error;
988
+ } else {
989
+ if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
990
+ if (max_len - (p - str) < 3)
991
+ goto error;
992
+
993
+ min = (1 << 11);
994
+ val = *(unsigned char *)p & 0x0f;
995
+ goto two_remaining;
996
+ } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
997
+ if (max_len - (p - str) < 4)
998
+ goto error;
999
+
1000
+ min = (1 << 16);
1001
+ val = *(unsigned char *)p & 0x07;
1002
+ } else {
1003
+ goto error;
1004
+ }
1005
+
1006
+ p++;
1007
+ CONTINUATION_CHAR;
1008
+ two_remaining:
1009
+ p++;
1010
+ CONTINUATION_CHAR;
1011
+ p++;
1012
+ CONTINUATION_CHAR;
1013
+
1014
+ if (val < min)
1015
+ goto error;
1016
+ if (!UNICODE_ISVALID(val))
1017
+ goto error;
1018
+ }
1019
+
1020
+ continue;
1021
+ error:
1022
+ return last;
1023
+ }
1024
+
1025
+ return p;
1026
+ }
1027
+
1028
+
1029
+ /* {{{1
1030
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence.
1031
+ */
1032
+ bool
1033
+ utf_isvalid(const char *str)
1034
+ {
1035
+ const char *p = fast_validate(str);
1036
+
1037
+ return *p == NUL;
1038
+ }
1039
+
1040
+
1041
+ /* {{{1
1042
+ * Check if ‘str’ constitutes a valid UTF-8 character sequence, examining at
1043
+ * most ‘max’ bytes. If it turns out ‘str’ isn't a valid UTF-8 character
1044
+ * sequence and ‘end’ is non-‹NULL›, ‘end’ is set to the end of the valid range
1045
+ * of bytes in ‘str’.
1046
+ */
1047
+ bool
1048
+ utf_isvalid_n(const char *str, size_t max, const char **end)
1049
+ {
1050
+ const char *p = fast_validate_len(str, max);
1051
+
1052
+ if (end != NULL)
1053
+ *end = p;
1054
+
1055
+ return p == str + max;
1056
+ }
1057
+
1058
+
1059
+ /* {{{1
1060
+ * Check whether ‘c’ is a valid Unicode character.
1061
+ */
1062
+ bool
1063
+ unichar_isvalid(unichar c)
1064
+ {
1065
+ return UNICODE_ISVALID(c);
1066
+ }
1067
+
1068
+
1069
+ /* {{{1
1070
+ * Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
1071
+ * store it in ‘result’, returning the length of the stored sequence.
1072
+ */
1073
+ int
1074
+ unichar_to_utf(unichar c, char *result)
1075
+ {
1076
+ int len = 0;
1077
+ int first;
1078
+
1079
+ if (c < UNI_LEN1) {
1080
+ first = 0;
1081
+ len = 1;
1082
+ } else if (c < UNI_LEN2) {
1083
+ first = 0xc0;
1084
+ len = 2;
1085
+ } else if (c < UNI_LEN3) {
1086
+ first = 0xe0;
1087
+ len = 3;
1088
+ } else if (c < UNI_LEN4) {
1089
+ first = 0xf0;
1090
+ len = 4;
1091
+ } else if (c < UNI_LEN5) {
1092
+ first = 0xf8;
1093
+ len = 5;
1094
+ } else {
1095
+ first = 0xfc;
1096
+ len = 6;
1097
+ }
1098
+
1099
+ if (result != NULL) {
1100
+ for (int i = len - 1; i > 0; i--)
1101
+ c = PUT_X(c, result[i]);
1102
+
1103
+ result[0] = c | first;
1104
+ }
1105
+
1106
+ return len;
1107
+ }
1108
+
1109
+
1110
+ /* {{{1
1111
+ * The real implementation of ucs4_to_utf8() and ucs4_to_utf8_n() below.
1112
+ */
1113
+ static char *
1114
+ ucs4_to_utf8_n_impl(unichar *str, size_t len, bool use_len,
1115
+ size_t *items_read, size_t *items_written)
1116
+ {
1117
+ size_t result_len = 0;
1118
+ char *result = NULL, *p;
1119
+
1120
+ for (size_t i = 0; (!use_len || i < len) && str[i] != NUL; i++) {
1121
+ if (str[i] >= 0x80000000) {
1122
+ if (items_read != NULL)
1123
+ *items_read = i;
1124
+
1125
+ rb_raise(rb_eArgError, "UCS-4 input contains character outside of range for UTF-8 (%lc))", str[i]);
1126
+ }
1127
+
1128
+ result_len += _utf_length(str[i]);
1129
+ }
1130
+
1131
+ p = result = ALLOC_N(char, result_len + 1);
1132
+ size_t i;
1133
+ for (i = 0; p < result + result_len; i++)
1134
+ p += unichar_to_utf(str[i], p);
1135
+ *p = NUL;
1136
+
1137
+ if (items_written != NULL)
1138
+ *items_written = p - result;
1139
+ if (items_read != NULL)
1140
+ *items_read = i;
1141
+
1142
+ return result;
1143
+ }
1144
+
1145
+ /* {{{1
1146
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1147
+ * store the number of characters read and bytes written in ‘items_read’ and
1148
+ * ‘items_written’ respectivelly.
1149
+ */
1150
+ char *
1151
+ ucs4_to_utf8(unichar *str, size_t *items_read, size_t *items_written)
1152
+ {
1153
+ return ucs4_to_utf8_n_impl(str, 0, false, items_read, items_written);
1154
+ }
1155
+
1156
+ /* {{{1
1157
+ * Turn an UTF-32 encoded string into an UTF-8 encoded one. If non-‹NULL›,
1158
+ * store the number of characters read and bytes written in ‘items_read’ and
1159
+ * ‘items_written’ respectivelly. Examine at most ‘len’ characters from ‘str’.
1160
+ */
1161
+ char *
1162
+ ucs4_to_utf8_n(unichar *str, size_t len, size_t *items_read, size_t *items_written)
1163
+ {
1164
+ return ucs4_to_utf8_n_impl(str, len, true, items_read, items_written);
1165
+ }
1166
+
1167
+
1168
+ /* {{{1
1169
+ * The real implementation of utf8_to_ucs4_fast() and utf8_to_ucs4_fast_n()
1170
+ * below.
1171
+ */
1172
+ static unichar *
1173
+ utf8_to_ucs4_fast_impl(const char *str, size_t len, bool use_len, size_t *items_written)
1174
+ {
1175
+ assert(str != NULL);
1176
+
1177
+ const char *p = str;
1178
+ size_t n = 0;
1179
+ if (use_len) {
1180
+ while (p < str + len && *p != NUL) {
1181
+ p = utf_next(p);
1182
+ n++;
1183
+ }
1184
+ } else {
1185
+ while (p != NUL) {
1186
+ p = utf_next(p);
1187
+ n++;
1188
+ }
1189
+ }
1190
+
1191
+ unichar *result = ALLOC_N(unichar, n + 1);
1192
+ p = str;
1193
+ size_t i;
1194
+ for (i = 0; i < n; i++) {
1195
+ unichar c = ((unsigned char *)p)[0];
1196
+ int c_len;
1197
+
1198
+ if (c < 0x80) {
1199
+ result[i] = c;
1200
+ p++;
1201
+ } else {
1202
+ /* TODO: use _utf_compute() here */
1203
+ if (c < 0xe0) {
1204
+ c_len = 2;
1205
+ c &= 0x1f;
1206
+ } else if (c < 0xf0) {
1207
+ c_len = 3;
1208
+ c &= 0x0f;
1209
+ } else if (c < 0xf8) {
1210
+ c_len = 4;
1211
+ c &= 0x07;
1212
+ } else if (c < 0xfc) {
1213
+ c_len = 5;
1214
+ c &= 0x03;
1215
+ } else {
1216
+ c_len = 6;
1217
+ c &= 0x01;
1218
+ }
1219
+
1220
+ for (int j = 1; j < c_len; j++) {
1221
+ c <<= BIT_X;
1222
+ c |= ((unsigned char *)p)[j] & MASK_X;
1223
+ }
1224
+
1225
+ result[i] = c;
1226
+ p += c_len;
1227
+ }
1228
+ }
1229
+ result[i] = NUL;
1230
+
1231
+ if (items_written != NULL)
1232
+ *items_written = i;
1233
+
1234
+ return result;
1235
+ }
1236
+
1237
+
1238
+ /* {{{1
1239
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1240
+ * the number of characters written in ‘items_written’.
1241
+ */
1242
+ unichar *
1243
+ utf8_to_ucs4_fast(const char *str, size_t *items_written)
1244
+ {
1245
+ return utf8_to_ucs4_fast_impl(str, 0, false, items_written);
1246
+ }
1247
+
1248
+
1249
+ /* {{{1
1250
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1251
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1252
+ * bytes from ‘str’.
1253
+ */
1254
+ unichar *
1255
+ utf8_to_ucs4_fast_n(const char *str, size_t len, size_t *items_written)
1256
+ {
1257
+ return utf8_to_ucs4_fast_impl(str, len, true, items_written);
1258
+ }
1259
+
1260
+
1261
+ /* {{{1
1262
+ * The real implementation of utf8_to_ucs4() and utf8_to_ucs4_n() below.
1263
+ */
1264
+ static unichar *
1265
+ utf8_to_ucs4_impl(const char *str, size_t len, bool use_len, size_t *items_read, size_t *items_written)
1266
+ {
1267
+ size_t n = 0;
1268
+ const char *p = str;
1269
+ for (; (!use_len || str + len - p > 0) && *p != NUL; p = utf_next(p)) {
1270
+ unichar c = utf_char_n(p, str + len - p);
1271
+ if (c & 0x80000000) {
1272
+ if (c == UTF_INCOMPLETE_INPUT_UNICHAR) {
1273
+ if (items_read != NULL)
1274
+ break;
1275
+
1276
+ rb_raise(rb_eArgError, "partial character sequence in UTF-8 input");
1277
+ } else {
1278
+ rb_raise(rb_eArgError, "UTF-8 input contains character outside of range for UTF-8 (%lc))", c);
1279
+ }
1280
+
1281
+ if (items_read != NULL)
1282
+ *items_read = p - str;
1283
+
1284
+ return NULL;
1285
+ } else {
1286
+ n++;
1287
+ }
1288
+ }
1289
+
1290
+ unichar *result = ALLOC_N(unichar, n + 1);
1291
+ size_t i;
1292
+ for (i = 0, p = str; i < n; i++) {
1293
+ result[i] = utf_char(p);
1294
+ p = utf_next(p);
1295
+ }
1296
+ result[i] = NUL;
1297
+
1298
+ if (items_written != NULL)
1299
+ *items_written = n;
1300
+ if (items_read != NULL)
1301
+ *items_read = p - str;
1302
+
1303
+ return result;
1304
+ }
1305
+
1306
+
1307
+ /* {{{1
1308
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1309
+ * the number of characters written in ‘items_written’. This function does
1310
+ * additional error-checking on the input.
1311
+ */
1312
+ unichar *
1313
+ utf8_to_ucs4(const char *str, size_t *items_read, size_t *items_written)
1314
+ {
1315
+ return utf8_to_ucs4_impl(str, 0, false, items_read, items_written);
1316
+ }
1317
+
1318
+
1319
+ /* {{{1
1320
+ * Turn an UTF-8 character sequence into an UTF-32 one. If non-‹NULL›, store
1321
+ * the number of characters written in ‘items_written’. Examine at most ‘len’
1322
+ * bytes from ‘str’. This function does additional error-checking on the
1323
+ * input.
1324
+ */
1325
+ unichar *
1326
+ utf8_to_ucs4_n(const char *str, int len, size_t *items_read, size_t *items_written)
1327
+ {
1328
+ return utf8_to_ucs4_impl(str, len, true, items_read, items_written);
1329
+ }
1330
+
1331
+
1332
+ /* }}}1 */