character-encodings 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.rstrip module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_rstrip_bang(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
StringValue(str);
|
13
|
+
const char *begin = RSTRING(str)->ptr;
|
14
|
+
if (begin == NULL || RSTRING(str)->len == 0)
|
15
|
+
return Qnil;
|
16
|
+
|
17
|
+
const char *end = begin + RSTRING(str)->len;
|
18
|
+
const char *t = end;
|
19
|
+
|
20
|
+
/* Remove trailing '\0'’s. */
|
21
|
+
while (t > begin && t[-1] == '\0')
|
22
|
+
t--;
|
23
|
+
|
24
|
+
/* Remove trailing spaces. */
|
25
|
+
while (t > begin) {
|
26
|
+
/* FIXME: Should we be validating here? */
|
27
|
+
const char *prev = rb_utf_prev_validated(begin, t);
|
28
|
+
|
29
|
+
if (!unichar_isspace(utf_char(prev)))
|
30
|
+
break;
|
31
|
+
|
32
|
+
t = prev;
|
33
|
+
}
|
34
|
+
|
35
|
+
if (t == end)
|
36
|
+
return Qnil;
|
37
|
+
|
38
|
+
rb_str_modify(str);
|
39
|
+
RSTRING(str)->len = t - begin;
|
40
|
+
RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
|
41
|
+
|
42
|
+
return str;
|
43
|
+
}
|
44
|
+
|
45
|
+
VALUE
|
46
|
+
rb_utf_rstrip(VALUE self, VALUE str)
|
47
|
+
{
|
48
|
+
str = rb_utf_dup(str);
|
49
|
+
rb_utf_rstrip_bang(self, str);
|
50
|
+
return str;
|
51
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.squeeze module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_tr.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_squeeze_bang(int argc, VALUE *argv, UNUSED(VALUE self))
|
12
|
+
{
|
13
|
+
need_at_least_n_arguments(argc, 1);
|
14
|
+
|
15
|
+
VALUE str = argv[0];
|
16
|
+
StringValue(str);
|
17
|
+
if (RSTRING(str)->len == 0)
|
18
|
+
return Qnil;
|
19
|
+
|
20
|
+
unsigned int table[TR_TABLE_SIZE];
|
21
|
+
if (argc == 1)
|
22
|
+
for (int i = 0; i < TR_TABLE_SIZE; i++)
|
23
|
+
table[i] = ~0U;
|
24
|
+
else
|
25
|
+
tr_setup_table_from_strings(table, argc - 1, &argv[1]);
|
26
|
+
|
27
|
+
rb_str_modify(str);
|
28
|
+
|
29
|
+
char *begin = RSTRING(str)->ptr;
|
30
|
+
char const *end = begin + RSTRING(str)->len;
|
31
|
+
|
32
|
+
/* We know that there is a character to eat (if the input isn’t
|
33
|
+
* invalid), as we’ve already verified that RSTRING(str)->len > 0, so
|
34
|
+
* ‘s_end’ must lay beyond ‘s’. Also, as we validate when we fetch the
|
35
|
+
* character, there’s no need to validate the call to utf_next(). */
|
36
|
+
unichar previous = _utf_char_validated(begin, end);
|
37
|
+
char *s = utf_next(begin);
|
38
|
+
char *t = s;
|
39
|
+
while (s < end) {
|
40
|
+
unichar c = _utf_char_validated(s, end);
|
41
|
+
char *next = utf_next(s);
|
42
|
+
|
43
|
+
if (c != previous || !tr_table_lookup(table, c)) {
|
44
|
+
memmove(t, s, next - s);
|
45
|
+
t += next - s;
|
46
|
+
previous = c;
|
47
|
+
}
|
48
|
+
|
49
|
+
s = next;
|
50
|
+
}
|
51
|
+
*t = '\0';
|
52
|
+
|
53
|
+
if (t - begin != RSTRING(str)->len) {
|
54
|
+
RSTRING(str)->len = t - begin;
|
55
|
+
return str;
|
56
|
+
}
|
57
|
+
|
58
|
+
return Qnil;
|
59
|
+
}
|
60
|
+
|
61
|
+
VALUE
|
62
|
+
rb_utf_squeeze(int argc, VALUE *argv, VALUE self)
|
63
|
+
{
|
64
|
+
need_at_least_n_arguments(argc, 1);
|
65
|
+
|
66
|
+
StringValue(argv[0]);
|
67
|
+
argv[0] = rb_utf_dup(argv[0]);
|
68
|
+
rb_utf_squeeze_bang(argc, argv, self);
|
69
|
+
return argv[0];
|
70
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.strip module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_strip_bang(VALUE self, VALUE str)
|
11
|
+
{
|
12
|
+
VALUE left = rb_utf_lstrip_bang(self, str);
|
13
|
+
VALUE right = rb_utf_rstrip_bang(self, str);
|
14
|
+
|
15
|
+
if (NIL_P(left) && NIL_P(right))
|
16
|
+
return Qnil;
|
17
|
+
|
18
|
+
return str;
|
19
|
+
}
|
20
|
+
|
21
|
+
VALUE
|
22
|
+
rb_utf_strip(VALUE self, VALUE str)
|
23
|
+
{
|
24
|
+
str = rb_utf_dup(str);
|
25
|
+
rb_utf_strip_bang(self, str);
|
26
|
+
return str;
|
27
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.to_i module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_bignum.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_to_i(int argc, VALUE *argv, UNUSED(VALUE self))
|
12
|
+
{
|
13
|
+
VALUE str, rbbase;
|
14
|
+
|
15
|
+
int base = 10;
|
16
|
+
if (rb_scan_args(argc, argv, "11", &str, &rbbase) == 2)
|
17
|
+
base = NUM2INT(rbbase);
|
18
|
+
|
19
|
+
/* XXX: this test is actually unnecessary, as this will be checked in
|
20
|
+
* rb_utf_to_inum() as well. */
|
21
|
+
if (base < 0)
|
22
|
+
rb_raise(rb_eArgError, "illegal radix %d", base);
|
23
|
+
|
24
|
+
return rb_utf_to_inum(str, base, false);
|
25
|
+
}
|
@@ -0,0 +1,250 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.tr module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_tr.h"
|
9
|
+
|
10
|
+
struct tr_range
|
11
|
+
{
|
12
|
+
unichar begin;
|
13
|
+
unichar end;
|
14
|
+
};
|
15
|
+
|
16
|
+
static int
|
17
|
+
tr_ranges_setup(struct tr *tr, struct tr_range *ranges)
|
18
|
+
{
|
19
|
+
int n = 0;
|
20
|
+
bool was_inside_range = false;
|
21
|
+
while (tr_next(tr) != TR_FINISHED) {
|
22
|
+
if (tr->inside_range) {
|
23
|
+
if (!was_inside_range) {
|
24
|
+
ranges[n].begin = tr->now;
|
25
|
+
was_inside_range = true;
|
26
|
+
}
|
27
|
+
} else {
|
28
|
+
if (was_inside_range)
|
29
|
+
ranges[n].end = tr->now;
|
30
|
+
else
|
31
|
+
ranges[n].begin = ranges[n].end = tr->now;
|
32
|
+
n++;
|
33
|
+
was_inside_range = false;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
return n;
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
struct tr_trans_closure
|
42
|
+
{
|
43
|
+
struct tr_range *from;
|
44
|
+
int n_from;
|
45
|
+
struct tr_range *to;
|
46
|
+
int n_to;
|
47
|
+
};
|
48
|
+
|
49
|
+
static unichar
|
50
|
+
tr_trans_replace_exclude(UNUSED(unichar c), void *closure)
|
51
|
+
{
|
52
|
+
return *((unichar *)closure);
|
53
|
+
}
|
54
|
+
|
55
|
+
static int
|
56
|
+
tr_trans_replace_include_offset_of(struct tr_range *ranges, int range)
|
57
|
+
{
|
58
|
+
int offset = 0;
|
59
|
+
|
60
|
+
for (int i = 0; i < range; i++)
|
61
|
+
offset += ranges[i].end - ranges[i].begin + 1;
|
62
|
+
|
63
|
+
return offset;
|
64
|
+
}
|
65
|
+
|
66
|
+
static unichar
|
67
|
+
tr_trans_replace_include(unichar c, void *v_closure)
|
68
|
+
{
|
69
|
+
struct tr_trans_closure *closure = (struct tr_trans_closure *)v_closure;
|
70
|
+
|
71
|
+
for (int i = closure->n_from - 1; i >= 0; i--) {
|
72
|
+
if (closure->from[i].begin >= c && closure->from[i].end <= c) {
|
73
|
+
int offset = tr_trans_replace_include_offset_of(closure->from, i);
|
74
|
+
int j;
|
75
|
+
for (j = 0; j < closure->n_to && offset > 0; j++)
|
76
|
+
offset -= closure->to[j].end - closure->to[j].begin + 1;
|
77
|
+
|
78
|
+
if (offset > 0)
|
79
|
+
return closure->to[closure->n_to - 1].end;
|
80
|
+
|
81
|
+
return closure->to[j].end - offset;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
return closure->to[closure->n_to - 1].end;
|
86
|
+
}
|
87
|
+
|
88
|
+
static VALUE
|
89
|
+
tr_trans_do(VALUE src, unsigned int *translation,
|
90
|
+
unichar (*replace)(unichar, void *), void *closure, bool squeeze,
|
91
|
+
UNUSED(bool replace_content))
|
92
|
+
{
|
93
|
+
VALUE dst = Qnil;
|
94
|
+
long len;
|
95
|
+
|
96
|
+
again:
|
97
|
+
len = 0;
|
98
|
+
|
99
|
+
const char *s = RSTRING(src)->ptr;
|
100
|
+
const char *s_end = s + RSTRING(src)->len;
|
101
|
+
|
102
|
+
char *t = NULL;
|
103
|
+
|
104
|
+
if (dst != Qnil)
|
105
|
+
t = RSTRING(dst)->ptr;
|
106
|
+
|
107
|
+
bool modified = false;
|
108
|
+
|
109
|
+
/* TODO: this should really be refactored… */
|
110
|
+
if (squeeze) {
|
111
|
+
unichar prev_c = -1;
|
112
|
+
|
113
|
+
while (s < s_end) {
|
114
|
+
unichar c0 = utf_char(s);
|
115
|
+
|
116
|
+
const char *prev = s;
|
117
|
+
s = utf_next(s);
|
118
|
+
|
119
|
+
if (tr_table_lookup(translation, c0)) {
|
120
|
+
unichar c = replace(c0, closure);
|
121
|
+
if (prev_c == c)
|
122
|
+
continue;
|
123
|
+
prev_c = c;
|
124
|
+
len += unichar_to_utf(c, (t != NULL) ? t + len : NULL);
|
125
|
+
modified = true;
|
126
|
+
} else {
|
127
|
+
prev_c = -1;
|
128
|
+
if (t != NULL)
|
129
|
+
memcpy(t + len, prev, s - prev);
|
130
|
+
len += s - prev;
|
131
|
+
}
|
132
|
+
|
133
|
+
}
|
134
|
+
|
135
|
+
if (RSTRING(src)->len > (t + len - RSTRING(src)->ptr))
|
136
|
+
modified = true;
|
137
|
+
} else {
|
138
|
+
while (s < s_end) {
|
139
|
+
unichar c = utf_char(s);
|
140
|
+
|
141
|
+
const char *prev = s;
|
142
|
+
s = utf_next(s);
|
143
|
+
|
144
|
+
if (tr_table_lookup(translation, c)) {
|
145
|
+
len += unichar_to_utf(replace(c, closure),
|
146
|
+
(t != NULL) ? t + len : NULL);
|
147
|
+
modified = true;
|
148
|
+
} else {
|
149
|
+
if (t != NULL)
|
150
|
+
memcpy(t + len, prev, s - prev);
|
151
|
+
len += s - prev;
|
152
|
+
}
|
153
|
+
}
|
154
|
+
}
|
155
|
+
|
156
|
+
#ifdef RB_STR_REPLACE_IS_EXTERN
|
157
|
+
if (replace_content && !modified)
|
158
|
+
return Qnil;
|
159
|
+
#endif
|
160
|
+
|
161
|
+
if (dst == Qnil) {
|
162
|
+
#ifdef RB_STR_REPLACE_IS_EXTERN
|
163
|
+
if (replace_content && len <= RSTRING(src)->len)
|
164
|
+
dst = src;
|
165
|
+
else
|
166
|
+
#endif
|
167
|
+
dst = rb_str_buf_new(len);
|
168
|
+
goto again;
|
169
|
+
}
|
170
|
+
|
171
|
+
t[len] = '\0';
|
172
|
+
RSTRING(dst)->len = len;
|
173
|
+
|
174
|
+
#ifdef RB_STR_REPLACE_IS_EXTERN
|
175
|
+
if (dst != src && replace_content) {
|
176
|
+
rb_str_replace(src, dst);
|
177
|
+
return src;
|
178
|
+
}
|
179
|
+
#endif
|
180
|
+
|
181
|
+
return dst;
|
182
|
+
}
|
183
|
+
|
184
|
+
static VALUE
|
185
|
+
tr_trans(VALUE str, VALUE from, VALUE to, bool squeeze, bool replace_content)
|
186
|
+
{
|
187
|
+
StringValue(str);
|
188
|
+
StringValue(from);
|
189
|
+
StringValue(to);
|
190
|
+
|
191
|
+
if (RSTRING(str)->ptr == NULL || RSTRING(str)->len == 0)
|
192
|
+
return replace_content ? Qnil : str;
|
193
|
+
|
194
|
+
if (RSTRING(to)->len == 0)
|
195
|
+
return rb_utf_delete_bang(1, &from, str);
|
196
|
+
|
197
|
+
struct tr tr_from;
|
198
|
+
tr_init(&tr_from,
|
199
|
+
RSTRING(from)->ptr,
|
200
|
+
RSTRING(from)->ptr + RSTRING(from)->len);
|
201
|
+
|
202
|
+
struct tr tr_to;
|
203
|
+
tr_init(&tr_to,
|
204
|
+
RSTRING(to)->ptr,
|
205
|
+
RSTRING(to)->ptr + RSTRING(to)->len);
|
206
|
+
|
207
|
+
unsigned int translation[TR_TABLE_SIZE];
|
208
|
+
tr_setup_table(from, translation, true);
|
209
|
+
|
210
|
+
tr_init(&tr_from,
|
211
|
+
RSTRING(from)->ptr,
|
212
|
+
RSTRING(from)->ptr + RSTRING(from)->len);
|
213
|
+
if (tr_should_exclude(&tr_from)) {
|
214
|
+
/* This case is easy. Just include everything by default and
|
215
|
+
* exclude the rest as always. Replace characters found by the
|
216
|
+
* last character found in tr_to. */
|
217
|
+
while (tr_next(&tr_to) != TR_FINISHED)
|
218
|
+
; /* We just need the last replacement character. */
|
219
|
+
return tr_trans_do(str, translation, tr_trans_replace_exclude,
|
220
|
+
&tr_to.now, squeeze, replace_content);
|
221
|
+
} else {
|
222
|
+
/* This case is hard. We need a full-fledged lookup of what
|
223
|
+
* character to translate to, not simply a check whether to
|
224
|
+
* include it or not. */
|
225
|
+
struct tr_trans_closure trans_closure;
|
226
|
+
|
227
|
+
struct tr_range from_ranges[utf_length_n(RSTRING(from)->ptr, RSTRING(from)->len)];
|
228
|
+
trans_closure.from = from_ranges;
|
229
|
+
trans_closure.n_from = tr_ranges_setup(&tr_from, from_ranges);
|
230
|
+
|
231
|
+
struct tr_range to_ranges[utf_length_n(RSTRING(to)->ptr, RSTRING(to)->len)];
|
232
|
+
trans_closure.to = to_ranges;
|
233
|
+
trans_closure.n_to = tr_ranges_setup(&tr_to, to_ranges);
|
234
|
+
|
235
|
+
return tr_trans_do(str, translation, tr_trans_replace_include,
|
236
|
+
&trans_closure, squeeze, replace_content);
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
VALUE
|
241
|
+
rb_utf_tr(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to)
|
242
|
+
{
|
243
|
+
return tr_trans(str, from, to, false, false);
|
244
|
+
}
|
245
|
+
|
246
|
+
VALUE
|
247
|
+
rb_utf_tr_s(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to)
|
248
|
+
{
|
249
|
+
return tr_trans(str, from, to, true, false);
|
250
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.upcase module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_upcase(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
return rb_utf_alloc_using(utf_upcase(StringValuePtr(str)));
|
13
|
+
}
|
@@ -0,0 +1,319 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Unicode class.
|
3
|
+
*
|
4
|
+
* Copyright © 2005 Nikolai Weibull <work@rawuncut.elitemail.org>
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
#include <ruby.h>
|
9
|
+
#include <re.h>
|
10
|
+
#include <stdbool.h>
|
11
|
+
#include <stddef.h>
|
12
|
+
#include <stdint.h>
|
13
|
+
#include <limits.h>
|
14
|
+
#include "unicode.h"
|
15
|
+
#include "private.h"
|
16
|
+
#include "rb_methods.h"
|
17
|
+
|
18
|
+
static VALUE mUTF8Methods;
|
19
|
+
|
20
|
+
void
|
21
|
+
need_at_least_n_arguments(int argc, int n)
|
22
|
+
{
|
23
|
+
static const char *const words[] = {
|
24
|
+
NULL, NULL, "two", "three", "four",
|
25
|
+
"five", "six", "seven", "eight", "nine"
|
26
|
+
};
|
27
|
+
|
28
|
+
if (argc >= n)
|
29
|
+
return;
|
30
|
+
|
31
|
+
if (n == 1)
|
32
|
+
rb_raise(rb_eArgError, "need at least one argument");
|
33
|
+
else if (1 < n && n < 10)
|
34
|
+
rb_raise(rb_eArgError, "need at least %s arguments", words[n]);
|
35
|
+
else
|
36
|
+
rb_raise(rb_eArgError, "need at least %d arguments", n);
|
37
|
+
}
|
38
|
+
|
39
|
+
unichar
|
40
|
+
_utf_char_validated(char const *const str, char const *const str_end)
|
41
|
+
{
|
42
|
+
unichar c = utf_char_validated_n(str, str_end - str);
|
43
|
+
switch (c) {
|
44
|
+
case UTF_BAD_INPUT_UNICHAR:
|
45
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
46
|
+
case UTF_INCOMPLETE_INPUT_UNICHAR:
|
47
|
+
rb_raise(rb_eArgError,
|
48
|
+
"input contains an incomplete UTF-8-encoded character");
|
49
|
+
default:
|
50
|
+
return c;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
/* TODO: instead of ‘end’, perhaps use a len/max-type parameter? */
|
55
|
+
char *
|
56
|
+
_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
57
|
+
const char *limit, bool noisy)
|
58
|
+
{
|
59
|
+
const char *p = str;
|
60
|
+
long saved_offset = offset;
|
61
|
+
|
62
|
+
if (offset > 0) {
|
63
|
+
while (p < limit && offset-- > 0)
|
64
|
+
p = utf_next(p);
|
65
|
+
|
66
|
+
if (offset > 0) {
|
67
|
+
if (noisy)
|
68
|
+
rb_raise(rb_eIndexError,
|
69
|
+
"index %ld lays beyond end of string",
|
70
|
+
saved_offset);
|
71
|
+
else
|
72
|
+
return NULL;
|
73
|
+
}
|
74
|
+
} else {
|
75
|
+
while (offset != 0) {
|
76
|
+
const char *base = p;
|
77
|
+
p += offset;
|
78
|
+
while (p >= limit && (*p & 0xc0) == 0x80)
|
79
|
+
p--;
|
80
|
+
|
81
|
+
if (p < limit) {
|
82
|
+
if (noisy)
|
83
|
+
rb_raise(rb_eIndexError,
|
84
|
+
"index %ld lays before beginning of string",
|
85
|
+
saved_offset);
|
86
|
+
else
|
87
|
+
return NULL;
|
88
|
+
break;
|
89
|
+
}
|
90
|
+
|
91
|
+
offset += utf_pointer_to_offset(p, base);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
return (char *)p;
|
96
|
+
}
|
97
|
+
|
98
|
+
char *
|
99
|
+
_utf_offset_to_pointer_validated(const char *str, long offset, const char *end)
|
100
|
+
{
|
101
|
+
return _utf_offset_to_pointer_validated_impl(str, offset, end, true);
|
102
|
+
}
|
103
|
+
|
104
|
+
char *
|
105
|
+
_utf_offset_to_pointer_failable(const char *str, long offset, const char *end)
|
106
|
+
{
|
107
|
+
return _utf_offset_to_pointer_validated_impl(str, offset, end, false);
|
108
|
+
}
|
109
|
+
|
110
|
+
static char *
|
111
|
+
rb_utf_begin_setup(VALUE str, long offset, char **base_limit, char **limit)
|
112
|
+
{
|
113
|
+
char *base = RSTRING(str)->ptr;
|
114
|
+
|
115
|
+
*base_limit = RSTRING(str)->ptr + RSTRING(str)->len;
|
116
|
+
*limit = *base_limit;
|
117
|
+
|
118
|
+
if (offset < 0) {
|
119
|
+
char *tmp = base;
|
120
|
+
base = *base_limit;
|
121
|
+
*base_limit = tmp;
|
122
|
+
}
|
123
|
+
|
124
|
+
return base;
|
125
|
+
}
|
126
|
+
|
127
|
+
bool
|
128
|
+
rb_utf_begin_from_offset(VALUE str, long offset, char **begin, char **limit)
|
129
|
+
{
|
130
|
+
char *base_limit;
|
131
|
+
char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
|
132
|
+
|
133
|
+
*begin = _utf_offset_to_pointer_failable(base, offset, base_limit);
|
134
|
+
|
135
|
+
return (*begin != NULL);
|
136
|
+
}
|
137
|
+
|
138
|
+
void
|
139
|
+
rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
140
|
+
char **limit)
|
141
|
+
{
|
142
|
+
char *base_limit;
|
143
|
+
char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
|
144
|
+
|
145
|
+
*begin = _utf_offset_to_pointer_validated(base, offset, base_limit);
|
146
|
+
}
|
147
|
+
|
148
|
+
char *
|
149
|
+
rb_utf_prev_validated(const char *begin, const char *p)
|
150
|
+
{
|
151
|
+
char *prev = utf_find_prev(begin, p);
|
152
|
+
if (prev == NULL)
|
153
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
154
|
+
return prev;
|
155
|
+
}
|
156
|
+
|
157
|
+
char *
|
158
|
+
rb_utf_next_validated(const char *p, const char *end)
|
159
|
+
{
|
160
|
+
char *next = (char *)utf_next(p);
|
161
|
+
if (next > end)
|
162
|
+
rb_raise(rb_eArgError, "input isn’t valid UTF-8");
|
163
|
+
return next;
|
164
|
+
}
|
165
|
+
|
166
|
+
VALUE
|
167
|
+
rb_utf_update(VALUE str, long offset, long len, VALUE replacement)
|
168
|
+
{
|
169
|
+
if (len < 0)
|
170
|
+
rb_raise(rb_eIndexError, "negative length %ld", len);
|
171
|
+
|
172
|
+
char *begin, *limit;
|
173
|
+
rb_utf_begin_from_offset_validated(str, offset, &begin, &limit);
|
174
|
+
char *end = _utf_offset_to_pointer_failable(begin, len, limit);
|
175
|
+
if (end == NULL)
|
176
|
+
end = limit;
|
177
|
+
|
178
|
+
rb_str_update(str, begin - RSTRING(str)->ptr, end - begin, replacement);
|
179
|
+
|
180
|
+
return replacement;
|
181
|
+
}
|
182
|
+
|
183
|
+
VALUE
|
184
|
+
rb_utf_new(const char *str, long len)
|
185
|
+
{
|
186
|
+
VALUE rbstr = rb_str_new(str, len);
|
187
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
188
|
+
return rbstr;
|
189
|
+
}
|
190
|
+
|
191
|
+
VALUE
|
192
|
+
rb_utf_new2(const char *str)
|
193
|
+
{
|
194
|
+
VALUE rbstr = rb_str_new2(str);
|
195
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
196
|
+
return rbstr;
|
197
|
+
}
|
198
|
+
|
199
|
+
VALUE
|
200
|
+
rb_utf_new5(VALUE obj, const char *str, long len)
|
201
|
+
{
|
202
|
+
VALUE rbstr = rb_str_new5(obj, str, len);
|
203
|
+
rb_extend_object(rbstr, mUTF8Methods);
|
204
|
+
return rbstr;
|
205
|
+
}
|
206
|
+
|
207
|
+
VALUE
|
208
|
+
rb_utf_alloc_using(char *str)
|
209
|
+
{
|
210
|
+
VALUE rbstr = rb_utf_new(NULL, 0);
|
211
|
+
long len = strlen(str);
|
212
|
+
|
213
|
+
RSTRING(rbstr)->ptr = str;
|
214
|
+
RSTRING(rbstr)->aux.capa = len;
|
215
|
+
RSTRING(rbstr)->len = len;
|
216
|
+
RSTRING(rbstr)->ptr[len] = '\0';
|
217
|
+
|
218
|
+
return rbstr;
|
219
|
+
}
|
220
|
+
|
221
|
+
VALUE
|
222
|
+
rb_utf_dup(VALUE str)
|
223
|
+
{
|
224
|
+
str = rb_str_dup(str);
|
225
|
+
rb_extend_object(str, mUTF8Methods);
|
226
|
+
return str;
|
227
|
+
}
|
228
|
+
|
229
|
+
/* TODO: rewrite this using the new offset-calculating functions. */
|
230
|
+
long
|
231
|
+
rb_utf_index(VALUE str, VALUE sub, long offset)
|
232
|
+
{
|
233
|
+
long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
234
|
+
|
235
|
+
if (offset < 0) {
|
236
|
+
offset += n_chars;
|
237
|
+
|
238
|
+
if (offset < 0)
|
239
|
+
return -1;
|
240
|
+
}
|
241
|
+
|
242
|
+
if (n_chars - offset < utf_length(RSTRING(sub)->ptr))
|
243
|
+
return -1;
|
244
|
+
|
245
|
+
if (RSTRING(sub)->len == 0)
|
246
|
+
return offset;
|
247
|
+
|
248
|
+
char *begin = utf_offset_to_pointer(RSTRING(str)->ptr, offset);
|
249
|
+
long pos = rb_memsearch(RSTRING(sub)->ptr, RSTRING(sub)->len,
|
250
|
+
begin, RSTRING(str)->len - (begin - RSTRING(str)->ptr));
|
251
|
+
|
252
|
+
if (pos < 0)
|
253
|
+
return -1;
|
254
|
+
|
255
|
+
return offset + utf_pointer_to_offset(begin, begin + pos);
|
256
|
+
}
|
257
|
+
|
258
|
+
long
|
259
|
+
rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
260
|
+
long offset, bool reverse)
|
261
|
+
{
|
262
|
+
long byte_offset = _utf_offset_to_pointer_validated(s, offset, end) - s;
|
263
|
+
long byte_startpos = rb_reg_adjust_startpos(sub, str, byte_offset, reverse);
|
264
|
+
long byte_index = rb_reg_search(sub, str, byte_startpos, reverse);
|
265
|
+
if (byte_index == -1)
|
266
|
+
return -1;
|
267
|
+
return utf_pointer_to_offset(s, s + byte_index);
|
268
|
+
}
|
269
|
+
|
270
|
+
void Init_utf8(void);
|
271
|
+
void
|
272
|
+
Init_utf8(void)
|
273
|
+
{
|
274
|
+
VALUE mEncoding = rb_define_module("Encoding");
|
275
|
+
VALUE mCharacter = rb_define_module_under(mEncoding, "Character");
|
276
|
+
VALUE mUTF8 = rb_define_module_under(mCharacter, "UTF8");
|
277
|
+
|
278
|
+
mUTF8Methods = rb_define_module_under(mUTF8, "Methods");
|
279
|
+
|
280
|
+
rb_define_module_function(mUTF8, "collate", rb_utf_collate, 2);
|
281
|
+
rb_define_module_function(mUTF8, "aref", rb_utf_aref_m, -1);
|
282
|
+
rb_define_module_function(mUTF8, "aset", rb_utf_aset_m, -1);
|
283
|
+
rb_define_module_function(mUTF8, "casecmp", rb_utf_casecmp, 2);
|
284
|
+
rb_define_module_function(mUTF8, "center", rb_utf_center, -1);
|
285
|
+
rb_define_module_function(mUTF8, "chomp", rb_utf_chomp, -1);
|
286
|
+
rb_define_module_function(mUTF8, "chomp!", rb_utf_chomp_bang, -1);
|
287
|
+
rb_define_module_function(mUTF8, "chop", rb_utf_chop, 1);
|
288
|
+
rb_define_module_function(mUTF8, "chop!", rb_utf_chop_bang, 1);
|
289
|
+
rb_define_module_function(mUTF8, "count", rb_utf_count, -1);
|
290
|
+
rb_define_module_function(mUTF8, "delete", rb_utf_delete, -1);
|
291
|
+
rb_define_module_function(mUTF8, "delete!", rb_utf_delete_bang, -1);
|
292
|
+
rb_define_module_function(mUTF8, "each_char", rb_utf_each_char, 1);
|
293
|
+
rb_define_module_function(mUTF8, "index", rb_utf_index_m, -1);
|
294
|
+
rb_define_module_function(mUTF8, "insert", rb_utf_insert, 3);
|
295
|
+
rb_define_module_function(mUTF8, "lstrip", rb_utf_lstrip, 1);
|
296
|
+
rb_define_module_function(mUTF8, "lstrip!", rb_utf_lstrip_bang, 1);
|
297
|
+
rb_define_module_function(mUTF8, "rindex", rb_utf_rindex_m, -1);
|
298
|
+
rb_define_module_function(mUTF8, "rstrip", rb_utf_rstrip, 1);
|
299
|
+
rb_define_module_function(mUTF8, "rstrip!", rb_utf_rstrip_bang, 1);
|
300
|
+
rb_define_module_function(mUTF8, "squeeze", rb_utf_squeeze, -1);
|
301
|
+
rb_define_module_function(mUTF8, "squeeze!", rb_utf_squeeze_bang, -1);
|
302
|
+
rb_define_module_function(mUTF8, "strip", rb_utf_strip, 1);
|
303
|
+
rb_define_module_function(mUTF8, "strip!", rb_utf_strip_bang, 1);
|
304
|
+
rb_define_module_function(mUTF8, "to_i", rb_utf_to_i, -1);
|
305
|
+
rb_define_module_function(mUTF8, "hex", rb_utf_hex, 1);
|
306
|
+
rb_define_module_function(mUTF8, "oct", rb_utf_oct, 1);
|
307
|
+
rb_define_module_function(mUTF8, "tr", rb_utf_tr, 3);
|
308
|
+
rb_define_module_function(mUTF8, "tr_s", rb_utf_tr_s, 3);
|
309
|
+
|
310
|
+
rb_define_module_function(mUTF8, "downcase", rb_utf_downcase, 1);
|
311
|
+
rb_define_module_function(mUTF8, "ljust", rb_utf_ljust, -1);
|
312
|
+
rb_define_module_function(mUTF8, "length", rb_utf_length, 1);
|
313
|
+
rb_define_module_function(mUTF8, "reverse", rb_utf_reverse, 1);
|
314
|
+
rb_define_module_function(mUTF8, "rjust", rb_utf_rjust, -1);
|
315
|
+
rb_define_module_function(mUTF8, "upcase", rb_utf_upcase, 1);
|
316
|
+
|
317
|
+
rb_define_module_function(mUTF8, "foldcase", rb_utf_foldcase, 1);
|
318
|
+
rb_define_module_function(mUTF8, "normalize", rb_utf_normalize, -1);
|
319
|
+
}
|