character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,51 @@
1
+ /*
2
+ * contents: UTF8.rstrip module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_rstrip_bang(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+ const char *begin = RSTRING(str)->ptr;
14
+ if (begin == NULL || RSTRING(str)->len == 0)
15
+ return Qnil;
16
+
17
+ const char *end = begin + RSTRING(str)->len;
18
+ const char *t = end;
19
+
20
+ /* Remove trailing '\0'’s. */
21
+ while (t > begin && t[-1] == '\0')
22
+ t--;
23
+
24
+ /* Remove trailing spaces. */
25
+ while (t > begin) {
26
+ /* FIXME: Should we be validating here? */
27
+ const char *prev = rb_utf_prev_validated(begin, t);
28
+
29
+ if (!unichar_isspace(utf_char(prev)))
30
+ break;
31
+
32
+ t = prev;
33
+ }
34
+
35
+ if (t == end)
36
+ return Qnil;
37
+
38
+ rb_str_modify(str);
39
+ RSTRING(str)->len = t - begin;
40
+ RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
41
+
42
+ return str;
43
+ }
44
+
45
+ VALUE
46
+ rb_utf_rstrip(VALUE self, VALUE str)
47
+ {
48
+ str = rb_utf_dup(str);
49
+ rb_utf_rstrip_bang(self, str);
50
+ return str;
51
+ }
@@ -0,0 +1,70 @@
1
+ /*
2
+ * contents: UTF8.squeeze module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_tr.h"
9
+
10
+ VALUE
11
+ rb_utf_squeeze_bang(int argc, VALUE *argv, UNUSED(VALUE self))
12
+ {
13
+ need_at_least_n_arguments(argc, 1);
14
+
15
+ VALUE str = argv[0];
16
+ StringValue(str);
17
+ if (RSTRING(str)->len == 0)
18
+ return Qnil;
19
+
20
+ unsigned int table[TR_TABLE_SIZE];
21
+ if (argc == 1)
22
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
23
+ table[i] = ~0U;
24
+ else
25
+ tr_setup_table_from_strings(table, argc - 1, &argv[1]);
26
+
27
+ rb_str_modify(str);
28
+
29
+ char *begin = RSTRING(str)->ptr;
30
+ char const *end = begin + RSTRING(str)->len;
31
+
32
+ /* We know that there is a character to eat (if the input isn’t
33
+ * invalid), as we’ve already verified that RSTRING(str)->len > 0, so
34
+ * ‘s_end’ must lay beyond ‘s’. Also, as we validate when we fetch the
35
+ * character, there’s no need to validate the call to utf_next(). */
36
+ unichar previous = _utf_char_validated(begin, end);
37
+ char *s = utf_next(begin);
38
+ char *t = s;
39
+ while (s < end) {
40
+ unichar c = _utf_char_validated(s, end);
41
+ char *next = utf_next(s);
42
+
43
+ if (c != previous || !tr_table_lookup(table, c)) {
44
+ memmove(t, s, next - s);
45
+ t += next - s;
46
+ previous = c;
47
+ }
48
+
49
+ s = next;
50
+ }
51
+ *t = '\0';
52
+
53
+ if (t - begin != RSTRING(str)->len) {
54
+ RSTRING(str)->len = t - begin;
55
+ return str;
56
+ }
57
+
58
+ return Qnil;
59
+ }
60
+
61
+ VALUE
62
+ rb_utf_squeeze(int argc, VALUE *argv, VALUE self)
63
+ {
64
+ need_at_least_n_arguments(argc, 1);
65
+
66
+ StringValue(argv[0]);
67
+ argv[0] = rb_utf_dup(argv[0]);
68
+ rb_utf_squeeze_bang(argc, argv, self);
69
+ return argv[0];
70
+ }
@@ -0,0 +1,27 @@
1
+ /*
2
+ * contents: UTF8.strip module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_strip_bang(VALUE self, VALUE str)
11
+ {
12
+ VALUE left = rb_utf_lstrip_bang(self, str);
13
+ VALUE right = rb_utf_rstrip_bang(self, str);
14
+
15
+ if (NIL_P(left) && NIL_P(right))
16
+ return Qnil;
17
+
18
+ return str;
19
+ }
20
+
21
+ VALUE
22
+ rb_utf_strip(VALUE self, VALUE str)
23
+ {
24
+ str = rb_utf_dup(str);
25
+ rb_utf_strip_bang(self, str);
26
+ return str;
27
+ }
@@ -0,0 +1,25 @@
1
+ /*
2
+ * contents: UTF8.to_i module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_bignum.h"
9
+
10
+ VALUE
11
+ rb_utf_to_i(int argc, VALUE *argv, UNUSED(VALUE self))
12
+ {
13
+ VALUE str, rbbase;
14
+
15
+ int base = 10;
16
+ if (rb_scan_args(argc, argv, "11", &str, &rbbase) == 2)
17
+ base = NUM2INT(rbbase);
18
+
19
+ /* XXX: this test is actually unnecessary, as this will be checked in
20
+ * rb_utf_to_inum() as well. */
21
+ if (base < 0)
22
+ rb_raise(rb_eArgError, "illegal radix %d", base);
23
+
24
+ return rb_utf_to_inum(str, base, false);
25
+ }
@@ -0,0 +1,250 @@
1
+ /*
2
+ * contents: UTF8.tr module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_tr.h"
9
+
10
+ struct tr_range
11
+ {
12
+ unichar begin;
13
+ unichar end;
14
+ };
15
+
16
+ static int
17
+ tr_ranges_setup(struct tr *tr, struct tr_range *ranges)
18
+ {
19
+ int n = 0;
20
+ bool was_inside_range = false;
21
+ while (tr_next(tr) != TR_FINISHED) {
22
+ if (tr->inside_range) {
23
+ if (!was_inside_range) {
24
+ ranges[n].begin = tr->now;
25
+ was_inside_range = true;
26
+ }
27
+ } else {
28
+ if (was_inside_range)
29
+ ranges[n].end = tr->now;
30
+ else
31
+ ranges[n].begin = ranges[n].end = tr->now;
32
+ n++;
33
+ was_inside_range = false;
34
+ }
35
+ }
36
+
37
+ return n;
38
+ }
39
+
40
+
41
+ struct tr_trans_closure
42
+ {
43
+ struct tr_range *from;
44
+ int n_from;
45
+ struct tr_range *to;
46
+ int n_to;
47
+ };
48
+
49
+ static unichar
50
+ tr_trans_replace_exclude(UNUSED(unichar c), void *closure)
51
+ {
52
+ return *((unichar *)closure);
53
+ }
54
+
55
+ static int
56
+ tr_trans_replace_include_offset_of(struct tr_range *ranges, int range)
57
+ {
58
+ int offset = 0;
59
+
60
+ for (int i = 0; i < range; i++)
61
+ offset += ranges[i].end - ranges[i].begin + 1;
62
+
63
+ return offset;
64
+ }
65
+
66
+ static unichar
67
+ tr_trans_replace_include(unichar c, void *v_closure)
68
+ {
69
+ struct tr_trans_closure *closure = (struct tr_trans_closure *)v_closure;
70
+
71
+ for (int i = closure->n_from - 1; i >= 0; i--) {
72
+ if (closure->from[i].begin >= c && closure->from[i].end <= c) {
73
+ int offset = tr_trans_replace_include_offset_of(closure->from, i);
74
+ int j;
75
+ for (j = 0; j < closure->n_to && offset > 0; j++)
76
+ offset -= closure->to[j].end - closure->to[j].begin + 1;
77
+
78
+ if (offset > 0)
79
+ return closure->to[closure->n_to - 1].end;
80
+
81
+ return closure->to[j].end - offset;
82
+ }
83
+ }
84
+
85
+ return closure->to[closure->n_to - 1].end;
86
+ }
87
+
88
+ static VALUE
89
+ tr_trans_do(VALUE src, unsigned int *translation,
90
+ unichar (*replace)(unichar, void *), void *closure, bool squeeze,
91
+ UNUSED(bool replace_content))
92
+ {
93
+ VALUE dst = Qnil;
94
+ long len;
95
+
96
+ again:
97
+ len = 0;
98
+
99
+ const char *s = RSTRING(src)->ptr;
100
+ const char *s_end = s + RSTRING(src)->len;
101
+
102
+ char *t = NULL;
103
+
104
+ if (dst != Qnil)
105
+ t = RSTRING(dst)->ptr;
106
+
107
+ bool modified = false;
108
+
109
+ /* TODO: this should really be refactored… */
110
+ if (squeeze) {
111
+ unichar prev_c = -1;
112
+
113
+ while (s < s_end) {
114
+ unichar c0 = utf_char(s);
115
+
116
+ const char *prev = s;
117
+ s = utf_next(s);
118
+
119
+ if (tr_table_lookup(translation, c0)) {
120
+ unichar c = replace(c0, closure);
121
+ if (prev_c == c)
122
+ continue;
123
+ prev_c = c;
124
+ len += unichar_to_utf(c, (t != NULL) ? t + len : NULL);
125
+ modified = true;
126
+ } else {
127
+ prev_c = -1;
128
+ if (t != NULL)
129
+ memcpy(t + len, prev, s - prev);
130
+ len += s - prev;
131
+ }
132
+
133
+ }
134
+
135
+ if (RSTRING(src)->len > (t + len - RSTRING(src)->ptr))
136
+ modified = true;
137
+ } else {
138
+ while (s < s_end) {
139
+ unichar c = utf_char(s);
140
+
141
+ const char *prev = s;
142
+ s = utf_next(s);
143
+
144
+ if (tr_table_lookup(translation, c)) {
145
+ len += unichar_to_utf(replace(c, closure),
146
+ (t != NULL) ? t + len : NULL);
147
+ modified = true;
148
+ } else {
149
+ if (t != NULL)
150
+ memcpy(t + len, prev, s - prev);
151
+ len += s - prev;
152
+ }
153
+ }
154
+ }
155
+
156
+ #ifdef RB_STR_REPLACE_IS_EXTERN
157
+ if (replace_content && !modified)
158
+ return Qnil;
159
+ #endif
160
+
161
+ if (dst == Qnil) {
162
+ #ifdef RB_STR_REPLACE_IS_EXTERN
163
+ if (replace_content && len <= RSTRING(src)->len)
164
+ dst = src;
165
+ else
166
+ #endif
167
+ dst = rb_str_buf_new(len);
168
+ goto again;
169
+ }
170
+
171
+ t[len] = '\0';
172
+ RSTRING(dst)->len = len;
173
+
174
+ #ifdef RB_STR_REPLACE_IS_EXTERN
175
+ if (dst != src && replace_content) {
176
+ rb_str_replace(src, dst);
177
+ return src;
178
+ }
179
+ #endif
180
+
181
+ return dst;
182
+ }
183
+
184
+ static VALUE
185
+ tr_trans(VALUE str, VALUE from, VALUE to, bool squeeze, bool replace_content)
186
+ {
187
+ StringValue(str);
188
+ StringValue(from);
189
+ StringValue(to);
190
+
191
+ if (RSTRING(str)->ptr == NULL || RSTRING(str)->len == 0)
192
+ return replace_content ? Qnil : str;
193
+
194
+ if (RSTRING(to)->len == 0)
195
+ return rb_utf_delete_bang(1, &from, str);
196
+
197
+ struct tr tr_from;
198
+ tr_init(&tr_from,
199
+ RSTRING(from)->ptr,
200
+ RSTRING(from)->ptr + RSTRING(from)->len);
201
+
202
+ struct tr tr_to;
203
+ tr_init(&tr_to,
204
+ RSTRING(to)->ptr,
205
+ RSTRING(to)->ptr + RSTRING(to)->len);
206
+
207
+ unsigned int translation[TR_TABLE_SIZE];
208
+ tr_setup_table(from, translation, true);
209
+
210
+ tr_init(&tr_from,
211
+ RSTRING(from)->ptr,
212
+ RSTRING(from)->ptr + RSTRING(from)->len);
213
+ if (tr_should_exclude(&tr_from)) {
214
+ /* This case is easy. Just include everything by default and
215
+ * exclude the rest as always. Replace characters found by the
216
+ * last character found in tr_to. */
217
+ while (tr_next(&tr_to) != TR_FINISHED)
218
+ ; /* We just need the last replacement character. */
219
+ return tr_trans_do(str, translation, tr_trans_replace_exclude,
220
+ &tr_to.now, squeeze, replace_content);
221
+ } else {
222
+ /* This case is hard. We need a full-fledged lookup of what
223
+ * character to translate to, not simply a check whether to
224
+ * include it or not. */
225
+ struct tr_trans_closure trans_closure;
226
+
227
+ struct tr_range from_ranges[utf_length_n(RSTRING(from)->ptr, RSTRING(from)->len)];
228
+ trans_closure.from = from_ranges;
229
+ trans_closure.n_from = tr_ranges_setup(&tr_from, from_ranges);
230
+
231
+ struct tr_range to_ranges[utf_length_n(RSTRING(to)->ptr, RSTRING(to)->len)];
232
+ trans_closure.to = to_ranges;
233
+ trans_closure.n_to = tr_ranges_setup(&tr_to, to_ranges);
234
+
235
+ return tr_trans_do(str, translation, tr_trans_replace_include,
236
+ &trans_closure, squeeze, replace_content);
237
+ }
238
+ }
239
+
240
+ VALUE
241
+ rb_utf_tr(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to)
242
+ {
243
+ return tr_trans(str, from, to, false, false);
244
+ }
245
+
246
+ VALUE
247
+ rb_utf_tr_s(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to)
248
+ {
249
+ return tr_trans(str, from, to, true, false);
250
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * contents: UTF8.upcase module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_upcase(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ return rb_utf_alloc_using(utf_upcase(StringValuePtr(str)));
13
+ }
@@ -0,0 +1,319 @@
1
+ /*
2
+ * contents: Unicode class.
3
+ *
4
+ * Copyright © 2005 Nikolai Weibull <work@rawuncut.elitemail.org>
5
+ */
6
+
7
+
8
+ #include <ruby.h>
9
+ #include <re.h>
10
+ #include <stdbool.h>
11
+ #include <stddef.h>
12
+ #include <stdint.h>
13
+ #include <limits.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "rb_methods.h"
17
+
18
+ static VALUE mUTF8Methods;
19
+
20
+ void
21
+ need_at_least_n_arguments(int argc, int n)
22
+ {
23
+ static const char *const words[] = {
24
+ NULL, NULL, "two", "three", "four",
25
+ "five", "six", "seven", "eight", "nine"
26
+ };
27
+
28
+ if (argc >= n)
29
+ return;
30
+
31
+ if (n == 1)
32
+ rb_raise(rb_eArgError, "need at least one argument");
33
+ else if (1 < n && n < 10)
34
+ rb_raise(rb_eArgError, "need at least %s arguments", words[n]);
35
+ else
36
+ rb_raise(rb_eArgError, "need at least %d arguments", n);
37
+ }
38
+
39
+ unichar
40
+ _utf_char_validated(char const *const str, char const *const str_end)
41
+ {
42
+ unichar c = utf_char_validated_n(str, str_end - str);
43
+ switch (c) {
44
+ case UTF_BAD_INPUT_UNICHAR:
45
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
46
+ case UTF_INCOMPLETE_INPUT_UNICHAR:
47
+ rb_raise(rb_eArgError,
48
+ "input contains an incomplete UTF-8-encoded character");
49
+ default:
50
+ return c;
51
+ }
52
+ }
53
+
54
+ /* TODO: instead of ‘end’, perhaps use a len/max-type parameter? */
55
+ char *
56
+ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
57
+ const char *limit, bool noisy)
58
+ {
59
+ const char *p = str;
60
+ long saved_offset = offset;
61
+
62
+ if (offset > 0) {
63
+ while (p < limit && offset-- > 0)
64
+ p = utf_next(p);
65
+
66
+ if (offset > 0) {
67
+ if (noisy)
68
+ rb_raise(rb_eIndexError,
69
+ "index %ld lays beyond end of string",
70
+ saved_offset);
71
+ else
72
+ return NULL;
73
+ }
74
+ } else {
75
+ while (offset != 0) {
76
+ const char *base = p;
77
+ p += offset;
78
+ while (p >= limit && (*p & 0xc0) == 0x80)
79
+ p--;
80
+
81
+ if (p < limit) {
82
+ if (noisy)
83
+ rb_raise(rb_eIndexError,
84
+ "index %ld lays before beginning of string",
85
+ saved_offset);
86
+ else
87
+ return NULL;
88
+ break;
89
+ }
90
+
91
+ offset += utf_pointer_to_offset(p, base);
92
+ }
93
+ }
94
+
95
+ return (char *)p;
96
+ }
97
+
98
+ char *
99
+ _utf_offset_to_pointer_validated(const char *str, long offset, const char *end)
100
+ {
101
+ return _utf_offset_to_pointer_validated_impl(str, offset, end, true);
102
+ }
103
+
104
+ char *
105
+ _utf_offset_to_pointer_failable(const char *str, long offset, const char *end)
106
+ {
107
+ return _utf_offset_to_pointer_validated_impl(str, offset, end, false);
108
+ }
109
+
110
+ static char *
111
+ rb_utf_begin_setup(VALUE str, long offset, char **base_limit, char **limit)
112
+ {
113
+ char *base = RSTRING(str)->ptr;
114
+
115
+ *base_limit = RSTRING(str)->ptr + RSTRING(str)->len;
116
+ *limit = *base_limit;
117
+
118
+ if (offset < 0) {
119
+ char *tmp = base;
120
+ base = *base_limit;
121
+ *base_limit = tmp;
122
+ }
123
+
124
+ return base;
125
+ }
126
+
127
+ bool
128
+ rb_utf_begin_from_offset(VALUE str, long offset, char **begin, char **limit)
129
+ {
130
+ char *base_limit;
131
+ char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
132
+
133
+ *begin = _utf_offset_to_pointer_failable(base, offset, base_limit);
134
+
135
+ return (*begin != NULL);
136
+ }
137
+
138
+ void
139
+ rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
140
+ char **limit)
141
+ {
142
+ char *base_limit;
143
+ char *base = rb_utf_begin_setup(str, offset, &base_limit, limit);
144
+
145
+ *begin = _utf_offset_to_pointer_validated(base, offset, base_limit);
146
+ }
147
+
148
+ char *
149
+ rb_utf_prev_validated(const char *begin, const char *p)
150
+ {
151
+ char *prev = utf_find_prev(begin, p);
152
+ if (prev == NULL)
153
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
154
+ return prev;
155
+ }
156
+
157
+ char *
158
+ rb_utf_next_validated(const char *p, const char *end)
159
+ {
160
+ char *next = (char *)utf_next(p);
161
+ if (next > end)
162
+ rb_raise(rb_eArgError, "input isn’t valid UTF-8");
163
+ return next;
164
+ }
165
+
166
+ VALUE
167
+ rb_utf_update(VALUE str, long offset, long len, VALUE replacement)
168
+ {
169
+ if (len < 0)
170
+ rb_raise(rb_eIndexError, "negative length %ld", len);
171
+
172
+ char *begin, *limit;
173
+ rb_utf_begin_from_offset_validated(str, offset, &begin, &limit);
174
+ char *end = _utf_offset_to_pointer_failable(begin, len, limit);
175
+ if (end == NULL)
176
+ end = limit;
177
+
178
+ rb_str_update(str, begin - RSTRING(str)->ptr, end - begin, replacement);
179
+
180
+ return replacement;
181
+ }
182
+
183
+ VALUE
184
+ rb_utf_new(const char *str, long len)
185
+ {
186
+ VALUE rbstr = rb_str_new(str, len);
187
+ rb_extend_object(rbstr, mUTF8Methods);
188
+ return rbstr;
189
+ }
190
+
191
+ VALUE
192
+ rb_utf_new2(const char *str)
193
+ {
194
+ VALUE rbstr = rb_str_new2(str);
195
+ rb_extend_object(rbstr, mUTF8Methods);
196
+ return rbstr;
197
+ }
198
+
199
+ VALUE
200
+ rb_utf_new5(VALUE obj, const char *str, long len)
201
+ {
202
+ VALUE rbstr = rb_str_new5(obj, str, len);
203
+ rb_extend_object(rbstr, mUTF8Methods);
204
+ return rbstr;
205
+ }
206
+
207
+ VALUE
208
+ rb_utf_alloc_using(char *str)
209
+ {
210
+ VALUE rbstr = rb_utf_new(NULL, 0);
211
+ long len = strlen(str);
212
+
213
+ RSTRING(rbstr)->ptr = str;
214
+ RSTRING(rbstr)->aux.capa = len;
215
+ RSTRING(rbstr)->len = len;
216
+ RSTRING(rbstr)->ptr[len] = '\0';
217
+
218
+ return rbstr;
219
+ }
220
+
221
+ VALUE
222
+ rb_utf_dup(VALUE str)
223
+ {
224
+ str = rb_str_dup(str);
225
+ rb_extend_object(str, mUTF8Methods);
226
+ return str;
227
+ }
228
+
229
+ /* TODO: rewrite this using the new offset-calculating functions. */
230
+ long
231
+ rb_utf_index(VALUE str, VALUE sub, long offset)
232
+ {
233
+ long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
234
+
235
+ if (offset < 0) {
236
+ offset += n_chars;
237
+
238
+ if (offset < 0)
239
+ return -1;
240
+ }
241
+
242
+ if (n_chars - offset < utf_length(RSTRING(sub)->ptr))
243
+ return -1;
244
+
245
+ if (RSTRING(sub)->len == 0)
246
+ return offset;
247
+
248
+ char *begin = utf_offset_to_pointer(RSTRING(str)->ptr, offset);
249
+ long pos = rb_memsearch(RSTRING(sub)->ptr, RSTRING(sub)->len,
250
+ begin, RSTRING(str)->len - (begin - RSTRING(str)->ptr));
251
+
252
+ if (pos < 0)
253
+ return -1;
254
+
255
+ return offset + utf_pointer_to_offset(begin, begin + pos);
256
+ }
257
+
258
+ long
259
+ rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
260
+ long offset, bool reverse)
261
+ {
262
+ long byte_offset = _utf_offset_to_pointer_validated(s, offset, end) - s;
263
+ long byte_startpos = rb_reg_adjust_startpos(sub, str, byte_offset, reverse);
264
+ long byte_index = rb_reg_search(sub, str, byte_startpos, reverse);
265
+ if (byte_index == -1)
266
+ return -1;
267
+ return utf_pointer_to_offset(s, s + byte_index);
268
+ }
269
+
270
+ void Init_utf8(void);
271
+ void
272
+ Init_utf8(void)
273
+ {
274
+ VALUE mEncoding = rb_define_module("Encoding");
275
+ VALUE mCharacter = rb_define_module_under(mEncoding, "Character");
276
+ VALUE mUTF8 = rb_define_module_under(mCharacter, "UTF8");
277
+
278
+ mUTF8Methods = rb_define_module_under(mUTF8, "Methods");
279
+
280
+ rb_define_module_function(mUTF8, "collate", rb_utf_collate, 2);
281
+ rb_define_module_function(mUTF8, "aref", rb_utf_aref_m, -1);
282
+ rb_define_module_function(mUTF8, "aset", rb_utf_aset_m, -1);
283
+ rb_define_module_function(mUTF8, "casecmp", rb_utf_casecmp, 2);
284
+ rb_define_module_function(mUTF8, "center", rb_utf_center, -1);
285
+ rb_define_module_function(mUTF8, "chomp", rb_utf_chomp, -1);
286
+ rb_define_module_function(mUTF8, "chomp!", rb_utf_chomp_bang, -1);
287
+ rb_define_module_function(mUTF8, "chop", rb_utf_chop, 1);
288
+ rb_define_module_function(mUTF8, "chop!", rb_utf_chop_bang, 1);
289
+ rb_define_module_function(mUTF8, "count", rb_utf_count, -1);
290
+ rb_define_module_function(mUTF8, "delete", rb_utf_delete, -1);
291
+ rb_define_module_function(mUTF8, "delete!", rb_utf_delete_bang, -1);
292
+ rb_define_module_function(mUTF8, "each_char", rb_utf_each_char, 1);
293
+ rb_define_module_function(mUTF8, "index", rb_utf_index_m, -1);
294
+ rb_define_module_function(mUTF8, "insert", rb_utf_insert, 3);
295
+ rb_define_module_function(mUTF8, "lstrip", rb_utf_lstrip, 1);
296
+ rb_define_module_function(mUTF8, "lstrip!", rb_utf_lstrip_bang, 1);
297
+ rb_define_module_function(mUTF8, "rindex", rb_utf_rindex_m, -1);
298
+ rb_define_module_function(mUTF8, "rstrip", rb_utf_rstrip, 1);
299
+ rb_define_module_function(mUTF8, "rstrip!", rb_utf_rstrip_bang, 1);
300
+ rb_define_module_function(mUTF8, "squeeze", rb_utf_squeeze, -1);
301
+ rb_define_module_function(mUTF8, "squeeze!", rb_utf_squeeze_bang, -1);
302
+ rb_define_module_function(mUTF8, "strip", rb_utf_strip, 1);
303
+ rb_define_module_function(mUTF8, "strip!", rb_utf_strip_bang, 1);
304
+ rb_define_module_function(mUTF8, "to_i", rb_utf_to_i, -1);
305
+ rb_define_module_function(mUTF8, "hex", rb_utf_hex, 1);
306
+ rb_define_module_function(mUTF8, "oct", rb_utf_oct, 1);
307
+ rb_define_module_function(mUTF8, "tr", rb_utf_tr, 3);
308
+ rb_define_module_function(mUTF8, "tr_s", rb_utf_tr_s, 3);
309
+
310
+ rb_define_module_function(mUTF8, "downcase", rb_utf_downcase, 1);
311
+ rb_define_module_function(mUTF8, "ljust", rb_utf_ljust, -1);
312
+ rb_define_module_function(mUTF8, "length", rb_utf_length, 1);
313
+ rb_define_module_function(mUTF8, "reverse", rb_utf_reverse, 1);
314
+ rb_define_module_function(mUTF8, "rjust", rb_utf_rjust, -1);
315
+ rb_define_module_function(mUTF8, "upcase", rb_utf_upcase, 1);
316
+
317
+ rb_define_module_function(mUTF8, "foldcase", rb_utf_foldcase, 1);
318
+ rb_define_module_function(mUTF8, "normalize", rb_utf_normalize, -1);
319
+ }