character-encodings 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Internal functionality for turning strings into Bignums.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_UTF_INTERNAL_BIGNUM_H
|
8
|
+
#define RB_UTF_INTERNAL_BIGNUM_H
|
9
|
+
|
10
|
+
VALUE rb_utf_to_inum(VALUE str, int base, bool verify) HIDDEN;
|
11
|
+
|
12
|
+
#endif /* RB_UTF_INTERNAL_BIGNUM_H */
|
@@ -0,0 +1,142 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Translation (#tr) related functions.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_tr.h"
|
9
|
+
|
10
|
+
void
|
11
|
+
tr_init(struct tr *tr, char *p, char *p_end)
|
12
|
+
{
|
13
|
+
tr->p = p;
|
14
|
+
tr->p_end = p_end;
|
15
|
+
tr->inside_range = false;
|
16
|
+
}
|
17
|
+
|
18
|
+
bool
|
19
|
+
tr_should_exclude(struct tr *tr)
|
20
|
+
{
|
21
|
+
if (tr->p + 1 < tr->p_end && *tr->p == '^') {
|
22
|
+
tr->p++;
|
23
|
+
return true;
|
24
|
+
}
|
25
|
+
|
26
|
+
return false;
|
27
|
+
}
|
28
|
+
|
29
|
+
static enum tr_state
|
30
|
+
tr_next_char(struct tr *t)
|
31
|
+
{
|
32
|
+
if (t->p == t->p_end)
|
33
|
+
return TR_FINISHED;
|
34
|
+
|
35
|
+
if (_utf_char_validated(t->p, t->p_end) == '\\') {
|
36
|
+
char *next = utf_find_next(t->p, t->p_end);
|
37
|
+
|
38
|
+
if (next == NULL) {
|
39
|
+
t->now = '\\';
|
40
|
+
t->p = t->p_end;
|
41
|
+
return TR_FOUND;
|
42
|
+
}
|
43
|
+
|
44
|
+
t->p = next;
|
45
|
+
}
|
46
|
+
|
47
|
+
t->now = _utf_char_validated(t->p, t->p_end);
|
48
|
+
|
49
|
+
char *next = utf_find_next(t->p, t->p_end);
|
50
|
+
if (next == NULL) {
|
51
|
+
t->p = t->p_end;
|
52
|
+
return TR_FOUND;
|
53
|
+
}
|
54
|
+
t->p = next;
|
55
|
+
|
56
|
+
if (_utf_char_validated(t->p, t->p_end) == '-') {
|
57
|
+
next = utf_find_next(t->p, t->p_end);
|
58
|
+
|
59
|
+
if (next != NULL) {
|
60
|
+
unichar max = utf_char(next);
|
61
|
+
|
62
|
+
if (max < t->now) {
|
63
|
+
t->p = next;
|
64
|
+
return TR_READ_ANOTHER;
|
65
|
+
}
|
66
|
+
|
67
|
+
t->inside_range = true;
|
68
|
+
t->max = max;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
return TR_FOUND;
|
73
|
+
}
|
74
|
+
|
75
|
+
enum tr_state
|
76
|
+
tr_next(struct tr *t)
|
77
|
+
{
|
78
|
+
while (true) {
|
79
|
+
if (!t->inside_range) {
|
80
|
+
enum tr_state state;
|
81
|
+
|
82
|
+
if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
|
83
|
+
continue;
|
84
|
+
|
85
|
+
return state;
|
86
|
+
} else if (++t->now < t->max) {
|
87
|
+
return TR_FOUND;
|
88
|
+
} else {
|
89
|
+
t->inside_range = false;
|
90
|
+
return TR_FOUND;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
static void
|
96
|
+
tr_table_set(unsigned int *table, unichar c, unsigned int value)
|
97
|
+
{
|
98
|
+
unsigned int offset = c / WORD_BIT;
|
99
|
+
unsigned int bit = c % WORD_BIT;
|
100
|
+
|
101
|
+
table[offset] = (table[offset] & ~(1U << bit)) | ((value & 1U) << bit);
|
102
|
+
}
|
103
|
+
|
104
|
+
void
|
105
|
+
tr_setup_table(VALUE str, unsigned int *table, bool initialize)
|
106
|
+
{
|
107
|
+
unsigned int buf[TR_TABLE_SIZE];
|
108
|
+
|
109
|
+
struct tr tr;
|
110
|
+
tr_init(&tr, RSTRING(str)->ptr, RSTRING(str)->ptr + RSTRING(str)->len);
|
111
|
+
|
112
|
+
bool exclude = tr_should_exclude(&tr);
|
113
|
+
|
114
|
+
if (initialize)
|
115
|
+
for (int i = 0; i < TR_TABLE_SIZE; i++)
|
116
|
+
table[i] = ~0U;
|
117
|
+
|
118
|
+
unsigned int buf_initializer = exclude ? ~0U : 0U;
|
119
|
+
for (int i = 0; i < TR_TABLE_SIZE; i++)
|
120
|
+
buf[i] = buf_initializer;
|
121
|
+
|
122
|
+
unsigned int buf_setter = !exclude;
|
123
|
+
while (tr_next(&tr) != TR_FINISHED)
|
124
|
+
tr_table_set(buf, tr.now, buf_setter);
|
125
|
+
|
126
|
+
for (int i = 0; i < TR_TABLE_SIZE; i++)
|
127
|
+
table[i] &= buf[i];
|
128
|
+
}
|
129
|
+
|
130
|
+
void
|
131
|
+
tr_setup_table_from_strings(unsigned int *table, int argc, VALUE *argv)
|
132
|
+
{
|
133
|
+
bool initialize = true;
|
134
|
+
for (int i = 0; i < argc; i++) {
|
135
|
+
VALUE s = argv[i];
|
136
|
+
|
137
|
+
StringValue(s);
|
138
|
+
tr_setup_table(s, table, initialize);
|
139
|
+
initialize = false;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Translation (#tr) related functions
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef TR_H
|
8
|
+
#define TR_H
|
9
|
+
|
10
|
+
#ifndef WORD_BIT
|
11
|
+
# define WORD_BIT (sizeof(int) * CHAR_BIT)
|
12
|
+
#endif
|
13
|
+
|
14
|
+
#define TR_TABLE_SIZE ((int)(UNICODE_N_CODEPOINTS / WORD_BIT))
|
15
|
+
|
16
|
+
#define tr_table_lookup(table, offset) \
|
17
|
+
((table)[(offset) / WORD_BIT] & (1U << (offset) % WORD_BIT))
|
18
|
+
|
19
|
+
struct tr {
|
20
|
+
bool inside_range;
|
21
|
+
unichar now;
|
22
|
+
unichar max;
|
23
|
+
char *p;
|
24
|
+
char *p_end;
|
25
|
+
};
|
26
|
+
|
27
|
+
enum tr_state
|
28
|
+
{
|
29
|
+
TR_FOUND,
|
30
|
+
TR_READ_ANOTHER,
|
31
|
+
TR_FINISHED
|
32
|
+
};
|
33
|
+
|
34
|
+
void tr_init(struct tr *tr, char *p, char *p_end) HIDDEN;
|
35
|
+
bool tr_should_exclude(struct tr *tr) HIDDEN;
|
36
|
+
enum tr_state tr_next(struct tr *t) HIDDEN;
|
37
|
+
void tr_setup_table(VALUE str, unsigned int *table, bool initialize) HIDDEN;
|
38
|
+
void tr_setup_table_from_strings(unsigned int *table, int argc,
|
39
|
+
VALUE *argv) HIDDEN;
|
40
|
+
|
41
|
+
#endif /* TR_H */
|
@@ -0,0 +1,96 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.center, UTF8.ljust, and UTF8.rjust.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
static char *
|
10
|
+
rb_utf_justify_one_side(char *p, const char *f, long f_len, long f_size, long n)
|
11
|
+
{
|
12
|
+
long i;
|
13
|
+
for (i = 0; i + f_len < n; i += f_len, p += f_size)
|
14
|
+
memcpy(p, f, f_size);
|
15
|
+
|
16
|
+
const char *q = f;
|
17
|
+
while (i < n) {
|
18
|
+
const char *q_end = utf_next(q);
|
19
|
+
memcpy(p, q, q_end - q);
|
20
|
+
p += q_end - q;
|
21
|
+
q = q_end;
|
22
|
+
i++;
|
23
|
+
}
|
24
|
+
|
25
|
+
return p;
|
26
|
+
}
|
27
|
+
|
28
|
+
static VALUE
|
29
|
+
rb_utf_justify(int argc, VALUE *argv, char jflag)
|
30
|
+
{
|
31
|
+
VALUE str, w, pad;
|
32
|
+
const char *f = " ";
|
33
|
+
long f_len = 1;
|
34
|
+
long f_size = 1;
|
35
|
+
bool infect_from_pad = false;
|
36
|
+
|
37
|
+
if (rb_scan_args(argc, argv, "21", &str, &w, &pad) == 3) {
|
38
|
+
StringValue(pad);
|
39
|
+
f = RSTRING(pad)->ptr;
|
40
|
+
f_len = utf_length_n(f, RSTRING(pad)->len);
|
41
|
+
if (f_len == 0)
|
42
|
+
rb_raise(rb_eArgError, "zero width padding");
|
43
|
+
f_size = RSTRING(pad)->len;
|
44
|
+
infect_from_pad = true;
|
45
|
+
}
|
46
|
+
|
47
|
+
long len = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
48
|
+
|
49
|
+
long width = NUM2LONG(w);
|
50
|
+
if (width < 0 || len >= width)
|
51
|
+
return rb_utf_dup(str);
|
52
|
+
|
53
|
+
VALUE res = rb_utf_new5(str, 0, RSTRING(str)->len + (width - len) * f_size);
|
54
|
+
char *p = RSTRING(res)->ptr;
|
55
|
+
|
56
|
+
long n_remaining = width - len;
|
57
|
+
if (jflag != 'l') {
|
58
|
+
long n = n_remaining;
|
59
|
+
if (jflag == 'c')
|
60
|
+
n /= 2;
|
61
|
+
n_remaining -= n;
|
62
|
+
|
63
|
+
p = rb_utf_justify_one_side(p, f, f_len, f_size, n);
|
64
|
+
}
|
65
|
+
|
66
|
+
memcpy(p, RSTRING(str)->ptr, RSTRING(str)->len);
|
67
|
+
p += RSTRING(str)->len;
|
68
|
+
|
69
|
+
if (jflag != 'r')
|
70
|
+
p = rb_utf_justify_one_side(p, f, f_len, f_size, n_remaining);
|
71
|
+
|
72
|
+
OBJ_INFECT(res, str);
|
73
|
+
|
74
|
+
if (infect_from_pad)
|
75
|
+
OBJ_INFECT(res, pad);
|
76
|
+
|
77
|
+
return res;
|
78
|
+
}
|
79
|
+
|
80
|
+
VALUE
|
81
|
+
rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self))
|
82
|
+
{
|
83
|
+
return rb_utf_justify(argc, argv, 'c');
|
84
|
+
}
|
85
|
+
|
86
|
+
VALUE
|
87
|
+
rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self))
|
88
|
+
{
|
89
|
+
return rb_utf_justify(argc, argv, 'l');
|
90
|
+
}
|
91
|
+
|
92
|
+
VALUE
|
93
|
+
rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self))
|
94
|
+
{
|
95
|
+
return rb_utf_justify(argc, argv, 'r');
|
96
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.length module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_length(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
StringValue(str);
|
13
|
+
return UINT2NUM(utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len));
|
14
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.lstrip module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
StringValue(str);
|
13
|
+
char *s = RSTRING(str)->ptr;
|
14
|
+
if (s == NULL || RSTRING(str)->len == 0)
|
15
|
+
return Qnil;
|
16
|
+
|
17
|
+
char *end = s + RSTRING(str)->len;
|
18
|
+
|
19
|
+
/* Remove spaces at head. */
|
20
|
+
while (s < end && unichar_isspace(_utf_char_validated(s, end)))
|
21
|
+
s = utf_next(s);
|
22
|
+
|
23
|
+
/* If there weren’t any spaces at head, return Qnil. */
|
24
|
+
if (s == RSTRING(str)->ptr)
|
25
|
+
return Qnil;
|
26
|
+
|
27
|
+
rb_str_modify(str);
|
28
|
+
RSTRING(str)->len = end - s;
|
29
|
+
memmove(RSTRING(str)->ptr, s, RSTRING(str)->len);
|
30
|
+
RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
|
31
|
+
|
32
|
+
return str;
|
33
|
+
}
|
34
|
+
|
35
|
+
VALUE
|
36
|
+
rb_utf_lstrip(VALUE self, VALUE str)
|
37
|
+
{
|
38
|
+
str = rb_utf_dup(str);
|
39
|
+
rb_utf_lstrip_bang(self, str);
|
40
|
+
return str;
|
41
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Encoding::Character::UTF8.normalize module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
#define SYMBOL2MODE(symbol, mode, id) do { \
|
10
|
+
static ID id_##symbol; \
|
11
|
+
if (id_##symbol == 0) \
|
12
|
+
id_##symbol = rb_intern(#symbol); \
|
13
|
+
if (id == id_##symbol) \
|
14
|
+
return mode; \
|
15
|
+
} while (0)
|
16
|
+
|
17
|
+
static NormalizeMode
|
18
|
+
symbol_to_mode(VALUE symbol)
|
19
|
+
{
|
20
|
+
if (!SYMBOL_P(symbol))
|
21
|
+
rb_raise(rb_eTypeError, "not a symbol");
|
22
|
+
|
23
|
+
ID id = SYM2ID(symbol);
|
24
|
+
|
25
|
+
SYMBOL2MODE(default, NORMALIZE_DEFAULT, id);
|
26
|
+
SYMBOL2MODE(nfd, NORMALIZE_NFD, id);
|
27
|
+
SYMBOL2MODE(default_compose, NORMALIZE_DEFAULT_COMPOSE, id);
|
28
|
+
SYMBOL2MODE(nfc, NORMALIZE_NFC, id);
|
29
|
+
SYMBOL2MODE(all, NORMALIZE_ALL, id);
|
30
|
+
SYMBOL2MODE(nfkd, NORMALIZE_NFKD, id);
|
31
|
+
SYMBOL2MODE(all_compose, NORMALIZE_ALL_COMPOSE, id);
|
32
|
+
SYMBOL2MODE(nfkc, NORMALIZE_NFKC, id);
|
33
|
+
|
34
|
+
rb_raise(rb_eArgError, "unknown symbol");
|
35
|
+
}
|
36
|
+
|
37
|
+
VALUE
|
38
|
+
rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self))
|
39
|
+
{
|
40
|
+
VALUE str, rbmode;
|
41
|
+
|
42
|
+
NormalizeMode mode = NORMALIZE_DEFAULT;
|
43
|
+
if (rb_scan_args(argc, argv, "11", &str, &rbmode) == 2)
|
44
|
+
mode = symbol_to_mode(rbmode);
|
45
|
+
|
46
|
+
StringValue(str);
|
47
|
+
|
48
|
+
return rb_utf_alloc_using(utf_normalize_n(RSTRING(str)->ptr,
|
49
|
+
mode,
|
50
|
+
RSTRING(str)->len));
|
51
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.oct module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_bignum.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_oct(UNUSED(VALUE self), VALUE str)
|
12
|
+
{
|
13
|
+
return rb_utf_to_inum(str, -8, false);
|
14
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.reverse module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_reverse(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
return rb_utf_alloc_using(utf_reverse(StringValuePtr(str)));
|
13
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.rindex module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
static long
|
10
|
+
rb_utf_rindex(VALUE str, VALUE sub, long offset)
|
11
|
+
{
|
12
|
+
if (RSTRING(str)->len < RSTRING(sub)->len)
|
13
|
+
return -1;
|
14
|
+
|
15
|
+
char *s, *end;
|
16
|
+
rb_utf_begin_from_offset_validated(str, offset, &s, &end);
|
17
|
+
|
18
|
+
if (RSTRING(sub)->len == 0)
|
19
|
+
return utf_pointer_to_offset(RSTRING(str)->ptr, s);
|
20
|
+
|
21
|
+
char *s_begin = RSTRING(str)->ptr;
|
22
|
+
char *t = RSTRING(sub)->ptr;
|
23
|
+
long len = RSTRING(sub)->len;
|
24
|
+
while (s >= s_begin) {
|
25
|
+
if (rb_memcmp(s, t, len) == 0)
|
26
|
+
return utf_pointer_to_offset(s_begin, s);
|
27
|
+
s--;
|
28
|
+
}
|
29
|
+
|
30
|
+
return -1;
|
31
|
+
}
|
32
|
+
|
33
|
+
VALUE
|
34
|
+
rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self))
|
35
|
+
{
|
36
|
+
VALUE str, sub, rboffset;
|
37
|
+
|
38
|
+
rb_scan_args(argc, argv, "21", &str, &sub, &rboffset);
|
39
|
+
|
40
|
+
StringValue(str);
|
41
|
+
|
42
|
+
long offset = (argc == 3) ? NUM2LONG(rboffset) : RSTRING(str)->len;
|
43
|
+
|
44
|
+
char *begin, *end;
|
45
|
+
rb_utf_begin_from_offset(str, offset, &begin, &end);
|
46
|
+
if (begin == NULL) {
|
47
|
+
if (offset <= 0) {
|
48
|
+
if (TYPE(sub) == T_REGEXP)
|
49
|
+
rb_backref_set(Qnil);
|
50
|
+
|
51
|
+
return Qnil;
|
52
|
+
}
|
53
|
+
|
54
|
+
begin = end;
|
55
|
+
/* TODO: this converting back and forward can be optimized away
|
56
|
+
* if rb_utf_index_regexp() and rb_utf_rindex() were split up
|
57
|
+
* into two additional functions, adding
|
58
|
+
* rb_utf_index_regexp_pointer() and rb_utf_rindex_pointer(),
|
59
|
+
* so that one can pass a pointer to start at immediately
|
60
|
+
* instead of an offset that gets calculated into a pointer. */
|
61
|
+
offset = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
62
|
+
}
|
63
|
+
|
64
|
+
switch (TYPE(sub)) {
|
65
|
+
case T_REGEXP:
|
66
|
+
if (RREGEXP(sub)->len > 0)
|
67
|
+
offset = rb_utf_index_regexp(str, begin, end, sub,
|
68
|
+
offset, true);
|
69
|
+
break;
|
70
|
+
default: {
|
71
|
+
VALUE tmp = rb_check_string_type(sub);
|
72
|
+
if (NIL_P(tmp))
|
73
|
+
rb_raise(rb_eTypeError, "type mismatch: %s given",
|
74
|
+
rb_obj_classname(sub));
|
75
|
+
|
76
|
+
sub = tmp;
|
77
|
+
}
|
78
|
+
/* fall through */
|
79
|
+
case T_STRING:
|
80
|
+
offset = rb_utf_rindex(str, sub, offset);
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (offset < 0)
|
85
|
+
return Qnil;
|
86
|
+
|
87
|
+
return LONG2NUM(offset);
|
88
|
+
}
|