utf8 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +46 -0
- data/Rakefile +11 -0
- data/benchmark/active_support.rb +61 -0
- data/benchmark/test.txt +693 -0
- data/ext/utf8/ext.c +21 -0
- data/ext/utf8/ext.h +17 -0
- data/ext/utf8/extconf.rb +7 -0
- data/ext/utf8/string_scanner_utf8.c +68 -0
- data/ext/utf8/string_scanner_utf8.h +6 -0
- data/ext/utf8/string_utf8.c +224 -0
- data/ext/utf8/string_utf8.h +6 -0
- data/ext/utf8/utf8.c +80 -0
- data/ext/utf8/utf8.h +7 -0
- data/lib/utf8.rb +5 -0
- data/lib/utf8/string.rb +19 -0
- data/lib/utf8/string_scanner.rb +21 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/string_scanner_spec.rb +48 -0
- data/spec/string_spec.rb +151 -0
- data/utf8.gemspec +37 -0
- metadata +120 -0
data/ext/utf8/ext.c
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
|
3
|
+
#include "string_utf8.h"
|
4
|
+
#include "string_scanner_utf8.h"
|
5
|
+
|
6
|
+
VALUE intern_as_utf8;
|
7
|
+
|
8
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
9
|
+
rb_encoding *utf8Encoding;
|
10
|
+
#endif
|
11
|
+
|
12
|
+
void Init_utf8() {
|
13
|
+
init_String_UTF8();
|
14
|
+
init_StringScanner_UTF8();
|
15
|
+
|
16
|
+
intern_as_utf8 = rb_intern("as_utf8");
|
17
|
+
|
18
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
19
|
+
utf8Encoding = rb_utf8_encoding();
|
20
|
+
#endif
|
21
|
+
}
|
data/ext/utf8/ext.h
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef UTF8_EXT_H
|
2
|
+
#define UTF8_EXT_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
extern rb_encoding *utf8Encoding;
|
9
|
+
#define AS_UTF8(_str) \
|
10
|
+
_str = rb_funcall(_str, intern_as_utf8, 0); \
|
11
|
+
rb_enc_associate(_str, utf8Encoding);
|
12
|
+
|
13
|
+
#else
|
14
|
+
#define AS_UTF8(_str) _str = rb_funcall(_str, intern_as_utf8, 0)
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
data/ext/utf8/extconf.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
#include "ruby/regex.h"
|
3
|
+
#include "utf8.h"
|
4
|
+
|
5
|
+
extern ID intern_as_utf8;
|
6
|
+
|
7
|
+
struct strscanner {
|
8
|
+
/* multi-purpose flags */
|
9
|
+
unsigned long flags;
|
10
|
+
|
11
|
+
/* the string to scan */
|
12
|
+
VALUE str;
|
13
|
+
|
14
|
+
/* scan pointers */
|
15
|
+
long prev; /* legal only when MATCHED_P(s) */
|
16
|
+
long curr; /* always legal */
|
17
|
+
|
18
|
+
/* the regexp register; legal only when MATCHED_P(s) */
|
19
|
+
struct re_registers regs;
|
20
|
+
};
|
21
|
+
|
22
|
+
#define GET_SCANNER(obj, var) \
|
23
|
+
Data_Get_Struct(obj, struct strscanner, var); \
|
24
|
+
if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
|
25
|
+
|
26
|
+
/*
|
27
|
+
* Document-class: StringScanner::UTF8
|
28
|
+
*/
|
29
|
+
|
30
|
+
/*
|
31
|
+
* call-seq: getch
|
32
|
+
*
|
33
|
+
* Works like StringScanner#getch but is UTF8-aware
|
34
|
+
*/
|
35
|
+
static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
36
|
+
unsigned char *str;
|
37
|
+
size_t len;
|
38
|
+
struct strscanner *scanner;
|
39
|
+
VALUE utf8Str;
|
40
|
+
int8_t lastCharLen=0;
|
41
|
+
GET_SCANNER(self, scanner);
|
42
|
+
|
43
|
+
str = (unsigned char *)RSTRING_PTR(scanner->str);
|
44
|
+
len = RSTRING_LEN(scanner->str);
|
45
|
+
|
46
|
+
if (len > 0 && len > scanner->curr) {
|
47
|
+
lastCharLen = utf8CharLen(str, len);
|
48
|
+
utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
|
49
|
+
scanner->curr += lastCharLen;
|
50
|
+
AS_UTF8(utf8Str);
|
51
|
+
return utf8Str;
|
52
|
+
} else {
|
53
|
+
return Qnil;
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
void init_StringScanner_UTF8() {
|
58
|
+
ID intern_string_scanner = rb_intern("StringScanner");
|
59
|
+
VALUE rb_cStringScanner, rb_cStringScanner_UTF8;
|
60
|
+
|
61
|
+
if (!rb_const_defined(rb_cObject, intern_string_scanner)) {
|
62
|
+
rb_require("strscan");
|
63
|
+
}
|
64
|
+
rb_cStringScanner = rb_const_get(rb_cObject, intern_string_scanner);
|
65
|
+
rb_cStringScanner_UTF8 = rb_define_class_under(rb_cStringScanner, "UTF8", rb_cStringScanner);
|
66
|
+
|
67
|
+
rb_define_method(rb_cStringScanner_UTF8, "getch", rb_cStringScanner_UTF8_getch, 0);
|
68
|
+
}
|
@@ -0,0 +1,224 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
#include "utf8.h"
|
3
|
+
|
4
|
+
extern VALUE intern_as_utf8;
|
5
|
+
|
6
|
+
/*
|
7
|
+
* Document-class: String::UTF8
|
8
|
+
*/
|
9
|
+
|
10
|
+
/*
|
11
|
+
* call-seq: length
|
12
|
+
*
|
13
|
+
* Returns the number of UTF8 characters in this string
|
14
|
+
*/
|
15
|
+
static VALUE rb_cString_UTF8_length(VALUE self) {
|
16
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
17
|
+
size_t len = RSTRING_LEN(self);
|
18
|
+
size_t utf8_len = 0;
|
19
|
+
|
20
|
+
utf8_len = utf8CharCount(str, len);
|
21
|
+
|
22
|
+
return INT2FIX(utf8_len);
|
23
|
+
}
|
24
|
+
|
25
|
+
/*
|
26
|
+
* call-seq: each_char {|utf8_char| ...}
|
27
|
+
*
|
28
|
+
* Iterates over the string, yielding one UTF8 character at a time
|
29
|
+
*/
|
30
|
+
static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
31
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
32
|
+
size_t len = RSTRING_LEN(self), i=0;
|
33
|
+
int8_t lastCharLen=0;
|
34
|
+
VALUE utf8Str;
|
35
|
+
|
36
|
+
// this will return an Enumerator wrapping this string, yielding this method
|
37
|
+
// when Enumerator#each is called
|
38
|
+
RETURN_ENUMERATOR(self, 0, 0);
|
39
|
+
|
40
|
+
for(; i<len; i+=lastCharLen) {
|
41
|
+
lastCharLen = utf8CharLen(str, len);
|
42
|
+
utf8Str = rb_str_new((char *)str+i, lastCharLen);
|
43
|
+
AS_UTF8(utf8Str);
|
44
|
+
rb_yield(utf8Str);
|
45
|
+
}
|
46
|
+
|
47
|
+
return self;
|
48
|
+
}
|
49
|
+
|
50
|
+
/*
|
51
|
+
* Works like String#[] but taking into account UTF8 character boundaries
|
52
|
+
*
|
53
|
+
* This method doesn't currently (and may never) support Regexp parameters
|
54
|
+
* It also doesn't support a String parameter (yet)
|
55
|
+
*/
|
56
|
+
static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
57
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
|
58
|
+
size_t len = RSTRING_LEN(self);
|
59
|
+
VALUE utf8Str;
|
60
|
+
|
61
|
+
if (len == 0) return Qnil;
|
62
|
+
|
63
|
+
if (argc == 2) {
|
64
|
+
if (TYPE(argv[0]) == T_REGEXP) {
|
65
|
+
rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
|
66
|
+
}
|
67
|
+
|
68
|
+
// [offset, length] syntax
|
69
|
+
long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]);
|
70
|
+
int8_t curCharLen = 0;
|
71
|
+
unsigned char *offset = str;
|
72
|
+
|
73
|
+
if (wantLen < 0) {
|
74
|
+
return Qnil;
|
75
|
+
} else if (wantLen == 0) {
|
76
|
+
utf8Str = rb_str_new("", 0);
|
77
|
+
AS_UTF8(utf8Str);
|
78
|
+
return utf8Str;
|
79
|
+
}
|
80
|
+
|
81
|
+
if (wantPos < 0) {
|
82
|
+
long char_cnt = utf8CharCount(str, len);
|
83
|
+
if ((wantPos * -1) > char_cnt) {
|
84
|
+
return Qnil;
|
85
|
+
}
|
86
|
+
wantPos = char_cnt + wantPos;
|
87
|
+
}
|
88
|
+
|
89
|
+
// scan until starting position
|
90
|
+
curCharLen = utf8CharLen(str, len);
|
91
|
+
while (curPos < wantPos) {
|
92
|
+
// if we're about to step out of bounds, return nil
|
93
|
+
if ((size_t)(str-start) >= len) {
|
94
|
+
return Qnil;
|
95
|
+
}
|
96
|
+
|
97
|
+
str += curCharLen;
|
98
|
+
curCharLen = utf8CharLen(str, len);
|
99
|
+
curPos++;
|
100
|
+
}
|
101
|
+
|
102
|
+
// now scan until we have the number of chars asked for
|
103
|
+
curPos = 1;
|
104
|
+
offset = str;
|
105
|
+
str += curCharLen;
|
106
|
+
curCharLen = utf8CharLen(str, len);
|
107
|
+
while (curPos < wantLen) {
|
108
|
+
// if we're about to step out of bounds, stop
|
109
|
+
if ((size_t)(str-start) >= len) {
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
|
113
|
+
str += curCharLen;
|
114
|
+
curCharLen = utf8CharLen(str, len);
|
115
|
+
curPos++;
|
116
|
+
}
|
117
|
+
|
118
|
+
utf8Str = rb_str_new((char *)offset, str-offset);
|
119
|
+
AS_UTF8(utf8Str);
|
120
|
+
return utf8Str;
|
121
|
+
}
|
122
|
+
|
123
|
+
if (argc != 1) {
|
124
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
|
125
|
+
}
|
126
|
+
|
127
|
+
// [Fixnum] syntax
|
128
|
+
if (TYPE(argv[0]) == T_FIXNUM) {
|
129
|
+
long wantPos = NUM2LONG(argv[0]), curPos = 0;
|
130
|
+
int8_t curCharLen = 0;
|
131
|
+
|
132
|
+
if (wantPos < 0) {
|
133
|
+
long char_cnt = utf8CharCount(str, len);
|
134
|
+
if ((wantPos * -1) > char_cnt) {
|
135
|
+
return Qnil;
|
136
|
+
}
|
137
|
+
wantPos = char_cnt + wantPos;
|
138
|
+
}
|
139
|
+
|
140
|
+
curCharLen = utf8CharLen(str, len);
|
141
|
+
while (curPos < wantPos) {
|
142
|
+
// if we're about to step out of bounds, return nil
|
143
|
+
if ((size_t)(str-start) >= len) {
|
144
|
+
return Qnil;
|
145
|
+
}
|
146
|
+
|
147
|
+
str += curCharLen;
|
148
|
+
curCharLen = utf8CharLen(str, len);
|
149
|
+
curPos++;
|
150
|
+
}
|
151
|
+
|
152
|
+
utf8Str = rb_str_new((char *)str, curCharLen);
|
153
|
+
AS_UTF8(utf8Str);
|
154
|
+
return utf8Str;
|
155
|
+
} else {
|
156
|
+
if (TYPE(argv[0]) == T_REGEXP) {
|
157
|
+
rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
|
158
|
+
}
|
159
|
+
|
160
|
+
// [Range] syntax
|
161
|
+
long wantPos, curPos = 0, wantLen, char_cnt = 0;
|
162
|
+
int8_t curCharLen = 0;
|
163
|
+
unsigned char *offset = str;
|
164
|
+
VALUE ret;
|
165
|
+
|
166
|
+
char_cnt = utf8CharCount(str, len);
|
167
|
+
ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0);
|
168
|
+
|
169
|
+
if (ret == Qnil) {
|
170
|
+
return Qnil;
|
171
|
+
} else if (ret == Qfalse) {
|
172
|
+
// TODO: wtf do we do :P
|
173
|
+
}
|
174
|
+
|
175
|
+
if (wantLen == 0) {
|
176
|
+
utf8Str = rb_str_new("", 0);
|
177
|
+
AS_UTF8(utf8Str);
|
178
|
+
return utf8Str;
|
179
|
+
}
|
180
|
+
|
181
|
+
// scan until starting position
|
182
|
+
curCharLen = utf8CharLen(str, len);
|
183
|
+
while (curPos < wantPos) {
|
184
|
+
// if we're about to step out of bounds, return ""
|
185
|
+
if ((size_t)(str-start) >= len) {
|
186
|
+
utf8Str = rb_str_new("", 0);
|
187
|
+
AS_UTF8(utf8Str);
|
188
|
+
return utf8Str;
|
189
|
+
}
|
190
|
+
|
191
|
+
str += curCharLen;
|
192
|
+
curCharLen = utf8CharLen(str, len);
|
193
|
+
curPos++;
|
194
|
+
}
|
195
|
+
|
196
|
+
// now scan until we have the number of chars asked for
|
197
|
+
curPos = 1;
|
198
|
+
offset = str;
|
199
|
+
str += curCharLen;
|
200
|
+
curCharLen = utf8CharLen(str, len);
|
201
|
+
while (curPos < wantLen) {
|
202
|
+
// if we're about to step out of bounds, stop
|
203
|
+
if ((size_t)(str-start) >= len) {
|
204
|
+
break;
|
205
|
+
}
|
206
|
+
|
207
|
+
str += curCharLen;
|
208
|
+
curCharLen = utf8CharLen(str, len);
|
209
|
+
curPos++;
|
210
|
+
}
|
211
|
+
|
212
|
+
utf8Str = rb_str_new((char *)offset, str-offset);
|
213
|
+
AS_UTF8(utf8Str);
|
214
|
+
return utf8Str;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
void init_String_UTF8() {
|
219
|
+
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
220
|
+
|
221
|
+
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
222
|
+
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
|
223
|
+
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
224
|
+
}
|
data/ext/utf8/utf8.c
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
|
4
|
+
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
|
5
|
+
|
6
|
+
/*
|
7
|
+
* Scans the current position of the buffer
|
8
|
+
* returning the length of this UTF8 character
|
9
|
+
*/
|
10
|
+
inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
11
|
+
if (in_len > 0) {
|
12
|
+
unsigned char curChar, *start;
|
13
|
+
|
14
|
+
start = in;
|
15
|
+
curChar = in[0];
|
16
|
+
in++;
|
17
|
+
|
18
|
+
if (curChar <= 0x7f) {
|
19
|
+
/* single byte */
|
20
|
+
return 1;
|
21
|
+
} else if ((curChar >> 5) == 0x6) {
|
22
|
+
/* two byte */
|
23
|
+
CHECK_LEN;
|
24
|
+
curChar = in[0];
|
25
|
+
in++;
|
26
|
+
if ((curChar >> 6) == 0x2) return 2;
|
27
|
+
} else if ((curChar >> 4) == 0x0e) {
|
28
|
+
/* three byte */
|
29
|
+
CHECK_LEN;
|
30
|
+
curChar = in[0];
|
31
|
+
in++;
|
32
|
+
if ((curChar >> 6) == 0x2) {
|
33
|
+
CHECK_LEN;
|
34
|
+
curChar = in[0];
|
35
|
+
in++;
|
36
|
+
if ((curChar >> 6) == 0x2) return 3;
|
37
|
+
}
|
38
|
+
} else if ((curChar >> 3) == 0x1e) {
|
39
|
+
/* four byte */
|
40
|
+
CHECK_LEN;
|
41
|
+
curChar = in[0];
|
42
|
+
in++;
|
43
|
+
if ((curChar >> 6) == 0x2) {
|
44
|
+
CHECK_LEN;
|
45
|
+
curChar = in[0];
|
46
|
+
in++;
|
47
|
+
if ((curChar >> 6) == 0x2) {
|
48
|
+
CHECK_LEN;
|
49
|
+
curChar = in[0];
|
50
|
+
in++;
|
51
|
+
if ((curChar >> 6) == 0x2) return 4;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
// error case
|
58
|
+
return -1;
|
59
|
+
}
|
60
|
+
|
61
|
+
/*
|
62
|
+
* Scans the current position of the buffer
|
63
|
+
* returning the total number of UTF8 characters found
|
64
|
+
*/
|
65
|
+
size_t utf8CharCount(unsigned char *in, size_t in_len) {
|
66
|
+
size_t total = 0, leftOver = in_len;
|
67
|
+
int8_t len = 0;
|
68
|
+
unsigned char *start = in;
|
69
|
+
|
70
|
+
if (in_len > 0) {
|
71
|
+
while (leftOver) {
|
72
|
+
len = utf8CharLen(start, leftOver);
|
73
|
+
leftOver -= len;
|
74
|
+
start += len;
|
75
|
+
total++;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
return total;
|
80
|
+
}
|