utf8 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +46 -0
- data/Rakefile +11 -0
- data/benchmark/active_support.rb +61 -0
- data/benchmark/test.txt +693 -0
- data/ext/utf8/ext.c +21 -0
- data/ext/utf8/ext.h +17 -0
- data/ext/utf8/extconf.rb +7 -0
- data/ext/utf8/string_scanner_utf8.c +68 -0
- data/ext/utf8/string_scanner_utf8.h +6 -0
- data/ext/utf8/string_utf8.c +224 -0
- data/ext/utf8/string_utf8.h +6 -0
- data/ext/utf8/utf8.c +80 -0
- data/ext/utf8/utf8.h +7 -0
- data/lib/utf8.rb +5 -0
- data/lib/utf8/string.rb +19 -0
- data/lib/utf8/string_scanner.rb +21 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/string_scanner_spec.rb +48 -0
- data/spec/string_spec.rb +151 -0
- data/utf8.gemspec +37 -0
- metadata +120 -0
data/ext/utf8/ext.c
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
|
3
|
+
#include "string_utf8.h"
|
4
|
+
#include "string_scanner_utf8.h"
|
5
|
+
|
6
|
+
VALUE intern_as_utf8;
|
7
|
+
|
8
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
9
|
+
rb_encoding *utf8Encoding;
|
10
|
+
#endif
|
11
|
+
|
12
|
+
void Init_utf8() {
|
13
|
+
init_String_UTF8();
|
14
|
+
init_StringScanner_UTF8();
|
15
|
+
|
16
|
+
intern_as_utf8 = rb_intern("as_utf8");
|
17
|
+
|
18
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
19
|
+
utf8Encoding = rb_utf8_encoding();
|
20
|
+
#endif
|
21
|
+
}
|
data/ext/utf8/ext.h
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef UTF8_EXT_H
|
2
|
+
#define UTF8_EXT_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
7
|
+
#include <ruby/encoding.h>
|
8
|
+
extern rb_encoding *utf8Encoding;
|
9
|
+
#define AS_UTF8(_str) \
|
10
|
+
_str = rb_funcall(_str, intern_as_utf8, 0); \
|
11
|
+
rb_enc_associate(_str, utf8Encoding);
|
12
|
+
|
13
|
+
#else
|
14
|
+
#define AS_UTF8(_str) _str = rb_funcall(_str, intern_as_utf8, 0)
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
data/ext/utf8/extconf.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
#include "ruby/regex.h"
|
3
|
+
#include "utf8.h"
|
4
|
+
|
5
|
+
extern ID intern_as_utf8;
|
6
|
+
|
7
|
+
struct strscanner {
|
8
|
+
/* multi-purpose flags */
|
9
|
+
unsigned long flags;
|
10
|
+
|
11
|
+
/* the string to scan */
|
12
|
+
VALUE str;
|
13
|
+
|
14
|
+
/* scan pointers */
|
15
|
+
long prev; /* legal only when MATCHED_P(s) */
|
16
|
+
long curr; /* always legal */
|
17
|
+
|
18
|
+
/* the regexp register; legal only when MATCHED_P(s) */
|
19
|
+
struct re_registers regs;
|
20
|
+
};
|
21
|
+
|
22
|
+
#define GET_SCANNER(obj, var) \
|
23
|
+
Data_Get_Struct(obj, struct strscanner, var); \
|
24
|
+
if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
|
25
|
+
|
26
|
+
/*
|
27
|
+
* Document-class: StringScanner::UTF8
|
28
|
+
*/
|
29
|
+
|
30
|
+
/*
|
31
|
+
* call-seq: getch
|
32
|
+
*
|
33
|
+
* Works like StringScanner#getch but is UTF8-aware
|
34
|
+
*/
|
35
|
+
static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
36
|
+
unsigned char *str;
|
37
|
+
size_t len;
|
38
|
+
struct strscanner *scanner;
|
39
|
+
VALUE utf8Str;
|
40
|
+
int8_t lastCharLen=0;
|
41
|
+
GET_SCANNER(self, scanner);
|
42
|
+
|
43
|
+
str = (unsigned char *)RSTRING_PTR(scanner->str);
|
44
|
+
len = RSTRING_LEN(scanner->str);
|
45
|
+
|
46
|
+
if (len > 0 && len > scanner->curr) {
|
47
|
+
lastCharLen = utf8CharLen(str, len);
|
48
|
+
utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
|
49
|
+
scanner->curr += lastCharLen;
|
50
|
+
AS_UTF8(utf8Str);
|
51
|
+
return utf8Str;
|
52
|
+
} else {
|
53
|
+
return Qnil;
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
void init_StringScanner_UTF8() {
|
58
|
+
ID intern_string_scanner = rb_intern("StringScanner");
|
59
|
+
VALUE rb_cStringScanner, rb_cStringScanner_UTF8;
|
60
|
+
|
61
|
+
if (!rb_const_defined(rb_cObject, intern_string_scanner)) {
|
62
|
+
rb_require("strscan");
|
63
|
+
}
|
64
|
+
rb_cStringScanner = rb_const_get(rb_cObject, intern_string_scanner);
|
65
|
+
rb_cStringScanner_UTF8 = rb_define_class_under(rb_cStringScanner, "UTF8", rb_cStringScanner);
|
66
|
+
|
67
|
+
rb_define_method(rb_cStringScanner_UTF8, "getch", rb_cStringScanner_UTF8_getch, 0);
|
68
|
+
}
|
@@ -0,0 +1,224 @@
|
|
1
|
+
#include "ext.h"
|
2
|
+
#include "utf8.h"
|
3
|
+
|
4
|
+
extern VALUE intern_as_utf8;
|
5
|
+
|
6
|
+
/*
|
7
|
+
* Document-class: String::UTF8
|
8
|
+
*/
|
9
|
+
|
10
|
+
/*
|
11
|
+
* call-seq: length
|
12
|
+
*
|
13
|
+
* Returns the number of UTF8 characters in this string
|
14
|
+
*/
|
15
|
+
static VALUE rb_cString_UTF8_length(VALUE self) {
|
16
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
17
|
+
size_t len = RSTRING_LEN(self);
|
18
|
+
size_t utf8_len = 0;
|
19
|
+
|
20
|
+
utf8_len = utf8CharCount(str, len);
|
21
|
+
|
22
|
+
return INT2FIX(utf8_len);
|
23
|
+
}
|
24
|
+
|
25
|
+
/*
|
26
|
+
* call-seq: each_char {|utf8_char| ...}
|
27
|
+
*
|
28
|
+
* Iterates over the string, yielding one UTF8 character at a time
|
29
|
+
*/
|
30
|
+
static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
31
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
32
|
+
size_t len = RSTRING_LEN(self), i=0;
|
33
|
+
int8_t lastCharLen=0;
|
34
|
+
VALUE utf8Str;
|
35
|
+
|
36
|
+
// this will return an Enumerator wrapping this string, yielding this method
|
37
|
+
// when Enumerator#each is called
|
38
|
+
RETURN_ENUMERATOR(self, 0, 0);
|
39
|
+
|
40
|
+
for(; i<len; i+=lastCharLen) {
|
41
|
+
lastCharLen = utf8CharLen(str, len);
|
42
|
+
utf8Str = rb_str_new((char *)str+i, lastCharLen);
|
43
|
+
AS_UTF8(utf8Str);
|
44
|
+
rb_yield(utf8Str);
|
45
|
+
}
|
46
|
+
|
47
|
+
return self;
|
48
|
+
}
|
49
|
+
|
50
|
+
/*
|
51
|
+
* Works like String#[] but taking into account UTF8 character boundaries
|
52
|
+
*
|
53
|
+
* This method doesn't currently (and may never) support Regexp parameters
|
54
|
+
* It also doesn't support a String parameter (yet)
|
55
|
+
*/
|
56
|
+
static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
57
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
|
58
|
+
size_t len = RSTRING_LEN(self);
|
59
|
+
VALUE utf8Str;
|
60
|
+
|
61
|
+
if (len == 0) return Qnil;
|
62
|
+
|
63
|
+
if (argc == 2) {
|
64
|
+
if (TYPE(argv[0]) == T_REGEXP) {
|
65
|
+
rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
|
66
|
+
}
|
67
|
+
|
68
|
+
// [offset, length] syntax
|
69
|
+
long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]);
|
70
|
+
int8_t curCharLen = 0;
|
71
|
+
unsigned char *offset = str;
|
72
|
+
|
73
|
+
if (wantLen < 0) {
|
74
|
+
return Qnil;
|
75
|
+
} else if (wantLen == 0) {
|
76
|
+
utf8Str = rb_str_new("", 0);
|
77
|
+
AS_UTF8(utf8Str);
|
78
|
+
return utf8Str;
|
79
|
+
}
|
80
|
+
|
81
|
+
if (wantPos < 0) {
|
82
|
+
long char_cnt = utf8CharCount(str, len);
|
83
|
+
if ((wantPos * -1) > char_cnt) {
|
84
|
+
return Qnil;
|
85
|
+
}
|
86
|
+
wantPos = char_cnt + wantPos;
|
87
|
+
}
|
88
|
+
|
89
|
+
// scan until starting position
|
90
|
+
curCharLen = utf8CharLen(str, len);
|
91
|
+
while (curPos < wantPos) {
|
92
|
+
// if we're about to step out of bounds, return nil
|
93
|
+
if ((size_t)(str-start) >= len) {
|
94
|
+
return Qnil;
|
95
|
+
}
|
96
|
+
|
97
|
+
str += curCharLen;
|
98
|
+
curCharLen = utf8CharLen(str, len);
|
99
|
+
curPos++;
|
100
|
+
}
|
101
|
+
|
102
|
+
// now scan until we have the number of chars asked for
|
103
|
+
curPos = 1;
|
104
|
+
offset = str;
|
105
|
+
str += curCharLen;
|
106
|
+
curCharLen = utf8CharLen(str, len);
|
107
|
+
while (curPos < wantLen) {
|
108
|
+
// if we're about to step out of bounds, stop
|
109
|
+
if ((size_t)(str-start) >= len) {
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
|
113
|
+
str += curCharLen;
|
114
|
+
curCharLen = utf8CharLen(str, len);
|
115
|
+
curPos++;
|
116
|
+
}
|
117
|
+
|
118
|
+
utf8Str = rb_str_new((char *)offset, str-offset);
|
119
|
+
AS_UTF8(utf8Str);
|
120
|
+
return utf8Str;
|
121
|
+
}
|
122
|
+
|
123
|
+
if (argc != 1) {
|
124
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
|
125
|
+
}
|
126
|
+
|
127
|
+
// [Fixnum] syntax
|
128
|
+
if (TYPE(argv[0]) == T_FIXNUM) {
|
129
|
+
long wantPos = NUM2LONG(argv[0]), curPos = 0;
|
130
|
+
int8_t curCharLen = 0;
|
131
|
+
|
132
|
+
if (wantPos < 0) {
|
133
|
+
long char_cnt = utf8CharCount(str, len);
|
134
|
+
if ((wantPos * -1) > char_cnt) {
|
135
|
+
return Qnil;
|
136
|
+
}
|
137
|
+
wantPos = char_cnt + wantPos;
|
138
|
+
}
|
139
|
+
|
140
|
+
curCharLen = utf8CharLen(str, len);
|
141
|
+
while (curPos < wantPos) {
|
142
|
+
// if we're about to step out of bounds, return nil
|
143
|
+
if ((size_t)(str-start) >= len) {
|
144
|
+
return Qnil;
|
145
|
+
}
|
146
|
+
|
147
|
+
str += curCharLen;
|
148
|
+
curCharLen = utf8CharLen(str, len);
|
149
|
+
curPos++;
|
150
|
+
}
|
151
|
+
|
152
|
+
utf8Str = rb_str_new((char *)str, curCharLen);
|
153
|
+
AS_UTF8(utf8Str);
|
154
|
+
return utf8Str;
|
155
|
+
} else {
|
156
|
+
if (TYPE(argv[0]) == T_REGEXP) {
|
157
|
+
rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
|
158
|
+
}
|
159
|
+
|
160
|
+
// [Range] syntax
|
161
|
+
long wantPos, curPos = 0, wantLen, char_cnt = 0;
|
162
|
+
int8_t curCharLen = 0;
|
163
|
+
unsigned char *offset = str;
|
164
|
+
VALUE ret;
|
165
|
+
|
166
|
+
char_cnt = utf8CharCount(str, len);
|
167
|
+
ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0);
|
168
|
+
|
169
|
+
if (ret == Qnil) {
|
170
|
+
return Qnil;
|
171
|
+
} else if (ret == Qfalse) {
|
172
|
+
// TODO: wtf do we do :P
|
173
|
+
}
|
174
|
+
|
175
|
+
if (wantLen == 0) {
|
176
|
+
utf8Str = rb_str_new("", 0);
|
177
|
+
AS_UTF8(utf8Str);
|
178
|
+
return utf8Str;
|
179
|
+
}
|
180
|
+
|
181
|
+
// scan until starting position
|
182
|
+
curCharLen = utf8CharLen(str, len);
|
183
|
+
while (curPos < wantPos) {
|
184
|
+
// if we're about to step out of bounds, return ""
|
185
|
+
if ((size_t)(str-start) >= len) {
|
186
|
+
utf8Str = rb_str_new("", 0);
|
187
|
+
AS_UTF8(utf8Str);
|
188
|
+
return utf8Str;
|
189
|
+
}
|
190
|
+
|
191
|
+
str += curCharLen;
|
192
|
+
curCharLen = utf8CharLen(str, len);
|
193
|
+
curPos++;
|
194
|
+
}
|
195
|
+
|
196
|
+
// now scan until we have the number of chars asked for
|
197
|
+
curPos = 1;
|
198
|
+
offset = str;
|
199
|
+
str += curCharLen;
|
200
|
+
curCharLen = utf8CharLen(str, len);
|
201
|
+
while (curPos < wantLen) {
|
202
|
+
// if we're about to step out of bounds, stop
|
203
|
+
if ((size_t)(str-start) >= len) {
|
204
|
+
break;
|
205
|
+
}
|
206
|
+
|
207
|
+
str += curCharLen;
|
208
|
+
curCharLen = utf8CharLen(str, len);
|
209
|
+
curPos++;
|
210
|
+
}
|
211
|
+
|
212
|
+
utf8Str = rb_str_new((char *)offset, str-offset);
|
213
|
+
AS_UTF8(utf8Str);
|
214
|
+
return utf8Str;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
void init_String_UTF8() {
|
219
|
+
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
220
|
+
|
221
|
+
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
222
|
+
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
|
223
|
+
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
224
|
+
}
|
data/ext/utf8/utf8.c
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
|
4
|
+
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
|
5
|
+
|
6
|
+
/*
|
7
|
+
* Scans the current position of the buffer
|
8
|
+
* returning the length of this UTF8 character
|
9
|
+
*/
|
10
|
+
inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
11
|
+
if (in_len > 0) {
|
12
|
+
unsigned char curChar, *start;
|
13
|
+
|
14
|
+
start = in;
|
15
|
+
curChar = in[0];
|
16
|
+
in++;
|
17
|
+
|
18
|
+
if (curChar <= 0x7f) {
|
19
|
+
/* single byte */
|
20
|
+
return 1;
|
21
|
+
} else if ((curChar >> 5) == 0x6) {
|
22
|
+
/* two byte */
|
23
|
+
CHECK_LEN;
|
24
|
+
curChar = in[0];
|
25
|
+
in++;
|
26
|
+
if ((curChar >> 6) == 0x2) return 2;
|
27
|
+
} else if ((curChar >> 4) == 0x0e) {
|
28
|
+
/* three byte */
|
29
|
+
CHECK_LEN;
|
30
|
+
curChar = in[0];
|
31
|
+
in++;
|
32
|
+
if ((curChar >> 6) == 0x2) {
|
33
|
+
CHECK_LEN;
|
34
|
+
curChar = in[0];
|
35
|
+
in++;
|
36
|
+
if ((curChar >> 6) == 0x2) return 3;
|
37
|
+
}
|
38
|
+
} else if ((curChar >> 3) == 0x1e) {
|
39
|
+
/* four byte */
|
40
|
+
CHECK_LEN;
|
41
|
+
curChar = in[0];
|
42
|
+
in++;
|
43
|
+
if ((curChar >> 6) == 0x2) {
|
44
|
+
CHECK_LEN;
|
45
|
+
curChar = in[0];
|
46
|
+
in++;
|
47
|
+
if ((curChar >> 6) == 0x2) {
|
48
|
+
CHECK_LEN;
|
49
|
+
curChar = in[0];
|
50
|
+
in++;
|
51
|
+
if ((curChar >> 6) == 0x2) return 4;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
// error case
|
58
|
+
return -1;
|
59
|
+
}
|
60
|
+
|
61
|
+
/*
|
62
|
+
* Scans the current position of the buffer
|
63
|
+
* returning the total number of UTF8 characters found
|
64
|
+
*/
|
65
|
+
size_t utf8CharCount(unsigned char *in, size_t in_len) {
|
66
|
+
size_t total = 0, leftOver = in_len;
|
67
|
+
int8_t len = 0;
|
68
|
+
unsigned char *start = in;
|
69
|
+
|
70
|
+
if (in_len > 0) {
|
71
|
+
while (leftOver) {
|
72
|
+
len = utf8CharLen(start, leftOver);
|
73
|
+
leftOver -= len;
|
74
|
+
start += len;
|
75
|
+
total++;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
return total;
|
80
|
+
}
|