utf8 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/utf8/ext.c ADDED
@@ -0,0 +1,21 @@
1
+ #include "ext.h"
2
+
3
+ #include "string_utf8.h"
4
+ #include "string_scanner_utf8.h"
5
+
6
+ VALUE intern_as_utf8;
7
+
8
+ #ifdef HAVE_RUBY_ENCODING_H
9
+ rb_encoding *utf8Encoding;
10
+ #endif
11
+
12
+ void Init_utf8() {
13
+ init_String_UTF8();
14
+ init_StringScanner_UTF8();
15
+
16
+ intern_as_utf8 = rb_intern("as_utf8");
17
+
18
+ #ifdef HAVE_RUBY_ENCODING_H
19
+ utf8Encoding = rb_utf8_encoding();
20
+ #endif
21
+ }
data/ext/utf8/ext.h ADDED
@@ -0,0 +1,17 @@
1
+ #ifndef UTF8_EXT_H
2
+ #define UTF8_EXT_H
3
+
4
+ #include <ruby.h>
5
+
6
+ #ifdef HAVE_RUBY_ENCODING_H
7
+ #include <ruby/encoding.h>
8
+ extern rb_encoding *utf8Encoding;
9
+ #define AS_UTF8(_str) \
10
+ _str = rb_funcall(_str, intern_as_utf8, 0); \
11
+ rb_enc_associate(_str, utf8Encoding);
12
+
13
+ #else
14
+ #define AS_UTF8(_str) _str = rb_funcall(_str, intern_as_utf8, 0)
15
+ #endif
16
+
17
+ #endif
@@ -0,0 +1,7 @@
1
+ require 'mkmf'
2
+ require 'rbconfig'
3
+
4
+ $CFLAGS << ' -Wall -funroll-loops'
5
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
6
+
7
+ create_makefile("utf8")
@@ -0,0 +1,68 @@
1
+ #include "ext.h"
2
+ #include "ruby/regex.h"
3
+ #include "utf8.h"
4
+
5
+ extern ID intern_as_utf8;
6
+
7
+ struct strscanner {
8
+ /* multi-purpose flags */
9
+ unsigned long flags;
10
+
11
+ /* the string to scan */
12
+ VALUE str;
13
+
14
+ /* scan pointers */
15
+ long prev; /* legal only when MATCHED_P(s) */
16
+ long curr; /* always legal */
17
+
18
+ /* the regexp register; legal only when MATCHED_P(s) */
19
+ struct re_registers regs;
20
+ };
21
+
22
+ #define GET_SCANNER(obj, var) \
23
+ Data_Get_Struct(obj, struct strscanner, var); \
24
+ if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
25
+
26
+ /*
27
+ * Document-class: StringScanner::UTF8
28
+ */
29
+
30
+ /*
31
+ * call-seq: getch
32
+ *
33
+ * Works like StringScanner#getch but is UTF8-aware
34
+ */
35
+ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
36
+ unsigned char *str;
37
+ size_t len;
38
+ struct strscanner *scanner;
39
+ VALUE utf8Str;
40
+ int8_t lastCharLen=0;
41
+ GET_SCANNER(self, scanner);
42
+
43
+ str = (unsigned char *)RSTRING_PTR(scanner->str);
44
+ len = RSTRING_LEN(scanner->str);
45
+
46
+ if (len > 0 && len > scanner->curr) {
47
+ lastCharLen = utf8CharLen(str, len);
48
+ utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
49
+ scanner->curr += lastCharLen;
50
+ AS_UTF8(utf8Str);
51
+ return utf8Str;
52
+ } else {
53
+ return Qnil;
54
+ }
55
+ }
56
+
57
+ void init_StringScanner_UTF8() {
58
+ ID intern_string_scanner = rb_intern("StringScanner");
59
+ VALUE rb_cStringScanner, rb_cStringScanner_UTF8;
60
+
61
+ if (!rb_const_defined(rb_cObject, intern_string_scanner)) {
62
+ rb_require("strscan");
63
+ }
64
+ rb_cStringScanner = rb_const_get(rb_cObject, intern_string_scanner);
65
+ rb_cStringScanner_UTF8 = rb_define_class_under(rb_cStringScanner, "UTF8", rb_cStringScanner);
66
+
67
+ rb_define_method(rb_cStringScanner_UTF8, "getch", rb_cStringScanner_UTF8_getch, 0);
68
+ }
@@ -0,0 +1,6 @@
1
+ #ifndef UTF8_STRING_SCANNER_H
2
+ #define UTF8_STRING_SCANNER_H
3
+
4
+ void init_StringScanner_UTF8();
5
+
6
+ #endif
@@ -0,0 +1,224 @@
1
+ #include "ext.h"
2
+ #include "utf8.h"
3
+
4
+ extern VALUE intern_as_utf8;
5
+
6
+ /*
7
+ * Document-class: String::UTF8
8
+ */
9
+
10
+ /*
11
+ * call-seq: length
12
+ *
13
+ * Returns the number of UTF8 characters in this string
14
+ */
15
+ static VALUE rb_cString_UTF8_length(VALUE self) {
16
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
17
+ size_t len = RSTRING_LEN(self);
18
+ size_t utf8_len = 0;
19
+
20
+ utf8_len = utf8CharCount(str, len);
21
+
22
+ return INT2FIX(utf8_len);
23
+ }
24
+
25
+ /*
26
+ * call-seq: each_char {|utf8_char| ...}
27
+ *
28
+ * Iterates over the string, yielding one UTF8 character at a time
29
+ */
30
+ static VALUE rb_cString_UTF8_each_char(VALUE self) {
31
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
32
+ size_t len = RSTRING_LEN(self), i=0;
33
+ int8_t lastCharLen=0;
34
+ VALUE utf8Str;
35
+
36
+ // this will return an Enumerator wrapping this string, yielding this method
37
+ // when Enumerator#each is called
38
+ RETURN_ENUMERATOR(self, 0, 0);
39
+
40
+ for(; i<len; i+=lastCharLen) {
41
+ lastCharLen = utf8CharLen(str, len);
42
+ utf8Str = rb_str_new((char *)str+i, lastCharLen);
43
+ AS_UTF8(utf8Str);
44
+ rb_yield(utf8Str);
45
+ }
46
+
47
+ return self;
48
+ }
49
+
50
+ /*
51
+ * Works like String#[] but taking into account UTF8 character boundaries
52
+ *
53
+ * This method doesn't currently (and may never) support Regexp parameters
54
+ * It also doesn't support a String parameter (yet)
55
+ */
56
+ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
57
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
58
+ size_t len = RSTRING_LEN(self);
59
+ VALUE utf8Str;
60
+
61
+ if (len == 0) return Qnil;
62
+
63
+ if (argc == 2) {
64
+ if (TYPE(argv[0]) == T_REGEXP) {
65
+ rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
66
+ }
67
+
68
+ // [offset, length] syntax
69
+ long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]);
70
+ int8_t curCharLen = 0;
71
+ unsigned char *offset = str;
72
+
73
+ if (wantLen < 0) {
74
+ return Qnil;
75
+ } else if (wantLen == 0) {
76
+ utf8Str = rb_str_new("", 0);
77
+ AS_UTF8(utf8Str);
78
+ return utf8Str;
79
+ }
80
+
81
+ if (wantPos < 0) {
82
+ long char_cnt = utf8CharCount(str, len);
83
+ if ((wantPos * -1) > char_cnt) {
84
+ return Qnil;
85
+ }
86
+ wantPos = char_cnt + wantPos;
87
+ }
88
+
89
+ // scan until starting position
90
+ curCharLen = utf8CharLen(str, len);
91
+ while (curPos < wantPos) {
92
+ // if we're about to step out of bounds, return nil
93
+ if ((size_t)(str-start) >= len) {
94
+ return Qnil;
95
+ }
96
+
97
+ str += curCharLen;
98
+ curCharLen = utf8CharLen(str, len);
99
+ curPos++;
100
+ }
101
+
102
+ // now scan until we have the number of chars asked for
103
+ curPos = 1;
104
+ offset = str;
105
+ str += curCharLen;
106
+ curCharLen = utf8CharLen(str, len);
107
+ while (curPos < wantLen) {
108
+ // if we're about to step out of bounds, stop
109
+ if ((size_t)(str-start) >= len) {
110
+ break;
111
+ }
112
+
113
+ str += curCharLen;
114
+ curCharLen = utf8CharLen(str, len);
115
+ curPos++;
116
+ }
117
+
118
+ utf8Str = rb_str_new((char *)offset, str-offset);
119
+ AS_UTF8(utf8Str);
120
+ return utf8Str;
121
+ }
122
+
123
+ if (argc != 1) {
124
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
125
+ }
126
+
127
+ // [Fixnum] syntax
128
+ if (TYPE(argv[0]) == T_FIXNUM) {
129
+ long wantPos = NUM2LONG(argv[0]), curPos = 0;
130
+ int8_t curCharLen = 0;
131
+
132
+ if (wantPos < 0) {
133
+ long char_cnt = utf8CharCount(str, len);
134
+ if ((wantPos * -1) > char_cnt) {
135
+ return Qnil;
136
+ }
137
+ wantPos = char_cnt + wantPos;
138
+ }
139
+
140
+ curCharLen = utf8CharLen(str, len);
141
+ while (curPos < wantPos) {
142
+ // if we're about to step out of bounds, return nil
143
+ if ((size_t)(str-start) >= len) {
144
+ return Qnil;
145
+ }
146
+
147
+ str += curCharLen;
148
+ curCharLen = utf8CharLen(str, len);
149
+ curPos++;
150
+ }
151
+
152
+ utf8Str = rb_str_new((char *)str, curCharLen);
153
+ AS_UTF8(utf8Str);
154
+ return utf8Str;
155
+ } else {
156
+ if (TYPE(argv[0]) == T_REGEXP) {
157
+ rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
158
+ }
159
+
160
+ // [Range] syntax
161
+ long wantPos, curPos = 0, wantLen, char_cnt = 0;
162
+ int8_t curCharLen = 0;
163
+ unsigned char *offset = str;
164
+ VALUE ret;
165
+
166
+ char_cnt = utf8CharCount(str, len);
167
+ ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0);
168
+
169
+ if (ret == Qnil) {
170
+ return Qnil;
171
+ } else if (ret == Qfalse) {
172
+ // TODO: wtf do we do :P
173
+ }
174
+
175
+ if (wantLen == 0) {
176
+ utf8Str = rb_str_new("", 0);
177
+ AS_UTF8(utf8Str);
178
+ return utf8Str;
179
+ }
180
+
181
+ // scan until starting position
182
+ curCharLen = utf8CharLen(str, len);
183
+ while (curPos < wantPos) {
184
+ // if we're about to step out of bounds, return ""
185
+ if ((size_t)(str-start) >= len) {
186
+ utf8Str = rb_str_new("", 0);
187
+ AS_UTF8(utf8Str);
188
+ return utf8Str;
189
+ }
190
+
191
+ str += curCharLen;
192
+ curCharLen = utf8CharLen(str, len);
193
+ curPos++;
194
+ }
195
+
196
+ // now scan until we have the number of chars asked for
197
+ curPos = 1;
198
+ offset = str;
199
+ str += curCharLen;
200
+ curCharLen = utf8CharLen(str, len);
201
+ while (curPos < wantLen) {
202
+ // if we're about to step out of bounds, stop
203
+ if ((size_t)(str-start) >= len) {
204
+ break;
205
+ }
206
+
207
+ str += curCharLen;
208
+ curCharLen = utf8CharLen(str, len);
209
+ curPos++;
210
+ }
211
+
212
+ utf8Str = rb_str_new((char *)offset, str-offset);
213
+ AS_UTF8(utf8Str);
214
+ return utf8Str;
215
+ }
216
+ }
217
+
218
+ void init_String_UTF8() {
219
+ VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
220
+
221
+ rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
222
+ rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
223
+ rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
224
+ }
@@ -0,0 +1,6 @@
1
+ #ifndef UTF8_STRING_H
2
+ #define UTF8_STRING_H
3
+
4
+ void init_String_UTF8();
5
+
6
+ #endif
data/ext/utf8/utf8.c ADDED
@@ -0,0 +1,80 @@
1
+ #include <stdio.h>
2
+ #include <stdint.h>
3
+
4
+ #define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
5
+
6
+ /*
7
+ * Scans the current position of the buffer
8
+ * returning the length of this UTF8 character
9
+ */
10
+ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
11
+ if (in_len > 0) {
12
+ unsigned char curChar, *start;
13
+
14
+ start = in;
15
+ curChar = in[0];
16
+ in++;
17
+
18
+ if (curChar <= 0x7f) {
19
+ /* single byte */
20
+ return 1;
21
+ } else if ((curChar >> 5) == 0x6) {
22
+ /* two byte */
23
+ CHECK_LEN;
24
+ curChar = in[0];
25
+ in++;
26
+ if ((curChar >> 6) == 0x2) return 2;
27
+ } else if ((curChar >> 4) == 0x0e) {
28
+ /* three byte */
29
+ CHECK_LEN;
30
+ curChar = in[0];
31
+ in++;
32
+ if ((curChar >> 6) == 0x2) {
33
+ CHECK_LEN;
34
+ curChar = in[0];
35
+ in++;
36
+ if ((curChar >> 6) == 0x2) return 3;
37
+ }
38
+ } else if ((curChar >> 3) == 0x1e) {
39
+ /* four byte */
40
+ CHECK_LEN;
41
+ curChar = in[0];
42
+ in++;
43
+ if ((curChar >> 6) == 0x2) {
44
+ CHECK_LEN;
45
+ curChar = in[0];
46
+ in++;
47
+ if ((curChar >> 6) == 0x2) {
48
+ CHECK_LEN;
49
+ curChar = in[0];
50
+ in++;
51
+ if ((curChar >> 6) == 0x2) return 4;
52
+ }
53
+ }
54
+ }
55
+ }
56
+
57
+ // error case
58
+ return -1;
59
+ }
60
+
61
+ /*
62
+ * Scans the current position of the buffer
63
+ * returning the total number of UTF8 characters found
64
+ */
65
+ size_t utf8CharCount(unsigned char *in, size_t in_len) {
66
+ size_t total = 0, leftOver = in_len;
67
+ int8_t len = 0;
68
+ unsigned char *start = in;
69
+
70
+ if (in_len > 0) {
71
+ while (leftOver) {
72
+ len = utf8CharLen(start, leftOver);
73
+ leftOver -= len;
74
+ start += len;
75
+ total++;
76
+ }
77
+ }
78
+
79
+ return total;
80
+ }
data/ext/utf8/utf8.h ADDED
@@ -0,0 +1,7 @@
1
+ #ifndef UTF8_UTF8_H
2
+ #define UTF8_UTF8_H
3
+
4
+ inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
+ size_t utf8CharCount(unsigned char *in, size_t in_len);
6
+
7
+ #endif