utf8 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/utf8/ext.c ADDED
@@ -0,0 +1,21 @@
1
+ #include "ext.h"
2
+
3
+ #include "string_utf8.h"
4
+ #include "string_scanner_utf8.h"
5
+
6
+ VALUE intern_as_utf8;
7
+
8
+ #ifdef HAVE_RUBY_ENCODING_H
9
+ rb_encoding *utf8Encoding;
10
+ #endif
11
+
12
+ void Init_utf8() {
13
+ init_String_UTF8();
14
+ init_StringScanner_UTF8();
15
+
16
+ intern_as_utf8 = rb_intern("as_utf8");
17
+
18
+ #ifdef HAVE_RUBY_ENCODING_H
19
+ utf8Encoding = rb_utf8_encoding();
20
+ #endif
21
+ }
data/ext/utf8/ext.h ADDED
@@ -0,0 +1,17 @@
1
+ #ifndef UTF8_EXT_H
2
+ #define UTF8_EXT_H
3
+
4
+ #include <ruby.h>
5
+
6
+ #ifdef HAVE_RUBY_ENCODING_H
7
+ #include <ruby/encoding.h>
8
+ extern rb_encoding *utf8Encoding;
9
+ #define AS_UTF8(_str) \
10
+ _str = rb_funcall(_str, intern_as_utf8, 0); \
11
+ rb_enc_associate(_str, utf8Encoding);
12
+
13
+ #else
14
+ #define AS_UTF8(_str) _str = rb_funcall(_str, intern_as_utf8, 0)
15
+ #endif
16
+
17
+ #endif
@@ -0,0 +1,7 @@
1
+ require 'mkmf'
2
+ require 'rbconfig'
3
+
4
+ $CFLAGS << ' -Wall -funroll-loops'
5
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
6
+
7
+ create_makefile("utf8")
@@ -0,0 +1,68 @@
1
+ #include "ext.h"
2
+ #include "ruby/regex.h"
3
+ #include "utf8.h"
4
+
5
+ extern ID intern_as_utf8;
6
+
7
+ struct strscanner {
8
+ /* multi-purpose flags */
9
+ unsigned long flags;
10
+
11
+ /* the string to scan */
12
+ VALUE str;
13
+
14
+ /* scan pointers */
15
+ long prev; /* legal only when MATCHED_P(s) */
16
+ long curr; /* always legal */
17
+
18
+ /* the regexp register; legal only when MATCHED_P(s) */
19
+ struct re_registers regs;
20
+ };
21
+
22
+ #define GET_SCANNER(obj, var) \
23
+ Data_Get_Struct(obj, struct strscanner, var); \
24
+ if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
25
+
26
+ /*
27
+ * Document-class: StringScanner::UTF8
28
+ */
29
+
30
+ /*
31
+ * call-seq: getch
32
+ *
33
+ * Works like StringScanner#getch but is UTF8-aware
34
+ */
35
+ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
36
+ unsigned char *str;
37
+ size_t len;
38
+ struct strscanner *scanner;
39
+ VALUE utf8Str;
40
+ int8_t lastCharLen=0;
41
+ GET_SCANNER(self, scanner);
42
+
43
+ str = (unsigned char *)RSTRING_PTR(scanner->str);
44
+ len = RSTRING_LEN(scanner->str);
45
+
46
+ if (len > 0 && len > scanner->curr) {
47
+ lastCharLen = utf8CharLen(str, len);
48
+ utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
49
+ scanner->curr += lastCharLen;
50
+ AS_UTF8(utf8Str);
51
+ return utf8Str;
52
+ } else {
53
+ return Qnil;
54
+ }
55
+ }
56
+
57
+ void init_StringScanner_UTF8() {
58
+ ID intern_string_scanner = rb_intern("StringScanner");
59
+ VALUE rb_cStringScanner, rb_cStringScanner_UTF8;
60
+
61
+ if (!rb_const_defined(rb_cObject, intern_string_scanner)) {
62
+ rb_require("strscan");
63
+ }
64
+ rb_cStringScanner = rb_const_get(rb_cObject, intern_string_scanner);
65
+ rb_cStringScanner_UTF8 = rb_define_class_under(rb_cStringScanner, "UTF8", rb_cStringScanner);
66
+
67
+ rb_define_method(rb_cStringScanner_UTF8, "getch", rb_cStringScanner_UTF8_getch, 0);
68
+ }
@@ -0,0 +1,6 @@
1
+ #ifndef UTF8_STRING_SCANNER_H
2
+ #define UTF8_STRING_SCANNER_H
3
+
4
+ void init_StringScanner_UTF8();
5
+
6
+ #endif
@@ -0,0 +1,224 @@
1
+ #include "ext.h"
2
+ #include "utf8.h"
3
+
4
+ extern VALUE intern_as_utf8;
5
+
6
+ /*
7
+ * Document-class: String::UTF8
8
+ */
9
+
10
+ /*
11
+ * call-seq: length
12
+ *
13
+ * Returns the number of UTF8 characters in this string
14
+ */
15
+ static VALUE rb_cString_UTF8_length(VALUE self) {
16
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
17
+ size_t len = RSTRING_LEN(self);
18
+ size_t utf8_len = 0;
19
+
20
+ utf8_len = utf8CharCount(str, len);
21
+
22
+ return INT2FIX(utf8_len);
23
+ }
24
+
25
+ /*
26
+ * call-seq: each_char {|utf8_char| ...}
27
+ *
28
+ * Iterates over the string, yielding one UTF8 character at a time
29
+ */
30
+ static VALUE rb_cString_UTF8_each_char(VALUE self) {
31
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
32
+ size_t len = RSTRING_LEN(self), i=0;
33
+ int8_t lastCharLen=0;
34
+ VALUE utf8Str;
35
+
36
+ // this will return an Enumerator wrapping this string, yielding this method
37
+ // when Enumerator#each is called
38
+ RETURN_ENUMERATOR(self, 0, 0);
39
+
40
+ for(; i<len; i+=lastCharLen) {
41
+ lastCharLen = utf8CharLen(str, len);
42
+ utf8Str = rb_str_new((char *)str+i, lastCharLen);
43
+ AS_UTF8(utf8Str);
44
+ rb_yield(utf8Str);
45
+ }
46
+
47
+ return self;
48
+ }
49
+
50
+ /*
51
+ * Works like String#[] but taking into account UTF8 character boundaries
52
+ *
53
+ * This method doesn't currently (and may never) support Regexp parameters
54
+ * It also doesn't support a String parameter (yet)
55
+ */
56
+ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
57
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
58
+ size_t len = RSTRING_LEN(self);
59
+ VALUE utf8Str;
60
+
61
+ if (len == 0) return Qnil;
62
+
63
+ if (argc == 2) {
64
+ if (TYPE(argv[0]) == T_REGEXP) {
65
+ rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
66
+ }
67
+
68
+ // [offset, length] syntax
69
+ long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]);
70
+ int8_t curCharLen = 0;
71
+ unsigned char *offset = str;
72
+
73
+ if (wantLen < 0) {
74
+ return Qnil;
75
+ } else if (wantLen == 0) {
76
+ utf8Str = rb_str_new("", 0);
77
+ AS_UTF8(utf8Str);
78
+ return utf8Str;
79
+ }
80
+
81
+ if (wantPos < 0) {
82
+ long char_cnt = utf8CharCount(str, len);
83
+ if ((wantPos * -1) > char_cnt) {
84
+ return Qnil;
85
+ }
86
+ wantPos = char_cnt + wantPos;
87
+ }
88
+
89
+ // scan until starting position
90
+ curCharLen = utf8CharLen(str, len);
91
+ while (curPos < wantPos) {
92
+ // if we're about to step out of bounds, return nil
93
+ if ((size_t)(str-start) >= len) {
94
+ return Qnil;
95
+ }
96
+
97
+ str += curCharLen;
98
+ curCharLen = utf8CharLen(str, len);
99
+ curPos++;
100
+ }
101
+
102
+ // now scan until we have the number of chars asked for
103
+ curPos = 1;
104
+ offset = str;
105
+ str += curCharLen;
106
+ curCharLen = utf8CharLen(str, len);
107
+ while (curPos < wantLen) {
108
+ // if we're about to step out of bounds, stop
109
+ if ((size_t)(str-start) >= len) {
110
+ break;
111
+ }
112
+
113
+ str += curCharLen;
114
+ curCharLen = utf8CharLen(str, len);
115
+ curPos++;
116
+ }
117
+
118
+ utf8Str = rb_str_new((char *)offset, str-offset);
119
+ AS_UTF8(utf8Str);
120
+ return utf8Str;
121
+ }
122
+
123
+ if (argc != 1) {
124
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
125
+ }
126
+
127
+ // [Fixnum] syntax
128
+ if (TYPE(argv[0]) == T_FIXNUM) {
129
+ long wantPos = NUM2LONG(argv[0]), curPos = 0;
130
+ int8_t curCharLen = 0;
131
+
132
+ if (wantPos < 0) {
133
+ long char_cnt = utf8CharCount(str, len);
134
+ if ((wantPos * -1) > char_cnt) {
135
+ return Qnil;
136
+ }
137
+ wantPos = char_cnt + wantPos;
138
+ }
139
+
140
+ curCharLen = utf8CharLen(str, len);
141
+ while (curPos < wantPos) {
142
+ // if we're about to step out of bounds, return nil
143
+ if ((size_t)(str-start) >= len) {
144
+ return Qnil;
145
+ }
146
+
147
+ str += curCharLen;
148
+ curCharLen = utf8CharLen(str, len);
149
+ curPos++;
150
+ }
151
+
152
+ utf8Str = rb_str_new((char *)str, curCharLen);
153
+ AS_UTF8(utf8Str);
154
+ return utf8Str;
155
+ } else {
156
+ if (TYPE(argv[0]) == T_REGEXP) {
157
+ rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
158
+ }
159
+
160
+ // [Range] syntax
161
+ long wantPos, curPos = 0, wantLen, char_cnt = 0;
162
+ int8_t curCharLen = 0;
163
+ unsigned char *offset = str;
164
+ VALUE ret;
165
+
166
+ char_cnt = utf8CharCount(str, len);
167
+ ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0);
168
+
169
+ if (ret == Qnil) {
170
+ return Qnil;
171
+ } else if (ret == Qfalse) {
172
+ // TODO: wtf do we do :P
173
+ }
174
+
175
+ if (wantLen == 0) {
176
+ utf8Str = rb_str_new("", 0);
177
+ AS_UTF8(utf8Str);
178
+ return utf8Str;
179
+ }
180
+
181
+ // scan until starting position
182
+ curCharLen = utf8CharLen(str, len);
183
+ while (curPos < wantPos) {
184
+ // if we're about to step out of bounds, return ""
185
+ if ((size_t)(str-start) >= len) {
186
+ utf8Str = rb_str_new("", 0);
187
+ AS_UTF8(utf8Str);
188
+ return utf8Str;
189
+ }
190
+
191
+ str += curCharLen;
192
+ curCharLen = utf8CharLen(str, len);
193
+ curPos++;
194
+ }
195
+
196
+ // now scan until we have the number of chars asked for
197
+ curPos = 1;
198
+ offset = str;
199
+ str += curCharLen;
200
+ curCharLen = utf8CharLen(str, len);
201
+ while (curPos < wantLen) {
202
+ // if we're about to step out of bounds, stop
203
+ if ((size_t)(str-start) >= len) {
204
+ break;
205
+ }
206
+
207
+ str += curCharLen;
208
+ curCharLen = utf8CharLen(str, len);
209
+ curPos++;
210
+ }
211
+
212
+ utf8Str = rb_str_new((char *)offset, str-offset);
213
+ AS_UTF8(utf8Str);
214
+ return utf8Str;
215
+ }
216
+ }
217
+
218
+ void init_String_UTF8() {
219
+ VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
220
+
221
+ rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
222
+ rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
223
+ rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
224
+ }
@@ -0,0 +1,6 @@
1
+ #ifndef UTF8_STRING_H
2
+ #define UTF8_STRING_H
3
+
4
+ void init_String_UTF8();
5
+
6
+ #endif
data/ext/utf8/utf8.c ADDED
@@ -0,0 +1,80 @@
1
+ #include <stdio.h>
2
+ #include <stdint.h>
3
+
4
+ #define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
5
+
6
+ /*
7
+ * Scans the current position of the buffer
8
+ * returning the length of this UTF8 character
9
+ */
10
+ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
11
+ if (in_len > 0) {
12
+ unsigned char curChar, *start;
13
+
14
+ start = in;
15
+ curChar = in[0];
16
+ in++;
17
+
18
+ if (curChar <= 0x7f) {
19
+ /* single byte */
20
+ return 1;
21
+ } else if ((curChar >> 5) == 0x6) {
22
+ /* two byte */
23
+ CHECK_LEN;
24
+ curChar = in[0];
25
+ in++;
26
+ if ((curChar >> 6) == 0x2) return 2;
27
+ } else if ((curChar >> 4) == 0x0e) {
28
+ /* three byte */
29
+ CHECK_LEN;
30
+ curChar = in[0];
31
+ in++;
32
+ if ((curChar >> 6) == 0x2) {
33
+ CHECK_LEN;
34
+ curChar = in[0];
35
+ in++;
36
+ if ((curChar >> 6) == 0x2) return 3;
37
+ }
38
+ } else if ((curChar >> 3) == 0x1e) {
39
+ /* four byte */
40
+ CHECK_LEN;
41
+ curChar = in[0];
42
+ in++;
43
+ if ((curChar >> 6) == 0x2) {
44
+ CHECK_LEN;
45
+ curChar = in[0];
46
+ in++;
47
+ if ((curChar >> 6) == 0x2) {
48
+ CHECK_LEN;
49
+ curChar = in[0];
50
+ in++;
51
+ if ((curChar >> 6) == 0x2) return 4;
52
+ }
53
+ }
54
+ }
55
+ }
56
+
57
+ // error case
58
+ return -1;
59
+ }
60
+
61
+ /*
62
+ * Scans the current position of the buffer
63
+ * returning the total number of UTF8 characters found
64
+ */
65
+ size_t utf8CharCount(unsigned char *in, size_t in_len) {
66
+ size_t total = 0, leftOver = in_len;
67
+ int8_t len = 0;
68
+ unsigned char *start = in;
69
+
70
+ if (in_len > 0) {
71
+ while (leftOver) {
72
+ len = utf8CharLen(start, leftOver);
73
+ leftOver -= len;
74
+ start += len;
75
+ total++;
76
+ }
77
+ }
78
+
79
+ return total;
80
+ }
data/ext/utf8/utf8.h ADDED
@@ -0,0 +1,7 @@
1
+ #ifndef UTF8_UTF8_H
2
+ #define UTF8_UTF8_H
3
+
4
+ inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
+ size_t utf8CharCount(unsigned char *in, size_t in_len);
6
+
7
+ #endif