utf8 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/utf8/string_utf8.c +40 -1
- data/lib/utf8/version.rb +1 -1
- data/spec/string_spec.rb +8 -0
- metadata +5 -5
data/ext/utf8/string_utf8.c
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
|
4
4
|
extern VALUE intern_as_utf8;
|
5
5
|
|
6
|
+
#define REPLACEMENT_CHAR '?'
|
7
|
+
|
6
8
|
/*
|
7
9
|
* Document-class: String::UTF-8
|
8
10
|
*/
|
@@ -112,7 +114,7 @@ static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
|
|
112
114
|
}
|
113
115
|
|
114
116
|
for(; i<len; i+=lastCharLen) {
|
115
|
-
lastCharLen = utf8CharLen(str, len);
|
117
|
+
lastCharLen = utf8CharLen(str+i, len);
|
116
118
|
if (lastCharLen < 0) {
|
117
119
|
return Qfalse;
|
118
120
|
}
|
@@ -327,6 +329,42 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
327
329
|
}
|
328
330
|
}
|
329
331
|
|
332
|
+
/*
|
333
|
+
* call-seq: clean
|
334
|
+
*
|
335
|
+
* Iterates over the string, replacing invalid UTF-8 characters with '?'
|
336
|
+
*
|
337
|
+
* Returns: a new String
|
338
|
+
*/
|
339
|
+
static VALUE rb_cString_UTF8_clean(VALUE self) {
|
340
|
+
unsigned char *str;
|
341
|
+
unsigned char *out;
|
342
|
+
unsigned char replace;
|
343
|
+
size_t len;
|
344
|
+
int8_t curCharLen;
|
345
|
+
size_t i;
|
346
|
+
VALUE rb_out;
|
347
|
+
|
348
|
+
str = (unsigned char *)RSTRING_PTR(self);
|
349
|
+
len = RSTRING_LEN(self);
|
350
|
+
replace = REPLACEMENT_CHAR;
|
351
|
+
out = xmalloc(len);
|
352
|
+
|
353
|
+
for(i=0; i<len; i++) {
|
354
|
+
curCharLen = utf8CharLen(str+i, len);
|
355
|
+
if (curCharLen < 0) {
|
356
|
+
*(out+i) = replace;
|
357
|
+
} else {
|
358
|
+
*(out+i) = *(str+i);
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
rb_out = rb_str_new((const char*)out, len);
|
363
|
+
AS_UTF8(rb_out);
|
364
|
+
|
365
|
+
return rb_out;
|
366
|
+
}
|
367
|
+
|
330
368
|
void init_String_UTF8() {
|
331
369
|
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
332
370
|
|
@@ -335,4 +373,5 @@ void init_String_UTF8() {
|
|
335
373
|
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
336
374
|
rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
|
337
375
|
rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
|
376
|
+
rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
|
338
377
|
}
|
data/lib/utf8/version.rb
CHANGED
data/spec/string_spec.rb
CHANGED
@@ -50,6 +50,12 @@ describe String::UTF8 do
|
|
50
50
|
@utf8.chars.to_a[0].class.should eql(String::UTF8)
|
51
51
|
end
|
52
52
|
|
53
|
+
it "clean should replace invalid utf8 chars with '?'" do
|
54
|
+
orig = "provided by Cristian Rodr\355guez."
|
55
|
+
clean = "provided by Cristian Rodr?guez."
|
56
|
+
orig.as_utf8.clean.should eql(clean)
|
57
|
+
end
|
58
|
+
|
53
59
|
context "#length and #size" do
|
54
60
|
it "should be utf8-aware" do
|
55
61
|
@utf8.length.should eql(@utf8_len)
|
@@ -185,6 +191,8 @@ describe String::UTF8 do
|
|
185
191
|
|
186
192
|
utf8.valid?.should be_false
|
187
193
|
@utf8.valid?.should be_true
|
194
|
+
|
195
|
+
"provided by Cristian Rodr\355guez.".as_utf8.should_not be_valid
|
188
196
|
end
|
189
197
|
|
190
198
|
it "should test validity using a maximum codepoint" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-07-21 00:00:00 -07:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -129,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
129
|
requirements: []
|
130
130
|
|
131
131
|
rubyforge_project:
|
132
|
-
rubygems_version: 1.
|
132
|
+
rubygems_version: 1.3.10
|
133
133
|
signing_key:
|
134
134
|
specification_version: 3
|
135
135
|
summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8
|