utf8 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,8 @@
3
3
 
4
4
  extern VALUE intern_as_utf8;
5
5
 
6
+ #define REPLACEMENT_CHAR '?'
7
+
6
8
  /*
7
9
  * Document-class: String::UTF-8
8
10
  */
@@ -112,7 +114,7 @@ static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
112
114
  }
113
115
 
114
116
  for(; i<len; i+=lastCharLen) {
115
- lastCharLen = utf8CharLen(str, len);
117
+ lastCharLen = utf8CharLen(str+i, len);
116
118
  if (lastCharLen < 0) {
117
119
  return Qfalse;
118
120
  }
@@ -327,6 +329,42 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
327
329
  }
328
330
  }
329
331
 
332
+ /*
333
+ * call-seq: clean
334
+ *
335
+ * Iterates over the string, replacing invalid UTF-8 characters with '?'
336
+ *
337
+ * Returns: a new String
338
+ */
339
+ static VALUE rb_cString_UTF8_clean(VALUE self) {
340
+ unsigned char *str;
341
+ unsigned char *out;
342
+ unsigned char replace;
343
+ size_t len;
344
+ int8_t curCharLen;
345
+ size_t i;
346
+ VALUE rb_out;
347
+
348
+ str = (unsigned char *)RSTRING_PTR(self);
349
+ len = RSTRING_LEN(self);
350
+ replace = REPLACEMENT_CHAR;
351
+ out = xmalloc(len);
352
+
353
+ for(i=0; i<len; i++) {
354
+ curCharLen = utf8CharLen(str+i, len);
355
+ if (curCharLen < 0) {
356
+ *(out+i) = replace;
357
+ } else {
358
+ *(out+i) = *(str+i);
359
+ }
360
+ }
361
+
362
+ rb_out = rb_str_new((const char*)out, len);
363
+ AS_UTF8(rb_out);
364
+
365
+ return rb_out;
366
+ }
367
+
330
368
  void init_String_UTF8() {
331
369
  VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
332
370
 
@@ -335,4 +373,5 @@ void init_String_UTF8() {
335
373
  rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
336
374
  rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
337
375
  rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
376
+ rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
338
377
  }
data/lib/utf8/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class String
2
2
  class UTF8 < ::String
3
- VERSION = "0.1.3"
3
+ VERSION = "0.1.4"
4
4
  end
5
5
  end
data/spec/string_spec.rb CHANGED
@@ -50,6 +50,12 @@ describe String::UTF8 do
50
50
  @utf8.chars.to_a[0].class.should eql(String::UTF8)
51
51
  end
52
52
 
53
+ it "clean should replace invalid utf8 chars with '?'" do
54
+ orig = "provided by Cristian Rodr\355guez."
55
+ clean = "provided by Cristian Rodr?guez."
56
+ orig.as_utf8.clean.should eql(clean)
57
+ end
58
+
53
59
  context "#length and #size" do
54
60
  it "should be utf8-aware" do
55
61
  @utf8.length.should eql(@utf8_len)
@@ -185,6 +191,8 @@ describe String::UTF8 do
185
191
 
186
192
  utf8.valid?.should be_false
187
193
  @utf8.valid?.should be_true
194
+
195
+ "provided by Cristian Rodr\355guez.".as_utf8.should_not be_valid
188
196
  end
189
197
 
190
198
  it "should test validity using a maximum codepoint" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 3
10
- version: 0.1.3
9
+ - 4
10
+ version: 0.1.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-06-02 00:00:00 -07:00
18
+ date: 2011-07-21 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -129,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
129
  requirements: []
130
130
 
131
131
  rubyforge_project:
132
- rubygems_version: 1.6.2
132
+ rubygems_version: 1.3.10
133
133
  signing_key:
134
134
  specification_version: 3
135
135
  summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8