utf8 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,52 @@
1
+ # A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
2
+
3
+ The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
4
+
5
+ At the moment, this gem is tested on 1.8.7, 1.9.2, 1.9.3 and Rubinius - it may work on others but ymmv.
6
+
7
+ ## String::UTF8 Example
8
+
9
+ The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
10
+
11
+ ``` ruby
12
+ # String (on 1.8)
13
+ str = "你好世界"
14
+ str.length # 12
15
+ str[0] # 228
16
+ str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
17
+
18
+
19
+ # String::UTF8 (on 1.8)
20
+ utf8_str = "你好世界".as_utf8
21
+ utf8_str.length # 4
22
+ utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
23
+
24
+ utf8[0] # => "你"
25
+ utf8_str[0, 3] # => "你好世"
26
+ utf8_str[0..3] # => "你好世界"
27
+ ```
28
+
29
+ ## StringScanner::UTF8 Example
30
+
31
+ The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
32
+
33
+ ``` ruby
34
+ # StringScanner (on 1.8)
35
+ scanner = StringScanner.new("你好世界")
36
+ scanner.getch # => "\344"
37
+
38
+ # StringScanner::UTF8
39
+ require 'utf8/string_scanner'
40
+
41
+ # NOTE: we could have used scanner.as_utf8 instead
42
+ utf8_scanner = StringScanner::UTF8.new("你好世界")
43
+ utf8_scanner.getch # => "你"
44
+ utf8_scanner.getch # => "好"
45
+ utf8_scanner.getch # => "世"
46
+ utf8_scanner.reset
47
+ utf8_scanner.getch # => "你"
48
+ ```
49
+
50
+ ## Disclaimer
51
+
52
+ I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
@@ -343,15 +343,23 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
343
343
  int8_t curCharLen;
344
344
  VALUE rb_out;
345
345
 
346
+ outBuf = NULL;
347
+ outBufCur = NULL;
346
348
  inBuf = (unsigned char *)RSTRING_PTR(self);
347
349
  inBufCur = inBuf;
348
350
  len = RSTRING_LEN(self);
349
- outBuf = malloc(len);
350
- outBufCur = outBuf;
351
351
 
352
352
  for(i=0; i<len; i+=curCharLen) {
353
353
  curCharLen = utf8CharLen(inBufCur, len);
354
354
  if (curCharLen < 0) {
355
+ if (!outBuf) {
356
+ outBuf = xmalloc(len);
357
+ outBufCur = outBuf;
358
+ if (inBufCur-inBuf > 0) {
359
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
360
+ }
361
+ }
362
+
355
363
  if (inBufCur-inBuf > 0) {
356
364
  memcpy(outBufCur, inBuf, inBufCur-inBuf);
357
365
  outBufCur += inBufCur-inBuf;
@@ -364,14 +372,18 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
364
372
  inBufCur += curCharLen;
365
373
  }
366
374
 
367
- if (inBufCur-inBuf > 0) {
368
- memcpy(outBufCur, inBuf, inBufCur-inBuf);
369
- }
375
+ if (outBuf) {
376
+ if (inBufCur-inBuf > 0) {
377
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
378
+ }
370
379
 
371
- rb_out = rb_str_new((const char*)outBuf, len);
372
- AS_UTF8(rb_out);
380
+ rb_out = rb_str_new((const char*)outBuf, len);
381
+ xfree(outBuf);
373
382
 
374
- free(outBuf);
383
+ AS_UTF8(rb_out);
384
+ } else {
385
+ rb_out = self;
386
+ }
375
387
 
376
388
  return rb_out;
377
389
  }
@@ -1,5 +1,5 @@
1
1
  class String
2
2
  class UTF8 < ::String
3
- VERSION = "0.1.7"
3
+ VERSION = "0.1.8"
4
4
  end
5
5
  end
@@ -64,6 +64,11 @@ describe String::UTF8 do
64
64
  assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
65
65
  end
66
66
 
67
+ test "clean should return the passed value if it was valid UTF-8" do
68
+ orig = "asdfasdf".as_utf8
69
+ assert_equal orig.object_id, orig.clean.object_id
70
+ end
71
+
67
72
  context "#length and #size" do
68
73
  test "should be utf8-aware" do
69
74
  assert_equal @utf8_len, @utf8.length
@@ -7,9 +7,6 @@ Gem::Specification.new do |s|
7
7
  s.date = Time.now.utc.strftime("%Y-%m-%d")
8
8
  s.email = %q{seniorlopez@gmail.com}
9
9
  s.extensions = ["ext/utf8/extconf.rb"]
10
- s.extra_rdoc_files = [
11
- "README.rdoc"
12
- ]
13
10
  s.files = `git ls-files`.split("\n")
14
11
  s.homepage = %q{http://github.com/brianmario/utf8}
15
12
  s.rdoc_options = ["--charset=UTF-8"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 11
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 7
10
- version: 0.1.7
9
+ - 8
10
+ version: 0.1.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-22 00:00:00 Z
18
+ date: 2011-12-17 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: rake-compiler
@@ -69,15 +69,15 @@ executables: []
69
69
 
70
70
  extensions:
71
71
  - ext/utf8/extconf.rb
72
- extra_rdoc_files:
73
- - README.rdoc
72
+ extra_rdoc_files: []
73
+
74
74
  files:
75
75
  - .gitignore
76
76
  - .rspec
77
77
  - .travis.yml
78
78
  - Gemfile
79
79
  - MIT-LICENSE
80
- - README.rdoc
80
+ - README.md
81
81
  - Rakefile
82
82
  - benchmark/active_support.rb
83
83
  - benchmark/test.txt
@@ -1,48 +0,0 @@
1
- = A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
2
-
3
- The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
4
-
5
- At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
6
-
7
- == String::UTF8 Example
8
-
9
- The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
10
-
11
- # String (on 1.8)
12
- str = "你好世界"
13
- str.length # 12
14
- str[0] # 228
15
- str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
16
-
17
-
18
- # String::UTF8 (on 1.8)
19
- utf8_str = "你好世界".as_utf8
20
- utf8_str.length # 4
21
- utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
22
-
23
- utf8[0] # => "你"
24
- utf8_str[0, 3] # => "你好世"
25
- utf8_str[0..3] # => "你好世界"
26
-
27
- == StringScanner::UTF8 Example
28
-
29
- The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
30
-
31
- # StringScanner (on 1.8)
32
- scanner = StringScanner.new("你好世界")
33
- scanner.getch # => "\344"
34
-
35
- # StringScanner::UTF8
36
- require 'utf8/string_scanner'
37
-
38
- # NOTE: we could have used scanner.as_utf8 instead
39
- utf8_scanner = StringScanner::UTF8.new("你好世界")
40
- utf8_scanner.getch # => "你"
41
- utf8_scanner.getch # => "好"
42
- utf8_scanner.getch # => "世"
43
- utf8_scanner.reset
44
- utf8_scanner.getch # => "你"
45
-
46
- == Disclaimer
47
-
48
- I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)