utf8 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ # A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
2
+
3
+ The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
4
+
5
+ At the moment, this gem is tested on 1.8.7, 1.9.2, 1.9.3 and Rubinius - it may work on others but ymmv.
6
+
7
+ ## String::UTF8 Example
8
+
9
+ The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
10
+
11
+ ``` ruby
12
+ # String (on 1.8)
13
+ str = "你好世界"
14
+ str.length # 12
15
+ str[0] # 228
16
+ str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
17
+
18
+
19
+ # String::UTF8 (on 1.8)
20
+ utf8_str = "你好世界".as_utf8
21
+ utf8_str.length # 4
22
+ utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
23
+
24
+ utf8[0] # => "你"
25
+ utf8_str[0, 3] # => "你好世"
26
+ utf8_str[0..3] # => "你好世界"
27
+ ```
28
+
29
+ ## StringScanner::UTF8 Example
30
+
31
+ The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
32
+
33
+ ``` ruby
34
+ # StringScanner (on 1.8)
35
+ scanner = StringScanner.new("你好世界")
36
+ scanner.getch # => "\344"
37
+
38
+ # StringScanner::UTF8
39
+ require 'utf8/string_scanner'
40
+
41
+ # NOTE: we could have used scanner.as_utf8 instead
42
+ utf8_scanner = StringScanner::UTF8.new("你好世界")
43
+ utf8_scanner.getch # => "你"
44
+ utf8_scanner.getch # => "好"
45
+ utf8_scanner.getch # => "世"
46
+ utf8_scanner.reset
47
+ utf8_scanner.getch # => "你"
48
+ ```
49
+
50
+ ## Disclaimer
51
+
52
+ I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
@@ -343,15 +343,23 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
343
343
  int8_t curCharLen;
344
344
  VALUE rb_out;
345
345
 
346
+ outBuf = NULL;
347
+ outBufCur = NULL;
346
348
  inBuf = (unsigned char *)RSTRING_PTR(self);
347
349
  inBufCur = inBuf;
348
350
  len = RSTRING_LEN(self);
349
- outBuf = malloc(len);
350
- outBufCur = outBuf;
351
351
 
352
352
  for(i=0; i<len; i+=curCharLen) {
353
353
  curCharLen = utf8CharLen(inBufCur, len);
354
354
  if (curCharLen < 0) {
355
+ if (!outBuf) {
356
+ outBuf = xmalloc(len);
357
+ outBufCur = outBuf;
358
+ if (inBufCur-inBuf > 0) {
359
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
360
+ }
361
+ }
362
+
355
363
  if (inBufCur-inBuf > 0) {
356
364
  memcpy(outBufCur, inBuf, inBufCur-inBuf);
357
365
  outBufCur += inBufCur-inBuf;
@@ -364,14 +372,18 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
364
372
  inBufCur += curCharLen;
365
373
  }
366
374
 
367
- if (inBufCur-inBuf > 0) {
368
- memcpy(outBufCur, inBuf, inBufCur-inBuf);
369
- }
375
+ if (outBuf) {
376
+ if (inBufCur-inBuf > 0) {
377
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
378
+ }
370
379
 
371
- rb_out = rb_str_new((const char*)outBuf, len);
372
- AS_UTF8(rb_out);
380
+ rb_out = rb_str_new((const char*)outBuf, len);
381
+ xfree(outBuf);
373
382
 
374
- free(outBuf);
383
+ AS_UTF8(rb_out);
384
+ } else {
385
+ rb_out = self;
386
+ }
375
387
 
376
388
  return rb_out;
377
389
  }
@@ -1,5 +1,5 @@
1
1
  class String
2
2
  class UTF8 < ::String
3
- VERSION = "0.1.7"
3
+ VERSION = "0.1.8"
4
4
  end
5
5
  end
@@ -64,6 +64,11 @@ describe String::UTF8 do
64
64
  assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
65
65
  end
66
66
 
67
+ test "clean should return the passed value if it was valid UTF-8" do
68
+ orig = "asdfasdf".as_utf8
69
+ assert_equal orig.object_id, orig.clean.object_id
70
+ end
71
+
67
72
  context "#length and #size" do
68
73
  test "should be utf8-aware" do
69
74
  assert_equal @utf8_len, @utf8.length
@@ -7,9 +7,6 @@ Gem::Specification.new do |s|
7
7
  s.date = Time.now.utc.strftime("%Y-%m-%d")
8
8
  s.email = %q{seniorlopez@gmail.com}
9
9
  s.extensions = ["ext/utf8/extconf.rb"]
10
- s.extra_rdoc_files = [
11
- "README.rdoc"
12
- ]
13
10
  s.files = `git ls-files`.split("\n")
14
11
  s.homepage = %q{http://github.com/brianmario/utf8}
15
12
  s.rdoc_options = ["--charset=UTF-8"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 11
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 7
10
- version: 0.1.7
9
+ - 8
10
+ version: 0.1.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-22 00:00:00 Z
18
+ date: 2011-12-17 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: rake-compiler
@@ -69,15 +69,15 @@ executables: []
69
69
 
70
70
  extensions:
71
71
  - ext/utf8/extconf.rb
72
- extra_rdoc_files:
73
- - README.rdoc
72
+ extra_rdoc_files: []
73
+
74
74
  files:
75
75
  - .gitignore
76
76
  - .rspec
77
77
  - .travis.yml
78
78
  - Gemfile
79
79
  - MIT-LICENSE
80
- - README.rdoc
80
+ - README.md
81
81
  - Rakefile
82
82
  - benchmark/active_support.rb
83
83
  - benchmark/test.txt
@@ -1,48 +0,0 @@
1
- = A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
2
-
3
- The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
4
-
5
- At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
6
-
7
- == String::UTF8 Example
8
-
9
- The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
10
-
11
- # String (on 1.8)
12
- str = "你好世界"
13
- str.length # 12
14
- str[0] # 228
15
- str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
16
-
17
-
18
- # String::UTF8 (on 1.8)
19
- utf8_str = "你好世界".as_utf8
20
- utf8_str.length # 4
21
- utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
22
-
23
- utf8[0] # => "你"
24
- utf8_str[0, 3] # => "你好世"
25
- utf8_str[0..3] # => "你好世界"
26
-
27
- == StringScanner::UTF8 Example
28
-
29
- The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
30
-
31
- # StringScanner (on 1.8)
32
- scanner = StringScanner.new("你好世界")
33
- scanner.getch # => "\344"
34
-
35
- # StringScanner::UTF8
36
- require 'utf8/string_scanner'
37
-
38
- # NOTE: we could have used scanner.as_utf8 instead
39
- utf8_scanner = StringScanner::UTF8.new("你好世界")
40
- utf8_scanner.getch # => "你"
41
- utf8_scanner.getch # => "好"
42
- utf8_scanner.getch # => "世"
43
- utf8_scanner.reset
44
- utf8_scanner.getch # => "你"
45
-
46
- == Disclaimer
47
-
48
- I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)