utf8 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +52 -0
- data/ext/utf8/string_utf8.c +20 -8
- data/lib/utf8/version.rb +1 -1
- data/spec/string_spec.rb +5 -0
- data/utf8.gemspec +0 -3
- metadata +7 -7
- data/README.rdoc +0 -48
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
|
2
|
+
|
3
|
+
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
|
+
|
5
|
+
At the moment, this gem is tested on 1.8.7, 1.9.2, 1.9.3 and Rubinius - it may work on others but ymmv.
|
6
|
+
|
7
|
+
## String::UTF8 Example
|
8
|
+
|
9
|
+
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
10
|
+
|
11
|
+
``` ruby
|
12
|
+
# String (on 1.8)
|
13
|
+
str = "你好世界"
|
14
|
+
str.length # 12
|
15
|
+
str[0] # 228
|
16
|
+
str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
|
17
|
+
|
18
|
+
|
19
|
+
# String::UTF8 (on 1.8)
|
20
|
+
utf8_str = "你好世界".as_utf8
|
21
|
+
utf8_str.length # 4
|
22
|
+
utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
|
23
|
+
|
24
|
+
utf8[0] # => "你"
|
25
|
+
utf8_str[0, 3] # => "你好世"
|
26
|
+
utf8_str[0..3] # => "你好世界"
|
27
|
+
```
|
28
|
+
|
29
|
+
## StringScanner::UTF8 Example
|
30
|
+
|
31
|
+
The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
|
32
|
+
|
33
|
+
``` ruby
|
34
|
+
# StringScanner (on 1.8)
|
35
|
+
scanner = StringScanner.new("你好世界")
|
36
|
+
scanner.getch # => "\344"
|
37
|
+
|
38
|
+
# StringScanner::UTF8
|
39
|
+
require 'utf8/string_scanner'
|
40
|
+
|
41
|
+
# NOTE: we could have used scanner.as_utf8 instead
|
42
|
+
utf8_scanner = StringScanner::UTF8.new("你好世界")
|
43
|
+
utf8_scanner.getch # => "你"
|
44
|
+
utf8_scanner.getch # => "好"
|
45
|
+
utf8_scanner.getch # => "世"
|
46
|
+
utf8_scanner.reset
|
47
|
+
utf8_scanner.getch # => "你"
|
48
|
+
```
|
49
|
+
|
50
|
+
## Disclaimer
|
51
|
+
|
52
|
+
I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -343,15 +343,23 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
|
|
343
343
|
int8_t curCharLen;
|
344
344
|
VALUE rb_out;
|
345
345
|
|
346
|
+
outBuf = NULL;
|
347
|
+
outBufCur = NULL;
|
346
348
|
inBuf = (unsigned char *)RSTRING_PTR(self);
|
347
349
|
inBufCur = inBuf;
|
348
350
|
len = RSTRING_LEN(self);
|
349
|
-
outBuf = malloc(len);
|
350
|
-
outBufCur = outBuf;
|
351
351
|
|
352
352
|
for(i=0; i<len; i+=curCharLen) {
|
353
353
|
curCharLen = utf8CharLen(inBufCur, len);
|
354
354
|
if (curCharLen < 0) {
|
355
|
+
if (!outBuf) {
|
356
|
+
outBuf = xmalloc(len);
|
357
|
+
outBufCur = outBuf;
|
358
|
+
if (inBufCur-inBuf > 0) {
|
359
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
355
363
|
if (inBufCur-inBuf > 0) {
|
356
364
|
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
357
365
|
outBufCur += inBufCur-inBuf;
|
@@ -364,14 +372,18 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
|
|
364
372
|
inBufCur += curCharLen;
|
365
373
|
}
|
366
374
|
|
367
|
-
if (
|
368
|
-
|
369
|
-
|
375
|
+
if (outBuf) {
|
376
|
+
if (inBufCur-inBuf > 0) {
|
377
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
378
|
+
}
|
370
379
|
|
371
|
-
|
372
|
-
|
380
|
+
rb_out = rb_str_new((const char*)outBuf, len);
|
381
|
+
xfree(outBuf);
|
373
382
|
|
374
|
-
|
383
|
+
AS_UTF8(rb_out);
|
384
|
+
} else {
|
385
|
+
rb_out = self;
|
386
|
+
}
|
375
387
|
|
376
388
|
return rb_out;
|
377
389
|
}
|
data/lib/utf8/version.rb
CHANGED
data/spec/string_spec.rb
CHANGED
@@ -64,6 +64,11 @@ describe String::UTF8 do
|
|
64
64
|
assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
|
65
65
|
end
|
66
66
|
|
67
|
+
test "clean should return the passed value if it was valid UTF-8" do
|
68
|
+
orig = "asdfasdf".as_utf8
|
69
|
+
assert_equal orig.object_id, orig.clean.object_id
|
70
|
+
end
|
71
|
+
|
67
72
|
context "#length and #size" do
|
68
73
|
test "should be utf8-aware" do
|
69
74
|
assert_equal @utf8_len, @utf8.length
|
data/utf8.gemspec
CHANGED
@@ -7,9 +7,6 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.date = Time.now.utc.strftime("%Y-%m-%d")
|
8
8
|
s.email = %q{seniorlopez@gmail.com}
|
9
9
|
s.extensions = ["ext/utf8/extconf.rb"]
|
10
|
-
s.extra_rdoc_files = [
|
11
|
-
"README.rdoc"
|
12
|
-
]
|
13
10
|
s.files = `git ls-files`.split("\n")
|
14
11
|
s.homepage = %q{http://github.com/brianmario/utf8}
|
15
12
|
s.rdoc_options = ["--charset=UTF-8"]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 8
|
10
|
+
version: 0.1.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-12-17 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: rake-compiler
|
@@ -69,15 +69,15 @@ executables: []
|
|
69
69
|
|
70
70
|
extensions:
|
71
71
|
- ext/utf8/extconf.rb
|
72
|
-
extra_rdoc_files:
|
73
|
-
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
74
74
|
files:
|
75
75
|
- .gitignore
|
76
76
|
- .rspec
|
77
77
|
- .travis.yml
|
78
78
|
- Gemfile
|
79
79
|
- MIT-LICENSE
|
80
|
-
- README.
|
80
|
+
- README.md
|
81
81
|
- Rakefile
|
82
82
|
- benchmark/active_support.rb
|
83
83
|
- benchmark/test.txt
|
data/README.rdoc
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
= A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
|
2
|
-
|
3
|
-
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
|
-
|
5
|
-
At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
|
6
|
-
|
7
|
-
== String::UTF8 Example
|
8
|
-
|
9
|
-
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
10
|
-
|
11
|
-
# String (on 1.8)
|
12
|
-
str = "你好世界"
|
13
|
-
str.length # 12
|
14
|
-
str[0] # 228
|
15
|
-
str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
|
16
|
-
|
17
|
-
|
18
|
-
# String::UTF8 (on 1.8)
|
19
|
-
utf8_str = "你好世界".as_utf8
|
20
|
-
utf8_str.length # 4
|
21
|
-
utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
|
22
|
-
|
23
|
-
utf8[0] # => "你"
|
24
|
-
utf8_str[0, 3] # => "你好世"
|
25
|
-
utf8_str[0..3] # => "你好世界"
|
26
|
-
|
27
|
-
== StringScanner::UTF8 Example
|
28
|
-
|
29
|
-
The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
|
30
|
-
|
31
|
-
# StringScanner (on 1.8)
|
32
|
-
scanner = StringScanner.new("你好世界")
|
33
|
-
scanner.getch # => "\344"
|
34
|
-
|
35
|
-
# StringScanner::UTF8
|
36
|
-
require 'utf8/string_scanner'
|
37
|
-
|
38
|
-
# NOTE: we could have used scanner.as_utf8 instead
|
39
|
-
utf8_scanner = StringScanner::UTF8.new("你好世界")
|
40
|
-
utf8_scanner.getch # => "你"
|
41
|
-
utf8_scanner.getch # => "好"
|
42
|
-
utf8_scanner.getch # => "世"
|
43
|
-
utf8_scanner.reset
|
44
|
-
utf8_scanner.getch # => "你"
|
45
|
-
|
46
|
-
== Disclaimer
|
47
|
-
|
48
|
-
I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
|