utf8 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +52 -0
- data/ext/utf8/string_utf8.c +20 -8
- data/lib/utf8/version.rb +1 -1
- data/spec/string_spec.rb +5 -0
- data/utf8.gemspec +0 -3
- metadata +7 -7
- data/README.rdoc +0 -48
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
|
2
|
+
|
3
|
+
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
|
+
|
5
|
+
At the moment, this gem is tested on 1.8.7, 1.9.2, 1.9.3 and Rubinius - it may work on others but ymmv.
|
6
|
+
|
7
|
+
## String::UTF8 Example
|
8
|
+
|
9
|
+
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
10
|
+
|
11
|
+
``` ruby
|
12
|
+
# String (on 1.8)
|
13
|
+
str = "你好世界"
|
14
|
+
str.length # 12
|
15
|
+
str[0] # 228
|
16
|
+
str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
|
17
|
+
|
18
|
+
|
19
|
+
# String::UTF8 (on 1.8)
|
20
|
+
utf8_str = "你好世界".as_utf8
|
21
|
+
utf8_str.length # 4
|
22
|
+
utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
|
23
|
+
|
24
|
+
utf8[0] # => "你"
|
25
|
+
utf8_str[0, 3] # => "你好世"
|
26
|
+
utf8_str[0..3] # => "你好世界"
|
27
|
+
```
|
28
|
+
|
29
|
+
## StringScanner::UTF8 Example
|
30
|
+
|
31
|
+
The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
|
32
|
+
|
33
|
+
``` ruby
|
34
|
+
# StringScanner (on 1.8)
|
35
|
+
scanner = StringScanner.new("你好世界")
|
36
|
+
scanner.getch # => "\344"
|
37
|
+
|
38
|
+
# StringScanner::UTF8
|
39
|
+
require 'utf8/string_scanner'
|
40
|
+
|
41
|
+
# NOTE: we could have used scanner.as_utf8 instead
|
42
|
+
utf8_scanner = StringScanner::UTF8.new("你好世界")
|
43
|
+
utf8_scanner.getch # => "你"
|
44
|
+
utf8_scanner.getch # => "好"
|
45
|
+
utf8_scanner.getch # => "世"
|
46
|
+
utf8_scanner.reset
|
47
|
+
utf8_scanner.getch # => "你"
|
48
|
+
```
|
49
|
+
|
50
|
+
## Disclaimer
|
51
|
+
|
52
|
+
I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -343,15 +343,23 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
|
|
343
343
|
int8_t curCharLen;
|
344
344
|
VALUE rb_out;
|
345
345
|
|
346
|
+
outBuf = NULL;
|
347
|
+
outBufCur = NULL;
|
346
348
|
inBuf = (unsigned char *)RSTRING_PTR(self);
|
347
349
|
inBufCur = inBuf;
|
348
350
|
len = RSTRING_LEN(self);
|
349
|
-
outBuf = malloc(len);
|
350
|
-
outBufCur = outBuf;
|
351
351
|
|
352
352
|
for(i=0; i<len; i+=curCharLen) {
|
353
353
|
curCharLen = utf8CharLen(inBufCur, len);
|
354
354
|
if (curCharLen < 0) {
|
355
|
+
if (!outBuf) {
|
356
|
+
outBuf = xmalloc(len);
|
357
|
+
outBufCur = outBuf;
|
358
|
+
if (inBufCur-inBuf > 0) {
|
359
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
355
363
|
if (inBufCur-inBuf > 0) {
|
356
364
|
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
357
365
|
outBufCur += inBufCur-inBuf;
|
@@ -364,14 +372,18 @@ static VALUE rb_cString_UTF8_clean(VALUE self) {
|
|
364
372
|
inBufCur += curCharLen;
|
365
373
|
}
|
366
374
|
|
367
|
-
if (
|
368
|
-
|
369
|
-
|
375
|
+
if (outBuf) {
|
376
|
+
if (inBufCur-inBuf > 0) {
|
377
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
378
|
+
}
|
370
379
|
|
371
|
-
|
372
|
-
|
380
|
+
rb_out = rb_str_new((const char*)outBuf, len);
|
381
|
+
xfree(outBuf);
|
373
382
|
|
374
|
-
|
383
|
+
AS_UTF8(rb_out);
|
384
|
+
} else {
|
385
|
+
rb_out = self;
|
386
|
+
}
|
375
387
|
|
376
388
|
return rb_out;
|
377
389
|
}
|
data/lib/utf8/version.rb
CHANGED
data/spec/string_spec.rb
CHANGED
@@ -64,6 +64,11 @@ describe String::UTF8 do
|
|
64
64
|
assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
|
65
65
|
end
|
66
66
|
|
67
|
+
test "clean should return the passed value if it was valid UTF-8" do
|
68
|
+
orig = "asdfasdf".as_utf8
|
69
|
+
assert_equal orig.object_id, orig.clean.object_id
|
70
|
+
end
|
71
|
+
|
67
72
|
context "#length and #size" do
|
68
73
|
test "should be utf8-aware" do
|
69
74
|
assert_equal @utf8_len, @utf8.length
|
data/utf8.gemspec
CHANGED
@@ -7,9 +7,6 @@ Gem::Specification.new do |s|
|
|
7
7
|
s.date = Time.now.utc.strftime("%Y-%m-%d")
|
8
8
|
s.email = %q{seniorlopez@gmail.com}
|
9
9
|
s.extensions = ["ext/utf8/extconf.rb"]
|
10
|
-
s.extra_rdoc_files = [
|
11
|
-
"README.rdoc"
|
12
|
-
]
|
13
10
|
s.files = `git ls-files`.split("\n")
|
14
11
|
s.homepage = %q{http://github.com/brianmario/utf8}
|
15
12
|
s.rdoc_options = ["--charset=UTF-8"]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 8
|
10
|
+
version: 0.1.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-12-17 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: rake-compiler
|
@@ -69,15 +69,15 @@ executables: []
|
|
69
69
|
|
70
70
|
extensions:
|
71
71
|
- ext/utf8/extconf.rb
|
72
|
-
extra_rdoc_files:
|
73
|
-
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
74
74
|
files:
|
75
75
|
- .gitignore
|
76
76
|
- .rspec
|
77
77
|
- .travis.yml
|
78
78
|
- Gemfile
|
79
79
|
- MIT-LICENSE
|
80
|
-
- README.
|
80
|
+
- README.md
|
81
81
|
- Rakefile
|
82
82
|
- benchmark/active_support.rb
|
83
83
|
- benchmark/test.txt
|
data/README.rdoc
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
= A lightweight UTF8-aware String class meant for use with Ruby 1.8.x
|
2
|
-
|
3
|
-
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
|
-
|
5
|
-
At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
|
6
|
-
|
7
|
-
== String::UTF8 Example
|
8
|
-
|
9
|
-
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
10
|
-
|
11
|
-
# String (on 1.8)
|
12
|
-
str = "你好世界"
|
13
|
-
str.length # 12
|
14
|
-
str[0] # 228
|
15
|
-
str.chars.to_a.inspect # ["\344", "\275", "\240", "\345", "\245", "\275", "\344", "\270", "\226", "\347", "\225", "\214"]
|
16
|
-
|
17
|
-
|
18
|
-
# String::UTF8 (on 1.8)
|
19
|
-
utf8_str = "你好世界".as_utf8
|
20
|
-
utf8_str.length # 4
|
21
|
-
utf8.chars.to_a.inspect # => ["你", "好", "世", "界"]
|
22
|
-
|
23
|
-
utf8[0] # => "你"
|
24
|
-
utf8_str[0, 3] # => "你好世"
|
25
|
-
utf8_str[0..3] # => "你好世界"
|
26
|
-
|
27
|
-
== StringScanner::UTF8 Example
|
28
|
-
|
29
|
-
The full StringScanner API hasn't been made UTF8-aware yet. I may try and finish it later, but the getch method is all I need at the moment.
|
30
|
-
|
31
|
-
# StringScanner (on 1.8)
|
32
|
-
scanner = StringScanner.new("你好世界")
|
33
|
-
scanner.getch # => "\344"
|
34
|
-
|
35
|
-
# StringScanner::UTF8
|
36
|
-
require 'utf8/string_scanner'
|
37
|
-
|
38
|
-
# NOTE: we could have used scanner.as_utf8 instead
|
39
|
-
utf8_scanner = StringScanner::UTF8.new("你好世界")
|
40
|
-
utf8_scanner.getch # => "你"
|
41
|
-
utf8_scanner.getch # => "好"
|
42
|
-
utf8_scanner.getch # => "世"
|
43
|
-
utf8_scanner.reset
|
44
|
-
utf8_scanner.getch # => "你"
|
45
|
-
|
46
|
-
== Disclaimer
|
47
|
-
|
48
|
-
I'm only planning on making the methods I need UTF8-aware, if you need others I welcome pull requests :)
|