utf8 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +46 -0
- data/Rakefile +11 -0
- data/benchmark/active_support.rb +61 -0
- data/benchmark/test.txt +693 -0
- data/ext/utf8/ext.c +21 -0
- data/ext/utf8/ext.h +17 -0
- data/ext/utf8/extconf.rb +7 -0
- data/ext/utf8/string_scanner_utf8.c +68 -0
- data/ext/utf8/string_scanner_utf8.h +6 -0
- data/ext/utf8/string_utf8.c +224 -0
- data/ext/utf8/string_utf8.h +6 -0
- data/ext/utf8/utf8.c +80 -0
- data/ext/utf8/utf8.h +7 -0
- data/lib/utf8.rb +5 -0
- data/lib/utf8/string.rb +19 -0
- data/lib/utf8/string_scanner.rb +21 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/string_scanner_spec.rb +48 -0
- data/spec/string_spec.rb +151 -0
- data/utf8.gemspec +37 -0
- metadata +120 -0
data/lib/utf8.rb
ADDED
data/lib/utf8/string.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
class String
|
2
|
+
# Wraps your string in an UTF8-aware version of String
|
3
|
+
def as_utf8
|
4
|
+
String::UTF8.new(self)
|
5
|
+
end
|
6
|
+
|
7
|
+
class UTF8
|
8
|
+
VERSION = "0.1.0"
|
9
|
+
|
10
|
+
# Gives you access to the raw non-UTF8-aware version of the string
|
11
|
+
def as_raw
|
12
|
+
::String.new(self)
|
13
|
+
end
|
14
|
+
|
15
|
+
alias :size :length
|
16
|
+
alias :chars :each_char
|
17
|
+
alias :slice :[]
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'utf8'
|
2
|
+
|
3
|
+
class StringScanner
|
4
|
+
# Returns an UTF8-aware version of StringScanner wrapping your original string
|
5
|
+
#
|
6
|
+
# NOTE: this will lose all state associated with the current StringScanner instance
|
7
|
+
# (like the current scan position)
|
8
|
+
def as_utf8
|
9
|
+
StringScanner::UTF8.new(self.string.as_utf8)
|
10
|
+
end
|
11
|
+
|
12
|
+
class UTF8
|
13
|
+
# Returns a non-UTF8-aware version of StringScanner wrapping your original string
|
14
|
+
#
|
15
|
+
# NOTE: this will lose all state associated with the current StringScanner::UTF8 instance
|
16
|
+
# (like the current scan position)
|
17
|
+
def as_raw
|
18
|
+
StringScanner.new(self.string)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path('../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe StringScanner::UTF8 do
|
5
|
+
before(:all) do
|
6
|
+
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
|
+
@scanner = StringScanner.new(@char_array.join)
|
8
|
+
@utf8_scanner = @scanner.as_utf8
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
|
12
|
+
@scanner.should respond_to(:as_utf8)
|
13
|
+
@scanner.as_utf8.class.should eql(StringScanner::UTF8)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should allow access to a regular (non-utf8-aware) StringScanner based on it's string" do
|
17
|
+
raw = @utf8_scanner.as_raw
|
18
|
+
raw.class.should eql(StringScanner)
|
19
|
+
raw.string.should eql(@utf8_scanner.string)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "#getch should be utf8-aware" do
|
23
|
+
i=0
|
24
|
+
while char = @utf8_scanner.getch
|
25
|
+
char.should eql(@char_array[i])
|
26
|
+
i+=1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should be able to be reset" do
|
31
|
+
i=0
|
32
|
+
while char = @utf8_scanner.getch
|
33
|
+
char.should eql(@char_array[i])
|
34
|
+
if i == 4
|
35
|
+
break
|
36
|
+
end
|
37
|
+
i+=1
|
38
|
+
end
|
39
|
+
|
40
|
+
@utf8_scanner.reset
|
41
|
+
|
42
|
+
i=0
|
43
|
+
while char = @utf8_scanner.getch
|
44
|
+
char.should eql(@char_array[i])
|
45
|
+
i+=1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/spec/string_spec.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path('../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe String::UTF8 do
|
5
|
+
before(:all) do
|
6
|
+
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
|
+
@str = @char_array.join
|
8
|
+
@utf8 = @str.as_utf8
|
9
|
+
@utf8_len = @char_array.size
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
|
13
|
+
"".should respond_to(:as_utf8)
|
14
|
+
"".as_utf8.class.should eql(String::UTF8)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should allow access to the underlying raw string" do
|
18
|
+
raw = @utf8.as_raw
|
19
|
+
raw.class.should eql(String)
|
20
|
+
if defined? Encoding
|
21
|
+
raw.length.should eql(@utf8_len)
|
22
|
+
else
|
23
|
+
raw.length.should eql(@str.size)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should wrap all returned strings to be utf8-aware" do
|
28
|
+
@utf8[0].class.should eql(String::UTF8)
|
29
|
+
@utf8.chars.to_a[0].class.should eql(String::UTF8)
|
30
|
+
end
|
31
|
+
|
32
|
+
context "#length and #size" do
|
33
|
+
it "should be utf8-aware" do
|
34
|
+
@utf8.length.should eql(@utf8_len)
|
35
|
+
@utf8.size.should eql(@utf8_len)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
context "#chars and #each_char" do
|
40
|
+
it "should be utf8-aware" do
|
41
|
+
klass = begin
|
42
|
+
if defined? Encoding
|
43
|
+
Enumerator
|
44
|
+
else
|
45
|
+
Enumerable::Enumerator
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
@utf8.chars.class.should eql(klass)
|
50
|
+
i=0
|
51
|
+
@utf8.chars do |char|
|
52
|
+
char.should_not be_nil
|
53
|
+
i+=1
|
54
|
+
end
|
55
|
+
joined = @utf8.chars.to_a.join
|
56
|
+
@utf8.should eql(joined)
|
57
|
+
@utf8.chars.to_a.size.should eql(@utf8_len)
|
58
|
+
@utf8.chars.to_a.should eql(@char_array)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context "[offset] syntax" do
|
63
|
+
it "should be utf8-aware" do
|
64
|
+
@char_array.each_with_index do |char, i|
|
65
|
+
utf8_char = @utf8[i]
|
66
|
+
utf8_char.should eql(char)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should support negative indices" do
|
71
|
+
utf8_char = @utf8[-5]
|
72
|
+
utf8_char.should eql(@char_array[-5])
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should return nil for out of range indices" do
|
76
|
+
@utf8[100].should be_nil
|
77
|
+
@utf8[-100].should be_nil
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "[offset, length] syntax" do
|
82
|
+
it "should be utf8-aware" do
|
83
|
+
utf8_char = @utf8[1, 4]
|
84
|
+
utf8_char.should eql(@char_array[1, 4].join)
|
85
|
+
|
86
|
+
utf8_char = @utf8[0, 6]
|
87
|
+
utf8_char.should eql(@char_array[0, 6].join)
|
88
|
+
|
89
|
+
# this will fail due to a bug in 1.9
|
90
|
+
unless defined? Encoding
|
91
|
+
utf8_char = @utf8[6, 100]
|
92
|
+
utf8_char.should eql(@char_array[6, 100].join)
|
93
|
+
end
|
94
|
+
|
95
|
+
utf8_char = @utf8[-1, 2]
|
96
|
+
utf8_char.should eql(@char_array[-1, 2].join)
|
97
|
+
|
98
|
+
utf8_char = @utf8[-1, 100]
|
99
|
+
utf8_char.should eql(@char_array[-1, 100].join)
|
100
|
+
|
101
|
+
utf8_char = @utf8[0, 0]
|
102
|
+
utf8_char.should eql(@char_array[0, 0].join)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should return nil for an out of range offset or length" do
|
106
|
+
@utf8[100, 100].should be_nil
|
107
|
+
@utf8[-100, 100].should be_nil
|
108
|
+
@utf8[0, -100].should be_nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
context "[Range] syntax" do
|
113
|
+
it "should be utf8-aware" do
|
114
|
+
utf8_char = @utf8[1..4]
|
115
|
+
utf8_char.should eql(@char_array[1..4].join)
|
116
|
+
|
117
|
+
utf8_char = @utf8[0..6]
|
118
|
+
utf8_char.should eql(@char_array[0..6].join)
|
119
|
+
|
120
|
+
# this will fail due to a bug in 1.9
|
121
|
+
unless defined? Encoding
|
122
|
+
utf8_char = @utf8[6..100]
|
123
|
+
utf8_char.should eql(@char_array[6..100].join)
|
124
|
+
end
|
125
|
+
|
126
|
+
utf8_char = @utf8[-1..2]
|
127
|
+
utf8_char.should eql(@char_array[-1..2].join)
|
128
|
+
|
129
|
+
utf8_char = @utf8[-1..100]
|
130
|
+
utf8_char.should eql(@char_array[-1..100].join)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should return nil for an out of range offset or length" do
|
134
|
+
@utf8[100..100].should be_nil
|
135
|
+
@utf8[-100..100].should be_nil
|
136
|
+
@utf8[0..-100].should eql("")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
it "[Regexp] syntax shouldn't be supported yet" do
|
141
|
+
lambda {
|
142
|
+
@utf8[/a/]
|
143
|
+
}.should raise_error(ArgumentError)
|
144
|
+
end
|
145
|
+
|
146
|
+
it "[Regexp, match_index] syntax shouldn't be supported yet" do
|
147
|
+
lambda {
|
148
|
+
@utf8[/(a)/, 1]
|
149
|
+
}.should raise_error(ArgumentError)
|
150
|
+
end
|
151
|
+
end
|
data/utf8.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{utf8}
|
5
|
+
s.version = "0.1.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Brian Lopez"]
|
9
|
+
s.date = %q{2011-01-12}
|
10
|
+
s.email = %q{seniorlopez@gmail.com}
|
11
|
+
s.extensions = ["ext/utf8/extconf.rb"]
|
12
|
+
s.extra_rdoc_files = [
|
13
|
+
"README.rdoc"
|
14
|
+
]
|
15
|
+
s.files = [".gitignore", "MIT-LICENSE", "README.rdoc", "Rakefile", "benchmark/active_support.rb", "benchmark/test.txt", "ext/utf8/ext.c", "ext/utf8/ext.h", "ext/utf8/extconf.rb", "ext/utf8/string_scanner_utf8.c", "ext/utf8/string_scanner_utf8.h", "ext/utf8/string_utf8.c", "ext/utf8/string_utf8.h", "ext/utf8/utf8.c", "ext/utf8/utf8.h", "lib/utf8.rb", "lib/utf8/string.rb", "lib/utf8/string_scanner.rb", "spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb", "utf8.gemspec"]
|
16
|
+
s.homepage = %q{http://github.com/brianmario/utf8}
|
17
|
+
s.require_paths = ["lib", "ext"]
|
18
|
+
s.rubygems_version = %q{1.4.2}
|
19
|
+
s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
|
20
|
+
s.test_files = ["spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb"]
|
21
|
+
|
22
|
+
if s.respond_to? :specification_version then
|
23
|
+
s.specification_version = 3
|
24
|
+
|
25
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
26
|
+
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
27
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
30
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
31
|
+
end
|
32
|
+
else
|
33
|
+
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
34
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Brian Lopez
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-12 00:00:00 -08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rake-compiler
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 9
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 7
|
33
|
+
- 5
|
34
|
+
version: 0.7.5
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rspec
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 3
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
version: "0"
|
49
|
+
type: :development
|
50
|
+
version_requirements: *id002
|
51
|
+
description:
|
52
|
+
email: seniorlopez@gmail.com
|
53
|
+
executables: []
|
54
|
+
|
55
|
+
extensions:
|
56
|
+
- ext/utf8/extconf.rb
|
57
|
+
extra_rdoc_files:
|
58
|
+
- README.rdoc
|
59
|
+
files:
|
60
|
+
- .gitignore
|
61
|
+
- MIT-LICENSE
|
62
|
+
- README.rdoc
|
63
|
+
- Rakefile
|
64
|
+
- benchmark/active_support.rb
|
65
|
+
- benchmark/test.txt
|
66
|
+
- ext/utf8/ext.c
|
67
|
+
- ext/utf8/ext.h
|
68
|
+
- ext/utf8/extconf.rb
|
69
|
+
- ext/utf8/string_scanner_utf8.c
|
70
|
+
- ext/utf8/string_scanner_utf8.h
|
71
|
+
- ext/utf8/string_utf8.c
|
72
|
+
- ext/utf8/string_utf8.h
|
73
|
+
- ext/utf8/utf8.c
|
74
|
+
- ext/utf8/utf8.h
|
75
|
+
- lib/utf8.rb
|
76
|
+
- lib/utf8/string.rb
|
77
|
+
- lib/utf8/string_scanner.rb
|
78
|
+
- spec/spec_helper.rb
|
79
|
+
- spec/string_scanner_spec.rb
|
80
|
+
- spec/string_spec.rb
|
81
|
+
- utf8.gemspec
|
82
|
+
has_rdoc: true
|
83
|
+
homepage: http://github.com/brianmario/utf8
|
84
|
+
licenses: []
|
85
|
+
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
- ext
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
hash: 3
|
98
|
+
segments:
|
99
|
+
- 0
|
100
|
+
version: "0"
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
hash: 3
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
requirements: []
|
111
|
+
|
112
|
+
rubyforge_project:
|
113
|
+
rubygems_version: 1.4.2
|
114
|
+
signing_key:
|
115
|
+
specification_version: 3
|
116
|
+
summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8
|
117
|
+
test_files:
|
118
|
+
- spec/spec_helper.rb
|
119
|
+
- spec/string_scanner_spec.rb
|
120
|
+
- spec/string_spec.rb
|