utf8 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format=documentation
3
+ --fail-fast
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+
3
+ gemspec
data/Rakefile CHANGED
@@ -1,6 +1,25 @@
1
+ # rspec
2
+ begin
3
+ require 'rspec'
4
+ require 'rspec/core/rake_task'
5
+
6
+ desc "Run all examples with RCov"
7
+ RSpec::Core::RakeTask.new('spec:rcov') do |t|
8
+ t.rcov = true
9
+ end
10
+ RSpec::Core::RakeTask.new('spec') do |t|
11
+ t.verbose = true
12
+ end
13
+
14
+ task :default => :spec
15
+ rescue LoadError
16
+ puts "rspec, or one of its dependencies, is not available. Install it with: sudo gem install rspec"
17
+ end
18
+
19
+ # rake-compiler
1
20
  require 'rake' unless defined? Rake
2
21
 
3
- gem 'rake-compiler', '~> 0.7.1'
22
+ gem 'rake-compiler', '>= 0.7.5'
4
23
  require "rake/extensiontask"
5
24
 
6
25
  Rake::ExtensionTask.new('utf8') do |ext|
@@ -9,3 +28,5 @@ Rake::ExtensionTask.new('utf8') do |ext|
9
28
 
10
29
  ext.lib_dir = File.join 'lib', 'utf8'
11
30
  end
31
+
32
+ Rake::Task[:spec].prerequisites << :compile
@@ -13,49 +13,39 @@ as_mb = ActiveSupport::Multibyte::Chars.new(raw)
13
13
  times = 1000
14
14
 
15
15
  puts "String::UTF8"
16
- Benchmark.bmbm do |x|
17
- x.report {
18
- puts "#length"
16
+ Benchmark.bmbm { |x|
17
+ x.report("#length") {
19
18
  times.times {utf8.length}
20
19
  }
21
- x.report {
22
- puts "#[index]"
20
+ x.report("#[index]") {
23
21
  times.times {utf8[1024]}
24
22
  }
25
- x.report {
26
- puts "#[-index]"
23
+ x.report("#[-index]") {
27
24
  times.times {utf8[-1024]}
28
25
  }
29
- x.report {
30
- puts "#[start, len]"
26
+ x.report("#[start, len]") {
31
27
  times.times {utf8[1024, 1024]}
32
28
  }
33
- x.report {
34
- puts "#[-start, len]"
29
+ x.report("#[-start, len]") {
35
30
  times.times {utf8[-1024, 1024]}
36
31
  }
37
- end
32
+ }
38
33
 
39
34
  puts "\n\nActiveSupport::Multibyte::Chars"
40
- Benchmark.bmbm do |x|
41
- x.report {
42
- puts "#length"
35
+ Benchmark.bmbm { |x|
36
+ x.report("#length") {
43
37
  times.times {as_mb.length}
44
38
  }
45
- x.report {
46
- puts "#[index]"
39
+ x.report("#[index]") {
47
40
  times.times {as_mb[1024]}
48
41
  }
49
- x.report {
50
- puts "#[-index]"
42
+ x.report("#[-index]") {
51
43
  times.times {as_mb[-1024]}
52
44
  }
53
- x.report {
54
- puts "#[start, len]"
45
+ x.report("#[start, len]") {
55
46
  times.times {as_mb[1024, 1024]}
56
47
  }
57
- x.report {
58
- puts "#[-start, len]"
48
+ x.report("#[-start, len]") {
59
49
  times.times {as_mb[-1024, 1024]}
60
50
  }
61
- end
51
+ }
data/ext/utf8/ext.h CHANGED
@@ -1,6 +1,11 @@
1
1
  #ifndef UTF8_EXT_H
2
2
  #define UTF8_EXT_H
3
3
 
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
4
9
  #include <ruby.h>
5
10
 
6
11
  #ifdef HAVE_RUBY_ENCODING_H
@@ -4,13 +4,13 @@
4
4
  extern VALUE intern_as_utf8;
5
5
 
6
6
  /*
7
- * Document-class: String::UTF8
7
+ * Document-class: String::UTF-8
8
8
  */
9
9
 
10
10
  /*
11
11
  * call-seq: length
12
12
  *
13
- * Returns the number of UTF8 characters in this string
13
+ * Returns: a Fixnum - the number of UTF-8 characters in this string
14
14
  */
15
15
  static VALUE rb_cString_UTF8_length(VALUE self) {
16
16
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -28,7 +28,9 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
28
28
  /*
29
29
  * call-seq: each_char {|utf8_char| ...}
30
30
  *
31
- * Iterates over the string, yielding one UTF8 character at a time
31
+ * Iterates over the string, yielding one UTF-8 character at a time
32
+ *
33
+ * Returns: self
32
34
  */
33
35
  static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
34
36
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -56,7 +58,75 @@ static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
56
58
  }
57
59
 
58
60
  /*
59
- * Works like String#[] but taking into account UTF8 character boundaries
61
+ * call-seq: each_codepoint {|utf8_codepoint| ...}
62
+ *
63
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
64
+ *
65
+ * Returns: self
66
+ */
67
+ static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
68
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
69
+ size_t len = RSTRING_LEN(self), i=0;
70
+ int8_t lastCharLen=0;
71
+ int32_t cp;
72
+
73
+ // this will return an Enumerator wrapping this string, yielding this method
74
+ // when Enumerator#each is called
75
+ if (!rb_block_given_p()) {
76
+ return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint")));
77
+ }
78
+
79
+ for(; i<len; i+=lastCharLen) {
80
+ lastCharLen = utf8CharLen(str, len);
81
+ if (lastCharLen < 0) {
82
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
83
+ }
84
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
85
+ rb_yield(INT2FIX(cp));
86
+ }
87
+
88
+ return self;
89
+ }
90
+
91
+ /*
92
+ * call-seq: valid?(max_codepoint=nil)
93
+ *
94
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
95
+ *
96
+ * max_codepoint - an optional Fixnum used to declare this string invalid
97
+ * if a codepoint higher than that value is found
98
+ * if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
99
+ *
100
+ * Returns: a Boolean - true if the string is valid, false if not
101
+ */
102
+ static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
103
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
104
+ size_t len = RSTRING_LEN(self), i=0;
105
+ int8_t lastCharLen=0;
106
+ int32_t cp, cp_max = -1;
107
+ VALUE rb_cp_max;
108
+
109
+ if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) {
110
+ Check_Type(rb_cp_max, T_FIXNUM);
111
+ cp_max = FIX2INT(rb_cp_max);
112
+ }
113
+
114
+ for(; i<len; i+=lastCharLen) {
115
+ lastCharLen = utf8CharLen(str, len);
116
+ if (lastCharLen < 0) {
117
+ return Qfalse;
118
+ }
119
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
120
+ if (cp_max >= 0 && cp > cp_max) {
121
+ return Qfalse;
122
+ }
123
+ }
124
+
125
+ return Qtrue;
126
+ }
127
+
128
+ /*
129
+ * Works like String#[] but taking into account UTF-8 character boundaries
60
130
  *
61
131
  * This method doesn't currently (and may never) support Regexp parameters
62
132
  * It also doesn't support a String parameter (yet)
@@ -263,4 +333,6 @@ void init_String_UTF8() {
263
333
  rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
264
334
  rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
265
335
  rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
336
+ rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
337
+ rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
266
338
  }
data/ext/utf8/utf8.c CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  /*
7
7
  * Scans the current position of the buffer
8
- * returning the length of this UTF8 character
8
+ * returning the length of this UTF-8 character
9
9
  */
10
10
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
11
11
  if (in_len > 0) {
@@ -60,7 +60,7 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
60
60
 
61
61
  /*
62
62
  * Scans the current position of the buffer
63
- * returning the total number of UTF8 characters found
63
+ * returning the total number of UTF-8 characters found
64
64
  */
65
65
  int64_t utf8CharCount(unsigned char *in, size_t in_len) {
66
66
  int64_t total = 0, leftOver = in_len;
@@ -81,3 +81,25 @@ int64_t utf8CharCount(unsigned char *in, size_t in_len) {
81
81
 
82
82
  return total;
83
83
  }
84
+
85
+ /*
86
+ * Scans the current position of the buffer
87
+ * returning the codepoint for the UTF-8 character at this position
88
+ */
89
+ int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len) {
90
+ int32_t cp, ncp, len;
91
+
92
+ len = utf8CharLen(in, in_len);
93
+ cp = *in++;
94
+ if (len > 1) {
95
+ len--;
96
+ ncp = cp & ((1 << (6 - len)) - 1);
97
+ while (len--) {
98
+ cp = *in++;
99
+ ncp = (ncp << 6) | (cp & ((1 << 6) - 1));
100
+ }
101
+ return ncp;
102
+ } else {
103
+ return cp;
104
+ }
105
+ }
data/ext/utf8/utf8.h CHANGED
@@ -3,5 +3,6 @@
3
3
 
4
4
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
5
  int64_t utf8CharCount(unsigned char *in, size_t in_len);
6
+ int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len);
6
7
 
7
8
  #endif
data/lib/utf8.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'utf8/utf8'
2
2
  require 'utf8/string'
3
+ require 'utf8/version' unless defined? String::UTF8::VERSION
3
4
 
4
5
  # explicitly require this in your app if you want to use it
5
6
  # require 'utf8/string_scanner'
data/lib/utf8/string.rb CHANGED
@@ -4,9 +4,7 @@ class String
4
4
  String::UTF8.new(self)
5
5
  end
6
6
 
7
- class UTF8
8
- VERSION = "0.1.2"
9
-
7
+ class UTF8 < ::String
10
8
  # Gives you access to the raw non-UTF8-aware version of the string
11
9
  def as_raw
12
10
  ::String.new(self)
@@ -15,5 +13,6 @@ class String
15
13
  alias :size :length
16
14
  alias :chars :each_char
17
15
  alias :slice :[]
16
+ alias :codepoints :each_codepoint
18
17
  end
19
18
  end
@@ -9,7 +9,7 @@ class StringScanner
9
9
  StringScanner::UTF8.new(self.string.as_utf8)
10
10
  end
11
11
 
12
- class UTF8
12
+ class UTF8 < ::StringScanner
13
13
  # Returns a non-UTF8-aware version of StringScanner wrapping your original string
14
14
  #
15
15
  # NOTE: this will lose all state associated with the current StringScanner::UTF8 instance
@@ -0,0 +1,5 @@
1
+ class String
2
+ class UTF8 < ::String
3
+ VERSION = "0.1.3"
4
+ end
5
+ end
data/spec/string_spec.rb CHANGED
@@ -7,6 +7,7 @@ describe String::UTF8 do
7
7
  @str = @char_array.join
8
8
  @utf8 = @str.as_utf8
9
9
  @utf8_len = @char_array.size
10
+ @codepoints = @char_array.map{|c| c.unpack('U').first}
10
11
  end
11
12
 
12
13
  it "should blow up on invalid utf8 chars" do
@@ -67,10 +68,8 @@ describe String::UTF8 do
67
68
  end
68
69
 
69
70
  @utf8.chars.class.should eql(klass)
70
- i=0
71
71
  @utf8.chars do |char|
72
72
  char.should_not be_nil
73
- i+=1
74
73
  end
75
74
  joined = @utf8.chars.to_a.join
76
75
  @utf8.should eql(joined)
@@ -79,6 +78,25 @@ describe String::UTF8 do
79
78
  end
80
79
  end
81
80
 
81
+ context "#codepoints and #each_codepoint" do
82
+ it "should be utf8-aware" do
83
+ klass = begin
84
+ if defined? Encoding
85
+ Enumerator
86
+ else
87
+ Enumerable::Enumerator
88
+ end
89
+ end
90
+
91
+ @utf8.codepoints.class.should eql(klass)
92
+ @utf8.codepoints do |codepoint|
93
+ codepoint.should_not be_nil
94
+ end
95
+ @utf8.codepoints.to_a.size.should eql(@codepoints.size)
96
+ @utf8.codepoints.to_a.should eql(@codepoints)
97
+ end
98
+ end
99
+
82
100
  context "[offset] syntax" do
83
101
  it "should be utf8-aware" do
84
102
  @char_array.each_with_index do |char, i|
@@ -157,6 +175,26 @@ describe String::UTF8 do
157
175
  end
158
176
  end
159
177
 
178
+ context "#valid?" do
179
+ it "should test validity" do
180
+ # lets cut right into the middle of a sequence so we know it's bad
181
+ @str.force_encoding('binary') if @str.respond_to?(:force_encoding)
182
+ utf8 = @str[0..1]
183
+ utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
184
+ utf8 = utf8.as_utf8
185
+
186
+ utf8.valid?.should be_false
187
+ @utf8.valid?.should be_true
188
+ end
189
+
190
+ it "should test validity using a maximum codepoint" do
191
+ highest_codepoint = @utf8.codepoints.to_a.max
192
+
193
+ @utf8.valid?(highest_codepoint).should be_true
194
+ @utf8.valid?(highest_codepoint-1).should be_false
195
+ end
196
+ end
197
+
160
198
  it "[Regexp] syntax shouldn't be supported yet" do
161
199
  lambda {
162
200
  @utf8[/a/]
data/utf8.gemspec CHANGED
@@ -1,37 +1,27 @@
1
- # encoding: utf-8
1
+ require './lib/utf8/version'
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{utf8}
5
- s.version = "0.1.2"
6
-
7
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
5
+ s.version = String::UTF8::VERSION
8
6
  s.authors = ["Brian Lopez"]
9
- s.date = %q{2011-01-12}
7
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
8
  s.email = %q{seniorlopez@gmail.com}
11
9
  s.extensions = ["ext/utf8/extconf.rb"]
12
10
  s.extra_rdoc_files = [
13
11
  "README.rdoc"
14
12
  ]
15
- s.files = [".gitignore", "MIT-LICENSE", "README.rdoc", "Rakefile", "benchmark/active_support.rb", "benchmark/test.txt", "ext/utf8/ext.c", "ext/utf8/ext.h", "ext/utf8/extconf.rb", "ext/utf8/string_scanner_utf8.c", "ext/utf8/string_scanner_utf8.h", "ext/utf8/string_utf8.c", "ext/utf8/string_utf8.h", "ext/utf8/utf8.c", "ext/utf8/utf8.h", "lib/utf8.rb", "lib/utf8/string.rb", "lib/utf8/string_scanner.rb", "spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb", "utf8.gemspec"]
13
+ s.files = `git ls-files`.split("\n")
16
14
  s.homepage = %q{http://github.com/brianmario/utf8}
15
+ s.rdoc_options = ["--charset=UTF-8"]
17
16
  s.require_paths = ["lib", "ext"]
18
17
  s.rubygems_version = %q{1.4.2}
19
18
  s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
20
- s.test_files = ["spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb"]
21
-
22
- if s.respond_to? :specification_version then
23
- s.specification_version = 3
19
+ s.test_files = `git ls-files spec`.split("\n")
24
20
 
25
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
26
- s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
27
- s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
28
- else
29
- s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
30
- s.add_dependency(%q<rspec>, [">= 2.0.0"])
31
- end
32
- else
33
- s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
34
- s.add_dependency(%q<rspec>, [">= 2.0.0"])
35
- end
21
+ # tests
22
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
23
+ s.add_development_dependency 'rspec', ">= 2.0.0"
24
+ # benchmarks
25
+ s.add_development_dependency 'activesupport'
36
26
  end
37
27
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 2
10
- version: 0.1.2
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-12 00:00:00 -08:00
18
+ date: 2011-06-02 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,20 @@ dependencies:
50
50
  version: 2.0.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: activesupport
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 3
62
+ segments:
63
+ - 0
64
+ version: "0"
65
+ type: :development
66
+ version_requirements: *id003
53
67
  description:
54
68
  email: seniorlopez@gmail.com
55
69
  executables: []
@@ -60,6 +74,8 @@ extra_rdoc_files:
60
74
  - README.rdoc
61
75
  files:
62
76
  - .gitignore
77
+ - .rspec
78
+ - Gemfile
63
79
  - MIT-LICENSE
64
80
  - README.rdoc
65
81
  - Rakefile
@@ -77,6 +93,7 @@ files:
77
93
  - lib/utf8.rb
78
94
  - lib/utf8/string.rb
79
95
  - lib/utf8/string_scanner.rb
96
+ - lib/utf8/version.rb
80
97
  - spec/spec_helper.rb
81
98
  - spec/string_scanner_spec.rb
82
99
  - spec/string_spec.rb
@@ -86,8 +103,8 @@ homepage: http://github.com/brianmario/utf8
86
103
  licenses: []
87
104
 
88
105
  post_install_message:
89
- rdoc_options: []
90
-
106
+ rdoc_options:
107
+ - --charset=UTF-8
91
108
  require_paths:
92
109
  - lib
93
110
  - ext
@@ -112,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
129
  requirements: []
113
130
 
114
131
  rubyforge_project:
115
- rubygems_version: 1.4.2
132
+ rubygems_version: 1.6.2
116
133
  signing_key:
117
134
  specification_version: 3
118
135
  summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8