utf8 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format=documentation
3
+ --fail-fast
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+
3
+ gemspec
data/Rakefile CHANGED
@@ -1,6 +1,25 @@
1
+ # rspec
2
+ begin
3
+ require 'rspec'
4
+ require 'rspec/core/rake_task'
5
+
6
+ desc "Run all examples with RCov"
7
+ RSpec::Core::RakeTask.new('spec:rcov') do |t|
8
+ t.rcov = true
9
+ end
10
+ RSpec::Core::RakeTask.new('spec') do |t|
11
+ t.verbose = true
12
+ end
13
+
14
+ task :default => :spec
15
+ rescue LoadError
16
+ puts "rspec, or one of its dependencies, is not available. Install it with: sudo gem install rspec"
17
+ end
18
+
19
+ # rake-compiler
1
20
  require 'rake' unless defined? Rake
2
21
 
3
- gem 'rake-compiler', '~> 0.7.1'
22
+ gem 'rake-compiler', '>= 0.7.5'
4
23
  require "rake/extensiontask"
5
24
 
6
25
  Rake::ExtensionTask.new('utf8') do |ext|
@@ -9,3 +28,5 @@ Rake::ExtensionTask.new('utf8') do |ext|
9
28
 
10
29
  ext.lib_dir = File.join 'lib', 'utf8'
11
30
  end
31
+
32
+ Rake::Task[:spec].prerequisites << :compile
@@ -13,49 +13,39 @@ as_mb = ActiveSupport::Multibyte::Chars.new(raw)
13
13
  times = 1000
14
14
 
15
15
  puts "String::UTF8"
16
- Benchmark.bmbm do |x|
17
- x.report {
18
- puts "#length"
16
+ Benchmark.bmbm { |x|
17
+ x.report("#length") {
19
18
  times.times {utf8.length}
20
19
  }
21
- x.report {
22
- puts "#[index]"
20
+ x.report("#[index]") {
23
21
  times.times {utf8[1024]}
24
22
  }
25
- x.report {
26
- puts "#[-index]"
23
+ x.report("#[-index]") {
27
24
  times.times {utf8[-1024]}
28
25
  }
29
- x.report {
30
- puts "#[start, len]"
26
+ x.report("#[start, len]") {
31
27
  times.times {utf8[1024, 1024]}
32
28
  }
33
- x.report {
34
- puts "#[-start, len]"
29
+ x.report("#[-start, len]") {
35
30
  times.times {utf8[-1024, 1024]}
36
31
  }
37
- end
32
+ }
38
33
 
39
34
  puts "\n\nActiveSupport::Multibyte::Chars"
40
- Benchmark.bmbm do |x|
41
- x.report {
42
- puts "#length"
35
+ Benchmark.bmbm { |x|
36
+ x.report("#length") {
43
37
  times.times {as_mb.length}
44
38
  }
45
- x.report {
46
- puts "#[index]"
39
+ x.report("#[index]") {
47
40
  times.times {as_mb[1024]}
48
41
  }
49
- x.report {
50
- puts "#[-index]"
42
+ x.report("#[-index]") {
51
43
  times.times {as_mb[-1024]}
52
44
  }
53
- x.report {
54
- puts "#[start, len]"
45
+ x.report("#[start, len]") {
55
46
  times.times {as_mb[1024, 1024]}
56
47
  }
57
- x.report {
58
- puts "#[-start, len]"
48
+ x.report("#[-start, len]") {
59
49
  times.times {as_mb[-1024, 1024]}
60
50
  }
61
- end
51
+ }
data/ext/utf8/ext.h CHANGED
@@ -1,6 +1,11 @@
1
1
  #ifndef UTF8_EXT_H
2
2
  #define UTF8_EXT_H
3
3
 
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
4
9
  #include <ruby.h>
5
10
 
6
11
  #ifdef HAVE_RUBY_ENCODING_H
@@ -4,13 +4,13 @@
4
4
  extern VALUE intern_as_utf8;
5
5
 
6
6
  /*
7
- * Document-class: String::UTF8
7
+ * Document-class: String::UTF-8
8
8
  */
9
9
 
10
10
  /*
11
11
  * call-seq: length
12
12
  *
13
- * Returns the number of UTF8 characters in this string
13
+ * Returns: a Fixnum - the number of UTF-8 characters in this string
14
14
  */
15
15
  static VALUE rb_cString_UTF8_length(VALUE self) {
16
16
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -28,7 +28,9 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
28
28
  /*
29
29
  * call-seq: each_char {|utf8_char| ...}
30
30
  *
31
- * Iterates over the string, yielding one UTF8 character at a time
31
+ * Iterates over the string, yielding one UTF-8 character at a time
32
+ *
33
+ * Returns: self
32
34
  */
33
35
  static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
34
36
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -56,7 +58,75 @@ static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
56
58
  }
57
59
 
58
60
  /*
59
- * Works like String#[] but taking into account UTF8 character boundaries
61
+ * call-seq: each_codepoint {|utf8_codepoint| ...}
62
+ *
63
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
64
+ *
65
+ * Returns: self
66
+ */
67
+ static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
68
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
69
+ size_t len = RSTRING_LEN(self), i=0;
70
+ int8_t lastCharLen=0;
71
+ int32_t cp;
72
+
73
+ // this will return an Enumerator wrapping this string, yielding this method
74
+ // when Enumerator#each is called
75
+ if (!rb_block_given_p()) {
76
+ return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint")));
77
+ }
78
+
79
+ for(; i<len; i+=lastCharLen) {
80
+ lastCharLen = utf8CharLen(str, len);
81
+ if (lastCharLen < 0) {
82
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
83
+ }
84
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
85
+ rb_yield(INT2FIX(cp));
86
+ }
87
+
88
+ return self;
89
+ }
90
+
91
+ /*
92
+ * call-seq: valid?(max_codepoint=nil)
93
+ *
94
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
95
+ *
96
+ * max_codepoint - an optional Fixnum used to declare this string invalid
97
+ * if a codepoint higher than that value is found
98
+ * if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
99
+ *
100
+ * Returns: a Boolean - true if the string is valid, false if not
101
+ */
102
+ static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
103
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
104
+ size_t len = RSTRING_LEN(self), i=0;
105
+ int8_t lastCharLen=0;
106
+ int32_t cp, cp_max = -1;
107
+ VALUE rb_cp_max;
108
+
109
+ if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) {
110
+ Check_Type(rb_cp_max, T_FIXNUM);
111
+ cp_max = FIX2INT(rb_cp_max);
112
+ }
113
+
114
+ for(; i<len; i+=lastCharLen) {
115
+ lastCharLen = utf8CharLen(str, len);
116
+ if (lastCharLen < 0) {
117
+ return Qfalse;
118
+ }
119
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
120
+ if (cp_max >= 0 && cp > cp_max) {
121
+ return Qfalse;
122
+ }
123
+ }
124
+
125
+ return Qtrue;
126
+ }
127
+
128
+ /*
129
+ * Works like String#[] but taking into account UTF-8 character boundaries
60
130
  *
61
131
  * This method doesn't currently (and may never) support Regexp parameters
62
132
  * It also doesn't support a String parameter (yet)
@@ -263,4 +333,6 @@ void init_String_UTF8() {
263
333
  rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
264
334
  rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
265
335
  rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
336
+ rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
337
+ rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
266
338
  }
data/ext/utf8/utf8.c CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  /*
7
7
  * Scans the current position of the buffer
8
- * returning the length of this UTF8 character
8
+ * returning the length of this UTF-8 character
9
9
  */
10
10
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
11
11
  if (in_len > 0) {
@@ -60,7 +60,7 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
60
60
 
61
61
  /*
62
62
  * Scans the current position of the buffer
63
- * returning the total number of UTF8 characters found
63
+ * returning the total number of UTF-8 characters found
64
64
  */
65
65
  int64_t utf8CharCount(unsigned char *in, size_t in_len) {
66
66
  int64_t total = 0, leftOver = in_len;
@@ -81,3 +81,25 @@ int64_t utf8CharCount(unsigned char *in, size_t in_len) {
81
81
 
82
82
  return total;
83
83
  }
84
+
85
+ /*
86
+ * Scans the current position of the buffer
87
+ * returning the codepoint for the UTF-8 character at this position
88
+ */
89
+ int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len) {
90
+ int32_t cp, ncp, len;
91
+
92
+ len = utf8CharLen(in, in_len);
93
+ cp = *in++;
94
+ if (len > 1) {
95
+ len--;
96
+ ncp = cp & ((1 << (6 - len)) - 1);
97
+ while (len--) {
98
+ cp = *in++;
99
+ ncp = (ncp << 6) | (cp & ((1 << 6) - 1));
100
+ }
101
+ return ncp;
102
+ } else {
103
+ return cp;
104
+ }
105
+ }
data/ext/utf8/utf8.h CHANGED
@@ -3,5 +3,6 @@
3
3
 
4
4
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
5
  int64_t utf8CharCount(unsigned char *in, size_t in_len);
6
+ int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len);
6
7
 
7
8
  #endif
data/lib/utf8.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'utf8/utf8'
2
2
  require 'utf8/string'
3
+ require 'utf8/version' unless defined? String::UTF8::VERSION
3
4
 
4
5
  # explicitly require this in your app if you want to use it
5
6
  # require 'utf8/string_scanner'
data/lib/utf8/string.rb CHANGED
@@ -4,9 +4,7 @@ class String
4
4
  String::UTF8.new(self)
5
5
  end
6
6
 
7
- class UTF8
8
- VERSION = "0.1.2"
9
-
7
+ class UTF8 < ::String
10
8
  # Gives you access to the raw non-UTF8-aware version of the string
11
9
  def as_raw
12
10
  ::String.new(self)
@@ -15,5 +13,6 @@ class String
15
13
  alias :size :length
16
14
  alias :chars :each_char
17
15
  alias :slice :[]
16
+ alias :codepoints :each_codepoint
18
17
  end
19
18
  end
@@ -9,7 +9,7 @@ class StringScanner
9
9
  StringScanner::UTF8.new(self.string.as_utf8)
10
10
  end
11
11
 
12
- class UTF8
12
+ class UTF8 < ::StringScanner
13
13
  # Returns a non-UTF8-aware version of StringScanner wrapping your original string
14
14
  #
15
15
  # NOTE: this will lose all state associated with the current StringScanner::UTF8 instance
@@ -0,0 +1,5 @@
1
+ class String
2
+ class UTF8 < ::String
3
+ VERSION = "0.1.3"
4
+ end
5
+ end
data/spec/string_spec.rb CHANGED
@@ -7,6 +7,7 @@ describe String::UTF8 do
7
7
  @str = @char_array.join
8
8
  @utf8 = @str.as_utf8
9
9
  @utf8_len = @char_array.size
10
+ @codepoints = @char_array.map{|c| c.unpack('U').first}
10
11
  end
11
12
 
12
13
  it "should blow up on invalid utf8 chars" do
@@ -67,10 +68,8 @@ describe String::UTF8 do
67
68
  end
68
69
 
69
70
  @utf8.chars.class.should eql(klass)
70
- i=0
71
71
  @utf8.chars do |char|
72
72
  char.should_not be_nil
73
- i+=1
74
73
  end
75
74
  joined = @utf8.chars.to_a.join
76
75
  @utf8.should eql(joined)
@@ -79,6 +78,25 @@ describe String::UTF8 do
79
78
  end
80
79
  end
81
80
 
81
+ context "#codepoints and #each_codepoint" do
82
+ it "should be utf8-aware" do
83
+ klass = begin
84
+ if defined? Encoding
85
+ Enumerator
86
+ else
87
+ Enumerable::Enumerator
88
+ end
89
+ end
90
+
91
+ @utf8.codepoints.class.should eql(klass)
92
+ @utf8.codepoints do |codepoint|
93
+ codepoint.should_not be_nil
94
+ end
95
+ @utf8.codepoints.to_a.size.should eql(@codepoints.size)
96
+ @utf8.codepoints.to_a.should eql(@codepoints)
97
+ end
98
+ end
99
+
82
100
  context "[offset] syntax" do
83
101
  it "should be utf8-aware" do
84
102
  @char_array.each_with_index do |char, i|
@@ -157,6 +175,26 @@ describe String::UTF8 do
157
175
  end
158
176
  end
159
177
 
178
+ context "#valid?" do
179
+ it "should test validity" do
180
+ # lets cut right into the middle of a sequence so we know it's bad
181
+ @str.force_encoding('binary') if @str.respond_to?(:force_encoding)
182
+ utf8 = @str[0..1]
183
+ utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
184
+ utf8 = utf8.as_utf8
185
+
186
+ utf8.valid?.should be_false
187
+ @utf8.valid?.should be_true
188
+ end
189
+
190
+ it "should test validity using a maximum codepoint" do
191
+ highest_codepoint = @utf8.codepoints.to_a.max
192
+
193
+ @utf8.valid?(highest_codepoint).should be_true
194
+ @utf8.valid?(highest_codepoint-1).should be_false
195
+ end
196
+ end
197
+
160
198
  it "[Regexp] syntax shouldn't be supported yet" do
161
199
  lambda {
162
200
  @utf8[/a/]
data/utf8.gemspec CHANGED
@@ -1,37 +1,27 @@
1
- # encoding: utf-8
1
+ require './lib/utf8/version'
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{utf8}
5
- s.version = "0.1.2"
6
-
7
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
5
+ s.version = String::UTF8::VERSION
8
6
  s.authors = ["Brian Lopez"]
9
- s.date = %q{2011-01-12}
7
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
8
  s.email = %q{seniorlopez@gmail.com}
11
9
  s.extensions = ["ext/utf8/extconf.rb"]
12
10
  s.extra_rdoc_files = [
13
11
  "README.rdoc"
14
12
  ]
15
- s.files = [".gitignore", "MIT-LICENSE", "README.rdoc", "Rakefile", "benchmark/active_support.rb", "benchmark/test.txt", "ext/utf8/ext.c", "ext/utf8/ext.h", "ext/utf8/extconf.rb", "ext/utf8/string_scanner_utf8.c", "ext/utf8/string_scanner_utf8.h", "ext/utf8/string_utf8.c", "ext/utf8/string_utf8.h", "ext/utf8/utf8.c", "ext/utf8/utf8.h", "lib/utf8.rb", "lib/utf8/string.rb", "lib/utf8/string_scanner.rb", "spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb", "utf8.gemspec"]
13
+ s.files = `git ls-files`.split("\n")
16
14
  s.homepage = %q{http://github.com/brianmario/utf8}
15
+ s.rdoc_options = ["--charset=UTF-8"]
17
16
  s.require_paths = ["lib", "ext"]
18
17
  s.rubygems_version = %q{1.4.2}
19
18
  s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
20
- s.test_files = ["spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb"]
21
-
22
- if s.respond_to? :specification_version then
23
- s.specification_version = 3
19
+ s.test_files = `git ls-files spec`.split("\n")
24
20
 
25
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
26
- s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
27
- s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
28
- else
29
- s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
30
- s.add_dependency(%q<rspec>, [">= 2.0.0"])
31
- end
32
- else
33
- s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
34
- s.add_dependency(%q<rspec>, [">= 2.0.0"])
35
- end
21
+ # tests
22
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
23
+ s.add_development_dependency 'rspec', ">= 2.0.0"
24
+ # benchmarks
25
+ s.add_development_dependency 'activesupport'
36
26
  end
37
27
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 2
10
- version: 0.1.2
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-12 00:00:00 -08:00
18
+ date: 2011-06-02 00:00:00 -07:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,20 @@ dependencies:
50
50
  version: 2.0.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: activesupport
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 3
62
+ segments:
63
+ - 0
64
+ version: "0"
65
+ type: :development
66
+ version_requirements: *id003
53
67
  description:
54
68
  email: seniorlopez@gmail.com
55
69
  executables: []
@@ -60,6 +74,8 @@ extra_rdoc_files:
60
74
  - README.rdoc
61
75
  files:
62
76
  - .gitignore
77
+ - .rspec
78
+ - Gemfile
63
79
  - MIT-LICENSE
64
80
  - README.rdoc
65
81
  - Rakefile
@@ -77,6 +93,7 @@ files:
77
93
  - lib/utf8.rb
78
94
  - lib/utf8/string.rb
79
95
  - lib/utf8/string_scanner.rb
96
+ - lib/utf8/version.rb
80
97
  - spec/spec_helper.rb
81
98
  - spec/string_scanner_spec.rb
82
99
  - spec/string_spec.rb
@@ -86,8 +103,8 @@ homepage: http://github.com/brianmario/utf8
86
103
  licenses: []
87
104
 
88
105
  post_install_message:
89
- rdoc_options: []
90
-
106
+ rdoc_options:
107
+ - --charset=UTF-8
91
108
  require_paths:
92
109
  - lib
93
110
  - ext
@@ -112,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
129
  requirements: []
113
130
 
114
131
  rubyforge_project:
115
- rubygems_version: 1.4.2
132
+ rubygems_version: 1.6.2
116
133
  signing_key:
117
134
  specification_version: 3
118
135
  summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8