coder 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ # encoding: UTF-8
2
+
3
+ # JRuby 1.7.0-preview2
4
+ # user system total real
5
+ # Coder::Cleaner::Java 0.360000 0.010000 0.370000 ( 0.121000)
6
+ # Coder::Cleaner::Iconv 0.290000 0.010000 0.300000 ( 0.103000)
7
+ # Coder::Cleaner::Simple 1.010000 0.020000 1.030000 ( 0.367000)
8
+
9
+ # MRI 1.9.3
10
+ # user system total real
11
+ # Coder::Cleaner::Builtin 0.060000 0.000000 0.060000 ( 0.057767)
12
+ # Coder::Cleaner::Iconv 0.020000 0.000000 0.020000 ( 0.022351)
13
+ # Coder::Cleaner::Simple 0.480000 0.000000 0.480000 ( 0.486451)
14
+
15
+ require 'benchmark'
16
+ require 'coder'
17
+
18
+ strings = [
19
+ "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94",
20
+ "{foo \xC3 'bar'}",
21
+ "yummy 🍔 " * 10
22
+ ]
23
+
24
+ Benchmark.bmbm do |x|
25
+ Coder::Cleaner::AVAILABLE.each do |cleaner|
26
+ x.report cleaner.to_s do
27
+ 1000.times do
28
+ strings.each do |str|
29
+ cleaner.new('UTF-8').clean(str)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -5,10 +5,21 @@ require 'coder/cleaner/simple'
5
5
 
6
6
  module Coder
7
7
  module Cleaner
8
- Default = [Builtin, Java, Iconv, Simple].detect { |c| c.available? }
8
+ ALL = [ Builtin, Java, Iconv, Simple ]
9
+ AVAILABLE = ALL.select { |e| e.available? }
10
+
11
+ def self.available?
12
+ AVAILABLE.any?
13
+ end
14
+
15
+ def self.supports?(encoding)
16
+ AVAILABLE.any? { |e| e.supports? encoding }
17
+ end
9
18
 
10
19
  def self.new(encoding)
11
- Default.new(encoding)
20
+ cleaner = AVAILABLE.detect { |e| e.supports? encoding }
21
+ raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless cleaner
22
+ cleaner.new(encoding)
12
23
  end
13
24
  end
14
25
  end
@@ -13,6 +13,12 @@ module Coder
13
13
  !defined?(RUBY_ENGINE) or RUBY_ENGINE == 'ruby'
14
14
  end
15
15
 
16
+ def self.supports?(encoding)
17
+ Encoding.find(encoding)
18
+ rescue ArgumentError
19
+ false
20
+ end
21
+
16
22
  def self.has_encoding?
17
23
  defined? Encoding.find and
18
24
  defined? EncodingError and
@@ -30,7 +36,7 @@ module Coder
30
36
 
31
37
  def clean(str)
32
38
  str = str.dup.force_encoding(@encoding)
33
- str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
39
+ str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0".encode(@encoding), "")
34
40
  rescue EncodingError => e
35
41
  raise Coder::Error, e.message
36
42
  end
@@ -38,9 +44,8 @@ module Coder
38
44
  private
39
45
 
40
46
  def check_encoding
41
- Encoding.find(@encoding)
42
- rescue ArgumentError => e
43
- raise Coder::InvalidEncoding, e.message
47
+ return if self.class.supports? @encoding
48
+ raise Coder::InvalidEncoding, "unknown encoding name - #{@encoding}"
44
49
  end
45
50
 
46
51
  def needs_dummy?
@@ -17,6 +17,12 @@ module Coder
17
17
  super
18
18
  end
19
19
 
20
+ def self.supports?(encoding)
21
+ encoding.to_s !~ /^ucs/i and ::Iconv.new("#{encoding}//ignore", encoding.to_s)
22
+ rescue Exception
23
+ false
24
+ end
25
+
20
26
  def self.available?
21
27
  load_iconv
22
28
  !!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
@@ -25,13 +31,15 @@ module Coder
25
31
  end
26
32
 
27
33
  def initialize(encoding)
28
- @iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
34
+ @nullbyte = "\0"
35
+ @iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
36
+ @nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
29
37
  rescue ::Iconv::InvalidEncoding => e
30
38
  raise Coder::InvalidEncoding, e.message
31
39
  end
32
40
 
33
41
  def clean(str)
34
- @iconv.iconv(str).gsub("\0", "")
42
+ @iconv.iconv(str).gsub(@nullbyte, "")
35
43
  rescue ::Iconv::Failure => e
36
44
  raise Coder::Error, e.message
37
45
  end
@@ -10,12 +10,18 @@ module Coder
10
10
  false
11
11
  end
12
12
 
13
+ def self.supports?(encoding)
14
+ encoding.to_s =~ /^utf-8$/i
15
+ end
16
+
13
17
  def initialize(encoding)
14
- encoding = encoding.to_s.upcase
15
- @charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
16
- @decoder = @charset.new_decoder
18
+ encoding = encoding.to_s.upcase
19
+ @nullbyte = "\0"
20
+ @charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
21
+ @decoder = @charset.new_decoder
17
22
  @decoder.on_malformed_input(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
18
23
  @decoder.on_unmappable_character(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
24
+ @nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
19
25
  rescue ::Java::JavaNioCharset::UnsupportedCharsetException, ::Java::JavaNioCharset::IllegalCharsetNameException
20
26
  raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}"
21
27
  rescue ::Java::JavaLang::RuntimeException => e
@@ -24,7 +30,7 @@ module Coder
24
30
 
25
31
  def clean(str)
26
32
  buffer = ::Java::JavaNio::ByteBuffer.wrap(str.to_java_bytes)
27
- @decoder.decode(buffer).to_s
33
+ @decoder.decode(buffer).to_s.gsub(@nullbyte, '')
28
34
  rescue Java::JavaLang::RuntimeException => e
29
35
  raise Coder::Error, e.message, e.backtrace
30
36
  end
@@ -9,9 +9,16 @@ module Coder
9
9
  true
10
10
  end
11
11
 
12
+ def self.supports?(encoding)
13
+ const_name = encoding.to_s.upcase.gsub('-', '_')
14
+ Encodings.const_defined? const_name
15
+ rescue NameError
16
+ false
17
+ end
18
+
12
19
  def initialize(encoding)
13
20
  const_name = encoding.to_s.upcase.gsub('-', '_')
14
- raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless coding_available? const_name
21
+ raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless self.class.supports? const_name
15
22
  @encoding, @name = Encodings.const_get(const_name), encoding
16
23
  end
17
24
 
@@ -23,12 +30,6 @@ module Coder
23
30
 
24
31
  private
25
32
 
26
- def coding_available?(const_name)
27
- Encodings.const_defined? const_name
28
- rescue NameError
29
- false
30
- end
31
-
32
33
  def force_encoding(str)
33
34
  return str unless str.respond_to? :force_encoding
34
35
  str.force_encoding(@name)
@@ -39,11 +39,12 @@ module Coder
39
39
  end
40
40
  end
41
41
 
42
- module UCS_2
42
+ module UCS_2BE
43
43
  extend self
44
44
 
45
45
  def garbage?(input, buffered)
46
46
  return false unless buffered.size + 1 == multibyte_size
47
+ return true if codepoint(buffered + [input]) > 0x10FFFF
47
48
  input == 0 and buffered.all? { |b| b == 0 }
48
49
  end
49
50
 
@@ -62,16 +63,16 @@ module Coder
62
63
  def multibyte_size(*)
63
64
  2
64
65
  end
66
+
67
+ def codepoint(values)
68
+ values.inject { |a,b| (a << 8) + b }
69
+ end
65
70
  end
66
71
 
67
- module UCS_4
68
- include UCS_2
72
+ module UCS_4BE
73
+ include UCS_2BE
69
74
  extend self
70
75
 
71
- def garbage?(input, buffered)
72
- super or input > 0x10FFFF
73
- end
74
-
75
76
  def multibyte_size(*)
76
77
  4
77
78
  end
@@ -1,3 +1,3 @@
1
1
  module Coder
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -1,43 +1,80 @@
1
1
  # encoding: UTF-8
2
2
  require 'coder/cleaner'
3
3
  require 'coder/error'
4
+ require 'support/clean_helpers'
4
5
 
5
6
  shared_examples Coder::Cleaner do
6
- let(:encoding) { example.example_group.description }
7
- subject { described_class.new(encoding) }
8
-
9
- def self.cleans(from, to = from)
10
- it "cleans #{from.inspect} to #{to.inspect}" do
11
- subject.clean(from).should be == to
12
- end
13
- end
14
-
15
- context "UTF-8" do
7
+ encoding "UTF-8" do
16
8
  cleans "foo"
17
9
  cleans ""
18
10
  cleans "yummy 🍔 "
19
-
11
+ cleans "\0", ""
20
12
  cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
21
13
  cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
22
14
  end
23
15
 
24
- context "Unknown Encoding" do
25
- it { expect { subject }.to raise_error(Coder::InvalidEncoding) }
16
+ encoding "UCS-2BE" do
17
+ cleans "\x00f\x00o\x00o"
18
+ cleans "\x00f\x00ox", "\x00f\x00o"
19
+ cleans "\x00f\x00o\x00\x00", "\x00f\x00o"
26
20
  end
27
- end
28
21
 
29
- describe Coder::Cleaner::Builtin do
30
- it_behaves_like Coder::Cleaner if described_class.available?
31
- end
22
+ encoding "UCS-4BE" do
23
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o"
24
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00x", "\x00\x00\x00f\x00\x00\x00o"
25
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00\x00", "\x00\x00\x00f\x00\x00\x00o"
26
+ cleans "\xFF\xFF\x10\x10", ""
27
+ end
32
28
 
33
- describe Coder::Cleaner::Java do
34
- it_behaves_like Coder::Cleaner if described_class.available?
29
+ context "unknown encoding" do
30
+ it "raises an exception" do
31
+ expect { described_class.new('foobar') }.
32
+ to raise_error(Coder::InvalidEncoding)
33
+ end
34
+ end
35
35
  end
36
36
 
37
- describe Coder::Cleaner::Iconv do
38
- it_behaves_like Coder::Cleaner if described_class.available?
37
+ describe Coder::Cleaner do
38
+ include CleanHelpers
39
+ it_behaves_like Coder::Cleaner
40
+
41
+ it { should support('UTF-8') }
42
+ it { should support('UCS-2BE') }
43
+ it { should support('UCS-4BE') }
39
44
  end
40
45
 
41
46
  describe Coder::Cleaner::Simple do
42
- it_behaves_like Coder::Cleaner if described_class.available?
47
+ include CleanHelpers
48
+ it_behaves_like Coder::Cleaner
49
+
50
+ it { should support('UTF-8') }
51
+ it { should support('UCS-2BE') }
52
+ it { should support('UCS-4BE') }
43
53
  end
54
+
55
+ describe Coder::Cleaner::Builtin do
56
+ include CleanHelpers
57
+ it_behaves_like Coder::Cleaner
58
+
59
+ it { should support('UTF-8') }
60
+ it { should support('UCS-2BE') }
61
+ it { should support('UCS-4BE') }
62
+ end if Coder::Cleaner::Builtin.available?
63
+
64
+ describe Coder::Cleaner::Java do
65
+ include CleanHelpers
66
+ it_behaves_like Coder::Cleaner
67
+
68
+ it { should support('UTF-8') }
69
+ it { should_not support('UCS-2BE') }
70
+ it { should_not support('UCS-4BE') }
71
+ end if Coder::Cleaner::Java.available?
72
+
73
+ describe Coder::Cleaner::Iconv do
74
+ include CleanHelpers
75
+ it_behaves_like Coder::Cleaner
76
+
77
+ it { should support('UTF-8') }
78
+ it { should_not support('UCS-2BE') }
79
+ it { should_not support('UCS-4BE') }
80
+ end if Coder::Cleaner::Iconv.available?
@@ -0,0 +1,33 @@
1
+ module CleanHelpers
2
+ module ClassMethods
3
+ def encoding(encoding, &block)
4
+ return unless described_class.supports? encoding
5
+ context(encoding) do
6
+ let(:encoding) { encoding}
7
+ instance_eval(&block)
8
+ end
9
+ end
10
+
11
+ def cleans(from, to = from)
12
+ it "cleans #{from.inspect} to #{to.inspect}" do
13
+ result = described_class.new(encoding).clean(binary(from))
14
+ binary(result).should be == binary(to)
15
+ end
16
+ end
17
+ end
18
+
19
+ def binary(str)
20
+ return str unless str.respond_to? :force_encoding
21
+ str.force_encoding('binary')
22
+ end
23
+
24
+ def support(encoding)
25
+ be_supports(encoding)
26
+ end
27
+
28
+ def self.append_features(obj)
29
+ obj.extend ClassMethods
30
+ obj.subject { obj.described_class }
31
+ super
32
+ end
33
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-23 00:00:00.000000000 Z
12
+ date: 2012-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -56,6 +56,7 @@ files:
56
56
  - LICENSE.txt
57
57
  - README.md
58
58
  - Rakefile
59
+ - bench.rb
59
60
  - coder.gemspec
60
61
  - lib/coder.rb
61
62
  - lib/coder/cleaner.rb
@@ -68,6 +69,7 @@ files:
68
69
  - lib/coder/error.rb
69
70
  - lib/coder/version.rb
70
71
  - spec/coder/cleaner_spec.rb
72
+ - spec/support/clean_helpers.rb
71
73
  homepage: http://github.com/rkh/coder
72
74
  licenses: []
73
75
  post_install_message:
@@ -94,4 +96,5 @@ specification_version: 3
94
96
  summary: library to handle encodings
95
97
  test_files:
96
98
  - spec/coder/cleaner_spec.rb
99
+ - spec/support/clean_helpers.rb
97
100
  has_rdoc: