coder 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ # encoding: UTF-8
2
+
3
+ # JRuby 1.7.0-preview2
4
+ # user system total real
5
+ # Coder::Cleaner::Java 0.360000 0.010000 0.370000 ( 0.121000)
6
+ # Coder::Cleaner::Iconv 0.290000 0.010000 0.300000 ( 0.103000)
7
+ # Coder::Cleaner::Simple 1.010000 0.020000 1.030000 ( 0.367000)
8
+
9
+ # MRI 1.9.3
10
+ # user system total real
11
+ # Coder::Cleaner::Builtin 0.060000 0.000000 0.060000 ( 0.057767)
12
+ # Coder::Cleaner::Iconv 0.020000 0.000000 0.020000 ( 0.022351)
13
+ # Coder::Cleaner::Simple 0.480000 0.000000 0.480000 ( 0.486451)
14
+
15
+ require 'benchmark'
16
+ require 'coder'
17
+
18
+ strings = [
19
+ "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94",
20
+ "{foo \xC3 'bar'}",
21
+ "yummy 🍔 " * 10
22
+ ]
23
+
24
+ Benchmark.bmbm do |x|
25
+ Coder::Cleaner::AVAILABLE.each do |cleaner|
26
+ x.report cleaner.to_s do
27
+ 1000.times do
28
+ strings.each do |str|
29
+ cleaner.new('UTF-8').clean(str)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -5,10 +5,21 @@ require 'coder/cleaner/simple'
5
5
 
6
6
  module Coder
7
7
  module Cleaner
8
- Default = [Builtin, Java, Iconv, Simple].detect { |c| c.available? }
8
+ ALL = [ Builtin, Java, Iconv, Simple ]
9
+ AVAILABLE = ALL.select { |e| e.available? }
10
+
11
+ def self.available?
12
+ AVAILABLE.any?
13
+ end
14
+
15
+ def self.supports?(encoding)
16
+ AVAILABLE.any? { |e| e.supports? encoding }
17
+ end
9
18
 
10
19
  def self.new(encoding)
11
- Default.new(encoding)
20
+ cleaner = AVAILABLE.detect { |e| e.supports? encoding }
21
+ raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless cleaner
22
+ cleaner.new(encoding)
12
23
  end
13
24
  end
14
25
  end
@@ -13,6 +13,12 @@ module Coder
13
13
  !defined?(RUBY_ENGINE) or RUBY_ENGINE == 'ruby'
14
14
  end
15
15
 
16
+ def self.supports?(encoding)
17
+ Encoding.find(encoding)
18
+ rescue ArgumentError
19
+ false
20
+ end
21
+
16
22
  def self.has_encoding?
17
23
  defined? Encoding.find and
18
24
  defined? EncodingError and
@@ -30,7 +36,7 @@ module Coder
30
36
 
31
37
  def clean(str)
32
38
  str = str.dup.force_encoding(@encoding)
33
- str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
39
+ str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0".encode(@encoding), "")
34
40
  rescue EncodingError => e
35
41
  raise Coder::Error, e.message
36
42
  end
@@ -38,9 +44,8 @@ module Coder
38
44
  private
39
45
 
40
46
  def check_encoding
41
- Encoding.find(@encoding)
42
- rescue ArgumentError => e
43
- raise Coder::InvalidEncoding, e.message
47
+ return if self.class.supports? @encoding
48
+ raise Coder::InvalidEncoding, "unknown encoding name - #{@encoding}"
44
49
  end
45
50
 
46
51
  def needs_dummy?
@@ -17,6 +17,12 @@ module Coder
17
17
  super
18
18
  end
19
19
 
20
+ def self.supports?(encoding)
21
+ encoding.to_s !~ /^ucs/i and ::Iconv.new("#{encoding}//ignore", encoding.to_s)
22
+ rescue Exception
23
+ false
24
+ end
25
+
20
26
  def self.available?
21
27
  load_iconv
22
28
  !!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
@@ -25,13 +31,15 @@ module Coder
25
31
  end
26
32
 
27
33
  def initialize(encoding)
28
- @iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
34
+ @nullbyte = "\0"
35
+ @iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
36
+ @nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
29
37
  rescue ::Iconv::InvalidEncoding => e
30
38
  raise Coder::InvalidEncoding, e.message
31
39
  end
32
40
 
33
41
  def clean(str)
34
- @iconv.iconv(str).gsub("\0", "")
42
+ @iconv.iconv(str).gsub(@nullbyte, "")
35
43
  rescue ::Iconv::Failure => e
36
44
  raise Coder::Error, e.message
37
45
  end
@@ -10,12 +10,18 @@ module Coder
10
10
  false
11
11
  end
12
12
 
13
+ def self.supports?(encoding)
14
+ encoding.to_s =~ /^utf-8$/i
15
+ end
16
+
13
17
  def initialize(encoding)
14
- encoding = encoding.to_s.upcase
15
- @charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
16
- @decoder = @charset.new_decoder
18
+ encoding = encoding.to_s.upcase
19
+ @nullbyte = "\0"
20
+ @charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
21
+ @decoder = @charset.new_decoder
17
22
  @decoder.on_malformed_input(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
18
23
  @decoder.on_unmappable_character(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
24
+ @nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
19
25
  rescue ::Java::JavaNioCharset::UnsupportedCharsetException, ::Java::JavaNioCharset::IllegalCharsetNameException
20
26
  raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}"
21
27
  rescue ::Java::JavaLang::RuntimeException => e
@@ -24,7 +30,7 @@ module Coder
24
30
 
25
31
  def clean(str)
26
32
  buffer = ::Java::JavaNio::ByteBuffer.wrap(str.to_java_bytes)
27
- @decoder.decode(buffer).to_s
33
+ @decoder.decode(buffer).to_s.gsub(@nullbyte, '')
28
34
  rescue Java::JavaLang::RuntimeException => e
29
35
  raise Coder::Error, e.message, e.backtrace
30
36
  end
@@ -9,9 +9,16 @@ module Coder
9
9
  true
10
10
  end
11
11
 
12
+ def self.supports?(encoding)
13
+ const_name = encoding.to_s.upcase.gsub('-', '_')
14
+ Encodings.const_defined? const_name
15
+ rescue NameError
16
+ false
17
+ end
18
+
12
19
  def initialize(encoding)
13
20
  const_name = encoding.to_s.upcase.gsub('-', '_')
14
- raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless coding_available? const_name
21
+ raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless self.class.supports? const_name
15
22
  @encoding, @name = Encodings.const_get(const_name), encoding
16
23
  end
17
24
 
@@ -23,12 +30,6 @@ module Coder
23
30
 
24
31
  private
25
32
 
26
- def coding_available?(const_name)
27
- Encodings.const_defined? const_name
28
- rescue NameError
29
- false
30
- end
31
-
32
33
  def force_encoding(str)
33
34
  return str unless str.respond_to? :force_encoding
34
35
  str.force_encoding(@name)
@@ -39,11 +39,12 @@ module Coder
39
39
  end
40
40
  end
41
41
 
42
- module UCS_2
42
+ module UCS_2BE
43
43
  extend self
44
44
 
45
45
  def garbage?(input, buffered)
46
46
  return false unless buffered.size + 1 == multibyte_size
47
+ return true if codepoint(buffered + [input]) > 0x10FFFF
47
48
  input == 0 and buffered.all? { |b| b == 0 }
48
49
  end
49
50
 
@@ -62,16 +63,16 @@ module Coder
62
63
  def multibyte_size(*)
63
64
  2
64
65
  end
66
+
67
+ def codepoint(values)
68
+ values.inject { |a,b| (a << 8) + b }
69
+ end
65
70
  end
66
71
 
67
- module UCS_4
68
- include UCS_2
72
+ module UCS_4BE
73
+ include UCS_2BE
69
74
  extend self
70
75
 
71
- def garbage?(input, buffered)
72
- super or input > 0x10FFFF
73
- end
74
-
75
76
  def multibyte_size(*)
76
77
  4
77
78
  end
@@ -1,3 +1,3 @@
1
1
  module Coder
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -1,43 +1,80 @@
1
1
  # encoding: UTF-8
2
2
  require 'coder/cleaner'
3
3
  require 'coder/error'
4
+ require 'support/clean_helpers'
4
5
 
5
6
  shared_examples Coder::Cleaner do
6
- let(:encoding) { example.example_group.description }
7
- subject { described_class.new(encoding) }
8
-
9
- def self.cleans(from, to = from)
10
- it "cleans #{from.inspect} to #{to.inspect}" do
11
- subject.clean(from).should be == to
12
- end
13
- end
14
-
15
- context "UTF-8" do
7
+ encoding "UTF-8" do
16
8
  cleans "foo"
17
9
  cleans ""
18
10
  cleans "yummy 🍔 "
19
-
11
+ cleans "\0", ""
20
12
  cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
21
13
  cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
22
14
  end
23
15
 
24
- context "Unknown Encoding" do
25
- it { expect { subject }.to raise_error(Coder::InvalidEncoding) }
16
+ encoding "UCS-2BE" do
17
+ cleans "\x00f\x00o\x00o"
18
+ cleans "\x00f\x00ox", "\x00f\x00o"
19
+ cleans "\x00f\x00o\x00\x00", "\x00f\x00o"
26
20
  end
27
- end
28
21
 
29
- describe Coder::Cleaner::Builtin do
30
- it_behaves_like Coder::Cleaner if described_class.available?
31
- end
22
+ encoding "UCS-4BE" do
23
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o"
24
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00x", "\x00\x00\x00f\x00\x00\x00o"
25
+ cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00\x00", "\x00\x00\x00f\x00\x00\x00o"
26
+ cleans "\xFF\xFF\x10\x10", ""
27
+ end
32
28
 
33
- describe Coder::Cleaner::Java do
34
- it_behaves_like Coder::Cleaner if described_class.available?
29
+ context "unknown encoding" do
30
+ it "raises an exception" do
31
+ expect { described_class.new('foobar') }.
32
+ to raise_error(Coder::InvalidEncoding)
33
+ end
34
+ end
35
35
  end
36
36
 
37
- describe Coder::Cleaner::Iconv do
38
- it_behaves_like Coder::Cleaner if described_class.available?
37
+ describe Coder::Cleaner do
38
+ include CleanHelpers
39
+ it_behaves_like Coder::Cleaner
40
+
41
+ it { should support('UTF-8') }
42
+ it { should support('UCS-2BE') }
43
+ it { should support('UCS-4BE') }
39
44
  end
40
45
 
41
46
  describe Coder::Cleaner::Simple do
42
- it_behaves_like Coder::Cleaner if described_class.available?
47
+ include CleanHelpers
48
+ it_behaves_like Coder::Cleaner
49
+
50
+ it { should support('UTF-8') }
51
+ it { should support('UCS-2BE') }
52
+ it { should support('UCS-4BE') }
43
53
  end
54
+
55
+ describe Coder::Cleaner::Builtin do
56
+ include CleanHelpers
57
+ it_behaves_like Coder::Cleaner
58
+
59
+ it { should support('UTF-8') }
60
+ it { should support('UCS-2BE') }
61
+ it { should support('UCS-4BE') }
62
+ end if Coder::Cleaner::Builtin.available?
63
+
64
+ describe Coder::Cleaner::Java do
65
+ include CleanHelpers
66
+ it_behaves_like Coder::Cleaner
67
+
68
+ it { should support('UTF-8') }
69
+ it { should_not support('UCS-2BE') }
70
+ it { should_not support('UCS-4BE') }
71
+ end if Coder::Cleaner::Java.available?
72
+
73
+ describe Coder::Cleaner::Iconv do
74
+ include CleanHelpers
75
+ it_behaves_like Coder::Cleaner
76
+
77
+ it { should support('UTF-8') }
78
+ it { should_not support('UCS-2BE') }
79
+ it { should_not support('UCS-4BE') }
80
+ end if Coder::Cleaner::Iconv.available?
@@ -0,0 +1,33 @@
1
+ module CleanHelpers
2
+ module ClassMethods
3
+ def encoding(encoding, &block)
4
+ return unless described_class.supports? encoding
5
+ context(encoding) do
6
+ let(:encoding) { encoding}
7
+ instance_eval(&block)
8
+ end
9
+ end
10
+
11
+ def cleans(from, to = from)
12
+ it "cleans #{from.inspect} to #{to.inspect}" do
13
+ result = described_class.new(encoding).clean(binary(from))
14
+ binary(result).should be == binary(to)
15
+ end
16
+ end
17
+ end
18
+
19
+ def binary(str)
20
+ return str unless str.respond_to? :force_encoding
21
+ str.force_encoding('binary')
22
+ end
23
+
24
+ def support(encoding)
25
+ be_supports(encoding)
26
+ end
27
+
28
+ def self.append_features(obj)
29
+ obj.extend ClassMethods
30
+ obj.subject { obj.described_class }
31
+ super
32
+ end
33
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-23 00:00:00.000000000 Z
12
+ date: 2012-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -56,6 +56,7 @@ files:
56
56
  - LICENSE.txt
57
57
  - README.md
58
58
  - Rakefile
59
+ - bench.rb
59
60
  - coder.gemspec
60
61
  - lib/coder.rb
61
62
  - lib/coder/cleaner.rb
@@ -68,6 +69,7 @@ files:
68
69
  - lib/coder/error.rb
69
70
  - lib/coder/version.rb
70
71
  - spec/coder/cleaner_spec.rb
72
+ - spec/support/clean_helpers.rb
71
73
  homepage: http://github.com/rkh/coder
72
74
  licenses: []
73
75
  post_install_message:
@@ -94,4 +96,5 @@ specification_version: 3
94
96
  summary: library to handle encodings
95
97
  test_files:
96
98
  - spec/coder/cleaner_spec.rb
99
+ - spec/support/clean_helpers.rb
97
100
  has_rdoc: