coder 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bench.rb +34 -0
- data/lib/coder/cleaner.rb +13 -2
- data/lib/coder/cleaner/builtin.rb +9 -4
- data/lib/coder/cleaner/iconv.rb +10 -2
- data/lib/coder/cleaner/java.rb +10 -4
- data/lib/coder/cleaner/simple.rb +8 -7
- data/lib/coder/cleaner/simple/encodings.rb +8 -7
- data/lib/coder/version.rb +1 -1
- data/spec/coder/cleaner_spec.rb +59 -22
- data/spec/support/clean_helpers.rb +33 -0
- metadata +5 -2
data/bench.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# JRuby 1.7.0-preview2
|
4
|
+
# user system total real
|
5
|
+
# Coder::Cleaner::Java 0.360000 0.010000 0.370000 ( 0.121000)
|
6
|
+
# Coder::Cleaner::Iconv 0.290000 0.010000 0.300000 ( 0.103000)
|
7
|
+
# Coder::Cleaner::Simple 1.010000 0.020000 1.030000 ( 0.367000)
|
8
|
+
|
9
|
+
# MRI 1.9.3
|
10
|
+
# user system total real
|
11
|
+
# Coder::Cleaner::Builtin 0.060000 0.000000 0.060000 ( 0.057767)
|
12
|
+
# Coder::Cleaner::Iconv 0.020000 0.000000 0.020000 ( 0.022351)
|
13
|
+
# Coder::Cleaner::Simple 0.480000 0.000000 0.480000 ( 0.486451)
|
14
|
+
|
15
|
+
require 'benchmark'
|
16
|
+
require 'coder'
|
17
|
+
|
18
|
+
strings = [
|
19
|
+
"yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94",
|
20
|
+
"{foo \xC3 'bar'}",
|
21
|
+
"yummy 🍔 " * 10
|
22
|
+
]
|
23
|
+
|
24
|
+
Benchmark.bmbm do |x|
|
25
|
+
Coder::Cleaner::AVAILABLE.each do |cleaner|
|
26
|
+
x.report cleaner.to_s do
|
27
|
+
1000.times do
|
28
|
+
strings.each do |str|
|
29
|
+
cleaner.new('UTF-8').clean(str)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/coder/cleaner.rb
CHANGED
@@ -5,10 +5,21 @@ require 'coder/cleaner/simple'
|
|
5
5
|
|
6
6
|
module Coder
|
7
7
|
module Cleaner
|
8
|
-
|
8
|
+
ALL = [ Builtin, Java, Iconv, Simple ]
|
9
|
+
AVAILABLE = ALL.select { |e| e.available? }
|
10
|
+
|
11
|
+
def self.available?
|
12
|
+
AVAILABLE.any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.supports?(encoding)
|
16
|
+
AVAILABLE.any? { |e| e.supports? encoding }
|
17
|
+
end
|
9
18
|
|
10
19
|
def self.new(encoding)
|
11
|
-
|
20
|
+
cleaner = AVAILABLE.detect { |e| e.supports? encoding }
|
21
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless cleaner
|
22
|
+
cleaner.new(encoding)
|
12
23
|
end
|
13
24
|
end
|
14
25
|
end
|
@@ -13,6 +13,12 @@ module Coder
|
|
13
13
|
!defined?(RUBY_ENGINE) or RUBY_ENGINE == 'ruby'
|
14
14
|
end
|
15
15
|
|
16
|
+
def self.supports?(encoding)
|
17
|
+
Encoding.find(encoding)
|
18
|
+
rescue ArgumentError
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
16
22
|
def self.has_encoding?
|
17
23
|
defined? Encoding.find and
|
18
24
|
defined? EncodingError and
|
@@ -30,7 +36,7 @@ module Coder
|
|
30
36
|
|
31
37
|
def clean(str)
|
32
38
|
str = str.dup.force_encoding(@encoding)
|
33
|
-
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
|
39
|
+
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0".encode(@encoding), "")
|
34
40
|
rescue EncodingError => e
|
35
41
|
raise Coder::Error, e.message
|
36
42
|
end
|
@@ -38,9 +44,8 @@ module Coder
|
|
38
44
|
private
|
39
45
|
|
40
46
|
def check_encoding
|
41
|
-
|
42
|
-
|
43
|
-
raise Coder::InvalidEncoding, e.message
|
47
|
+
return if self.class.supports? @encoding
|
48
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{@encoding}"
|
44
49
|
end
|
45
50
|
|
46
51
|
def needs_dummy?
|
data/lib/coder/cleaner/iconv.rb
CHANGED
@@ -17,6 +17,12 @@ module Coder
|
|
17
17
|
super
|
18
18
|
end
|
19
19
|
|
20
|
+
def self.supports?(encoding)
|
21
|
+
encoding.to_s !~ /^ucs/i and ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
22
|
+
rescue Exception
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
20
26
|
def self.available?
|
21
27
|
load_iconv
|
22
28
|
!!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
|
@@ -25,13 +31,15 @@ module Coder
|
|
25
31
|
end
|
26
32
|
|
27
33
|
def initialize(encoding)
|
28
|
-
@
|
34
|
+
@nullbyte = "\0"
|
35
|
+
@iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
36
|
+
@nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
|
29
37
|
rescue ::Iconv::InvalidEncoding => e
|
30
38
|
raise Coder::InvalidEncoding, e.message
|
31
39
|
end
|
32
40
|
|
33
41
|
def clean(str)
|
34
|
-
@iconv.iconv(str).gsub(
|
42
|
+
@iconv.iconv(str).gsub(@nullbyte, "")
|
35
43
|
rescue ::Iconv::Failure => e
|
36
44
|
raise Coder::Error, e.message
|
37
45
|
end
|
data/lib/coder/cleaner/java.rb
CHANGED
@@ -10,12 +10,18 @@ module Coder
|
|
10
10
|
false
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.supports?(encoding)
|
14
|
+
encoding.to_s =~ /^utf-8$/i
|
15
|
+
end
|
16
|
+
|
13
17
|
def initialize(encoding)
|
14
|
-
encoding
|
15
|
-
@
|
16
|
-
@
|
18
|
+
encoding = encoding.to_s.upcase
|
19
|
+
@nullbyte = "\0"
|
20
|
+
@charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
|
21
|
+
@decoder = @charset.new_decoder
|
17
22
|
@decoder.on_malformed_input(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
|
18
23
|
@decoder.on_unmappable_character(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
|
24
|
+
@nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
|
19
25
|
rescue ::Java::JavaNioCharset::UnsupportedCharsetException, ::Java::JavaNioCharset::IllegalCharsetNameException
|
20
26
|
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}"
|
21
27
|
rescue ::Java::JavaLang::RuntimeException => e
|
@@ -24,7 +30,7 @@ module Coder
|
|
24
30
|
|
25
31
|
def clean(str)
|
26
32
|
buffer = ::Java::JavaNio::ByteBuffer.wrap(str.to_java_bytes)
|
27
|
-
@decoder.decode(buffer).to_s
|
33
|
+
@decoder.decode(buffer).to_s.gsub(@nullbyte, '')
|
28
34
|
rescue Java::JavaLang::RuntimeException => e
|
29
35
|
raise Coder::Error, e.message, e.backtrace
|
30
36
|
end
|
data/lib/coder/cleaner/simple.rb
CHANGED
@@ -9,9 +9,16 @@ module Coder
|
|
9
9
|
true
|
10
10
|
end
|
11
11
|
|
12
|
+
def self.supports?(encoding)
|
13
|
+
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
+
Encodings.const_defined? const_name
|
15
|
+
rescue NameError
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
12
19
|
def initialize(encoding)
|
13
20
|
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
-
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless
|
21
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless self.class.supports? const_name
|
15
22
|
@encoding, @name = Encodings.const_get(const_name), encoding
|
16
23
|
end
|
17
24
|
|
@@ -23,12 +30,6 @@ module Coder
|
|
23
30
|
|
24
31
|
private
|
25
32
|
|
26
|
-
def coding_available?(const_name)
|
27
|
-
Encodings.const_defined? const_name
|
28
|
-
rescue NameError
|
29
|
-
false
|
30
|
-
end
|
31
|
-
|
32
33
|
def force_encoding(str)
|
33
34
|
return str unless str.respond_to? :force_encoding
|
34
35
|
str.force_encoding(@name)
|
@@ -39,11 +39,12 @@ module Coder
|
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
-
module
|
42
|
+
module UCS_2BE
|
43
43
|
extend self
|
44
44
|
|
45
45
|
def garbage?(input, buffered)
|
46
46
|
return false unless buffered.size + 1 == multibyte_size
|
47
|
+
return true if codepoint(buffered + [input]) > 0x10FFFF
|
47
48
|
input == 0 and buffered.all? { |b| b == 0 }
|
48
49
|
end
|
49
50
|
|
@@ -62,16 +63,16 @@ module Coder
|
|
62
63
|
def multibyte_size(*)
|
63
64
|
2
|
64
65
|
end
|
66
|
+
|
67
|
+
def codepoint(values)
|
68
|
+
values.inject { |a,b| (a << 8) + b }
|
69
|
+
end
|
65
70
|
end
|
66
71
|
|
67
|
-
module
|
68
|
-
include
|
72
|
+
module UCS_4BE
|
73
|
+
include UCS_2BE
|
69
74
|
extend self
|
70
75
|
|
71
|
-
def garbage?(input, buffered)
|
72
|
-
super or input > 0x10FFFF
|
73
|
-
end
|
74
|
-
|
75
76
|
def multibyte_size(*)
|
76
77
|
4
|
77
78
|
end
|
data/lib/coder/version.rb
CHANGED
data/spec/coder/cleaner_spec.rb
CHANGED
@@ -1,43 +1,80 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'coder/cleaner'
|
3
3
|
require 'coder/error'
|
4
|
+
require 'support/clean_helpers'
|
4
5
|
|
5
6
|
shared_examples Coder::Cleaner do
|
6
|
-
|
7
|
-
subject { described_class.new(encoding) }
|
8
|
-
|
9
|
-
def self.cleans(from, to = from)
|
10
|
-
it "cleans #{from.inspect} to #{to.inspect}" do
|
11
|
-
subject.clean(from).should be == to
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
context "UTF-8" do
|
7
|
+
encoding "UTF-8" do
|
16
8
|
cleans "foo"
|
17
9
|
cleans ""
|
18
10
|
cleans "yummy 🍔 "
|
19
|
-
|
11
|
+
cleans "\0", ""
|
20
12
|
cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
|
21
13
|
cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
|
22
14
|
end
|
23
15
|
|
24
|
-
|
25
|
-
|
16
|
+
encoding "UCS-2BE" do
|
17
|
+
cleans "\x00f\x00o\x00o"
|
18
|
+
cleans "\x00f\x00ox", "\x00f\x00o"
|
19
|
+
cleans "\x00f\x00o\x00\x00", "\x00f\x00o"
|
26
20
|
end
|
27
|
-
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
22
|
+
encoding "UCS-4BE" do
|
23
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o"
|
24
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00x", "\x00\x00\x00f\x00\x00\x00o"
|
25
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00\x00", "\x00\x00\x00f\x00\x00\x00o"
|
26
|
+
cleans "\xFF\xFF\x10\x10", ""
|
27
|
+
end
|
32
28
|
|
33
|
-
|
34
|
-
|
29
|
+
context "unknown encoding" do
|
30
|
+
it "raises an exception" do
|
31
|
+
expect { described_class.new('foobar') }.
|
32
|
+
to raise_error(Coder::InvalidEncoding)
|
33
|
+
end
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
|
-
describe Coder::Cleaner
|
38
|
-
|
37
|
+
describe Coder::Cleaner do
|
38
|
+
include CleanHelpers
|
39
|
+
it_behaves_like Coder::Cleaner
|
40
|
+
|
41
|
+
it { should support('UTF-8') }
|
42
|
+
it { should support('UCS-2BE') }
|
43
|
+
it { should support('UCS-4BE') }
|
39
44
|
end
|
40
45
|
|
41
46
|
describe Coder::Cleaner::Simple do
|
42
|
-
|
47
|
+
include CleanHelpers
|
48
|
+
it_behaves_like Coder::Cleaner
|
49
|
+
|
50
|
+
it { should support('UTF-8') }
|
51
|
+
it { should support('UCS-2BE') }
|
52
|
+
it { should support('UCS-4BE') }
|
43
53
|
end
|
54
|
+
|
55
|
+
describe Coder::Cleaner::Builtin do
|
56
|
+
include CleanHelpers
|
57
|
+
it_behaves_like Coder::Cleaner
|
58
|
+
|
59
|
+
it { should support('UTF-8') }
|
60
|
+
it { should support('UCS-2BE') }
|
61
|
+
it { should support('UCS-4BE') }
|
62
|
+
end if Coder::Cleaner::Builtin.available?
|
63
|
+
|
64
|
+
describe Coder::Cleaner::Java do
|
65
|
+
include CleanHelpers
|
66
|
+
it_behaves_like Coder::Cleaner
|
67
|
+
|
68
|
+
it { should support('UTF-8') }
|
69
|
+
it { should_not support('UCS-2BE') }
|
70
|
+
it { should_not support('UCS-4BE') }
|
71
|
+
end if Coder::Cleaner::Java.available?
|
72
|
+
|
73
|
+
describe Coder::Cleaner::Iconv do
|
74
|
+
include CleanHelpers
|
75
|
+
it_behaves_like Coder::Cleaner
|
76
|
+
|
77
|
+
it { should support('UTF-8') }
|
78
|
+
it { should_not support('UCS-2BE') }
|
79
|
+
it { should_not support('UCS-4BE') }
|
80
|
+
end if Coder::Cleaner::Iconv.available?
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module CleanHelpers
|
2
|
+
module ClassMethods
|
3
|
+
def encoding(encoding, &block)
|
4
|
+
return unless described_class.supports? encoding
|
5
|
+
context(encoding) do
|
6
|
+
let(:encoding) { encoding}
|
7
|
+
instance_eval(&block)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def cleans(from, to = from)
|
12
|
+
it "cleans #{from.inspect} to #{to.inspect}" do
|
13
|
+
result = described_class.new(encoding).clean(binary(from))
|
14
|
+
binary(result).should be == binary(to)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def binary(str)
|
20
|
+
return str unless str.respond_to? :force_encoding
|
21
|
+
str.force_encoding('binary')
|
22
|
+
end
|
23
|
+
|
24
|
+
def support(encoding)
|
25
|
+
be_supports(encoding)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.append_features(obj)
|
29
|
+
obj.extend ClassMethods
|
30
|
+
obj.subject { obj.described_class }
|
31
|
+
super
|
32
|
+
end
|
33
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- LICENSE.txt
|
57
57
|
- README.md
|
58
58
|
- Rakefile
|
59
|
+
- bench.rb
|
59
60
|
- coder.gemspec
|
60
61
|
- lib/coder.rb
|
61
62
|
- lib/coder/cleaner.rb
|
@@ -68,6 +69,7 @@ files:
|
|
68
69
|
- lib/coder/error.rb
|
69
70
|
- lib/coder/version.rb
|
70
71
|
- spec/coder/cleaner_spec.rb
|
72
|
+
- spec/support/clean_helpers.rb
|
71
73
|
homepage: http://github.com/rkh/coder
|
72
74
|
licenses: []
|
73
75
|
post_install_message:
|
@@ -94,4 +96,5 @@ specification_version: 3
|
|
94
96
|
summary: library to handle encodings
|
95
97
|
test_files:
|
96
98
|
- spec/coder/cleaner_spec.rb
|
99
|
+
- spec/support/clean_helpers.rb
|
97
100
|
has_rdoc:
|