coder 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bench.rb +34 -0
- data/lib/coder/cleaner.rb +13 -2
- data/lib/coder/cleaner/builtin.rb +9 -4
- data/lib/coder/cleaner/iconv.rb +10 -2
- data/lib/coder/cleaner/java.rb +10 -4
- data/lib/coder/cleaner/simple.rb +8 -7
- data/lib/coder/cleaner/simple/encodings.rb +8 -7
- data/lib/coder/version.rb +1 -1
- data/spec/coder/cleaner_spec.rb +59 -22
- data/spec/support/clean_helpers.rb +33 -0
- metadata +5 -2
data/bench.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# JRuby 1.7.0-preview2
|
4
|
+
# user system total real
|
5
|
+
# Coder::Cleaner::Java 0.360000 0.010000 0.370000 ( 0.121000)
|
6
|
+
# Coder::Cleaner::Iconv 0.290000 0.010000 0.300000 ( 0.103000)
|
7
|
+
# Coder::Cleaner::Simple 1.010000 0.020000 1.030000 ( 0.367000)
|
8
|
+
|
9
|
+
# MRI 1.9.3
|
10
|
+
# user system total real
|
11
|
+
# Coder::Cleaner::Builtin 0.060000 0.000000 0.060000 ( 0.057767)
|
12
|
+
# Coder::Cleaner::Iconv 0.020000 0.000000 0.020000 ( 0.022351)
|
13
|
+
# Coder::Cleaner::Simple 0.480000 0.000000 0.480000 ( 0.486451)
|
14
|
+
|
15
|
+
require 'benchmark'
|
16
|
+
require 'coder'
|
17
|
+
|
18
|
+
strings = [
|
19
|
+
"yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94",
|
20
|
+
"{foo \xC3 'bar'}",
|
21
|
+
"yummy 🍔 " * 10
|
22
|
+
]
|
23
|
+
|
24
|
+
Benchmark.bmbm do |x|
|
25
|
+
Coder::Cleaner::AVAILABLE.each do |cleaner|
|
26
|
+
x.report cleaner.to_s do
|
27
|
+
1000.times do
|
28
|
+
strings.each do |str|
|
29
|
+
cleaner.new('UTF-8').clean(str)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/coder/cleaner.rb
CHANGED
@@ -5,10 +5,21 @@ require 'coder/cleaner/simple'
|
|
5
5
|
|
6
6
|
module Coder
|
7
7
|
module Cleaner
|
8
|
-
|
8
|
+
ALL = [ Builtin, Java, Iconv, Simple ]
|
9
|
+
AVAILABLE = ALL.select { |e| e.available? }
|
10
|
+
|
11
|
+
def self.available?
|
12
|
+
AVAILABLE.any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.supports?(encoding)
|
16
|
+
AVAILABLE.any? { |e| e.supports? encoding }
|
17
|
+
end
|
9
18
|
|
10
19
|
def self.new(encoding)
|
11
|
-
|
20
|
+
cleaner = AVAILABLE.detect { |e| e.supports? encoding }
|
21
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless cleaner
|
22
|
+
cleaner.new(encoding)
|
12
23
|
end
|
13
24
|
end
|
14
25
|
end
|
@@ -13,6 +13,12 @@ module Coder
|
|
13
13
|
!defined?(RUBY_ENGINE) or RUBY_ENGINE == 'ruby'
|
14
14
|
end
|
15
15
|
|
16
|
+
def self.supports?(encoding)
|
17
|
+
Encoding.find(encoding)
|
18
|
+
rescue ArgumentError
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
16
22
|
def self.has_encoding?
|
17
23
|
defined? Encoding.find and
|
18
24
|
defined? EncodingError and
|
@@ -30,7 +36,7 @@ module Coder
|
|
30
36
|
|
31
37
|
def clean(str)
|
32
38
|
str = str.dup.force_encoding(@encoding)
|
33
|
-
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
|
39
|
+
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0".encode(@encoding), "")
|
34
40
|
rescue EncodingError => e
|
35
41
|
raise Coder::Error, e.message
|
36
42
|
end
|
@@ -38,9 +44,8 @@ module Coder
|
|
38
44
|
private
|
39
45
|
|
40
46
|
def check_encoding
|
41
|
-
|
42
|
-
|
43
|
-
raise Coder::InvalidEncoding, e.message
|
47
|
+
return if self.class.supports? @encoding
|
48
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{@encoding}"
|
44
49
|
end
|
45
50
|
|
46
51
|
def needs_dummy?
|
data/lib/coder/cleaner/iconv.rb
CHANGED
@@ -17,6 +17,12 @@ module Coder
|
|
17
17
|
super
|
18
18
|
end
|
19
19
|
|
20
|
+
def self.supports?(encoding)
|
21
|
+
encoding.to_s !~ /^ucs/i and ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
22
|
+
rescue Exception
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
20
26
|
def self.available?
|
21
27
|
load_iconv
|
22
28
|
!!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
|
@@ -25,13 +31,15 @@ module Coder
|
|
25
31
|
end
|
26
32
|
|
27
33
|
def initialize(encoding)
|
28
|
-
@
|
34
|
+
@nullbyte = "\0"
|
35
|
+
@iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
36
|
+
@nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
|
29
37
|
rescue ::Iconv::InvalidEncoding => e
|
30
38
|
raise Coder::InvalidEncoding, e.message
|
31
39
|
end
|
32
40
|
|
33
41
|
def clean(str)
|
34
|
-
@iconv.iconv(str).gsub(
|
42
|
+
@iconv.iconv(str).gsub(@nullbyte, "")
|
35
43
|
rescue ::Iconv::Failure => e
|
36
44
|
raise Coder::Error, e.message
|
37
45
|
end
|
data/lib/coder/cleaner/java.rb
CHANGED
@@ -10,12 +10,18 @@ module Coder
|
|
10
10
|
false
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.supports?(encoding)
|
14
|
+
encoding.to_s =~ /^utf-8$/i
|
15
|
+
end
|
16
|
+
|
13
17
|
def initialize(encoding)
|
14
|
-
encoding
|
15
|
-
@
|
16
|
-
@
|
18
|
+
encoding = encoding.to_s.upcase
|
19
|
+
@nullbyte = "\0"
|
20
|
+
@charset = ::Java::JavaNioCharset::Charset.for_name(encoding)
|
21
|
+
@decoder = @charset.new_decoder
|
17
22
|
@decoder.on_malformed_input(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
|
18
23
|
@decoder.on_unmappable_character(::Java::JavaNioCharset::CodingErrorAction::IGNORE)
|
24
|
+
@nullbyte.encode! encoding if @nullbyte.respond_to? :encode!
|
19
25
|
rescue ::Java::JavaNioCharset::UnsupportedCharsetException, ::Java::JavaNioCharset::IllegalCharsetNameException
|
20
26
|
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}"
|
21
27
|
rescue ::Java::JavaLang::RuntimeException => e
|
@@ -24,7 +30,7 @@ module Coder
|
|
24
30
|
|
25
31
|
def clean(str)
|
26
32
|
buffer = ::Java::JavaNio::ByteBuffer.wrap(str.to_java_bytes)
|
27
|
-
@decoder.decode(buffer).to_s
|
33
|
+
@decoder.decode(buffer).to_s.gsub(@nullbyte, '')
|
28
34
|
rescue Java::JavaLang::RuntimeException => e
|
29
35
|
raise Coder::Error, e.message, e.backtrace
|
30
36
|
end
|
data/lib/coder/cleaner/simple.rb
CHANGED
@@ -9,9 +9,16 @@ module Coder
|
|
9
9
|
true
|
10
10
|
end
|
11
11
|
|
12
|
+
def self.supports?(encoding)
|
13
|
+
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
+
Encodings.const_defined? const_name
|
15
|
+
rescue NameError
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
12
19
|
def initialize(encoding)
|
13
20
|
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
-
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless
|
21
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless self.class.supports? const_name
|
15
22
|
@encoding, @name = Encodings.const_get(const_name), encoding
|
16
23
|
end
|
17
24
|
|
@@ -23,12 +30,6 @@ module Coder
|
|
23
30
|
|
24
31
|
private
|
25
32
|
|
26
|
-
def coding_available?(const_name)
|
27
|
-
Encodings.const_defined? const_name
|
28
|
-
rescue NameError
|
29
|
-
false
|
30
|
-
end
|
31
|
-
|
32
33
|
def force_encoding(str)
|
33
34
|
return str unless str.respond_to? :force_encoding
|
34
35
|
str.force_encoding(@name)
|
@@ -39,11 +39,12 @@ module Coder
|
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
|
-
module
|
42
|
+
module UCS_2BE
|
43
43
|
extend self
|
44
44
|
|
45
45
|
def garbage?(input, buffered)
|
46
46
|
return false unless buffered.size + 1 == multibyte_size
|
47
|
+
return true if codepoint(buffered + [input]) > 0x10FFFF
|
47
48
|
input == 0 and buffered.all? { |b| b == 0 }
|
48
49
|
end
|
49
50
|
|
@@ -62,16 +63,16 @@ module Coder
|
|
62
63
|
def multibyte_size(*)
|
63
64
|
2
|
64
65
|
end
|
66
|
+
|
67
|
+
def codepoint(values)
|
68
|
+
values.inject { |a,b| (a << 8) + b }
|
69
|
+
end
|
65
70
|
end
|
66
71
|
|
67
|
-
module
|
68
|
-
include
|
72
|
+
module UCS_4BE
|
73
|
+
include UCS_2BE
|
69
74
|
extend self
|
70
75
|
|
71
|
-
def garbage?(input, buffered)
|
72
|
-
super or input > 0x10FFFF
|
73
|
-
end
|
74
|
-
|
75
76
|
def multibyte_size(*)
|
76
77
|
4
|
77
78
|
end
|
data/lib/coder/version.rb
CHANGED
data/spec/coder/cleaner_spec.rb
CHANGED
@@ -1,43 +1,80 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'coder/cleaner'
|
3
3
|
require 'coder/error'
|
4
|
+
require 'support/clean_helpers'
|
4
5
|
|
5
6
|
shared_examples Coder::Cleaner do
|
6
|
-
|
7
|
-
subject { described_class.new(encoding) }
|
8
|
-
|
9
|
-
def self.cleans(from, to = from)
|
10
|
-
it "cleans #{from.inspect} to #{to.inspect}" do
|
11
|
-
subject.clean(from).should be == to
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
context "UTF-8" do
|
7
|
+
encoding "UTF-8" do
|
16
8
|
cleans "foo"
|
17
9
|
cleans ""
|
18
10
|
cleans "yummy 🍔 "
|
19
|
-
|
11
|
+
cleans "\0", ""
|
20
12
|
cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
|
21
13
|
cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
|
22
14
|
end
|
23
15
|
|
24
|
-
|
25
|
-
|
16
|
+
encoding "UCS-2BE" do
|
17
|
+
cleans "\x00f\x00o\x00o"
|
18
|
+
cleans "\x00f\x00ox", "\x00f\x00o"
|
19
|
+
cleans "\x00f\x00o\x00\x00", "\x00f\x00o"
|
26
20
|
end
|
27
|
-
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
22
|
+
encoding "UCS-4BE" do
|
23
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o"
|
24
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00x", "\x00\x00\x00f\x00\x00\x00o"
|
25
|
+
cleans "\x00\x00\x00f\x00\x00\x00o\x00\x00\x00\x00", "\x00\x00\x00f\x00\x00\x00o"
|
26
|
+
cleans "\xFF\xFF\x10\x10", ""
|
27
|
+
end
|
32
28
|
|
33
|
-
|
34
|
-
|
29
|
+
context "unknown encoding" do
|
30
|
+
it "raises an exception" do
|
31
|
+
expect { described_class.new('foobar') }.
|
32
|
+
to raise_error(Coder::InvalidEncoding)
|
33
|
+
end
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
|
-
describe Coder::Cleaner
|
38
|
-
|
37
|
+
describe Coder::Cleaner do
|
38
|
+
include CleanHelpers
|
39
|
+
it_behaves_like Coder::Cleaner
|
40
|
+
|
41
|
+
it { should support('UTF-8') }
|
42
|
+
it { should support('UCS-2BE') }
|
43
|
+
it { should support('UCS-4BE') }
|
39
44
|
end
|
40
45
|
|
41
46
|
describe Coder::Cleaner::Simple do
|
42
|
-
|
47
|
+
include CleanHelpers
|
48
|
+
it_behaves_like Coder::Cleaner
|
49
|
+
|
50
|
+
it { should support('UTF-8') }
|
51
|
+
it { should support('UCS-2BE') }
|
52
|
+
it { should support('UCS-4BE') }
|
43
53
|
end
|
54
|
+
|
55
|
+
describe Coder::Cleaner::Builtin do
|
56
|
+
include CleanHelpers
|
57
|
+
it_behaves_like Coder::Cleaner
|
58
|
+
|
59
|
+
it { should support('UTF-8') }
|
60
|
+
it { should support('UCS-2BE') }
|
61
|
+
it { should support('UCS-4BE') }
|
62
|
+
end if Coder::Cleaner::Builtin.available?
|
63
|
+
|
64
|
+
describe Coder::Cleaner::Java do
|
65
|
+
include CleanHelpers
|
66
|
+
it_behaves_like Coder::Cleaner
|
67
|
+
|
68
|
+
it { should support('UTF-8') }
|
69
|
+
it { should_not support('UCS-2BE') }
|
70
|
+
it { should_not support('UCS-4BE') }
|
71
|
+
end if Coder::Cleaner::Java.available?
|
72
|
+
|
73
|
+
describe Coder::Cleaner::Iconv do
|
74
|
+
include CleanHelpers
|
75
|
+
it_behaves_like Coder::Cleaner
|
76
|
+
|
77
|
+
it { should support('UTF-8') }
|
78
|
+
it { should_not support('UCS-2BE') }
|
79
|
+
it { should_not support('UCS-4BE') }
|
80
|
+
end if Coder::Cleaner::Iconv.available?
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module CleanHelpers
|
2
|
+
module ClassMethods
|
3
|
+
def encoding(encoding, &block)
|
4
|
+
return unless described_class.supports? encoding
|
5
|
+
context(encoding) do
|
6
|
+
let(:encoding) { encoding}
|
7
|
+
instance_eval(&block)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def cleans(from, to = from)
|
12
|
+
it "cleans #{from.inspect} to #{to.inspect}" do
|
13
|
+
result = described_class.new(encoding).clean(binary(from))
|
14
|
+
binary(result).should be == binary(to)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def binary(str)
|
20
|
+
return str unless str.respond_to? :force_encoding
|
21
|
+
str.force_encoding('binary')
|
22
|
+
end
|
23
|
+
|
24
|
+
def support(encoding)
|
25
|
+
be_supports(encoding)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.append_features(obj)
|
29
|
+
obj.extend ClassMethods
|
30
|
+
obj.subject { obj.described_class }
|
31
|
+
super
|
32
|
+
end
|
33
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- LICENSE.txt
|
57
57
|
- README.md
|
58
58
|
- Rakefile
|
59
|
+
- bench.rb
|
59
60
|
- coder.gemspec
|
60
61
|
- lib/coder.rb
|
61
62
|
- lib/coder/cleaner.rb
|
@@ -68,6 +69,7 @@ files:
|
|
68
69
|
- lib/coder/error.rb
|
69
70
|
- lib/coder/version.rb
|
70
71
|
- spec/coder/cleaner_spec.rb
|
72
|
+
- spec/support/clean_helpers.rb
|
71
73
|
homepage: http://github.com/rkh/coder
|
72
74
|
licenses: []
|
73
75
|
post_install_message:
|
@@ -94,4 +96,5 @@ specification_version: 3
|
|
94
96
|
summary: library to handle encodings
|
95
97
|
test_files:
|
96
98
|
- spec/coder/cleaner_spec.rb
|
99
|
+
- spec/support/clean_helpers.rb
|
97
100
|
has_rdoc:
|