coder 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.travis.yml +16 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +2 -0
- data/coder.gemspec +22 -0
- data/lib/coder.rb +14 -0
- data/lib/coder/cleaner.rb +13 -0
- data/lib/coder/cleaner/builtin.rb +43 -0
- data/lib/coder/cleaner/iconv.rb +40 -0
- data/lib/coder/cleaner/simple.rb +32 -0
- data/lib/coder/cleaner/simple/byte_buffer.rb +53 -0
- data/lib/coder/cleaner/simple/encodings.rb +82 -0
- data/lib/coder/error.rb +7 -0
- data/lib/coder/version.rb +3 -0
- data/spec/coder/cleaner_spec.rb +33 -0
- metadata +96 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
rvm:
|
2
|
+
- 1.8.7
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- rbx-18mode
|
6
|
+
- rbx-19mode
|
7
|
+
- jruby-18mode
|
8
|
+
- jruby-19mode
|
9
|
+
- jruby-head
|
10
|
+
- ruby-head
|
11
|
+
before_install:
|
12
|
+
- export JRUBY_OPTS="--server -Xcext.enabled=false -Xcompile.invokedynamic=false"
|
13
|
+
matrix:
|
14
|
+
allow_failures:
|
15
|
+
- rvm: ruby-head
|
16
|
+
- rvm: jruby-head
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Konstantin Haase
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Coder
|
2
|
+
|
3
|
+
A Ruby library to deal with encodings, no matter if you're on 1.8 or 1.9, JRuby
|
4
|
+
or Rubinius, if you have a working iconv or not, it chooses the best way for you
|
5
|
+
to handle String encodings.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
At the moment, Coder only cleans strings for you. I plan to add string
|
10
|
+
conversion and encoding detection later.
|
11
|
+
|
12
|
+
### Cleaning Strings
|
13
|
+
|
14
|
+
``` ruby
|
15
|
+
clean_string = Coder.clean(dirty_string)
|
16
|
+
```
|
17
|
+
|
18
|
+
You can also specify the encoding:
|
19
|
+
|
20
|
+
|
21
|
+
``` ruby
|
22
|
+
clean_string = Coder.clean(dirty_string, 'UTF-8')
|
23
|
+
```
|
24
|
+
|
25
|
+
You can also modify a string in-place:
|
26
|
+
|
27
|
+
``` ruby
|
28
|
+
Coder.clean! some_string
|
29
|
+
```
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
1. Fork it
|
34
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
35
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
36
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
37
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/coder.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'coder/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "coder"
|
8
|
+
gem.version = Coder::VERSION
|
9
|
+
gem.authors = ["Konstantin Haase"]
|
10
|
+
gem.email = ["konstantin.mailinglists@googlemail.com"]
|
11
|
+
gem.description = %q{handle encodings, no matter what}
|
12
|
+
gem.summary = %q{library to handle encodings}
|
13
|
+
gem.homepage = "http://github.com/rkh/coder"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency("rspec", "~> 2.11")
|
21
|
+
gem.add_development_dependency("rake")
|
22
|
+
end
|
data/lib/coder.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'coder/version'
|
2
|
+
require 'coder/cleaner'
|
3
|
+
|
4
|
+
module Coder
|
5
|
+
extend self
|
6
|
+
|
7
|
+
def clean(str, encoding = nil)
|
8
|
+
Cleaner.new(encoding || 'UTF-8').clean(str)
|
9
|
+
end
|
10
|
+
|
11
|
+
def clean!(str, encoding = nil)
|
12
|
+
str.replace clean(str, encoding)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'coder/cleaner/builtin'
|
2
|
+
require 'coder/cleaner/iconv'
|
3
|
+
require 'coder/cleaner/simple'
|
4
|
+
|
5
|
+
module Coder
|
6
|
+
module Cleaner
|
7
|
+
Default = [Builtin, Iconv, Simple].detect { |c| c.available? }
|
8
|
+
|
9
|
+
def self.new(encoding)
|
10
|
+
Default.new(encoding)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
|
3
|
+
module Coder
|
4
|
+
module Cleaner
|
5
|
+
class Builtin
|
6
|
+
OPTIONS = { :undef => :replace, :invalid => :replace, :replace => "" }
|
7
|
+
|
8
|
+
def self.available?
|
9
|
+
defined? Encoding.find and
|
10
|
+
defined? EncodingError and
|
11
|
+
String.method_defined? :encode and
|
12
|
+
String.method_defined? :force_encoding
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(encoding)
|
16
|
+
@encoding = encoding.to_s.upcase
|
17
|
+
@dummy = @encoding == 'UTF-8' ? 'UTF-16BE' : 'UTF-8' if needs_dummy?
|
18
|
+
@dummy ||= @encoding
|
19
|
+
|
20
|
+
check_encoding
|
21
|
+
end
|
22
|
+
|
23
|
+
def clean(str)
|
24
|
+
str = str.dup.force_encoding(@encoding)
|
25
|
+
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
|
26
|
+
rescue EncodingError => e
|
27
|
+
raise Coder::Error, e.message
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def check_encoding
|
33
|
+
Encoding.find(@encoding)
|
34
|
+
rescue ArgumentError => e
|
35
|
+
raise Coder::InvalidEncoding, e.message
|
36
|
+
end
|
37
|
+
|
38
|
+
def needs_dummy?
|
39
|
+
RUBY_VERSION < '2.0'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Coder
|
5
|
+
module Cleaner
|
6
|
+
class Iconv
|
7
|
+
def self.load_iconv
|
8
|
+
return if defined? ::Iconv
|
9
|
+
stderr_was, $stderr = $stderr, StringIO.new
|
10
|
+
require 'iconv'
|
11
|
+
ensure
|
12
|
+
$stderr = stderr_was if stderr_was
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.new(*)
|
16
|
+
load_iconv
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.available?
|
21
|
+
load_iconv
|
22
|
+
!!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
|
23
|
+
rescue Exception => e
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(encoding)
|
28
|
+
@iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
29
|
+
rescue ::Iconv::InvalidEncoding => e
|
30
|
+
raise Coder::InvalidEncoding, e.message
|
31
|
+
end
|
32
|
+
|
33
|
+
def clean(str)
|
34
|
+
@iconv.iconv(str).gsub("\0", "")
|
35
|
+
rescue ::Iconv::Failure => e
|
36
|
+
raise Coder::Error, e.message
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
require 'coder/cleaner/simple/byte_buffer'
|
3
|
+
require 'coder/cleaner/simple/encodings'
|
4
|
+
|
5
|
+
module Coder
|
6
|
+
module Cleaner
|
7
|
+
class Simple
|
8
|
+
def self.available?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(encoding)
|
13
|
+
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless Encodings.const_defined? const_name
|
15
|
+
@encoding, @name = Encodings.const_get(const_name), encoding
|
16
|
+
end
|
17
|
+
|
18
|
+
def clean(str)
|
19
|
+
bytes = ByteBuffer.new(@encoding)
|
20
|
+
str.each_byte { |b| bytes << b }
|
21
|
+
force_encoding bytes.to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def force_encoding(str)
|
27
|
+
return str unless str.respond_to? :force_encoding
|
28
|
+
str.force_encoding(@name)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Coder
|
2
|
+
module Cleaner
|
3
|
+
class Simple
|
4
|
+
class ByteBuffer
|
5
|
+
attr_accessor :encoding, :bytes, :buffer, :outstanding
|
6
|
+
|
7
|
+
def initialize(encoding)
|
8
|
+
@encoding, @bytes = encoding, []
|
9
|
+
clear_buffer
|
10
|
+
end
|
11
|
+
|
12
|
+
def <<(byte)
|
13
|
+
if encoding.garbage? byte, buffer
|
14
|
+
clear_buffer
|
15
|
+
elsif encoding.single_byte? byte, buffer
|
16
|
+
add(byte)
|
17
|
+
elsif encoding.multibyte? byte, buffer
|
18
|
+
fill_buffer(byte)
|
19
|
+
elsif encoding.multibyte_start? byte, buffer
|
20
|
+
start_buffer(byte, encoding.multibyte_size(byte, buffer))
|
21
|
+
else
|
22
|
+
clear_buffer
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
bytes.pack('C*')
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def clear_buffer
|
33
|
+
start_buffer(nil, 0)
|
34
|
+
end
|
35
|
+
|
36
|
+
def start_buffer(byte, size)
|
37
|
+
@buffer, @outstanding = Array(byte), size
|
38
|
+
end
|
39
|
+
|
40
|
+
def fill_buffer(byte)
|
41
|
+
buffer << byte
|
42
|
+
add(buffer) if buffer.size == outstanding
|
43
|
+
clear_buffer if buffer.size > outstanding
|
44
|
+
end
|
45
|
+
|
46
|
+
def add(input)
|
47
|
+
clear_buffer
|
48
|
+
bytes.concat Array(input)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Coder
|
2
|
+
module Cleaner
|
3
|
+
class Simple
|
4
|
+
module Encodings
|
5
|
+
# Note: This currently does not remove most overlong forms
|
6
|
+
module UTF_8
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def garbage?(input, buffered)
|
10
|
+
return true if input > 244 or input == 192 or input == 193
|
11
|
+
case buffered <=> [244, 143, 191]
|
12
|
+
when -1 then false
|
13
|
+
when 0 then input < 192
|
14
|
+
when 1 then true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def single_byte?(input, buffered)
|
19
|
+
input.between? 1, 127
|
20
|
+
end
|
21
|
+
|
22
|
+
def multibyte_start?(input, buffered)
|
23
|
+
input.between? 192, 244
|
24
|
+
end
|
25
|
+
|
26
|
+
def multibyte?(input, buffered)
|
27
|
+
input.between? 128, 191
|
28
|
+
end
|
29
|
+
|
30
|
+
def multibyte_size(input, buffered)
|
31
|
+
case input
|
32
|
+
when 192..223 then 2
|
33
|
+
when 224..239 then 3
|
34
|
+
when 240..247 then 4
|
35
|
+
when 248..244 then 5
|
36
|
+
when 001..127 then 1
|
37
|
+
else 0
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module UCS_2
|
43
|
+
extend self
|
44
|
+
|
45
|
+
def garbage?(input, buffered)
|
46
|
+
return false unless buffered.size + 1 == multibyte_size
|
47
|
+
input == 0 and buffered.all? { |b| b == 0 }
|
48
|
+
end
|
49
|
+
|
50
|
+
def single_byte?(input, buffered)
|
51
|
+
false
|
52
|
+
end
|
53
|
+
|
54
|
+
def multibyte_start?(input, buffered)
|
55
|
+
buffered.size % multibyte_size == 0
|
56
|
+
end
|
57
|
+
|
58
|
+
def multibyte?(input, buffered)
|
59
|
+
not multibyte_start? input, buffered
|
60
|
+
end
|
61
|
+
|
62
|
+
def multibyte_size(*)
|
63
|
+
2
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
module UCS_4
|
68
|
+
include UCS_2
|
69
|
+
extend self
|
70
|
+
|
71
|
+
def garbage?(input, buffered)
|
72
|
+
super or input > 0x10FFFF
|
73
|
+
end
|
74
|
+
|
75
|
+
def multibyte_size(*)
|
76
|
+
4
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/coder/error.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'coder/cleaner'
|
2
|
+
|
3
|
+
shared_examples Coder::Cleaner do
|
4
|
+
let(:encoding) { example.example_group.description }
|
5
|
+
subject { described_class.new(encoding) }
|
6
|
+
|
7
|
+
def self.cleans(from, to = from)
|
8
|
+
it "cleans #{from.inspect} to #{to.inspect}" do
|
9
|
+
subject.clean(from).should be == to
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
context "UTF-8" do
|
14
|
+
cleans "foo"
|
15
|
+
cleans ""
|
16
|
+
cleans "yummy 🍔 "
|
17
|
+
|
18
|
+
cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
|
19
|
+
cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe Coder::Cleaner::Builtin do
|
24
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
25
|
+
end
|
26
|
+
|
27
|
+
describe Coder::Cleaner::Iconv do
|
28
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
29
|
+
end
|
30
|
+
|
31
|
+
describe Coder::Cleaner::Simple do
|
32
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: coder
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Konstantin Haase
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-21 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.11'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.11'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: handle encodings, no matter what
|
47
|
+
email:
|
48
|
+
- konstantin.mailinglists@googlemail.com
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- .gitignore
|
54
|
+
- .travis.yml
|
55
|
+
- Gemfile
|
56
|
+
- LICENSE.txt
|
57
|
+
- README.md
|
58
|
+
- Rakefile
|
59
|
+
- coder.gemspec
|
60
|
+
- lib/coder.rb
|
61
|
+
- lib/coder/cleaner.rb
|
62
|
+
- lib/coder/cleaner/builtin.rb
|
63
|
+
- lib/coder/cleaner/iconv.rb
|
64
|
+
- lib/coder/cleaner/simple.rb
|
65
|
+
- lib/coder/cleaner/simple/byte_buffer.rb
|
66
|
+
- lib/coder/cleaner/simple/encodings.rb
|
67
|
+
- lib/coder/error.rb
|
68
|
+
- lib/coder/version.rb
|
69
|
+
- spec/coder/cleaner_spec.rb
|
70
|
+
homepage: http://github.com/rkh/coder
|
71
|
+
licenses: []
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
78
|
+
requirements:
|
79
|
+
- - ! '>='
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.8.23
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: library to handle encodings
|
94
|
+
test_files:
|
95
|
+
- spec/coder/cleaner_spec.rb
|
96
|
+
has_rdoc:
|