coder 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.travis.yml +16 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +2 -0
- data/coder.gemspec +22 -0
- data/lib/coder.rb +14 -0
- data/lib/coder/cleaner.rb +13 -0
- data/lib/coder/cleaner/builtin.rb +43 -0
- data/lib/coder/cleaner/iconv.rb +40 -0
- data/lib/coder/cleaner/simple.rb +32 -0
- data/lib/coder/cleaner/simple/byte_buffer.rb +53 -0
- data/lib/coder/cleaner/simple/encodings.rb +82 -0
- data/lib/coder/error.rb +7 -0
- data/lib/coder/version.rb +3 -0
- data/spec/coder/cleaner_spec.rb +33 -0
- metadata +96 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
rvm:
|
2
|
+
- 1.8.7
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- rbx-18mode
|
6
|
+
- rbx-19mode
|
7
|
+
- jruby-18mode
|
8
|
+
- jruby-19mode
|
9
|
+
- jruby-head
|
10
|
+
- ruby-head
|
11
|
+
before_install:
|
12
|
+
- export JRUBY_OPTS="--server -Xcext.enabled=false -Xcompile.invokedynamic=false"
|
13
|
+
matrix:
|
14
|
+
allow_failures:
|
15
|
+
- rvm: ruby-head
|
16
|
+
- rvm: jruby-head
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Konstantin Haase
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Coder
|
2
|
+
|
3
|
+
A Ruby library to deal with encodings, no matter if you're on 1.8 or 1.9, JRuby
|
4
|
+
or Rubinius, if you have a working iconv or not, it chooses the best way for you
|
5
|
+
to handle String encodings.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
At the moment, Coder only cleans strings for you. I plan to add string
|
10
|
+
conversion and encoding detection later.
|
11
|
+
|
12
|
+
### Cleaning Strings
|
13
|
+
|
14
|
+
``` ruby
|
15
|
+
clean_string = Coder.clean(dirty_string)
|
16
|
+
```
|
17
|
+
|
18
|
+
You can also specify the encoding:
|
19
|
+
|
20
|
+
|
21
|
+
``` ruby
|
22
|
+
clean_string = Coder.clean(dirty_string, 'UTF-8')
|
23
|
+
```
|
24
|
+
|
25
|
+
You can also modify a string in-place:
|
26
|
+
|
27
|
+
``` ruby
|
28
|
+
Coder.clean! some_string
|
29
|
+
```
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
1. Fork it
|
34
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
35
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
36
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
37
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/coder.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'coder/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "coder"
|
8
|
+
gem.version = Coder::VERSION
|
9
|
+
gem.authors = ["Konstantin Haase"]
|
10
|
+
gem.email = ["konstantin.mailinglists@googlemail.com"]
|
11
|
+
gem.description = %q{handle encodings, no matter what}
|
12
|
+
gem.summary = %q{library to handle encodings}
|
13
|
+
gem.homepage = "http://github.com/rkh/coder"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency("rspec", "~> 2.11")
|
21
|
+
gem.add_development_dependency("rake")
|
22
|
+
end
|
data/lib/coder.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'coder/version'
|
2
|
+
require 'coder/cleaner'
|
3
|
+
|
4
|
+
module Coder
|
5
|
+
extend self
|
6
|
+
|
7
|
+
def clean(str, encoding = nil)
|
8
|
+
Cleaner.new(encoding || 'UTF-8').clean(str)
|
9
|
+
end
|
10
|
+
|
11
|
+
def clean!(str, encoding = nil)
|
12
|
+
str.replace clean(str, encoding)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'coder/cleaner/builtin'
|
2
|
+
require 'coder/cleaner/iconv'
|
3
|
+
require 'coder/cleaner/simple'
|
4
|
+
|
5
|
+
module Coder
|
6
|
+
module Cleaner
|
7
|
+
Default = [Builtin, Iconv, Simple].detect { |c| c.available? }
|
8
|
+
|
9
|
+
def self.new(encoding)
|
10
|
+
Default.new(encoding)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
|
3
|
+
module Coder
|
4
|
+
module Cleaner
|
5
|
+
class Builtin
|
6
|
+
OPTIONS = { :undef => :replace, :invalid => :replace, :replace => "" }
|
7
|
+
|
8
|
+
def self.available?
|
9
|
+
defined? Encoding.find and
|
10
|
+
defined? EncodingError and
|
11
|
+
String.method_defined? :encode and
|
12
|
+
String.method_defined? :force_encoding
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(encoding)
|
16
|
+
@encoding = encoding.to_s.upcase
|
17
|
+
@dummy = @encoding == 'UTF-8' ? 'UTF-16BE' : 'UTF-8' if needs_dummy?
|
18
|
+
@dummy ||= @encoding
|
19
|
+
|
20
|
+
check_encoding
|
21
|
+
end
|
22
|
+
|
23
|
+
def clean(str)
|
24
|
+
str = str.dup.force_encoding(@encoding)
|
25
|
+
str.encode(@dummy, OPTIONS).encode(@encoding).gsub("\0", "")
|
26
|
+
rescue EncodingError => e
|
27
|
+
raise Coder::Error, e.message
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def check_encoding
|
33
|
+
Encoding.find(@encoding)
|
34
|
+
rescue ArgumentError => e
|
35
|
+
raise Coder::InvalidEncoding, e.message
|
36
|
+
end
|
37
|
+
|
38
|
+
def needs_dummy?
|
39
|
+
RUBY_VERSION < '2.0'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Coder
|
5
|
+
module Cleaner
|
6
|
+
class Iconv
|
7
|
+
def self.load_iconv
|
8
|
+
return if defined? ::Iconv
|
9
|
+
stderr_was, $stderr = $stderr, StringIO.new
|
10
|
+
require 'iconv'
|
11
|
+
ensure
|
12
|
+
$stderr = stderr_was if stderr_was
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.new(*)
|
16
|
+
load_iconv
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.available?
|
21
|
+
load_iconv
|
22
|
+
!!::Iconv.conv("iso-8859-1//ignore", "utf-8", "\305\253" + "a"*8160)
|
23
|
+
rescue Exception => e
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(encoding)
|
28
|
+
@iconv = ::Iconv.new("#{encoding}//ignore", encoding.to_s)
|
29
|
+
rescue ::Iconv::InvalidEncoding => e
|
30
|
+
raise Coder::InvalidEncoding, e.message
|
31
|
+
end
|
32
|
+
|
33
|
+
def clean(str)
|
34
|
+
@iconv.iconv(str).gsub("\0", "")
|
35
|
+
rescue ::Iconv::Failure => e
|
36
|
+
raise Coder::Error, e.message
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'coder/error'
|
2
|
+
require 'coder/cleaner/simple/byte_buffer'
|
3
|
+
require 'coder/cleaner/simple/encodings'
|
4
|
+
|
5
|
+
module Coder
|
6
|
+
module Cleaner
|
7
|
+
class Simple
|
8
|
+
def self.available?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(encoding)
|
13
|
+
const_name = encoding.to_s.upcase.gsub('-', '_')
|
14
|
+
raise Coder::InvalidEncoding, "unknown encoding name - #{encoding}" unless Encodings.const_defined? const_name
|
15
|
+
@encoding, @name = Encodings.const_get(const_name), encoding
|
16
|
+
end
|
17
|
+
|
18
|
+
def clean(str)
|
19
|
+
bytes = ByteBuffer.new(@encoding)
|
20
|
+
str.each_byte { |b| bytes << b }
|
21
|
+
force_encoding bytes.to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def force_encoding(str)
|
27
|
+
return str unless str.respond_to? :force_encoding
|
28
|
+
str.force_encoding(@name)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Coder
|
2
|
+
module Cleaner
|
3
|
+
class Simple
|
4
|
+
class ByteBuffer
|
5
|
+
attr_accessor :encoding, :bytes, :buffer, :outstanding
|
6
|
+
|
7
|
+
def initialize(encoding)
|
8
|
+
@encoding, @bytes = encoding, []
|
9
|
+
clear_buffer
|
10
|
+
end
|
11
|
+
|
12
|
+
def <<(byte)
|
13
|
+
if encoding.garbage? byte, buffer
|
14
|
+
clear_buffer
|
15
|
+
elsif encoding.single_byte? byte, buffer
|
16
|
+
add(byte)
|
17
|
+
elsif encoding.multibyte? byte, buffer
|
18
|
+
fill_buffer(byte)
|
19
|
+
elsif encoding.multibyte_start? byte, buffer
|
20
|
+
start_buffer(byte, encoding.multibyte_size(byte, buffer))
|
21
|
+
else
|
22
|
+
clear_buffer
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
bytes.pack('C*')
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def clear_buffer
|
33
|
+
start_buffer(nil, 0)
|
34
|
+
end
|
35
|
+
|
36
|
+
def start_buffer(byte, size)
|
37
|
+
@buffer, @outstanding = Array(byte), size
|
38
|
+
end
|
39
|
+
|
40
|
+
def fill_buffer(byte)
|
41
|
+
buffer << byte
|
42
|
+
add(buffer) if buffer.size == outstanding
|
43
|
+
clear_buffer if buffer.size > outstanding
|
44
|
+
end
|
45
|
+
|
46
|
+
def add(input)
|
47
|
+
clear_buffer
|
48
|
+
bytes.concat Array(input)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Coder
|
2
|
+
module Cleaner
|
3
|
+
class Simple
|
4
|
+
module Encodings
|
5
|
+
# Note: This currently does not remove most overlong forms
|
6
|
+
module UTF_8
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def garbage?(input, buffered)
|
10
|
+
return true if input > 244 or input == 192 or input == 193
|
11
|
+
case buffered <=> [244, 143, 191]
|
12
|
+
when -1 then false
|
13
|
+
when 0 then input < 192
|
14
|
+
when 1 then true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def single_byte?(input, buffered)
|
19
|
+
input.between? 1, 127
|
20
|
+
end
|
21
|
+
|
22
|
+
def multibyte_start?(input, buffered)
|
23
|
+
input.between? 192, 244
|
24
|
+
end
|
25
|
+
|
26
|
+
def multibyte?(input, buffered)
|
27
|
+
input.between? 128, 191
|
28
|
+
end
|
29
|
+
|
30
|
+
def multibyte_size(input, buffered)
|
31
|
+
case input
|
32
|
+
when 192..223 then 2
|
33
|
+
when 224..239 then 3
|
34
|
+
when 240..247 then 4
|
35
|
+
when 248..244 then 5
|
36
|
+
when 001..127 then 1
|
37
|
+
else 0
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module UCS_2
|
43
|
+
extend self
|
44
|
+
|
45
|
+
def garbage?(input, buffered)
|
46
|
+
return false unless buffered.size + 1 == multibyte_size
|
47
|
+
input == 0 and buffered.all? { |b| b == 0 }
|
48
|
+
end
|
49
|
+
|
50
|
+
def single_byte?(input, buffered)
|
51
|
+
false
|
52
|
+
end
|
53
|
+
|
54
|
+
def multibyte_start?(input, buffered)
|
55
|
+
buffered.size % multibyte_size == 0
|
56
|
+
end
|
57
|
+
|
58
|
+
def multibyte?(input, buffered)
|
59
|
+
not multibyte_start? input, buffered
|
60
|
+
end
|
61
|
+
|
62
|
+
def multibyte_size(*)
|
63
|
+
2
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
module UCS_4
|
68
|
+
include UCS_2
|
69
|
+
extend self
|
70
|
+
|
71
|
+
def garbage?(input, buffered)
|
72
|
+
super or input > 0x10FFFF
|
73
|
+
end
|
74
|
+
|
75
|
+
def multibyte_size(*)
|
76
|
+
4
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/coder/error.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'coder/cleaner'
|
2
|
+
|
3
|
+
shared_examples Coder::Cleaner do
|
4
|
+
let(:encoding) { example.example_group.description }
|
5
|
+
subject { described_class.new(encoding) }
|
6
|
+
|
7
|
+
def self.cleans(from, to = from)
|
8
|
+
it "cleans #{from.inspect} to #{to.inspect}" do
|
9
|
+
subject.clean(from).should be == to
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
context "UTF-8" do
|
14
|
+
cleans "foo"
|
15
|
+
cleans ""
|
16
|
+
cleans "yummy 🍔 "
|
17
|
+
|
18
|
+
cleans "{foo \xC3 'bar'}", "{foo 'bar'}"
|
19
|
+
cleans "yummy\xE2 \xF0\x9F\x8D\x94 \x9F\x8D\x94", "yummy 🍔 "
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe Coder::Cleaner::Builtin do
|
24
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
25
|
+
end
|
26
|
+
|
27
|
+
describe Coder::Cleaner::Iconv do
|
28
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
29
|
+
end
|
30
|
+
|
31
|
+
describe Coder::Cleaner::Simple do
|
32
|
+
it_behaves_like Coder::Cleaner if described_class.available?
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: coder
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Konstantin Haase
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-21 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.11'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.11'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: handle encodings, no matter what
|
47
|
+
email:
|
48
|
+
- konstantin.mailinglists@googlemail.com
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- .gitignore
|
54
|
+
- .travis.yml
|
55
|
+
- Gemfile
|
56
|
+
- LICENSE.txt
|
57
|
+
- README.md
|
58
|
+
- Rakefile
|
59
|
+
- coder.gemspec
|
60
|
+
- lib/coder.rb
|
61
|
+
- lib/coder/cleaner.rb
|
62
|
+
- lib/coder/cleaner/builtin.rb
|
63
|
+
- lib/coder/cleaner/iconv.rb
|
64
|
+
- lib/coder/cleaner/simple.rb
|
65
|
+
- lib/coder/cleaner/simple/byte_buffer.rb
|
66
|
+
- lib/coder/cleaner/simple/encodings.rb
|
67
|
+
- lib/coder/error.rb
|
68
|
+
- lib/coder/version.rb
|
69
|
+
- spec/coder/cleaner_spec.rb
|
70
|
+
homepage: http://github.com/rkh/coder
|
71
|
+
licenses: []
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
78
|
+
requirements:
|
79
|
+
- - ! '>='
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.8.23
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: library to handle encodings
|
94
|
+
test_files:
|
95
|
+
- spec/coder/cleaner_spec.rb
|
96
|
+
has_rdoc:
|