ensure_valid_encoding 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +120 -0
- data/Rakefile +10 -0
- data/ensure_valid_encoding.gemspec +17 -0
- data/lib/ensure_valid_encoding.rb +91 -0
- data/lib/ensure_valid_encoding/version.rb +3 -0
- data/test/ensure_valid_encoding_test.rb +80 -0
- metadata +57 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jonathan Rochkind
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# EnsureValidEncoding
|
2
|
+
|
3
|
+
For ruby 1.9 strings, replace bad bytes in given encoding with replacement strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different encoding.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'ensure_valid_encoding'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install ensure_valid_encoding
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
~~~ruby
|
22
|
+
# \xE9 is not valid UTF-8
|
23
|
+
bad_utf8 = "M\xE9xico".force_encoding("UTF-8")
|
24
|
+
|
25
|
+
EnsureValidEncoding.ensure_valid_encoding(bad_utf8)
|
26
|
+
# => raises a Encoding::InvalidByteSequenceError
|
27
|
+
# Note well, sadly, for performance and pain-in-the-neck reasons,
|
28
|
+
# this will not be filled out with byte number, preceeding or succeeding
|
29
|
+
# bytes, or any other metadata normally included with an InvalidByteSequenceError
|
30
|
+
# from stdlib.
|
31
|
+
~~~~
|
32
|
+
|
33
|
+
Uses the same options as String#encode, `:invalid => :replace`, possibly
|
34
|
+
combined with `:replace => custom_replace_string` (which can be empty
|
35
|
+
string if you like).
|
36
|
+
|
37
|
+
~~~ruby
|
38
|
+
fixed = EnsureValidEncoding.ensure_valid_encoding(bad_utf8, :invalid => :replace)
|
39
|
+
# => Replaces invalid bytes with default replacement char.
|
40
|
+
# For unicode encodings, that's unicode replacement code, "\uFFFD",
|
41
|
+
# otherwise, '?'
|
42
|
+
|
43
|
+
fixed = EnsureValidEncoding.ensure_valid_encoding(bad_utf8, :invalid => :replace, :replace => "*")
|
44
|
+
# => "M*xico"
|
45
|
+
~~~
|
46
|
+
|
47
|
+
Mutate a string in-place with replacement chars? No problem, use the bang
|
48
|
+
version.
|
49
|
+
|
50
|
+
~~~ruby
|
51
|
+
EnsureValidEncoding.ensure_valid_encoding!(bad_utf8, :invalid => :replace)
|
52
|
+
# bad_utf8 has been mutated
|
53
|
+
~~~
|
54
|
+
|
55
|
+
For convenience to save you some typing, methods defined as module instance
|
56
|
+
methods too:
|
57
|
+
|
58
|
+
~~~ruby
|
59
|
+
include EnsureValidEncoding
|
60
|
+
fixed = ensure_valid_encoding(bad_str)
|
61
|
+
~~~
|
62
|
+
|
63
|
+
## Rationale
|
64
|
+
|
65
|
+
You are taking textual input from some external source. Could be user input,
|
66
|
+
could be a user-uploaded file of some kind, could be a a third party API, could
|
67
|
+
be anything.
|
68
|
+
|
69
|
+
You know what character encoding the textual data _claims_ to be, what it
|
70
|
+
_should_ be, and what it _usually_ is. But occasionally it may have bad bytes
|
71
|
+
in it, due to data corruption, due to mistakes, due to mis-advertised encoding,
|
72
|
+
due to bugs upstream, due to anything.
|
73
|
+
|
74
|
+
What do you do? If you do nothing, in cases of such corruption, then
|
75
|
+
eventually your code will _probably_ (but not neccesarily) do something
|
76
|
+
that causes some kind of exception to be raised, could be an
|
77
|
+
Encoding::InvalidByteSequenceError, could be something else from somewhere else.
|
78
|
+
May be hard to predict exactly when, if, and what will be raised.
|
79
|
+
But when it does happen, if you're not rescue'ing the exception,
|
80
|
+
your application dies hard.
|
81
|
+
|
82
|
+
Okay, so maybe you manage to catch the exceptions. Or more
|
83
|
+
conveniently you guard by testing `input_str.valid_encoding?` instead of waiting
|
84
|
+
for an exception to be raised. Then what? You could ignore this particular
|
85
|
+
file/stream of input, and have your application go on it's merry way.
|
86
|
+
|
87
|
+
But what if you want to do what most _every other_ mature application that
|
88
|
+
displays textual streams does in the case of bad bytes? Display the parts of the
|
89
|
+
string that _can_ be displayed, replace the other parts with a replacement
|
90
|
+
string of some sort.
|
91
|
+
|
92
|
+
String#encode gives you an API for using a replacement char when converting/transcoding
|
93
|
+
from one encoding to another, but that's not where we are. We know what encoding
|
94
|
+
the string is supposed to be, we don't know any other better encoding to
|
95
|
+
transcode it to -- we just want to do the best we can with it, substituting
|
96
|
+
any illegal bytes. It's what `bash` does. It's what `vim` does. It is
|
97
|
+
surprisingly tricky to do with the ruby 1.9.3 stdlib, or even with 'iconv'.
|
98
|
+
|
99
|
+
So there you go, now you can do it with this gem. Maybe not as performant
|
100
|
+
as if it were implemented in C like stdlib char encoding routines, but, hey.
|
101
|
+
|
102
|
+
**Note well:** A buncha [people on reddit](http://www.reddit.com/r/ruby/comments/sfceq) don't see why you'd ever want to do
|
103
|
+
this. If you agree, then, well, don't do it.
|
104
|
+
|
105
|
+
**Also:** I have [filed a feature request](https://bugs.ruby-lang.org/issues/6321) for ruby stdlib. Developer from
|
106
|
+
ruby core team also thinks it's an unusually odd thing to do. shrug.
|
107
|
+
|
108
|
+
## Developing/Contributing
|
109
|
+
|
110
|
+
Some tests written with minitest/spec. Run with `rake test`.
|
111
|
+
|
112
|
+
Gem built with bundler rake tests. `rake build`, `rake install`, `rake release`.
|
113
|
+
|
114
|
+
Suggestions/improvements welcome.
|
115
|
+
|
116
|
+
1. Fork it
|
117
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
118
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
119
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
120
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/ensure_valid_encoding/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jonathan Rochkind"]
|
6
|
+
gem.email = ["jonathan@dnil.net"]
|
7
|
+
gem.summary = %q{For ruby 1.9 strings, replace bad bytes in given encoding with replacement strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different encoding.
|
8
|
+
}
|
9
|
+
gem.homepage = "https://github.com/jrochkind/ensure_valid_encoding"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "ensure_valid_encoding"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = EnsureValidEncoding::VERSION
|
17
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require "ensure_valid_encoding/version"
|
2
|
+
|
3
|
+
module EnsureValidEncoding
|
4
|
+
|
5
|
+
# Pass in a string, this method promises the return string
|
6
|
+
# will be #valid_encoding? for the input's existing #encoding, or an exception
|
7
|
+
# will be raised.
|
8
|
+
#
|
9
|
+
# With no arguments, an Encoding::InvalidByteSequenceError will be raised
|
10
|
+
# unless str.valid_encoding? Unfortunately, unlike InvalidByteSequenceErrors
|
11
|
+
# raised by stdlib, there will be _no_ line number or preceeding/succeeding
|
12
|
+
# char info included in the exception though, sorry.
|
13
|
+
#
|
14
|
+
# Or, just like String#encode, pass in :invalid => :replace to replace
|
15
|
+
# invalid bytes with a replacement string.
|
16
|
+
#
|
17
|
+
# Just like String#encode, the default replacement string is Unicode
|
18
|
+
# replacement char for Unicode encodings or ascii "?" otherwise.
|
19
|
+
#
|
20
|
+
# Just like String#encode, you can set your own replacement string (including
|
21
|
+
# the empty string) with `:replace => your_string`
|
22
|
+
#
|
23
|
+
# Under ruby 1.8.x (or any ruby without String#encoding),
|
24
|
+
# this method no-ops and just returns it's input.
|
25
|
+
#
|
26
|
+
# EnsureValidEncoding.ensure_valid_encoding( some_string )
|
27
|
+
#
|
28
|
+
# include EnsureValidEncoding
|
29
|
+
# ensure_valid_encoding( some_string, :invalid => :replace)
|
30
|
+
# ensure_valid_encoding( some_string, :invalid => :replace, :replace => '')
|
31
|
+
# ensure_valid_encoding( some_string, :invalid => :replace, :replace => "*")
|
32
|
+
def self.ensure_valid_encoding(str, options = {})
|
33
|
+
# Can do nothing in ruby 1.8.x
|
34
|
+
return str unless str.respond_to?(:encoding)
|
35
|
+
|
36
|
+
# We believe it's fastest to use built in #valid_encoding?
|
37
|
+
# with it's C implementation, and bail out immediately if we need
|
38
|
+
# to do nothing more, rather than stepping through byte by byte
|
39
|
+
# in cases where the string was valid in the first place.
|
40
|
+
if str.valid_encoding?
|
41
|
+
return str
|
42
|
+
elsif options[:invalid] != :replace
|
43
|
+
# If we're not replacing, just raise right away without going through
|
44
|
+
# chars for performance.
|
45
|
+
#
|
46
|
+
# That does mean we're not able to say exactly what byte was bad though.
|
47
|
+
# And the exception isn't filled out with all it's usual attributes,
|
48
|
+
# which would be hard even we were going through all the chars/bytes.
|
49
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
50
|
+
else
|
51
|
+
# :replace => :invalid,
|
52
|
+
# actually need to go through chars to replace bad ones
|
53
|
+
return str.chars.collect do |c|
|
54
|
+
if c.valid_encoding?
|
55
|
+
c
|
56
|
+
else
|
57
|
+
options[:replace] || (
|
58
|
+
# surely there's a better way to tell if
|
59
|
+
# an encoding is a 'Unicode encoding form'
|
60
|
+
# than this? What's wrong with you ruby 1.9?
|
61
|
+
str.encoding.name.start_with?('UTF') ?
|
62
|
+
"\uFFFD".force_encoding(str.encoding) :
|
63
|
+
"?".force_encoding(str.encoding) )
|
64
|
+
end
|
65
|
+
end.join
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# just like #ensure_valid_encoding, but actually mutates
|
70
|
+
# the input string if neccesary to ensure validity (using String#replace),
|
71
|
+
# rather than returning the valid string.
|
72
|
+
#
|
73
|
+
# ensure_valid_encoding!( some_string, :invalid => :replace )
|
74
|
+
def self.ensure_valid_encoding!(str, options = {})
|
75
|
+
str.replace( ensure_valid_encoding(str, options) )
|
76
|
+
end
|
77
|
+
|
78
|
+
# instance version, so you can type less.
|
79
|
+
#
|
80
|
+
# include EnsureValidEncoding
|
81
|
+
# ensure_valid_encoding(bad_str)
|
82
|
+
def ensure_valid_encoding(*args)
|
83
|
+
EnsureValidEncoding.ensure_valid_encoding(*args)
|
84
|
+
end
|
85
|
+
|
86
|
+
def ensure_valid_encoding!(*args)
|
87
|
+
EnsureValidEncoding.ensure_valid_encoding!(*args)
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'minitest/spec'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
|
4
|
+
require 'ensure_valid_encoding'
|
5
|
+
|
6
|
+
describe EnsureValidEncoding do
|
7
|
+
before do
|
8
|
+
@bad_bytes_utf8 = "M\xE9xico".force_encoding("UTF-8")
|
9
|
+
@bad_bytes_ascii = "M\xA1xico".force_encoding("ASCII")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "raises on invalid bytes" do
|
13
|
+
proc do
|
14
|
+
EnsureValidEncoding.ensure_valid_encoding( @bad_bytes_utf8 )
|
15
|
+
end.must_raise Encoding::InvalidByteSequenceError
|
16
|
+
end
|
17
|
+
|
18
|
+
it "replaces with unicode replacement string" do
|
19
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
20
|
+
:invalid => :replace).
|
21
|
+
must_equal("M\uFFFDxico")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "replaces with chosen replacement string" do
|
25
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
26
|
+
:invalid => :replace, :replace => "*").
|
27
|
+
must_equal("M*xico")
|
28
|
+
end
|
29
|
+
|
30
|
+
it "replaces with empty string" do
|
31
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
32
|
+
:invalid => :replace, :replace => '').
|
33
|
+
must_equal("Mxico")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "replaces non-unicode encoding with ? replacement str" do
|
37
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_ascii,
|
38
|
+
:invalid => :replace).
|
39
|
+
must_equal("M?xico")
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "edge cases" do
|
43
|
+
it "works with first byte bad" do
|
44
|
+
str = "\xE9xico".force_encoding("UTF-8")
|
45
|
+
EnsureValidEncoding.ensure_valid_encoding(str,
|
46
|
+
:invalid => :replace,
|
47
|
+
:replace => "?").must_equal("?xico")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "works with last bad byte" do
|
51
|
+
str = "Mexico\xE9".force_encoding("UTF-8")
|
52
|
+
EnsureValidEncoding.ensure_valid_encoding(str,
|
53
|
+
:invalid => :replace,
|
54
|
+
:replace => "?").must_equal("Mexico?")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
it "mutates" do
|
59
|
+
EnsureValidEncoding.ensure_valid_encoding!(@bad_bytes_utf8, :invalid => :replace)
|
60
|
+
@bad_bytes_utf8.must_equal("M\uFFFDxico")
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "instance method versions" do
|
64
|
+
before do
|
65
|
+
@klass = Class.new do
|
66
|
+
include EnsureValidEncoding
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "ensure_valid_encoding" do
|
71
|
+
@klass.new.ensure_valid_encoding("foo", :invalid => :replace)
|
72
|
+
end
|
73
|
+
|
74
|
+
it "ensure_valid_encoding!" do
|
75
|
+
@klass.new.ensure_valid_encoding!("foo", :invalid => :replace)
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ensure_valid_encoding
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jonathan Rochkind
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description:
|
15
|
+
email:
|
16
|
+
- jonathan@dnil.net
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- LICENSE
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- ensure_valid_encoding.gemspec
|
27
|
+
- lib/ensure_valid_encoding.rb
|
28
|
+
- lib/ensure_valid_encoding/version.rb
|
29
|
+
- test/ensure_valid_encoding_test.rb
|
30
|
+
homepage: https://github.com/jrochkind/ensure_valid_encoding
|
31
|
+
licenses: []
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
none: false
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirements: []
|
49
|
+
rubyforge_project:
|
50
|
+
rubygems_version: 1.8.24
|
51
|
+
signing_key:
|
52
|
+
specification_version: 3
|
53
|
+
summary: For ruby 1.9 strings, replace bad bytes in given encoding with replacement
|
54
|
+
strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different
|
55
|
+
encoding.
|
56
|
+
test_files:
|
57
|
+
- test/ensure_valid_encoding_test.rb
|