ensure_valid_encoding 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +120 -0
- data/Rakefile +10 -0
- data/ensure_valid_encoding.gemspec +17 -0
- data/lib/ensure_valid_encoding.rb +91 -0
- data/lib/ensure_valid_encoding/version.rb +3 -0
- data/test/ensure_valid_encoding_test.rb +80 -0
- metadata +57 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jonathan Rochkind
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# EnsureValidEncoding
|
2
|
+
|
3
|
+
For ruby 1.9 strings, replace bad bytes in given encoding with replacement strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different encoding.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'ensure_valid_encoding'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install ensure_valid_encoding
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
~~~ruby
|
22
|
+
# \xE9 is not valid UTF-8
|
23
|
+
bad_utf8 = "M\xE9xico".force_encoding("UTF-8")
|
24
|
+
|
25
|
+
EnsureValidEncoding.ensure_valid_encoding(bad_utf8)
|
26
|
+
# => raises a Encoding::InvalidByteSequenceError
|
27
|
+
# Note well, sadly, for performance and pain-in-the-neck reasons,
|
28
|
+
# this will not be filled out with byte number, preceeding or succeeding
|
29
|
+
# bytes, or any other metadata normally included with an InvalidByteSequenceError
|
30
|
+
# from stdlib.
|
31
|
+
~~~~
|
32
|
+
|
33
|
+
Uses the same options as String#encode, `:invalid => :replace`, possibly
|
34
|
+
combined with `:replace => custom_replace_string` (which can be empty
|
35
|
+
string if you like).
|
36
|
+
|
37
|
+
~~~ruby
|
38
|
+
fixed = EnsureValidEncoding.ensure_valid_encoding(bad_utf8, :invalid => :replace)
|
39
|
+
# => Replaces invalid bytes with default replacement char.
|
40
|
+
# For unicode encodings, that's unicode replacement code, "\uFFFD",
|
41
|
+
# otherwise, '?'
|
42
|
+
|
43
|
+
fixed = EnsureValidEncoding.ensure_valid_encoding(bad_utf8, :invalid => :replace, :replace => "*")
|
44
|
+
# => "M*xico"
|
45
|
+
~~~
|
46
|
+
|
47
|
+
Mutate a string in-place with replacement chars? No problem, use the bang
|
48
|
+
version.
|
49
|
+
|
50
|
+
~~~ruby
|
51
|
+
EnsureValidEncoding.ensure_valid_encoding!(bad_utf8, :invalid => :replace)
|
52
|
+
# bad_utf8 has been mutated
|
53
|
+
~~~
|
54
|
+
|
55
|
+
For convenience to save you some typing, methods defined as module instance
|
56
|
+
methods too:
|
57
|
+
|
58
|
+
~~~ruby
|
59
|
+
include EnsureValidEncoding
|
60
|
+
fixed = ensure_valid_encoding(bad_str)
|
61
|
+
~~~
|
62
|
+
|
63
|
+
## Rationale
|
64
|
+
|
65
|
+
You are taking textual input from some external source. Could be user input,
|
66
|
+
could be a user-uploaded file of some kind, could be a a third party API, could
|
67
|
+
be anything.
|
68
|
+
|
69
|
+
You know what character encoding the textual data _claims_ to be, what it
|
70
|
+
_should_ be, and what it _usually_ is. But occasionally it may have bad bytes
|
71
|
+
in it, due to data corruption, due to mistakes, due to mis-advertised encoding,
|
72
|
+
due to bugs upstream, due to anything.
|
73
|
+
|
74
|
+
What do you do? If you do nothing, in cases of such corruption, then
|
75
|
+
eventually your code will _probably_ (but not neccesarily) do something
|
76
|
+
that causes some kind of exception to be raised, could be an
|
77
|
+
Encoding::InvalidByteSequenceError, could be something else from somewhere else.
|
78
|
+
May be hard to predict exactly when, if, and what will be raised.
|
79
|
+
But when it does happen, if you're not rescue'ing the exception,
|
80
|
+
your application dies hard.
|
81
|
+
|
82
|
+
Okay, so maybe you manage to catch the exceptions. Or more
|
83
|
+
conveniently you guard by testing `input_str.valid_encoding?` instead of waiting
|
84
|
+
for an exception to be raised. Then what? You could ignore this particular
|
85
|
+
file/stream of input, and have your application go on it's merry way.
|
86
|
+
|
87
|
+
But what if you want to do what most _every other_ mature application that
|
88
|
+
displays textual streams does in the case of bad bytes? Display the parts of the
|
89
|
+
string that _can_ be displayed, replace the other parts with a replacement
|
90
|
+
string of some sort.
|
91
|
+
|
92
|
+
String#encode gives you an API for using a replacement char when converting/transcoding
|
93
|
+
from one encoding to another, but that's not where we are. We know what encoding
|
94
|
+
the string is supposed to be, we don't know any other better encoding to
|
95
|
+
transcode it to -- we just want to do the best we can with it, substituting
|
96
|
+
any illegal bytes. It's what `bash` does. It's what `vim` does. It is
|
97
|
+
surprisingly tricky to do with the ruby 1.9.3 stdlib, or even with 'iconv'.
|
98
|
+
|
99
|
+
So there you go, now you can do it with this gem. Maybe not as performant
|
100
|
+
as if it were implemented in C like stdlib char encoding routines, but, hey.
|
101
|
+
|
102
|
+
**Note well:** A buncha [people on reddit](http://www.reddit.com/r/ruby/comments/sfceq) don't see why you'd ever want to do
|
103
|
+
this. If you agree, then, well, don't do it.
|
104
|
+
|
105
|
+
**Also:** I have [filed a feature request](https://bugs.ruby-lang.org/issues/6321) for ruby stdlib. Developer from
|
106
|
+
ruby core team also thinks it's an unusually odd thing to do. shrug.
|
107
|
+
|
108
|
+
## Developing/Contributing
|
109
|
+
|
110
|
+
Some tests written with minitest/spec. Run with `rake test`.
|
111
|
+
|
112
|
+
Gem built with bundler rake tests. `rake build`, `rake install`, `rake release`.
|
113
|
+
|
114
|
+
Suggestions/improvements welcome.
|
115
|
+
|
116
|
+
1. Fork it
|
117
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
118
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
119
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
120
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/ensure_valid_encoding/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jonathan Rochkind"]
|
6
|
+
gem.email = ["jonathan@dnil.net"]
|
7
|
+
gem.summary = %q{For ruby 1.9 strings, replace bad bytes in given encoding with replacement strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different encoding.
|
8
|
+
}
|
9
|
+
gem.homepage = "https://github.com/jrochkind/ensure_valid_encoding"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "ensure_valid_encoding"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = EnsureValidEncoding::VERSION
|
17
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require "ensure_valid_encoding/version"
|
2
|
+
|
3
|
+
module EnsureValidEncoding
|
4
|
+
|
5
|
+
# Pass in a string, this method promises the return string
|
6
|
+
# will be #valid_encoding? for the input's existing #encoding, or an exception
|
7
|
+
# will be raised.
|
8
|
+
#
|
9
|
+
# With no arguments, an Encoding::InvalidByteSequenceError will be raised
|
10
|
+
# unless str.valid_encoding? Unfortunately, unlike InvalidByteSequenceErrors
|
11
|
+
# raised by stdlib, there will be _no_ line number or preceeding/succeeding
|
12
|
+
# char info included in the exception though, sorry.
|
13
|
+
#
|
14
|
+
# Or, just like String#encode, pass in :invalid => :replace to replace
|
15
|
+
# invalid bytes with a replacement string.
|
16
|
+
#
|
17
|
+
# Just like String#encode, the default replacement string is Unicode
|
18
|
+
# replacement char for Unicode encodings or ascii "?" otherwise.
|
19
|
+
#
|
20
|
+
# Just like String#encode, you can set your own replacement string (including
|
21
|
+
# the empty string) with `:replace => your_string`
|
22
|
+
#
|
23
|
+
# Under ruby 1.8.x (or any ruby without String#encoding),
|
24
|
+
# this method no-ops and just returns it's input.
|
25
|
+
#
|
26
|
+
# EnsureValidEncoding.ensure_valid_encoding( some_string )
|
27
|
+
#
|
28
|
+
# include EnsureValidEncoding
|
29
|
+
# ensure_valid_encoding( some_string, :invalid => :replace)
|
30
|
+
# ensure_valid_encoding( some_string, :invalid => :replace, :replace => '')
|
31
|
+
# ensure_valid_encoding( some_string, :invalid => :replace, :replace => "*")
|
32
|
+
def self.ensure_valid_encoding(str, options = {})
|
33
|
+
# Can do nothing in ruby 1.8.x
|
34
|
+
return str unless str.respond_to?(:encoding)
|
35
|
+
|
36
|
+
# We believe it's fastest to use built in #valid_encoding?
|
37
|
+
# with it's C implementation, and bail out immediately if we need
|
38
|
+
# to do nothing more, rather than stepping through byte by byte
|
39
|
+
# in cases where the string was valid in the first place.
|
40
|
+
if str.valid_encoding?
|
41
|
+
return str
|
42
|
+
elsif options[:invalid] != :replace
|
43
|
+
# If we're not replacing, just raise right away without going through
|
44
|
+
# chars for performance.
|
45
|
+
#
|
46
|
+
# That does mean we're not able to say exactly what byte was bad though.
|
47
|
+
# And the exception isn't filled out with all it's usual attributes,
|
48
|
+
# which would be hard even we were going through all the chars/bytes.
|
49
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
50
|
+
else
|
51
|
+
# :replace => :invalid,
|
52
|
+
# actually need to go through chars to replace bad ones
|
53
|
+
return str.chars.collect do |c|
|
54
|
+
if c.valid_encoding?
|
55
|
+
c
|
56
|
+
else
|
57
|
+
options[:replace] || (
|
58
|
+
# surely there's a better way to tell if
|
59
|
+
# an encoding is a 'Unicode encoding form'
|
60
|
+
# than this? What's wrong with you ruby 1.9?
|
61
|
+
str.encoding.name.start_with?('UTF') ?
|
62
|
+
"\uFFFD".force_encoding(str.encoding) :
|
63
|
+
"?".force_encoding(str.encoding) )
|
64
|
+
end
|
65
|
+
end.join
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# just like #ensure_valid_encoding, but actually mutates
|
70
|
+
# the input string if neccesary to ensure validity (using String#replace),
|
71
|
+
# rather than returning the valid string.
|
72
|
+
#
|
73
|
+
# ensure_valid_encoding!( some_string, :invalid => :replace )
|
74
|
+
def self.ensure_valid_encoding!(str, options = {})
|
75
|
+
str.replace( ensure_valid_encoding(str, options) )
|
76
|
+
end
|
77
|
+
|
78
|
+
# instance version, so you can type less.
|
79
|
+
#
|
80
|
+
# include EnsureValidEncoding
|
81
|
+
# ensure_valid_encoding(bad_str)
|
82
|
+
def ensure_valid_encoding(*args)
|
83
|
+
EnsureValidEncoding.ensure_valid_encoding(*args)
|
84
|
+
end
|
85
|
+
|
86
|
+
def ensure_valid_encoding!(*args)
|
87
|
+
EnsureValidEncoding.ensure_valid_encoding!(*args)
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'minitest/spec'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
|
4
|
+
require 'ensure_valid_encoding'
|
5
|
+
|
6
|
+
describe EnsureValidEncoding do
|
7
|
+
before do
|
8
|
+
@bad_bytes_utf8 = "M\xE9xico".force_encoding("UTF-8")
|
9
|
+
@bad_bytes_ascii = "M\xA1xico".force_encoding("ASCII")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "raises on invalid bytes" do
|
13
|
+
proc do
|
14
|
+
EnsureValidEncoding.ensure_valid_encoding( @bad_bytes_utf8 )
|
15
|
+
end.must_raise Encoding::InvalidByteSequenceError
|
16
|
+
end
|
17
|
+
|
18
|
+
it "replaces with unicode replacement string" do
|
19
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
20
|
+
:invalid => :replace).
|
21
|
+
must_equal("M\uFFFDxico")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "replaces with chosen replacement string" do
|
25
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
26
|
+
:invalid => :replace, :replace => "*").
|
27
|
+
must_equal("M*xico")
|
28
|
+
end
|
29
|
+
|
30
|
+
it "replaces with empty string" do
|
31
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_utf8,
|
32
|
+
:invalid => :replace, :replace => '').
|
33
|
+
must_equal("Mxico")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "replaces non-unicode encoding with ? replacement str" do
|
37
|
+
EnsureValidEncoding.ensure_valid_encoding(@bad_bytes_ascii,
|
38
|
+
:invalid => :replace).
|
39
|
+
must_equal("M?xico")
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "edge cases" do
|
43
|
+
it "works with first byte bad" do
|
44
|
+
str = "\xE9xico".force_encoding("UTF-8")
|
45
|
+
EnsureValidEncoding.ensure_valid_encoding(str,
|
46
|
+
:invalid => :replace,
|
47
|
+
:replace => "?").must_equal("?xico")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "works with last bad byte" do
|
51
|
+
str = "Mexico\xE9".force_encoding("UTF-8")
|
52
|
+
EnsureValidEncoding.ensure_valid_encoding(str,
|
53
|
+
:invalid => :replace,
|
54
|
+
:replace => "?").must_equal("Mexico?")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
it "mutates" do
|
59
|
+
EnsureValidEncoding.ensure_valid_encoding!(@bad_bytes_utf8, :invalid => :replace)
|
60
|
+
@bad_bytes_utf8.must_equal("M\uFFFDxico")
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "instance method versions" do
|
64
|
+
before do
|
65
|
+
@klass = Class.new do
|
66
|
+
include EnsureValidEncoding
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "ensure_valid_encoding" do
|
71
|
+
@klass.new.ensure_valid_encoding("foo", :invalid => :replace)
|
72
|
+
end
|
73
|
+
|
74
|
+
it "ensure_valid_encoding!" do
|
75
|
+
@klass.new.ensure_valid_encoding!("foo", :invalid => :replace)
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ensure_valid_encoding
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jonathan Rochkind
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description:
|
15
|
+
email:
|
16
|
+
- jonathan@dnil.net
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- LICENSE
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- ensure_valid_encoding.gemspec
|
27
|
+
- lib/ensure_valid_encoding.rb
|
28
|
+
- lib/ensure_valid_encoding/version.rb
|
29
|
+
- test/ensure_valid_encoding_test.rb
|
30
|
+
homepage: https://github.com/jrochkind/ensure_valid_encoding
|
31
|
+
licenses: []
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
none: false
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirements: []
|
49
|
+
rubyforge_project:
|
50
|
+
rubygems_version: 1.8.24
|
51
|
+
signing_key:
|
52
|
+
specification_version: 3
|
53
|
+
summary: For ruby 1.9 strings, replace bad bytes in given encoding with replacement
|
54
|
+
strings, _or_ fail quickly on invalid encodings -- _without_ a transcode to a different
|
55
|
+
encoding.
|
56
|
+
test_files:
|
57
|
+
- test/ensure_valid_encoding_test.rb
|