guess_html_encoding 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/LICENSE +19 -0
- data/README.markdown +46 -52
- data/lib/guess_html_encoding/version.rb +1 -1
- data/lib/guess_html_encoding.rb +10 -9
- data/spec/guess_html_encoding_spec.rb +4 -1
- metadata +5 -4
data/Gemfile.lock
CHANGED
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2012, Andrew Cantino (Iteration Labs, LLC)
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.markdown
CHANGED
@@ -1,52 +1,46 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
encoded = GuessHtmlEncoding.encode(data)
|
48
|
-
encoded.encoding.to_s.should == "UTF-8"
|
49
|
-
encoded.should be_valid_encoding
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
1
|
+
# GuessHTMLEncoding
|
2
|
+
|
3
|
+
GuessHTMLEncoding is a simple library to guess HTML encodings in Ruby 1.9. It considers HTTP headers and META tags.
|
4
|
+
|
5
|
+
# Install
|
6
|
+
|
7
|
+
(sudo) gem install guess_html_encoding
|
8
|
+
|
9
|
+
# Usage
|
10
|
+
|
11
|
+
GuessHTMLEncoding can guess the encoding of an HTML file based on the http-equiv content-type:
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'guess_html_encoding'
|
15
|
+
|
16
|
+
guess = GuessHtmlEncoding.guess(<<-HTML)
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<meta http-equiv="content-type" content="text/html; charset=LATIN1">
|
20
|
+
</head>
|
21
|
+
<body>
|
22
|
+
<div>hi!</div>
|
23
|
+
</body>
|
24
|
+
</html>
|
25
|
+
HTML
|
26
|
+
guess.should == "ISO-8859-1"
|
27
|
+
|
28
|
+
You can also give it HTTP headers to guess from, which it will prefer, both as a string or as a hash:
|
29
|
+
|
30
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
31
|
+
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
32
|
+
guess.should == "ISO-8859-1"
|
33
|
+
|
34
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
35
|
+
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
36
|
+
guess.should == "ISO-8859-1"
|
37
|
+
|
38
|
+
It's likely that you want to force the encoding of the given HTML into the guessed encoding. This is easy to do:
|
39
|
+
|
40
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
41
|
+
encoded = GuessHtmlEncoding.encode(data)
|
42
|
+
encoded.encoding.to_s.should == "UTF-8"
|
43
|
+
|
44
|
+
If an encoding cannot be guessed, or Ruby doesn't understand it, UTF-8 will be used and unknown characters will be ignored.
|
45
|
+
|
46
|
+
# Pull requests welcome!
|
data/lib/guess_html_encoding.rb
CHANGED
@@ -26,10 +26,10 @@ module GuessHtmlEncoding
|
|
26
26
|
|
27
27
|
# Translate encodings with other names.
|
28
28
|
if out
|
29
|
-
out = "UTF-8" if [
|
30
|
-
out = "CP1251" if out
|
31
|
-
out = "ISO-8859-1" if [
|
32
|
-
out = "Windows-1250" if [
|
29
|
+
out = "UTF-8" if %w[DEFAULT UTF8 UNICODE].include?(out)
|
30
|
+
out = "CP1251" if out == "CP-1251"
|
31
|
+
out = "ISO-8859-1" if %w[LATIN1 LATIN-1].include?(out)
|
32
|
+
out = "Windows-1250" if %w[WIN-1251 WIN1251].include?(out)
|
33
33
|
end
|
34
34
|
|
35
35
|
out
|
@@ -37,12 +37,13 @@ module GuessHtmlEncoding
|
|
37
37
|
|
38
38
|
# Force an HTML string into a guessed encoding.
|
39
39
|
def self.encode(html, headers = nil)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
html_copy = html.dup
|
41
|
+
encoding = guess(html_copy, (headers || '').gsub(/[\r\n]+/, "\n"))
|
42
|
+
html_copy.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
|
43
|
+
if html_copy.valid_encoding?
|
44
|
+
html_copy
|
44
45
|
else
|
45
|
-
|
46
|
+
html_copy.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
|
46
47
|
end
|
47
48
|
end
|
48
49
|
|
@@ -76,7 +76,7 @@ describe "GuessHtmlEncoding" do
|
|
76
76
|
encoded.should be_valid_encoding
|
77
77
|
end
|
78
78
|
|
79
|
-
it "should work on pages encoded with an
|
79
|
+
it "should work on pages encoded with an unknown encoding by forcing them to utf8" do
|
80
80
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=x-mac-roman;'></head><body><div>hi!</div></body></html>"
|
81
81
|
data.force_encoding("ASCII-8BIT")
|
82
82
|
data.should be_valid_encoding # everything is valid in binary
|
@@ -86,6 +86,8 @@ describe "GuessHtmlEncoding" do
|
|
86
86
|
encoded = GuessHtmlEncoding.encode(data)
|
87
87
|
encoded.encoding.to_s.should == "UTF-8"
|
88
88
|
encoded.should be_valid_encoding
|
89
|
+
|
90
|
+
data.encoding.to_s.should == "ASCII-8BIT"
|
89
91
|
end
|
90
92
|
end
|
91
93
|
|
@@ -95,6 +97,7 @@ describe "GuessHtmlEncoding" do
|
|
95
97
|
GuessHtmlEncoding.encoding_loaded?(name).should be_true
|
96
98
|
end
|
97
99
|
end
|
100
|
+
|
98
101
|
it 'returns false for irregular or unloaded encoding' do
|
99
102
|
GuessHtmlEncoding.encoding_loaded?('_WHY').should be_false
|
100
103
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-04 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &75762100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *75762100
|
25
25
|
description: This gem helps guess the encoding of an HTML page.
|
26
26
|
email:
|
27
27
|
- andrew@iterationlabs.com
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- .rvmrc
|
34
34
|
- Gemfile
|
35
35
|
- Gemfile.lock
|
36
|
+
- LICENSE
|
36
37
|
- README.markdown
|
37
38
|
- Rakefile
|
38
39
|
- guess_html_encoding.gemspec
|