guess_html_encoding 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README +52 -0
- data/lib/guess_html_encoding/version.rb +1 -1
- data/lib/guess_html_encoding.rb +12 -5
- data/spec/guess_html_encoding_spec.rb +25 -6
- metadata +4 -4
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
|
2
|
+
|
3
|
+
describe "GuessHtmlEncoding" do
|
4
|
+
describe "#guess" do
|
5
|
+
it "can use headers" do
|
6
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
7
|
+
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
8
|
+
guess.should == "ISO-8859-1"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "accepts headers as a hash as well" do
|
12
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
13
|
+
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
14
|
+
guess.should == "ISO-8859-1"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "accepts meta tags" do
|
18
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
|
19
|
+
guess.should == "ISO-8859-1"
|
20
|
+
end
|
21
|
+
|
22
|
+
...
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#encode" do
|
26
|
+
it "should work on correctly encoded pages" do
|
27
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
28
|
+
data.force_encoding("ASCII-8BIT")
|
29
|
+
data.should be_valid_encoding # everything is valid in binary
|
30
|
+
|
31
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
32
|
+
data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
|
33
|
+
|
34
|
+
encoded = GuessHtmlEncoding.encode(data)
|
35
|
+
encoded.encoding.to_s.should == "UTF-8"
|
36
|
+
encoded.should be_valid_encoding
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should work on incorrectly encoded pages" do
|
40
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
41
|
+
data.force_encoding("ASCII-8BIT")
|
42
|
+
data.should be_valid_encoding # everything is valid in binary
|
43
|
+
|
44
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
45
|
+
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
|
46
|
+
|
47
|
+
encoded = GuessHtmlEncoding.encode(data)
|
48
|
+
encoded.encoding.to_s.should == "UTF-8"
|
49
|
+
encoded.should be_valid_encoding
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/guess_html_encoding.rb
CHANGED
@@ -1,15 +1,20 @@
|
|
1
1
|
require "guess_html_encoding/version"
|
2
2
|
|
3
|
+
# A small and simple library for guessing the encoding of HTML in Ruby 1.9.
|
3
4
|
module GuessHtmlEncoding
|
5
|
+
# Guess the encoding of an HTML string, using HTTP headers if provided. HTTP headers can be a string or a hash.
|
4
6
|
def self.guess(html, headers = nil)
|
5
7
|
html = html.dup.force_encoding("ASCII-8BIT")
|
6
|
-
headers = headers.dup.force_encoding("ASCII-8BIT") if headers
|
7
|
-
|
8
8
|
out = nil
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
if headers
|
11
|
+
headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
|
12
|
+
headers = headers.dup.force_encoding("ASCII-8BIT")
|
13
|
+
headers.split("\n").map {|i| i.split(":")}.each do |k,v|
|
14
|
+
if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
|
15
|
+
out = $1.upcase
|
16
|
+
break
|
17
|
+
end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
@@ -19,6 +24,7 @@ module GuessHtmlEncoding
|
|
19
24
|
end
|
20
25
|
end
|
21
26
|
|
27
|
+
# Translate encodings with other names.
|
22
28
|
if out
|
23
29
|
out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
|
24
30
|
out = "CP1251" if out.upcase == "CP-1251"
|
@@ -29,6 +35,7 @@ module GuessHtmlEncoding
|
|
29
35
|
out
|
30
36
|
end
|
31
37
|
|
38
|
+
# Force an HTML string into a guessed encoding.
|
32
39
|
def self.encode(html, headers = nil)
|
33
40
|
encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
|
34
41
|
html.force_encoding(encoding ? encoding : "UTF-8")
|
@@ -3,15 +3,21 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe "GuessHtmlEncoding" do
|
5
5
|
describe "#guess" do
|
6
|
-
it "
|
6
|
+
it "can use headers" do
|
7
7
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
|
-
"Hello: world\nContent-Type: text/html; charset=
|
9
|
-
guess.should == "
|
8
|
+
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
9
|
+
guess.should == "ISO-8859-1"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "accepts headers as a hash as well" do
|
13
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
14
|
+
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
15
|
+
guess.should == "ISO-8859-1"
|
10
16
|
end
|
11
17
|
|
12
18
|
it "accepts meta tags" do
|
13
|
-
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=
|
14
|
-
guess.should == "
|
19
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
|
20
|
+
guess.should == "ISO-8859-1"
|
15
21
|
end
|
16
22
|
|
17
23
|
it "works okay when there is a semi-colon after the encoding with headers" do
|
@@ -44,13 +50,26 @@ describe "GuessHtmlEncoding" do
|
|
44
50
|
end
|
45
51
|
|
46
52
|
describe "#encode" do
|
53
|
+
it "should work on correctly encoded pages" do
|
54
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
55
|
+
data.force_encoding("ASCII-8BIT")
|
56
|
+
data.should be_valid_encoding # everything is valid in binary
|
57
|
+
|
58
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
59
|
+
data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
|
60
|
+
|
61
|
+
encoded = GuessHtmlEncoding.encode(data)
|
62
|
+
encoded.encoding.to_s.should == "UTF-8"
|
63
|
+
encoded.should be_valid_encoding
|
64
|
+
end
|
65
|
+
|
47
66
|
it "should work on incorrectly encoded pages" do
|
48
67
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
49
68
|
data.force_encoding("ASCII-8BIT")
|
50
69
|
data.should be_valid_encoding # everything is valid in binary
|
51
70
|
|
52
71
|
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
53
|
-
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
|
72
|
+
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
|
54
73
|
|
55
74
|
encoded = GuessHtmlEncoding.encode(data)
|
56
75
|
encoded.encoding.to_s.should == "UTF-8"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-10-26 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70345601879200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70345601879200
|
25
25
|
description: This gem helps guess the encoding of an HTML page.
|
26
26
|
email:
|
27
27
|
- andrew@iterationlabs.com
|