guess_html_encoding 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README +52 -0
- data/lib/guess_html_encoding/version.rb +1 -1
- data/lib/guess_html_encoding.rb +12 -5
- data/spec/guess_html_encoding_spec.rb +25 -6
- metadata +4 -4
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
|
2
|
+
|
3
|
+
describe "GuessHtmlEncoding" do
|
4
|
+
describe "#guess" do
|
5
|
+
it "can use headers" do
|
6
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
7
|
+
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
8
|
+
guess.should == "ISO-8859-1"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "accepts headers as a hash as well" do
|
12
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
13
|
+
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
14
|
+
guess.should == "ISO-8859-1"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "accepts meta tags" do
|
18
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
|
19
|
+
guess.should == "ISO-8859-1"
|
20
|
+
end
|
21
|
+
|
22
|
+
...
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#encode" do
|
26
|
+
it "should work on correctly encoded pages" do
|
27
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
28
|
+
data.force_encoding("ASCII-8BIT")
|
29
|
+
data.should be_valid_encoding # everything is valid in binary
|
30
|
+
|
31
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
32
|
+
data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
|
33
|
+
|
34
|
+
encoded = GuessHtmlEncoding.encode(data)
|
35
|
+
encoded.encoding.to_s.should == "UTF-8"
|
36
|
+
encoded.should be_valid_encoding
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should work on incorrectly encoded pages" do
|
40
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
41
|
+
data.force_encoding("ASCII-8BIT")
|
42
|
+
data.should be_valid_encoding # everything is valid in binary
|
43
|
+
|
44
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
45
|
+
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
|
46
|
+
|
47
|
+
encoded = GuessHtmlEncoding.encode(data)
|
48
|
+
encoded.encoding.to_s.should == "UTF-8"
|
49
|
+
encoded.should be_valid_encoding
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/guess_html_encoding.rb
CHANGED
@@ -1,15 +1,20 @@
|
|
1
1
|
require "guess_html_encoding/version"
|
2
2
|
|
3
|
+
# A small and simple library for guessing the encoding of HTML in Ruby 1.9.
|
3
4
|
module GuessHtmlEncoding
|
5
|
+
# Guess the encoding of an HTML string, using HTTP headers if provided. HTTP headers can be a string or a hash.
|
4
6
|
def self.guess(html, headers = nil)
|
5
7
|
html = html.dup.force_encoding("ASCII-8BIT")
|
6
|
-
headers = headers.dup.force_encoding("ASCII-8BIT") if headers
|
7
|
-
|
8
8
|
out = nil
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
if headers
|
11
|
+
headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
|
12
|
+
headers = headers.dup.force_encoding("ASCII-8BIT")
|
13
|
+
headers.split("\n").map {|i| i.split(":")}.each do |k,v|
|
14
|
+
if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
|
15
|
+
out = $1.upcase
|
16
|
+
break
|
17
|
+
end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
@@ -19,6 +24,7 @@ module GuessHtmlEncoding
|
|
19
24
|
end
|
20
25
|
end
|
21
26
|
|
27
|
+
# Translate encodings with other names.
|
22
28
|
if out
|
23
29
|
out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
|
24
30
|
out = "CP1251" if out.upcase == "CP-1251"
|
@@ -29,6 +35,7 @@ module GuessHtmlEncoding
|
|
29
35
|
out
|
30
36
|
end
|
31
37
|
|
38
|
+
# Force an HTML string into a guessed encoding.
|
32
39
|
def self.encode(html, headers = nil)
|
33
40
|
encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
|
34
41
|
html.force_encoding(encoding ? encoding : "UTF-8")
|
@@ -3,15 +3,21 @@ require 'spec_helper'
|
|
3
3
|
|
4
4
|
describe "GuessHtmlEncoding" do
|
5
5
|
describe "#guess" do
|
6
|
-
it "
|
6
|
+
it "can use headers" do
|
7
7
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
|
-
"Hello: world\nContent-Type: text/html; charset=
|
9
|
-
guess.should == "
|
8
|
+
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
9
|
+
guess.should == "ISO-8859-1"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "accepts headers as a hash as well" do
|
13
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
14
|
+
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
15
|
+
guess.should == "ISO-8859-1"
|
10
16
|
end
|
11
17
|
|
12
18
|
it "accepts meta tags" do
|
13
|
-
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=
|
14
|
-
guess.should == "
|
19
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
|
20
|
+
guess.should == "ISO-8859-1"
|
15
21
|
end
|
16
22
|
|
17
23
|
it "works okay when there is a semi-colon after the encoding with headers" do
|
@@ -44,13 +50,26 @@ describe "GuessHtmlEncoding" do
|
|
44
50
|
end
|
45
51
|
|
46
52
|
describe "#encode" do
|
53
|
+
it "should work on correctly encoded pages" do
|
54
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
55
|
+
data.force_encoding("ASCII-8BIT")
|
56
|
+
data.should be_valid_encoding # everything is valid in binary
|
57
|
+
|
58
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
59
|
+
data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
|
60
|
+
|
61
|
+
encoded = GuessHtmlEncoding.encode(data)
|
62
|
+
encoded.encoding.to_s.should == "UTF-8"
|
63
|
+
encoded.should be_valid_encoding
|
64
|
+
end
|
65
|
+
|
47
66
|
it "should work on incorrectly encoded pages" do
|
48
67
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
49
68
|
data.force_encoding("ASCII-8BIT")
|
50
69
|
data.should be_valid_encoding # everything is valid in binary
|
51
70
|
|
52
71
|
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
53
|
-
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
|
72
|
+
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
|
54
73
|
|
55
74
|
encoded = GuessHtmlEncoding.encode(data)
|
56
75
|
encoded.encoding.to_s.should == "UTF-8"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-10-26 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70345601879200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70345601879200
|
25
25
|
description: This gem helps guess the encoding of an HTML page.
|
26
26
|
email:
|
27
27
|
- andrew@iterationlabs.com
|