guess_html_encoding 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile.lock +14 -10
- data/lib/guess_html_encoding.rb +2 -2
- data/lib/guess_html_encoding/version.rb +1 -1
- data/spec/guess_html_encoding_spec.rb +48 -44
- metadata +12 -15
- data/.rvmrc +0 -1
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f24b82e186d3e1a58cd2061c7cb1eef2f5b5d1b0
|
4
|
+
data.tar.gz: cbfd0284000e074ef621763a36ca2be60cbed218
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4d68030d7c0af216faa1e1dc029c65b6557287a8349aa89ec2a7a98833de4178a838693d2bf3e866b966edd597951e0d31c26f3a4c33daab30c7afa93692b7a5
|
7
|
+
data.tar.gz: e2ddc685bae62c4cc6e962dd79a4f69863aef512b556957b2ab91113b492c3b07d7315f880c57179c8a9aea30d0279823983a21bbdcc6cbbc697cfc9ef2ada30
|
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
guess_html_encoding
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.5
|
data/Gemfile.lock
CHANGED
@@ -1,20 +1,24 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
guess_html_encoding (0.0.
|
4
|
+
guess_html_encoding (0.0.9)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
|
-
diff-lcs (1.
|
10
|
-
rspec (
|
11
|
-
rspec-core (~>
|
12
|
-
rspec-expectations (~>
|
13
|
-
rspec-mocks (~>
|
14
|
-
rspec-core (
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
diff-lcs (1.2.5)
|
10
|
+
rspec (3.1.0)
|
11
|
+
rspec-core (~> 3.1.0)
|
12
|
+
rspec-expectations (~> 3.1.0)
|
13
|
+
rspec-mocks (~> 3.1.0)
|
14
|
+
rspec-core (3.1.7)
|
15
|
+
rspec-support (~> 3.1.0)
|
16
|
+
rspec-expectations (3.1.2)
|
17
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
18
|
+
rspec-support (~> 3.1.0)
|
19
|
+
rspec-mocks (3.1.3)
|
20
|
+
rspec-support (~> 3.1.0)
|
21
|
+
rspec-support (3.1.2)
|
18
22
|
|
19
23
|
PLATFORMS
|
20
24
|
ruby
|
data/lib/guess_html_encoding.rb
CHANGED
@@ -10,7 +10,7 @@ module GuessHtmlEncoding
|
|
10
10
|
if headers
|
11
11
|
headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
|
12
12
|
headers = headers.dup.force_encoding("ASCII-8BIT")
|
13
|
-
headers.split("\n").map {|i| i.split(":")}.each do |k,v|
|
13
|
+
headers.gsub(/[\r\n]+/, "\n").split("\n").map {|i| i.split(":")}.each do |k,v|
|
14
14
|
if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
|
15
15
|
out = $1.upcase
|
16
16
|
break
|
@@ -42,7 +42,7 @@ module GuessHtmlEncoding
|
|
42
42
|
# Force an HTML string into a guessed encoding.
|
43
43
|
def self.encode(html, headers = nil)
|
44
44
|
html_copy = html.to_s.dup
|
45
|
-
encoding = guess(html_copy,
|
45
|
+
encoding = guess(html_copy, headers)
|
46
46
|
html_copy.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
|
47
47
|
if html_copy.valid_encoding?
|
48
48
|
html_copy
|
@@ -6,59 +6,59 @@ describe "GuessHtmlEncoding" do
|
|
6
6
|
it "can use headers" do
|
7
7
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
8
|
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
|
9
|
-
guess.
|
9
|
+
expect(guess).to eq("ISO-8859-1")
|
10
10
|
end
|
11
11
|
|
12
12
|
it "accepts headers as a hash as well" do
|
13
13
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
14
14
|
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
|
15
|
-
guess.
|
15
|
+
expect(guess).to eq("ISO-8859-1")
|
16
16
|
end
|
17
17
|
|
18
18
|
it "accepts meta tags" do
|
19
19
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
|
20
|
-
guess.
|
20
|
+
expect(guess).to eq("ISO-8859-1")
|
21
21
|
end
|
22
22
|
|
23
23
|
it "works okay when there is a semi-colon after the encoding with headers" do
|
24
24
|
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
25
25
|
"Hello: world\nContent-Type: text/html; charset=utf-8;\nFoo: bar")
|
26
|
-
guess.
|
26
|
+
expect(guess).to eq("UTF-8")
|
27
27
|
end
|
28
28
|
|
29
29
|
it "works okay when there is a semi-colon after the encoding with meta-tags" do
|
30
30
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8;"></head><body><div>hi!</div></body></html>')
|
31
|
-
guess.
|
31
|
+
expect(guess).to eq("UTF-8")
|
32
32
|
end
|
33
33
|
|
34
34
|
it "converts UTF8 to UTF-8" do
|
35
35
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>')
|
36
|
-
guess.
|
36
|
+
expect(guess).to eq("UTF-8")
|
37
37
|
end
|
38
38
|
|
39
39
|
it "converts CP-1251 to CP1251" do
|
40
40
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=cp-1251;"></head><body><div>hi!</div></body></html>')
|
41
|
-
guess.
|
41
|
+
expect(guess).to eq("CP1251")
|
42
42
|
end
|
43
43
|
|
44
44
|
it "skips the header content type if it's invalid" do
|
45
45
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>',
|
46
46
|
"Hello: world\nContent-Type: text/html; charset=RU;\nFoo: bar")
|
47
|
-
guess.
|
47
|
+
expect(guess).to eq("UTF-8")
|
48
48
|
end
|
49
49
|
|
50
50
|
it "translates WIN1251 to WINDOWS-1250" do
|
51
51
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=WIN1251;"></head><body><div>hi!</div></body></html>')
|
52
|
-
guess.
|
52
|
+
expect(guess).to eq("WINDOWS-1250")
|
53
53
|
end
|
54
54
|
|
55
55
|
it "translates GB2312 to GB18030" do
|
56
56
|
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=GB2312;"></head><body><div>hi!</div></body></html>')
|
57
|
-
guess.
|
57
|
+
expect(guess).to eq("GB18030")
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should not raise an exception if data is nil" do
|
61
|
-
GuessHtmlEncoding.guess(nil).
|
61
|
+
expect { GuessHtmlEncoding.guess(nil) }.not_to raise_error
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
@@ -66,97 +66,101 @@ describe "GuessHtmlEncoding" do
|
|
66
66
|
it "should work on correctly encoded pages" do
|
67
67
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
|
68
68
|
data.force_encoding("ASCII-8BIT")
|
69
|
-
data.
|
69
|
+
expect(data).to be_valid_encoding # everything is valid in binary
|
70
70
|
|
71
|
-
GuessHtmlEncoding.guess(data).
|
72
|
-
data.force_encoding("UTF-8").
|
71
|
+
expect(GuessHtmlEncoding.guess(data)).to eq("UTF-8") # because the page says so!
|
72
|
+
expect(data.force_encoding("UTF-8")).to be_valid_encoding # because it really is utf-8
|
73
73
|
|
74
74
|
encoded = GuessHtmlEncoding.encode(data)
|
75
|
-
encoded.encoding.to_s.
|
76
|
-
encoded.
|
75
|
+
expect(encoded.encoding.to_s).to eq("UTF-8")
|
76
|
+
expect(encoded).to be_valid_encoding
|
77
77
|
end
|
78
78
|
|
79
79
|
it "should work on incorrectly encoded pages" do
|
80
80
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
81
81
|
data.force_encoding("ASCII-8BIT")
|
82
|
-
data.
|
82
|
+
expect(data).to be_valid_encoding # everything is valid in binary
|
83
83
|
|
84
|
-
GuessHtmlEncoding.guess(data).
|
85
|
-
data.force_encoding("UTF-8").
|
84
|
+
expect(GuessHtmlEncoding.guess(data)).to eq("UTF-8") # because the page says so!
|
85
|
+
expect(data.force_encoding("UTF-8")).not_to be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
|
86
86
|
|
87
87
|
encoded = GuessHtmlEncoding.encode(data)
|
88
|
-
encoded.encoding.to_s.
|
89
|
-
encoded.
|
88
|
+
expect(encoded.encoding.to_s).to eq("UTF-8")
|
89
|
+
expect(encoded).to be_valid_encoding
|
90
90
|
end
|
91
91
|
|
92
92
|
it "should work on pages encoded with an unknown encoding by forcing them to utf8" do
|
93
93
|
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=x-mac-roman;'></head><body><div>hi!</div></body></html>"
|
94
94
|
data.force_encoding("ASCII-8BIT")
|
95
|
-
data.
|
95
|
+
expect(data).to be_valid_encoding # everything is valid in binary
|
96
96
|
|
97
|
-
GuessHtmlEncoding.guess(data).
|
97
|
+
expect(GuessHtmlEncoding.guess(data)).to eq("X-MAC-ROMAN") # because the page says so!
|
98
98
|
|
99
99
|
encoded = GuessHtmlEncoding.encode(data)
|
100
|
-
encoded.encoding.to_s.
|
101
|
-
encoded.
|
100
|
+
expect(encoded.encoding.to_s).to eq("UTF-8")
|
101
|
+
expect(encoded).to be_valid_encoding
|
102
102
|
|
103
|
-
data.encoding.to_s.
|
103
|
+
expect(data.encoding.to_s).to eq("ASCII-8BIT")
|
104
104
|
end
|
105
105
|
|
106
106
|
it "should not raise an exception if data is nil" do
|
107
|
-
GuessHtmlEncoding.encode(nil).
|
107
|
+
expect { GuessHtmlEncoding.encode(nil) }.not_to raise_error
|
108
108
|
end
|
109
109
|
|
110
|
-
|
111
110
|
it "should work on GB18030 (and translate GB2312 into GB18030)" do
|
112
111
|
data = File.read(File.join(File.dirname(__FILE__), "fixtures/gb18030.html"), :encoding => "binary")
|
113
|
-
GuessHtmlEncoding.encoding_loaded?("GB18030").
|
114
|
-
GuessHtmlEncoding.guess(data).
|
115
|
-
GuessHtmlEncoding.encode(data).encoding.to_s.
|
112
|
+
expect(GuessHtmlEncoding.encoding_loaded?("GB18030")).to be_truthy
|
113
|
+
expect(GuessHtmlEncoding.guess(data)).to eq("GB18030")
|
114
|
+
expect(GuessHtmlEncoding.encode(data).encoding.to_s).to eq("GB18030")
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should work with headers as a hash" do
|
118
|
+
data = File.read(File.join(File.dirname(__FILE__), "fixtures/gb18030.html"), :encoding => "binary")
|
119
|
+
expect(lambda { GuessHtmlEncoding.encode(data, {}) }).not_to raise_error
|
116
120
|
end
|
117
121
|
end
|
118
122
|
|
119
123
|
describe "#encoding_loaded?" do
|
120
124
|
it 'returns true for all loaded encodings' do
|
121
125
|
(Encoding.name_list - ["internal"]).each do |name|
|
122
|
-
GuessHtmlEncoding.encoding_loaded?(name).
|
123
|
-
|
126
|
+
expect(GuessHtmlEncoding.encoding_loaded?(name)).to be_truthy
|
127
|
+
expect { Encoding.find(name) }.not_to raise_error
|
124
128
|
end
|
125
129
|
end
|
126
130
|
|
127
131
|
it 'returns true for uppercase encodings' do
|
128
|
-
GuessHtmlEncoding.encoding_loaded?("WINDOWS-1250").
|
129
|
-
|
132
|
+
expect(GuessHtmlEncoding.encoding_loaded?("WINDOWS-1250")).to be_truthy
|
133
|
+
expect { Encoding.find("WINDOWS-1250") }.not_to raise_error
|
130
134
|
end
|
131
135
|
|
132
136
|
it 'returns true for lowercase encodings' do
|
133
|
-
GuessHtmlEncoding.encoding_loaded?("windows-1250").
|
134
|
-
|
137
|
+
expect(GuessHtmlEncoding.encoding_loaded?("windows-1250")).to be_truthy
|
138
|
+
expect { Encoding.find("windows-1250") }.not_to raise_error
|
135
139
|
end
|
136
140
|
|
137
141
|
it 'returns true for encoding aliases' do
|
138
142
|
Encoding.aliases.keys.each do |key|
|
139
|
-
GuessHtmlEncoding.encoding_loaded?(key).
|
140
|
-
GuessHtmlEncoding.encoding_loaded?(key.upcase).
|
141
|
-
|
142
|
-
|
143
|
+
expect(GuessHtmlEncoding.encoding_loaded?(key)).to be_truthy
|
144
|
+
expect(GuessHtmlEncoding.encoding_loaded?(key.upcase)).to be_truthy
|
145
|
+
expect { Encoding.find(key) }.not_to raise_error
|
146
|
+
expect { Encoding.find(key.upcase) }.not_to raise_error
|
143
147
|
end
|
144
148
|
end
|
145
149
|
|
146
150
|
it 'returns false for irregular or unloaded encoding' do
|
147
|
-
GuessHtmlEncoding.encoding_loaded?('_WHY').
|
151
|
+
expect(GuessHtmlEncoding.encoding_loaded?('_WHY')).to be_falsy
|
148
152
|
end
|
149
153
|
|
150
154
|
it "accepts a simple meta tag" do
|
151
155
|
# Like http://www.taobao.com
|
152
156
|
guess = GuessHtmlEncoding.guess('<html><head><meta charset="gbk" /></head><body><div>hi!</div></body></html>')
|
153
|
-
guess.
|
157
|
+
expect(guess).to eq("GBK")
|
154
158
|
end
|
155
159
|
|
156
160
|
it "works as well when there is no double quotation marks with http-equiv in meta-tags" do
|
157
161
|
# Like http://www.frozentux.net/iptables-tutorial/cn/iptables-tutorial-cn-1.1.19.html
|
158
162
|
guess = GuessHtmlEncoding.guess('<html><head><META http-equiv=Content-Type content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
|
159
|
-
guess.
|
163
|
+
expect(guess).to eq("UTF-8")
|
160
164
|
end
|
161
165
|
end
|
162
166
|
end
|
metadata
CHANGED
@@ -1,30 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: guess_html_encoding
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.10
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Andrew Cantino (Iteration Labs, LLC)
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-12-14 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rspec
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - ">="
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - ">="
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
description: ''
|
@@ -34,8 +31,9 @@ executables: []
|
|
34
31
|
extensions: []
|
35
32
|
extra_rdoc_files: []
|
36
33
|
files:
|
37
|
-
- .gitignore
|
38
|
-
- .
|
34
|
+
- ".gitignore"
|
35
|
+
- ".ruby-gemset"
|
36
|
+
- ".ruby-version"
|
39
37
|
- Gemfile
|
40
38
|
- Gemfile.lock
|
41
39
|
- LICENSE
|
@@ -50,27 +48,26 @@ files:
|
|
50
48
|
- spec/spec_helper.rb
|
51
49
|
homepage: http://github.com/cantino/guess_html_encoding
|
52
50
|
licenses: []
|
51
|
+
metadata: {}
|
53
52
|
post_install_message:
|
54
53
|
rdoc_options: []
|
55
54
|
require_paths:
|
56
55
|
- lib
|
57
56
|
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
-
none: false
|
59
57
|
requirements:
|
60
|
-
- -
|
58
|
+
- - ">="
|
61
59
|
- !ruby/object:Gem::Version
|
62
60
|
version: '0'
|
63
61
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
-
none: false
|
65
62
|
requirements:
|
66
|
-
- -
|
63
|
+
- - ">="
|
67
64
|
- !ruby/object:Gem::Version
|
68
65
|
version: '0'
|
69
66
|
requirements: []
|
70
67
|
rubyforge_project: guess_html_encoding
|
71
|
-
rubygems_version:
|
68
|
+
rubygems_version: 2.2.2
|
72
69
|
signing_key:
|
73
|
-
specification_version:
|
70
|
+
specification_version: 4
|
74
71
|
summary: A small gem that attempts to guess and then force encoding of HTML documents
|
75
72
|
for Ruby 1.9
|
76
73
|
test_files:
|
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm use 1.9.3@guess_html_encoding --create
|