guess_html_encoding 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/.rvmrc +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +24 -0
- data/README +0 -0
- data/Rakefile +6 -0
- data/guess_html_encoding.gemspec +24 -0
- data/lib/guess_html_encoding/version.rb +3 -0
- data/lib/guess_html_encoding.rb +41 -0
- data/spec/guess_html_encoding_spec.rb +60 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +5 -0
- metadata +71 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.2@guess_html_encoding --create
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
guess_html_encoding (0.0.1)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.1.3)
|
10
|
+
rspec (2.6.0)
|
11
|
+
rspec-core (~> 2.6.0)
|
12
|
+
rspec-expectations (~> 2.6.0)
|
13
|
+
rspec-mocks (~> 2.6.0)
|
14
|
+
rspec-core (2.6.4)
|
15
|
+
rspec-expectations (2.6.0)
|
16
|
+
diff-lcs (~> 1.1.2)
|
17
|
+
rspec-mocks (2.6.0)
|
18
|
+
|
19
|
+
PLATFORMS
|
20
|
+
ruby
|
21
|
+
|
22
|
+
DEPENDENCIES
|
23
|
+
guess_html_encoding!
|
24
|
+
rspec
|
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "guess_html_encoding/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "guess_html_encoding"
|
7
|
+
s.version = GuessHtmlEncoding::VERSION
|
8
|
+
s.authors = ["Andrew Cantino (Iteration Labs, LLC)"]
|
9
|
+
s.email = ["andrew@iterationlabs.com"]
|
10
|
+
s.homepage = "http://github.com/iterationlabs/guess_html_encoding"
|
11
|
+
s.summary = %q{This gem helps guess the encoding of an HTML page.}
|
12
|
+
s.description = %q{This gem helps guess the encoding of an HTML page.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "guess_html_encoding"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
# s.add_runtime_dependency "rest-client"
|
24
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "guess_html_encoding/version"
|
2
|
+
|
3
|
+
module GuessHtmlEncoding
|
4
|
+
def self.guess(html, headers = nil)
|
5
|
+
html = html.dup.force_encoding("ASCII-8BIT")
|
6
|
+
headers = headers.dup.force_encoding("ASCII-8BIT") if headers
|
7
|
+
|
8
|
+
out = nil
|
9
|
+
|
10
|
+
(headers || "").split("\n").map {|i| i.split(":")}.each do |k,v|
|
11
|
+
if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
|
12
|
+
out = $1.upcase
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
if out.nil? || out.empty? || !Encoding.name_list.include?(out)
|
17
|
+
if html =~ /<meta[^>]*HTTP-EQUIV=["']Content-Type["'][^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
|
18
|
+
out = $1.upcase
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
if out
|
23
|
+
out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
|
24
|
+
out = "CP1251" if out.upcase == "CP-1251"
|
25
|
+
out = "ISO-8859-1" if ["LATIN1", "LATIN-1"].include?(out.upcase)
|
26
|
+
out = "Windows-1250" if ["WIN-1251", "WIN1251"].include?(out.upcase)
|
27
|
+
end
|
28
|
+
|
29
|
+
out
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.encode(html, headers = nil)
|
33
|
+
encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
|
34
|
+
html.force_encoding(encoding ? encoding : "UTF-8")
|
35
|
+
if html.valid_encoding?
|
36
|
+
html
|
37
|
+
else
|
38
|
+
html.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe "GuessHtmlEncoding" do
|
5
|
+
describe "#guess" do
|
6
|
+
it "prefers headers" do
|
7
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
8
|
+
"Hello: world\nContent-Type: text/html; charset=utf-8\nFoo: bar")
|
9
|
+
guess.should == "UTF-8"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "accepts meta tags" do
|
13
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
|
14
|
+
guess.should == "UTF-8"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "works okay when there is a semi-colon after the encoding with headers" do
|
18
|
+
guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
|
19
|
+
"Hello: world\nContent-Type: text/html; charset=utf-8;\nFoo: bar")
|
20
|
+
guess.should == "UTF-8"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "works okay when there is a semi-colon after the encoding with meta-tags" do
|
24
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8;"></head><body><div>hi!</div></body></html>')
|
25
|
+
guess.should == "UTF-8"
|
26
|
+
end
|
27
|
+
|
28
|
+
it "converts UTF8 to UTF-8" do
|
29
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>')
|
30
|
+
guess.should == "UTF-8"
|
31
|
+
end
|
32
|
+
|
33
|
+
it "converts CP-1251 to CP1251" do
|
34
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=cp-1251;"></head><body><div>hi!</div></body></html>')
|
35
|
+
guess.should == "CP1251"
|
36
|
+
end
|
37
|
+
|
38
|
+
it "skips the header content type if it's invalid" do
|
39
|
+
guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>',
|
40
|
+
"Hello: world\nContent-Type: text/html; charset=RU;\nFoo: bar")
|
41
|
+
guess.should == "UTF-8"
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe "#encode" do
|
47
|
+
it "should work on incorrectly encoded pages" do
|
48
|
+
data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
|
49
|
+
data.force_encoding("ASCII-8BIT")
|
50
|
+
data.should be_valid_encoding # everything is valid in binary
|
51
|
+
|
52
|
+
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
|
53
|
+
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
|
54
|
+
|
55
|
+
encoded = GuessHtmlEncoding.encode(data)
|
56
|
+
encoded.encoding.to_s.should == "UTF-8"
|
57
|
+
encoded.should be_valid_encoding
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: guess_html_encoding
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrew Cantino (Iteration Labs, LLC)
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-09-29 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70102956990840 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70102956990840
|
25
|
+
description: This gem helps guess the encoding of an HTML page.
|
26
|
+
email:
|
27
|
+
- andrew@iterationlabs.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- .rvmrc
|
34
|
+
- Gemfile
|
35
|
+
- Gemfile.lock
|
36
|
+
- README
|
37
|
+
- Rakefile
|
38
|
+
- guess_html_encoding.gemspec
|
39
|
+
- lib/guess_html_encoding.rb
|
40
|
+
- lib/guess_html_encoding/version.rb
|
41
|
+
- spec/guess_html_encoding_spec.rb
|
42
|
+
- spec/spec.opts
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
homepage: http://github.com/iterationlabs/guess_html_encoding
|
45
|
+
licenses: []
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project: guess_html_encoding
|
64
|
+
rubygems_version: 1.8.6
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: This gem helps guess the encoding of an HTML page.
|
68
|
+
test_files:
|
69
|
+
- spec/guess_html_encoding_spec.rb
|
70
|
+
- spec/spec.opts
|
71
|
+
- spec/spec_helper.rb
|