guess_html_encoding 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .idea
2
+ *.swp
3
+ .DS_Store
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@guess_html_encoding --create
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in guess_html_encoding.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ guess_html_encoding (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.3)
10
+ rspec (2.6.0)
11
+ rspec-core (~> 2.6.0)
12
+ rspec-expectations (~> 2.6.0)
13
+ rspec-mocks (~> 2.6.0)
14
+ rspec-core (2.6.4)
15
+ rspec-expectations (2.6.0)
16
+ diff-lcs (~> 1.1.2)
17
+ rspec-mocks (2.6.0)
18
+
19
+ PLATFORMS
20
+ ruby
21
+
22
+ DEPENDENCIES
23
+ guess_html_encoding!
24
+ rspec
data/README ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "guess_html_encoding/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "guess_html_encoding"
7
+ s.version = GuessHtmlEncoding::VERSION
8
+ s.authors = ["Andrew Cantino (Iteration Labs, LLC)"]
9
+ s.email = ["andrew@iterationlabs.com"]
10
+ s.homepage = "http://github.com/iterationlabs/guess_html_encoding"
11
+ s.summary = %q{This gem helps guess the encoding of an HTML page.}
12
+ s.description = %q{This gem helps guess the encoding of an HTML page.}
13
+
14
+ s.rubyforge_project = "guess_html_encoding"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ # s.add_runtime_dependency "rest-client"
24
+ end
@@ -0,0 +1,3 @@
1
+ module GuessHtmlEncoding
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,41 @@
1
+ require "guess_html_encoding/version"
2
+
3
+ module GuessHtmlEncoding
4
+ def self.guess(html, headers = nil)
5
+ html = html.dup.force_encoding("ASCII-8BIT")
6
+ headers = headers.dup.force_encoding("ASCII-8BIT") if headers
7
+
8
+ out = nil
9
+
10
+ (headers || "").split("\n").map {|i| i.split(":")}.each do |k,v|
11
+ if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
12
+ out = $1.upcase
13
+ end
14
+ end
15
+
16
+ if out.nil? || out.empty? || !Encoding.name_list.include?(out)
17
+ if html =~ /<meta[^>]*HTTP-EQUIV=["']Content-Type["'][^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
18
+ out = $1.upcase
19
+ end
20
+ end
21
+
22
+ if out
23
+ out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
24
+ out = "CP1251" if out.upcase == "CP-1251"
25
+ out = "ISO-8859-1" if ["LATIN1", "LATIN-1"].include?(out.upcase)
26
+ out = "Windows-1250" if ["WIN-1251", "WIN1251"].include?(out.upcase)
27
+ end
28
+
29
+ out
30
+ end
31
+
32
+ def self.encode(html, headers = nil)
33
+ encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
34
+ html.force_encoding(encoding ? encoding : "UTF-8")
35
+ if html.valid_encoding?
36
+ html
37
+ else
38
+ html.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+
4
+ describe "GuessHtmlEncoding" do
5
+ describe "#guess" do
6
+ it "prefers headers" do
7
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
+ "Hello: world\nContent-Type: text/html; charset=utf-8\nFoo: bar")
9
+ guess.should == "UTF-8"
10
+ end
11
+
12
+ it "accepts meta tags" do
13
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
14
+ guess.should == "UTF-8"
15
+ end
16
+
17
+ it "works okay when there is a semi-colon after the encoding with headers" do
18
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
19
+ "Hello: world\nContent-Type: text/html; charset=utf-8;\nFoo: bar")
20
+ guess.should == "UTF-8"
21
+ end
22
+
23
+ it "works okay when there is a semi-colon after the encoding with meta-tags" do
24
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8;"></head><body><div>hi!</div></body></html>')
25
+ guess.should == "UTF-8"
26
+ end
27
+
28
+ it "converts UTF8 to UTF-8" do
29
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>')
30
+ guess.should == "UTF-8"
31
+ end
32
+
33
+ it "converts CP-1251 to CP1251" do
34
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=cp-1251;"></head><body><div>hi!</div></body></html>')
35
+ guess.should == "CP1251"
36
+ end
37
+
38
+ it "skips the header content type if it's invalid" do
39
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>',
40
+ "Hello: world\nContent-Type: text/html; charset=RU;\nFoo: bar")
41
+ guess.should == "UTF-8"
42
+ end
43
+
44
+ end
45
+
46
+ describe "#encode" do
47
+ it "should work on incorrectly encoded pages" do
48
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
49
+ data.force_encoding("ASCII-8BIT")
50
+ data.should be_valid_encoding # everything is valid in binary
51
+
52
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
53
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
54
+
55
+ encoded = GuessHtmlEncoding.encode(data)
56
+ encoded.encoding.to_s.should == "UTF-8"
57
+ encoded.should be_valid_encoding
58
+ end
59
+ end
60
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'guess_html_encoding'
3
+
4
+ RSpec.configure do |c|
5
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guess_html_encoding
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino (Iteration Labs, LLC)
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-09-29 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70102956990840 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70102956990840
25
+ description: This gem helps guess the encoding of an HTML page.
26
+ email:
27
+ - andrew@iterationlabs.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - .rvmrc
34
+ - Gemfile
35
+ - Gemfile.lock
36
+ - README
37
+ - Rakefile
38
+ - guess_html_encoding.gemspec
39
+ - lib/guess_html_encoding.rb
40
+ - lib/guess_html_encoding/version.rb
41
+ - spec/guess_html_encoding_spec.rb
42
+ - spec/spec.opts
43
+ - spec/spec_helper.rb
44
+ homepage: http://github.com/iterationlabs/guess_html_encoding
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project: guess_html_encoding
64
+ rubygems_version: 1.8.6
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: This gem helps guess the encoding of an HTML page.
68
+ test_files:
69
+ - spec/guess_html_encoding_spec.rb
70
+ - spec/spec.opts
71
+ - spec/spec_helper.rb