guess_html_encoding 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .idea
2
+ *.swp
3
+ .DS_Store
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@guess_html_encoding --create
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in guess_html_encoding.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ guess_html_encoding (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.3)
10
+ rspec (2.6.0)
11
+ rspec-core (~> 2.6.0)
12
+ rspec-expectations (~> 2.6.0)
13
+ rspec-mocks (~> 2.6.0)
14
+ rspec-core (2.6.4)
15
+ rspec-expectations (2.6.0)
16
+ diff-lcs (~> 1.1.2)
17
+ rspec-mocks (2.6.0)
18
+
19
+ PLATFORMS
20
+ ruby
21
+
22
+ DEPENDENCIES
23
+ guess_html_encoding!
24
+ rspec
data/README ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "guess_html_encoding/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "guess_html_encoding"
7
+ s.version = GuessHtmlEncoding::VERSION
8
+ s.authors = ["Andrew Cantino (Iteration Labs, LLC)"]
9
+ s.email = ["andrew@iterationlabs.com"]
10
+ s.homepage = "http://github.com/iterationlabs/guess_html_encoding"
11
+ s.summary = %q{This gem helps guess the encoding of an HTML page.}
12
+ s.description = %q{This gem helps guess the encoding of an HTML page.}
13
+
14
+ s.rubyforge_project = "guess_html_encoding"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ # s.add_runtime_dependency "rest-client"
24
+ end
@@ -0,0 +1,3 @@
1
+ module GuessHtmlEncoding
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,41 @@
1
+ require "guess_html_encoding/version"
2
+
3
+ module GuessHtmlEncoding
4
+ def self.guess(html, headers = nil)
5
+ html = html.dup.force_encoding("ASCII-8BIT")
6
+ headers = headers.dup.force_encoding("ASCII-8BIT") if headers
7
+
8
+ out = nil
9
+
10
+ (headers || "").split("\n").map {|i| i.split(":")}.each do |k,v|
11
+ if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
12
+ out = $1.upcase
13
+ end
14
+ end
15
+
16
+ if out.nil? || out.empty? || !Encoding.name_list.include?(out)
17
+ if html =~ /<meta[^>]*HTTP-EQUIV=["']Content-Type["'][^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
18
+ out = $1.upcase
19
+ end
20
+ end
21
+
22
+ if out
23
+ out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
24
+ out = "CP1251" if out.upcase == "CP-1251"
25
+ out = "ISO-8859-1" if ["LATIN1", "LATIN-1"].include?(out.upcase)
26
+ out = "Windows-1250" if ["WIN-1251", "WIN1251"].include?(out.upcase)
27
+ end
28
+
29
+ out
30
+ end
31
+
32
+ def self.encode(html, headers = nil)
33
+ encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
34
+ html.force_encoding(encoding ? encoding : "UTF-8")
35
+ if html.valid_encoding?
36
+ html
37
+ else
38
+ html.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+
4
+ describe "GuessHtmlEncoding" do
5
+ describe "#guess" do
6
+ it "prefers headers" do
7
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
+ "Hello: world\nContent-Type: text/html; charset=utf-8\nFoo: bar")
9
+ guess.should == "UTF-8"
10
+ end
11
+
12
+ it "accepts meta tags" do
13
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
14
+ guess.should == "UTF-8"
15
+ end
16
+
17
+ it "works okay when there is a semi-colon after the encoding with headers" do
18
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
19
+ "Hello: world\nContent-Type: text/html; charset=utf-8;\nFoo: bar")
20
+ guess.should == "UTF-8"
21
+ end
22
+
23
+ it "works okay when there is a semi-colon after the encoding with meta-tags" do
24
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8;"></head><body><div>hi!</div></body></html>')
25
+ guess.should == "UTF-8"
26
+ end
27
+
28
+ it "converts UTF8 to UTF-8" do
29
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>')
30
+ guess.should == "UTF-8"
31
+ end
32
+
33
+ it "converts CP-1251 to CP1251" do
34
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=cp-1251;"></head><body><div>hi!</div></body></html>')
35
+ guess.should == "CP1251"
36
+ end
37
+
38
+ it "skips the header content type if it's invalid" do
39
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf8;"></head><body><div>hi!</div></body></html>',
40
+ "Hello: world\nContent-Type: text/html; charset=RU;\nFoo: bar")
41
+ guess.should == "UTF-8"
42
+ end
43
+
44
+ end
45
+
46
+ describe "#encode" do
47
+ it "should work on incorrectly encoded pages" do
48
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
49
+ data.force_encoding("ASCII-8BIT")
50
+ data.should be_valid_encoding # everything is valid in binary
51
+
52
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
53
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
54
+
55
+ encoded = GuessHtmlEncoding.encode(data)
56
+ encoded.encoding.to_s.should == "UTF-8"
57
+ encoded.should be_valid_encoding
58
+ end
59
+ end
60
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'guess_html_encoding'
3
+
4
+ RSpec.configure do |c|
5
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guess_html_encoding
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino (Iteration Labs, LLC)
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-09-29 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70102956990840 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70102956990840
25
+ description: This gem helps guess the encoding of an HTML page.
26
+ email:
27
+ - andrew@iterationlabs.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - .rvmrc
34
+ - Gemfile
35
+ - Gemfile.lock
36
+ - README
37
+ - Rakefile
38
+ - guess_html_encoding.gemspec
39
+ - lib/guess_html_encoding.rb
40
+ - lib/guess_html_encoding/version.rb
41
+ - spec/guess_html_encoding_spec.rb
42
+ - spec/spec.opts
43
+ - spec/spec_helper.rb
44
+ homepage: http://github.com/iterationlabs/guess_html_encoding
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project: guess_html_encoding
64
+ rubygems_version: 1.8.6
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: This gem helps guess the encoding of an HTML page.
68
+ test_files:
69
+ - spec/guess_html_encoding_spec.rb
70
+ - spec/spec.opts
71
+ - spec/spec_helper.rb