guess_html_encoding 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,3 +1,4 @@
1
1
  .idea
2
2
  *.swp
3
+ pkg
3
4
  .DS_Store
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.1)
4
+ guess_html_encoding (0.0.2)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/README CHANGED
@@ -0,0 +1,52 @@
1
+ GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
2
+
3
+ describe "GuessHtmlEncoding" do
4
+ describe "#guess" do
5
+ it "can use headers" do
6
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
7
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
8
+ guess.should == "ISO-8859-1"
9
+ end
10
+
11
+ it "accepts headers as a hash as well" do
12
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
13
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
14
+ guess.should == "ISO-8859-1"
15
+ end
16
+
17
+ it "accepts meta tags" do
18
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
19
+ guess.should == "ISO-8859-1"
20
+ end
21
+
22
+ ...
23
+ end
24
+
25
+ describe "#encode" do
26
+ it "should work on correctly encoded pages" do
27
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
28
+ data.force_encoding("ASCII-8BIT")
29
+ data.should be_valid_encoding # everything is valid in binary
30
+
31
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
32
+ data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
33
+
34
+ encoded = GuessHtmlEncoding.encode(data)
35
+ encoded.encoding.to_s.should == "UTF-8"
36
+ encoded.should be_valid_encoding
37
+ end
38
+
39
+ it "should work on incorrectly encoded pages" do
40
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
41
+ data.force_encoding("ASCII-8BIT")
42
+ data.should be_valid_encoding # everything is valid in binary
43
+
44
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
45
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
46
+
47
+ encoded = GuessHtmlEncoding.encode(data)
48
+ encoded.encoding.to_s.should == "UTF-8"
49
+ encoded.should be_valid_encoding
50
+ end
51
+ end
52
+ end
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -1,15 +1,20 @@
1
1
  require "guess_html_encoding/version"
2
2
 
3
+ # A small and simple library for guessing the encoding of HTML in Ruby 1.9.
3
4
  module GuessHtmlEncoding
5
+ # Guess the encoding of an HTML string, using HTTP headers if provided. HTTP headers can be a string or a hash.
4
6
  def self.guess(html, headers = nil)
5
7
  html = html.dup.force_encoding("ASCII-8BIT")
6
- headers = headers.dup.force_encoding("ASCII-8BIT") if headers
7
-
8
8
  out = nil
9
9
 
10
- (headers || "").split("\n").map {|i| i.split(":")}.each do |k,v|
11
- if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
12
- out = $1.upcase
10
+ if headers
11
+ headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
12
+ headers = headers.dup.force_encoding("ASCII-8BIT")
13
+ headers.split("\n").map {|i| i.split(":")}.each do |k,v|
14
+ if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
15
+ out = $1.upcase
16
+ break
17
+ end
13
18
  end
14
19
  end
15
20
 
@@ -19,6 +24,7 @@ module GuessHtmlEncoding
19
24
  end
20
25
  end
21
26
 
27
+ # Translate encodings with other names.
22
28
  if out
23
29
  out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
24
30
  out = "CP1251" if out.upcase == "CP-1251"
@@ -29,6 +35,7 @@ module GuessHtmlEncoding
29
35
  out
30
36
  end
31
37
 
38
+ # Force an HTML string into a guessed encoding.
32
39
  def self.encode(html, headers = nil)
33
40
  encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
34
41
  html.force_encoding(encoding ? encoding : "UTF-8")
@@ -3,15 +3,21 @@ require 'spec_helper'
3
3
 
4
4
  describe "GuessHtmlEncoding" do
5
5
  describe "#guess" do
6
- it "prefers headers" do
6
+ it "can use headers" do
7
7
  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
- "Hello: world\nContent-Type: text/html; charset=utf-8\nFoo: bar")
9
- guess.should == "UTF-8"
8
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
9
+ guess.should == "ISO-8859-1"
10
+ end
11
+
12
+ it "accepts headers as a hash as well" do
13
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
14
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
15
+ guess.should == "ISO-8859-1"
10
16
  end
11
17
 
12
18
  it "accepts meta tags" do
13
- guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
14
- guess.should == "UTF-8"
19
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
20
+ guess.should == "ISO-8859-1"
15
21
  end
16
22
 
17
23
  it "works okay when there is a semi-colon after the encoding with headers" do
@@ -44,13 +50,26 @@ describe "GuessHtmlEncoding" do
44
50
  end
45
51
 
46
52
  describe "#encode" do
53
+ it "should work on correctly encoded pages" do
54
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
55
+ data.force_encoding("ASCII-8BIT")
56
+ data.should be_valid_encoding # everything is valid in binary
57
+
58
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
59
+ data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
60
+
61
+ encoded = GuessHtmlEncoding.encode(data)
62
+ encoded.encoding.to_s.should == "UTF-8"
63
+ encoded.should be_valid_encoding
64
+ end
65
+
47
66
  it "should work on incorrectly encoded pages" do
48
67
  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
49
68
  data.force_encoding("ASCII-8BIT")
50
69
  data.should be_valid_encoding # everything is valid in binary
51
70
 
52
71
  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
53
- data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
72
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
54
73
 
55
74
  encoded = GuessHtmlEncoding.encode(data)
56
75
  encoded.encoding.to_s.should == "UTF-8"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-29 00:00:00.000000000Z
12
+ date: 2011-10-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70102956990840 !ruby/object:Gem::Requirement
16
+ requirement: &70345601879200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70102956990840
24
+ version_requirements: *70345601879200
25
25
  description: This gem helps guess the encoding of an HTML page.
26
26
  email:
27
27
  - andrew@iterationlabs.com