guess_html_encoding 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,3 +1,4 @@
1
1
  .idea
2
2
  *.swp
3
+ pkg
3
4
  .DS_Store
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.1)
4
+ guess_html_encoding (0.0.2)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/README CHANGED
@@ -0,0 +1,52 @@
1
+ GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
2
+
3
+ describe "GuessHtmlEncoding" do
4
+ describe "#guess" do
5
+ it "can use headers" do
6
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
7
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
8
+ guess.should == "ISO-8859-1"
9
+ end
10
+
11
+ it "accepts headers as a hash as well" do
12
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
13
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
14
+ guess.should == "ISO-8859-1"
15
+ end
16
+
17
+ it "accepts meta tags" do
18
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
19
+ guess.should == "ISO-8859-1"
20
+ end
21
+
22
+ ...
23
+ end
24
+
25
+ describe "#encode" do
26
+ it "should work on correctly encoded pages" do
27
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
28
+ data.force_encoding("ASCII-8BIT")
29
+ data.should be_valid_encoding # everything is valid in binary
30
+
31
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
32
+ data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
33
+
34
+ encoded = GuessHtmlEncoding.encode(data)
35
+ encoded.encoding.to_s.should == "UTF-8"
36
+ encoded.should be_valid_encoding
37
+ end
38
+
39
+ it "should work on incorrectly encoded pages" do
40
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
41
+ data.force_encoding("ASCII-8BIT")
42
+ data.should be_valid_encoding # everything is valid in binary
43
+
44
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
45
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
46
+
47
+ encoded = GuessHtmlEncoding.encode(data)
48
+ encoded.encoding.to_s.should == "UTF-8"
49
+ encoded.should be_valid_encoding
50
+ end
51
+ end
52
+ end
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -1,15 +1,20 @@
1
1
  require "guess_html_encoding/version"
2
2
 
3
+ # A small and simple library for guessing the encoding of HTML in Ruby 1.9.
3
4
  module GuessHtmlEncoding
5
+ # Guess the encoding of an HTML string, using HTTP headers if provided. HTTP headers can be a string or a hash.
4
6
  def self.guess(html, headers = nil)
5
7
  html = html.dup.force_encoding("ASCII-8BIT")
6
- headers = headers.dup.force_encoding("ASCII-8BIT") if headers
7
-
8
8
  out = nil
9
9
 
10
- (headers || "").split("\n").map {|i| i.split(":")}.each do |k,v|
11
- if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
12
- out = $1.upcase
10
+ if headers
11
+ headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
12
+ headers = headers.dup.force_encoding("ASCII-8BIT")
13
+ headers.split("\n").map {|i| i.split(":")}.each do |k,v|
14
+ if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
15
+ out = $1.upcase
16
+ break
17
+ end
13
18
  end
14
19
  end
15
20
 
@@ -19,6 +24,7 @@ module GuessHtmlEncoding
19
24
  end
20
25
  end
21
26
 
27
+ # Translate encodings with other names.
22
28
  if out
23
29
  out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
24
30
  out = "CP1251" if out.upcase == "CP-1251"
@@ -29,6 +35,7 @@ module GuessHtmlEncoding
29
35
  out
30
36
  end
31
37
 
38
+ # Force an HTML string into a guessed encoding.
32
39
  def self.encode(html, headers = nil)
33
40
  encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
34
41
  html.force_encoding(encoding ? encoding : "UTF-8")
@@ -3,15 +3,21 @@ require 'spec_helper'
3
3
 
4
4
  describe "GuessHtmlEncoding" do
5
5
  describe "#guess" do
6
- it "prefers headers" do
6
+ it "can use headers" do
7
7
  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
8
- "Hello: world\nContent-Type: text/html; charset=utf-8\nFoo: bar")
9
- guess.should == "UTF-8"
8
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
9
+ guess.should == "ISO-8859-1"
10
+ end
11
+
12
+ it "accepts headers as a hash as well" do
13
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
14
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
15
+ guess.should == "ISO-8859-1"
10
16
  end
11
17
 
12
18
  it "accepts meta tags" do
13
- guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head><body><div>hi!</div></body></html>')
14
- guess.should == "UTF-8"
19
+ guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
20
+ guess.should == "ISO-8859-1"
15
21
  end
16
22
 
17
23
  it "works okay when there is a semi-colon after the encoding with headers" do
@@ -44,13 +50,26 @@ describe "GuessHtmlEncoding" do
44
50
  end
45
51
 
46
52
  describe "#encode" do
53
+ it "should work on correctly encoded pages" do
54
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
55
+ data.force_encoding("ASCII-8BIT")
56
+ data.should be_valid_encoding # everything is valid in binary
57
+
58
+ GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
59
+ data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
60
+
61
+ encoded = GuessHtmlEncoding.encode(data)
62
+ encoded.encoding.to_s.should == "UTF-8"
63
+ encoded.should be_valid_encoding
64
+ end
65
+
47
66
  it "should work on incorrectly encoded pages" do
48
67
  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
49
68
  data.force_encoding("ASCII-8BIT")
50
69
  data.should be_valid_encoding # everything is valid in binary
51
70
 
52
71
  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
53
- data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
72
+ data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
54
73
 
55
74
  encoded = GuessHtmlEncoding.encode(data)
56
75
  encoded.encoding.to_s.should == "UTF-8"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-29 00:00:00.000000000Z
12
+ date: 2011-10-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70102956990840 !ruby/object:Gem::Requirement
16
+ requirement: &70345601879200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70102956990840
24
+ version_requirements: *70345601879200
25
25
  description: This gem helps guess the encoding of an HTML page.
26
26
  email:
27
27
  - andrew@iterationlabs.com