guess_html_encoding 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
 - data/Gemfile.lock +1 -1
 - data/README +52 -0
 - data/lib/guess_html_encoding/version.rb +1 -1
 - data/lib/guess_html_encoding.rb +12 -5
 - data/spec/guess_html_encoding_spec.rb +25 -6
 - metadata +4 -4
 
    
        data/.gitignore
    CHANGED
    
    
    
        data/Gemfile.lock
    CHANGED
    
    
    
        data/README
    CHANGED
    
    | 
         @@ -0,0 +1,52 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            describe "GuessHtmlEncoding" do
         
     | 
| 
      
 4 
     | 
    
         
            +
              describe "#guess" do
         
     | 
| 
      
 5 
     | 
    
         
            +
                it "can use headers" do
         
     | 
| 
      
 6 
     | 
    
         
            +
                  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
         
     | 
| 
      
 7 
     | 
    
         
            +
                                                  "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
         
     | 
| 
      
 8 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
      
 9 
     | 
    
         
            +
                end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                it "accepts headers as a hash as well" do
         
     | 
| 
      
 12 
     | 
    
         
            +
                  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
         
     | 
| 
      
 13 
     | 
    
         
            +
                      {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
         
     | 
| 
      
 14 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
      
 15 
     | 
    
         
            +
                end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                it "accepts meta tags" do
         
     | 
| 
      
 18 
     | 
    
         
            +
                  guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
         
     | 
| 
      
 19 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
      
 20 
     | 
    
         
            +
                end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                ...
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              describe "#encode" do
         
     | 
| 
      
 26 
     | 
    
         
            +
                it "should work on correctly encoded pages" do
         
     | 
| 
      
 27 
     | 
    
         
            +
                  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
         
     | 
| 
      
 28 
     | 
    
         
            +
                  data.force_encoding("ASCII-8BIT")
         
     | 
| 
      
 29 
     | 
    
         
            +
                  data.should be_valid_encoding # everything is valid in binary
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
         
     | 
| 
      
 32 
     | 
    
         
            +
                  data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                  encoded = GuessHtmlEncoding.encode(data)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  encoded.encoding.to_s.should == "UTF-8"
         
     | 
| 
      
 36 
     | 
    
         
            +
                  encoded.should be_valid_encoding
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                it "should work on incorrectly encoded pages" do
         
     | 
| 
      
 40 
     | 
    
         
            +
                  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
         
     | 
| 
      
 41 
     | 
    
         
            +
                  data.force_encoding("ASCII-8BIT")
         
     | 
| 
      
 42 
     | 
    
         
            +
                  data.should be_valid_encoding # everything is valid in binary
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
         
     | 
| 
      
 45 
     | 
    
         
            +
                  data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  encoded = GuessHtmlEncoding.encode(data)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  encoded.encoding.to_s.should == "UTF-8"
         
     | 
| 
      
 49 
     | 
    
         
            +
                  encoded.should be_valid_encoding
         
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
      
 52 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/guess_html_encoding.rb
    CHANGED
    
    | 
         @@ -1,15 +1,20 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require "guess_html_encoding/version"
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
      
 3 
     | 
    
         
            +
            # A small and simple library for guessing the encoding of HTML in Ruby 1.9.
         
     | 
| 
       3 
4 
     | 
    
         
             
            module GuessHtmlEncoding
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Guess the encoding of an HTML string, using HTTP headers if provided.  HTTP headers can be a string or a hash.
         
     | 
| 
       4 
6 
     | 
    
         
             
              def self.guess(html, headers = nil)
         
     | 
| 
       5 
7 
     | 
    
         
             
                html = html.dup.force_encoding("ASCII-8BIT")
         
     | 
| 
       6 
     | 
    
         
            -
                headers = headers.dup.force_encoding("ASCII-8BIT") if headers
         
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
8 
     | 
    
         
             
                out = nil
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                 
     | 
| 
       11 
     | 
    
         
            -
                   
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
      
 10 
     | 
    
         
            +
                if headers
         
     | 
| 
      
 11 
     | 
    
         
            +
                  headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
         
     | 
| 
      
 12 
     | 
    
         
            +
                  headers = headers.dup.force_encoding("ASCII-8BIT")
         
     | 
| 
      
 13 
     | 
    
         
            +
                  headers.split("\n").map {|i| i.split(":")}.each do |k,v|
         
     | 
| 
      
 14 
     | 
    
         
            +
                    if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
         
     | 
| 
      
 15 
     | 
    
         
            +
                      out = $1.upcase
         
     | 
| 
      
 16 
     | 
    
         
            +
                      break
         
     | 
| 
      
 17 
     | 
    
         
            +
                    end
         
     | 
| 
       13 
18 
     | 
    
         
             
                  end
         
     | 
| 
       14 
19 
     | 
    
         
             
                end
         
     | 
| 
       15 
20 
     | 
    
         | 
| 
         @@ -19,6 +24,7 @@ module GuessHtmlEncoding 
     | 
|
| 
       19 
24 
     | 
    
         
             
                  end
         
     | 
| 
       20 
25 
     | 
    
         
             
                end
         
     | 
| 
       21 
26 
     | 
    
         | 
| 
      
 27 
     | 
    
         
            +
                # Translate encodings with other names.
         
     | 
| 
       22 
28 
     | 
    
         
             
                if out
         
     | 
| 
       23 
29 
     | 
    
         
             
                  out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
         
     | 
| 
       24 
30 
     | 
    
         
             
                  out = "CP1251" if out.upcase == "CP-1251"
         
     | 
| 
         @@ -29,6 +35,7 @@ module GuessHtmlEncoding 
     | 
|
| 
       29 
35 
     | 
    
         
             
                out
         
     | 
| 
       30 
36 
     | 
    
         
             
              end
         
     | 
| 
       31 
37 
     | 
    
         | 
| 
      
 38 
     | 
    
         
            +
              # Force an HTML string into a guessed encoding.
         
     | 
| 
       32 
39 
     | 
    
         
             
              def self.encode(html, headers = nil)
         
     | 
| 
       33 
40 
     | 
    
         
             
                encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
         
     | 
| 
       34 
41 
     | 
    
         
             
                html.force_encoding(encoding ? encoding : "UTF-8")
         
     | 
| 
         @@ -3,15 +3,21 @@ require 'spec_helper' 
     | 
|
| 
       3 
3 
     | 
    
         | 
| 
       4 
4 
     | 
    
         
             
            describe "GuessHtmlEncoding" do
         
     | 
| 
       5 
5 
     | 
    
         
             
              describe "#guess" do
         
     | 
| 
       6 
     | 
    
         
            -
                it " 
     | 
| 
      
 6 
     | 
    
         
            +
                it "can use headers" do
         
     | 
| 
       7 
7 
     | 
    
         
             
                  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
         
     | 
| 
       8 
     | 
    
         
            -
                                                  "Hello: world\nContent-Type: text/html; charset= 
     | 
| 
       9 
     | 
    
         
            -
                  guess.should == " 
     | 
| 
      
 8 
     | 
    
         
            +
                                                  "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
         
     | 
| 
      
 9 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                it "accepts headers as a hash as well" do
         
     | 
| 
      
 13 
     | 
    
         
            +
                  guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
         
     | 
| 
      
 14 
     | 
    
         
            +
                      {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
         
     | 
| 
      
 15 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
       10 
16 
     | 
    
         
             
                end
         
     | 
| 
       11 
17 
     | 
    
         | 
| 
       12 
18 
     | 
    
         
             
                it "accepts meta tags" do
         
     | 
| 
       13 
     | 
    
         
            -
                  guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset= 
     | 
| 
       14 
     | 
    
         
            -
                  guess.should == " 
     | 
| 
      
 19 
     | 
    
         
            +
                  guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
         
     | 
| 
      
 20 
     | 
    
         
            +
                  guess.should == "ISO-8859-1"
         
     | 
| 
       15 
21 
     | 
    
         
             
                end
         
     | 
| 
       16 
22 
     | 
    
         | 
| 
       17 
23 
     | 
    
         
             
                it "works okay when there is a semi-colon after the encoding with headers" do
         
     | 
| 
         @@ -44,13 +50,26 @@ describe "GuessHtmlEncoding" do 
     | 
|
| 
       44 
50 
     | 
    
         
             
              end
         
     | 
| 
       45 
51 
     | 
    
         | 
| 
       46 
52 
     | 
    
         
             
              describe "#encode" do
         
     | 
| 
      
 53 
     | 
    
         
            +
                it "should work on correctly encoded pages" do
         
     | 
| 
      
 54 
     | 
    
         
            +
                  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
         
     | 
| 
      
 55 
     | 
    
         
            +
                  data.force_encoding("ASCII-8BIT")
         
     | 
| 
      
 56 
     | 
    
         
            +
                  data.should be_valid_encoding # everything is valid in binary
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
         
     | 
| 
      
 59 
     | 
    
         
            +
                  data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                  encoded = GuessHtmlEncoding.encode(data)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  encoded.encoding.to_s.should == "UTF-8"
         
     | 
| 
      
 63 
     | 
    
         
            +
                  encoded.should be_valid_encoding
         
     | 
| 
      
 64 
     | 
    
         
            +
                end
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
       47 
66 
     | 
    
         
             
                it "should work on incorrectly encoded pages" do
         
     | 
| 
       48 
67 
     | 
    
         
             
                  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
         
     | 
| 
       49 
68 
     | 
    
         
             
                  data.force_encoding("ASCII-8BIT")
         
     | 
| 
       50 
69 
     | 
    
         
             
                  data.should be_valid_encoding # everything is valid in binary
         
     | 
| 
       51 
70 
     | 
    
         | 
| 
       52 
71 
     | 
    
         
             
                  GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
         
     | 
| 
       53 
     | 
    
         
            -
                  data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2
         
     | 
| 
      
 72 
     | 
    
         
            +
                  data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
         
     | 
| 
       54 
73 
     | 
    
         | 
| 
       55 
74 
     | 
    
         
             
                  encoded = GuessHtmlEncoding.encode(data)
         
     | 
| 
       56 
75 
     | 
    
         
             
                  encoded.encoding.to_s.should == "UTF-8"
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: guess_html_encoding
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.2
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -9,11 +9,11 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2011- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2011-10-26 00:00:00.000000000Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: rspec
         
     | 
| 
       16 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 16 
     | 
    
         
            +
              requirement: &70345601879200 !ruby/object:Gem::Requirement
         
     | 
| 
       17 
17 
     | 
    
         
             
                none: false
         
     | 
| 
       18 
18 
     | 
    
         
             
                requirements:
         
     | 
| 
       19 
19 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -21,7 +21,7 @@ dependencies: 
     | 
|
| 
       21 
21 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       22 
22 
     | 
    
         
             
              type: :development
         
     | 
| 
       23 
23 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       24 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 24 
     | 
    
         
            +
              version_requirements: *70345601879200
         
     | 
| 
       25 
25 
     | 
    
         
             
            description: This gem helps guess the encoding of an HTML page.
         
     | 
| 
       26 
26 
     | 
    
         
             
            email:
         
     | 
| 
       27 
27 
     | 
    
         
             
            - andrew@iterationlabs.com
         
     |