guess_html_encoding 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.2)
4
+ guess_html_encoding (0.0.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012, Andrew Cantino (Iteration Labs, LLC)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.markdown CHANGED
@@ -1,52 +1,46 @@
1
- GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
2
-
3
- describe "GuessHtmlEncoding" do
4
- describe "#guess" do
5
- it "can use headers" do
6
- guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
7
- "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
8
- guess.should == "ISO-8859-1"
9
- end
10
-
11
- it "accepts headers as a hash as well" do
12
- guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
13
- {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
14
- guess.should == "ISO-8859-1"
15
- end
16
-
17
- it "accepts meta tags" do
18
- guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
19
- guess.should == "ISO-8859-1"
20
- end
21
-
22
- ...
23
- end
24
-
25
- describe "#encode" do
26
- it "should work on correctly encoded pages" do
27
- data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
28
- data.force_encoding("ASCII-8BIT")
29
- data.should be_valid_encoding # everything is valid in binary
30
-
31
- GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
32
- data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
33
-
34
- encoded = GuessHtmlEncoding.encode(data)
35
- encoded.encoding.to_s.should == "UTF-8"
36
- encoded.should be_valid_encoding
37
- end
38
-
39
- it "should work on incorrectly encoded pages" do
40
- data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
41
- data.force_encoding("ASCII-8BIT")
42
- data.should be_valid_encoding # everything is valid in binary
43
-
44
- GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
45
- data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
46
-
47
- encoded = GuessHtmlEncoding.encode(data)
48
- encoded.encoding.to_s.should == "UTF-8"
49
- encoded.should be_valid_encoding
50
- end
51
- end
52
- end
1
+ # GuessHTMLEncoding
2
+
3
+ GuessHTMLEncoding is a simple library to guess HTML encodings in Ruby 1.9. It considers HTTP headers and META tags.
4
+
5
+ # Install
6
+
7
+ (sudo) gem install guess_html_encoding
8
+
9
+ # Usage
10
+
11
+ GuessHTMLEncoding can guess the encoding of an HTML file based on the http-equiv content-type:
12
+
13
+ require 'rubygems'
14
+ require 'guess_html_encoding'
15
+
16
+ guess = GuessHtmlEncoding.guess(<<-HTML)
17
+ <html>
18
+ <head>
19
+ <meta http-equiv="content-type" content="text/html; charset=LATIN1">
20
+ </head>
21
+ <body>
22
+ <div>hi!</div>
23
+ </body>
24
+ </html>
25
+ HTML
26
+ guess.should == "ISO-8859-1"
27
+
28
+ You can also give it HTTP headers to guess from, which it will prefer, both as a string or as a hash:
29
+
30
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
31
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
32
+ guess.should == "ISO-8859-1"
33
+
34
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
35
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
36
+ guess.should == "ISO-8859-1"
37
+
38
+ It's likely that you want to force the encoding of the given HTML into the guessed encoding. This is easy to do:
39
+
40
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
41
+ encoded = GuessHtmlEncoding.encode(data)
42
+ encoded.encoding.to_s.should == "UTF-8"
43
+
44
+ If an encoding cannot be guessed, or Ruby doesn't understand it, UTF-8 will be used and unknown characters will be ignored.
45
+
46
+ # Pull requests welcome!
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -26,10 +26,10 @@ module GuessHtmlEncoding
26
26
 
27
27
  # Translate encodings with other names.
28
28
  if out
29
- out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
30
- out = "CP1251" if out.upcase == "CP-1251"
31
- out = "ISO-8859-1" if ["LATIN1", "LATIN-1"].include?(out.upcase)
32
- out = "Windows-1250" if ["WIN-1251", "WIN1251"].include?(out.upcase)
29
+ out = "UTF-8" if %w[DEFAULT UTF8 UNICODE].include?(out)
30
+ out = "CP1251" if out == "CP-1251"
31
+ out = "ISO-8859-1" if %w[LATIN1 LATIN-1].include?(out)
32
+ out = "Windows-1250" if %w[WIN-1251 WIN1251].include?(out)
33
33
  end
34
34
 
35
35
  out
@@ -37,12 +37,13 @@ module GuessHtmlEncoding
37
37
 
38
38
  # Force an HTML string into a guessed encoding.
39
39
  def self.encode(html, headers = nil)
40
- encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
41
- html.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
42
- if html.valid_encoding?
43
- html
40
+ html_copy = html.dup
41
+ encoding = guess(html_copy, (headers || '').gsub(/[\r\n]+/, "\n"))
42
+ html_copy.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
43
+ if html_copy.valid_encoding?
44
+ html_copy
44
45
  else
45
- html.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
46
+ html_copy.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
46
47
  end
47
48
  end
48
49
 
@@ -76,7 +76,7 @@ describe "GuessHtmlEncoding" do
76
76
  encoded.should be_valid_encoding
77
77
  end
78
78
 
79
- it "should work on pages encoded with an unloaded encoding" do
79
+ it "should work on pages encoded with an unknown encoding by forcing them to utf8" do
80
80
  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=x-mac-roman;'></head><body><div>hi!</div></body></html>"
81
81
  data.force_encoding("ASCII-8BIT")
82
82
  data.should be_valid_encoding # everything is valid in binary
@@ -86,6 +86,8 @@ describe "GuessHtmlEncoding" do
86
86
  encoded = GuessHtmlEncoding.encode(data)
87
87
  encoded.encoding.to_s.should == "UTF-8"
88
88
  encoded.should be_valid_encoding
89
+
90
+ data.encoding.to_s.should == "ASCII-8BIT"
89
91
  end
90
92
  end
91
93
 
@@ -95,6 +97,7 @@ describe "GuessHtmlEncoding" do
95
97
  GuessHtmlEncoding.encoding_loaded?(name).should be_true
96
98
  end
97
99
  end
100
+
98
101
  it 'returns false for irregular or unloaded encoding' do
99
102
  GuessHtmlEncoding.encoding_loaded?('_WHY').should be_false
100
103
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-13 00:00:00.000000000Z
12
+ date: 2012-04-04 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70293930 !ruby/object:Gem::Requirement
16
+ requirement: &75762100 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70293930
24
+ version_requirements: *75762100
25
25
  description: This gem helps guess the encoding of an HTML page.
26
26
  email:
27
27
  - andrew@iterationlabs.com
@@ -33,6 +33,7 @@ files:
33
33
  - .rvmrc
34
34
  - Gemfile
35
35
  - Gemfile.lock
36
+ - LICENSE
36
37
  - README.markdown
37
38
  - Rakefile
38
39
  - guess_html_encoding.gemspec