guess_html_encoding 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- guess_html_encoding (0.0.2)
4
+ guess_html_encoding (0.0.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012, Andrew Cantino (Iteration Labs, LLC)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.markdown CHANGED
@@ -1,52 +1,46 @@
1
- GuessHtmlEncoding is a simple library to guess the HTML encodings in Ruby 1.9
2
-
3
- describe "GuessHtmlEncoding" do
4
- describe "#guess" do
5
- it "can use headers" do
6
- guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
7
- "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
8
- guess.should == "ISO-8859-1"
9
- end
10
-
11
- it "accepts headers as a hash as well" do
12
- guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
13
- {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
14
- guess.should == "ISO-8859-1"
15
- end
16
-
17
- it "accepts meta tags" do
18
- guess = GuessHtmlEncoding.guess('<html><head><meta http-equiv="content-type" content="text/html; charset=LATIN1"></head><body><div>hi!</div></body></html>')
19
- guess.should == "ISO-8859-1"
20
- end
21
-
22
- ...
23
- end
24
-
25
- describe "#encode" do
26
- it "should work on correctly encoded pages" do
27
- data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
28
- data.force_encoding("ASCII-8BIT")
29
- data.should be_valid_encoding # everything is valid in binary
30
-
31
- GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
32
- data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
33
-
34
- encoded = GuessHtmlEncoding.encode(data)
35
- encoded.encoding.to_s.should == "UTF-8"
36
- encoded.should be_valid_encoding
37
- end
38
-
39
- it "should work on incorrectly encoded pages" do
40
- data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!\xc2</div></body></html>"
41
- data.force_encoding("ASCII-8BIT")
42
- data.should be_valid_encoding # everything is valid in binary
43
-
44
- GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
45
- data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
46
-
47
- encoded = GuessHtmlEncoding.encode(data)
48
- encoded.encoding.to_s.should == "UTF-8"
49
- encoded.should be_valid_encoding
50
- end
51
- end
52
- end
1
+ # GuessHTMLEncoding
2
+
3
+ GuessHTMLEncoding is a simple library to guess HTML encodings in Ruby 1.9. It considers HTTP headers and META tags.
4
+
5
+ # Install
6
+
7
+ (sudo) gem install guess_html_encoding
8
+
9
+ # Usage
10
+
11
+ GuessHTMLEncoding can guess the encoding of an HTML file based on the http-equiv content-type:
12
+
13
+ require 'rubygems'
14
+ require 'guess_html_encoding'
15
+
16
+ guess = GuessHtmlEncoding.guess(<<-HTML)
17
+ <html>
18
+ <head>
19
+ <meta http-equiv="content-type" content="text/html; charset=LATIN1">
20
+ </head>
21
+ <body>
22
+ <div>hi!</div>
23
+ </body>
24
+ </html>
25
+ HTML
26
+ guess.should == "ISO-8859-1"
27
+
28
+ You can also give it HTTP headers to guess from, which it will prefer, both as a string or as a hash:
29
+
30
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
31
+ "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
32
+ guess.should == "ISO-8859-1"
33
+
34
+ guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
35
+ {"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
36
+ guess.should == "ISO-8859-1"
37
+
38
+ It's likely that you want to force the encoding of the given HTML into the guessed encoding. This is easy to do:
39
+
40
+ data = "<html><head><meta http-equiv='content-type' content='text/html; charset=utf8;'></head><body><div>hi!♥</div></body></html>"
41
+ encoded = GuessHtmlEncoding.encode(data)
42
+ encoded.encoding.to_s.should == "UTF-8"
43
+
44
+ If an encoding cannot be guessed, or Ruby doesn't understand it, UTF-8 will be used and unknown characters will be ignored.
45
+
46
+ # Pull requests welcome!
@@ -1,3 +1,3 @@
1
1
  module GuessHtmlEncoding
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -26,10 +26,10 @@ module GuessHtmlEncoding
26
26
 
27
27
  # Translate encodings with other names.
28
28
  if out
29
- out = "UTF-8" if ["DEFAULT", "UTF8", "UNICODE"].include?(out.upcase)
30
- out = "CP1251" if out.upcase == "CP-1251"
31
- out = "ISO-8859-1" if ["LATIN1", "LATIN-1"].include?(out.upcase)
32
- out = "Windows-1250" if ["WIN-1251", "WIN1251"].include?(out.upcase)
29
+ out = "UTF-8" if %w[DEFAULT UTF8 UNICODE].include?(out)
30
+ out = "CP1251" if out == "CP-1251"
31
+ out = "ISO-8859-1" if %w[LATIN1 LATIN-1].include?(out)
32
+ out = "Windows-1250" if %w[WIN-1251 WIN1251].include?(out)
33
33
  end
34
34
 
35
35
  out
@@ -37,12 +37,13 @@ module GuessHtmlEncoding
37
37
 
38
38
  # Force an HTML string into a guessed encoding.
39
39
  def self.encode(html, headers = nil)
40
- encoding = guess(html, (headers || '').gsub(/[\r\n]+/, "\n"))
41
- html.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
42
- if html.valid_encoding?
43
- html
40
+ html_copy = html.dup
41
+ encoding = guess(html_copy, (headers || '').gsub(/[\r\n]+/, "\n"))
42
+ html_copy.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
43
+ if html_copy.valid_encoding?
44
+ html_copy
44
45
  else
45
- html.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
46
+ html_copy.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
46
47
  end
47
48
  end
48
49
 
@@ -76,7 +76,7 @@ describe "GuessHtmlEncoding" do
76
76
  encoded.should be_valid_encoding
77
77
  end
78
78
 
79
- it "should work on pages encoded with an unloaded encoding" do
79
+ it "should work on pages encoded with an unknown encoding by forcing them to utf8" do
80
80
  data = "<html><head><meta http-equiv='content-type' content='text/html; charset=x-mac-roman;'></head><body><div>hi!</div></body></html>"
81
81
  data.force_encoding("ASCII-8BIT")
82
82
  data.should be_valid_encoding # everything is valid in binary
@@ -86,6 +86,8 @@ describe "GuessHtmlEncoding" do
86
86
  encoded = GuessHtmlEncoding.encode(data)
87
87
  encoded.encoding.to_s.should == "UTF-8"
88
88
  encoded.should be_valid_encoding
89
+
90
+ data.encoding.to_s.should == "ASCII-8BIT"
89
91
  end
90
92
  end
91
93
 
@@ -95,6 +97,7 @@ describe "GuessHtmlEncoding" do
95
97
  GuessHtmlEncoding.encoding_loaded?(name).should be_true
96
98
  end
97
99
  end
100
+
98
101
  it 'returns false for irregular or unloaded encoding' do
99
102
  GuessHtmlEncoding.encoding_loaded?('_WHY').should be_false
100
103
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: guess_html_encoding
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-13 00:00:00.000000000Z
12
+ date: 2012-04-04 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70293930 !ruby/object:Gem::Requirement
16
+ requirement: &75762100 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70293930
24
+ version_requirements: *75762100
25
25
  description: This gem helps guess the encoding of an HTML page.
26
26
  email:
27
27
  - andrew@iterationlabs.com
@@ -33,6 +33,7 @@ files:
33
33
  - .rvmrc
34
34
  - Gemfile
35
35
  - Gemfile.lock
36
+ - LICENSE
36
37
  - README.markdown
37
38
  - Rakefile
38
39
  - guess_html_encoding.gemspec