nokogumbo 0.5.2 → 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +18 -5
- data/lib/nokogumbo.rb +81 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -13,18 +13,31 @@ require 'nokogumbo'
|
|
13
13
|
doc = Nokogiri::HTML5(string)
|
14
14
|
```
|
15
15
|
|
16
|
+
Because HTML is often fetched via the web, a convenience interface is also
|
17
|
+
provided:
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'nokogumbo'
|
21
|
+
doc = Nokogiri::HTML5.get(uri)
|
22
|
+
```
|
16
23
|
Notes:
|
17
24
|
-----
|
18
25
|
|
19
|
-
* The `
|
26
|
+
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
20
27
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
21
|
-
The resulting Gumbo parse tree is the walked, producing a
|
22
|
-
The original Gumbo parse tree is then destroyed, and
|
23
|
-
is
|
28
|
+
The resulting Gumbo parse tree is the walked, producing a libxml2 parse tree.
|
29
|
+
The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby
|
30
|
+
object is constructed to wrap the libxml2 parse tree. Nokogiri only produces
|
31
|
+
Ruby objects as necessary, so all scanning is done using the underlying
|
32
|
+
libxml2 libraries.
|
33
|
+
|
34
|
+
* The `Nokogiri::HTML5.get` function takes care of following redirects,
|
35
|
+
https, and determining the character encoding of the result, based on the
|
36
|
+
rules defined in the HTML5 specification for doing so.
|
24
37
|
|
25
38
|
* Instead of uppercase element names, lowercase element names are produced.
|
26
39
|
|
27
|
-
* Instead of returning
|
40
|
+
* Instead of returning `unknown` as the element name for unknown tags, the
|
28
41
|
original tag name is returned verbatim.
|
29
42
|
|
30
43
|
* The gem itself includes a copy of the Gumbo HTML5 parser.
|
data/lib/nokogumbo.rb
CHANGED
@@ -8,12 +8,92 @@ module Nokogiri
|
|
8
8
|
|
9
9
|
module HTML5
|
10
10
|
def self.parse(string)
|
11
|
+
if string.respond_to? :read
|
12
|
+
string = string.read
|
13
|
+
end
|
14
|
+
|
11
15
|
# convert to UTF-8 (Ruby 1.9+)
|
12
16
|
if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8
|
13
|
-
string = string
|
17
|
+
string = reencode(string)
|
14
18
|
end
|
15
19
|
|
16
20
|
Nokogumbo.parse(string)
|
17
21
|
end
|
22
|
+
|
23
|
+
def self.get(uri, limit=10)
|
24
|
+
require 'net/http'
|
25
|
+
uri = URI(uri) unless URI === uri
|
26
|
+
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
28
|
+
if uri.scheme == 'https'
|
29
|
+
http.use_ssl = true
|
30
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
31
|
+
end
|
32
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
33
|
+
response = http.request(request)
|
34
|
+
|
35
|
+
case response
|
36
|
+
when Net::HTTPSuccess
|
37
|
+
parse(reencode(response.body, response['content-type']))
|
38
|
+
when Net::HTTPRedirection
|
39
|
+
response.value if limit <= 1
|
40
|
+
get(response['location'], limit-1)
|
41
|
+
else
|
42
|
+
response.value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Charset sniffing is a complex and controversial topic that understandably
|
49
|
+
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
50
|
+
# it is a very real problem for consumers of HTML as the default for HTML
|
51
|
+
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
52
|
+
# *only* supports utf-8.
|
53
|
+
#
|
54
|
+
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
55
|
+
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
56
|
+
# while attempting to more closely follow the HTML5 standard.
|
57
|
+
#
|
58
|
+
# http://bugs.ruby-lang.org/issues/2567
|
59
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
60
|
+
#
|
61
|
+
def self.reencode(body, content_type=nil)
|
62
|
+
return body unless body.respond_to? :encoding
|
63
|
+
|
64
|
+
if body.encoding == Encoding::ASCII_8BIT
|
65
|
+
encoding = nil
|
66
|
+
|
67
|
+
# look for a Byte Order Mark (BOM)
|
68
|
+
if body[0..1] == "\xFE\xFF"
|
69
|
+
encoding = 'utf-16be'
|
70
|
+
elsif body[0..1] == "\xFF\xFE"
|
71
|
+
encoding = 'utf-16le'
|
72
|
+
elsif body[0..2] == "\xEF\xBB\xBF"
|
73
|
+
encoding = 'utf-8'
|
74
|
+
end
|
75
|
+
|
76
|
+
# look for a charset in a content-encoding header
|
77
|
+
if content_type
|
78
|
+
encoding ||= content_type[/charset=(.*?)($|\s|;)/i, 1]
|
79
|
+
end
|
80
|
+
|
81
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
82
|
+
if not encoding
|
83
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
84
|
+
data.scan(/<meta.*?>/m).each do |meta|
|
85
|
+
encoding ||= meta[/charset="?(.*?)($|"|\s|>)/im, 1]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# if all else fails, default to the official default encoding for HTML
|
90
|
+
encoding ||= Encoding::ISO_8859_1
|
91
|
+
|
92
|
+
# change the encoding to match the detected or inferred encoding
|
93
|
+
body.force_encoding(encoding)
|
94
|
+
end
|
95
|
+
|
96
|
+
body.encode(Encoding::UTF_8)
|
97
|
+
end
|
18
98
|
end
|
19
99
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.6'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|