nokogumbo 0.5.2 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +18 -5
- data/lib/nokogumbo.rb +81 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -13,18 +13,31 @@ require 'nokogumbo'
|
|
13
13
|
doc = Nokogiri::HTML5(string)
|
14
14
|
```
|
15
15
|
|
16
|
+
Because HTML is often fetched via the web, a convenience interface is also
|
17
|
+
provided:
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'nokogumbo'
|
21
|
+
doc = Nokogiri::HTML5.get(uri)
|
22
|
+
```
|
16
23
|
Notes:
|
17
24
|
-----
|
18
25
|
|
19
|
-
* The `
|
26
|
+
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
20
27
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
21
|
-
The resulting Gumbo parse tree is the walked, producing a
|
22
|
-
The original Gumbo parse tree is then destroyed, and
|
23
|
-
is
|
28
|
+
The resulting Gumbo parse tree is the walked, producing a libxml2 parse tree.
|
29
|
+
The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby
|
30
|
+
object is constructed to wrap the libxml2 parse tree. Nokogiri only produces
|
31
|
+
Ruby objects as necessary, so all scanning is done using the underlying
|
32
|
+
libxml2 libraries.
|
33
|
+
|
34
|
+
* The `Nokogiri::HTML5.get` function takes care of following redirects,
|
35
|
+
https, and determining the character encoding of the result, based on the
|
36
|
+
rules defined in the HTML5 specification for doing so.
|
24
37
|
|
25
38
|
* Instead of uppercase element names, lowercase element names are produced.
|
26
39
|
|
27
|
-
* Instead of returning
|
40
|
+
* Instead of returning `unknown` as the element name for unknown tags, the
|
28
41
|
original tag name is returned verbatim.
|
29
42
|
|
30
43
|
* The gem itself includes a copy of the Gumbo HTML5 parser.
|
data/lib/nokogumbo.rb
CHANGED
@@ -8,12 +8,92 @@ module Nokogiri
|
|
8
8
|
|
9
9
|
module HTML5
|
10
10
|
def self.parse(string)
|
11
|
+
if string.respond_to? :read
|
12
|
+
string = string.read
|
13
|
+
end
|
14
|
+
|
11
15
|
# convert to UTF-8 (Ruby 1.9+)
|
12
16
|
if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8
|
13
|
-
string = string
|
17
|
+
string = reencode(string)
|
14
18
|
end
|
15
19
|
|
16
20
|
Nokogumbo.parse(string)
|
17
21
|
end
|
22
|
+
|
23
|
+
def self.get(uri, limit=10)
|
24
|
+
require 'net/http'
|
25
|
+
uri = URI(uri) unless URI === uri
|
26
|
+
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
28
|
+
if uri.scheme == 'https'
|
29
|
+
http.use_ssl = true
|
30
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
31
|
+
end
|
32
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
33
|
+
response = http.request(request)
|
34
|
+
|
35
|
+
case response
|
36
|
+
when Net::HTTPSuccess
|
37
|
+
parse(reencode(response.body, response['content-type']))
|
38
|
+
when Net::HTTPRedirection
|
39
|
+
response.value if limit <= 1
|
40
|
+
get(response['location'], limit-1)
|
41
|
+
else
|
42
|
+
response.value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Charset sniffing is a complex and controversial topic that understandably
|
49
|
+
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
50
|
+
# it is a very real problem for consumers of HTML as the default for HTML
|
51
|
+
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
52
|
+
# *only* supports utf-8.
|
53
|
+
#
|
54
|
+
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
55
|
+
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
56
|
+
# while attempting to more closely follow the HTML5 standard.
|
57
|
+
#
|
58
|
+
# http://bugs.ruby-lang.org/issues/2567
|
59
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
60
|
+
#
|
61
|
+
def self.reencode(body, content_type=nil)
|
62
|
+
return body unless body.respond_to? :encoding
|
63
|
+
|
64
|
+
if body.encoding == Encoding::ASCII_8BIT
|
65
|
+
encoding = nil
|
66
|
+
|
67
|
+
# look for a Byte Order Mark (BOM)
|
68
|
+
if body[0..1] == "\xFE\xFF"
|
69
|
+
encoding = 'utf-16be'
|
70
|
+
elsif body[0..1] == "\xFF\xFE"
|
71
|
+
encoding = 'utf-16le'
|
72
|
+
elsif body[0..2] == "\xEF\xBB\xBF"
|
73
|
+
encoding = 'utf-8'
|
74
|
+
end
|
75
|
+
|
76
|
+
# look for a charset in a content-encoding header
|
77
|
+
if content_type
|
78
|
+
encoding ||= content_type[/charset=(.*?)($|\s|;)/i, 1]
|
79
|
+
end
|
80
|
+
|
81
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
82
|
+
if not encoding
|
83
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
84
|
+
data.scan(/<meta.*?>/m).each do |meta|
|
85
|
+
encoding ||= meta[/charset="?(.*?)($|"|\s|>)/im, 1]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# if all else fails, default to the official default encoding for HTML
|
90
|
+
encoding ||= Encoding::ISO_8859_1
|
91
|
+
|
92
|
+
# change the encoding to match the detected or inferred encoding
|
93
|
+
body.force_encoding(encoding)
|
94
|
+
end
|
95
|
+
|
96
|
+
body.encode(Encoding::UTF_8)
|
97
|
+
end
|
18
98
|
end
|
19
99
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.6'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|