nokogumbo 1.5.0 → 2.0.0.pre.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0d434c0749d7922ba8f084c15ed7219ccbf0e07b715368ae846bc38e64aad17
|
4
|
+
data.tar.gz: 2770648e3e9e82d0ffb1877f1c06edc537688cf6a8405bc52dbdf5a6bb69bc1a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6c3de49495bf55ccaa250e2a3275b6796b0f0565da2a930e3333d2a153f2a16312eb77cb28ca3e03c17720127c2ecc27a1f71cfd6acfd15407295c29973e9fb
|
7
|
+
data.tar.gz: e8ce6c80cb2327d2327f03c7e829156c1f0074ba4d6fce2b0d59305b80112b8fd5edc0932fad1fca13cb5f4bb6f2652fe52a2f090110aa76d06e1afbdebc334f
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to Nokogumbo will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
6
|
+
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [Unreleased]
|
9
|
+
### Added
|
10
|
+
- Experimental support for errors (it was supported in 1.5.0 but
|
11
|
+
undocumented).
|
12
|
+
- Added proper HTML5 serialization.
|
13
|
+
- Added option `:max_tree_depth` to control the maximum parse tree depth.
|
14
|
+
|
15
|
+
### Changed
|
16
|
+
- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
|
17
|
+
Nokogumbo. A system version will not be used.
|
18
|
+
- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
|
19
|
+
`:max_parse_errors` is deprecated and will go away
|
20
|
+
- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
|
21
|
+
`Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
|
22
|
+
rather than `Nokogiri::HTML::Document` and
|
23
|
+
`Nokogiri::HTML::DocumentFragment`.
|
24
|
+
- Changed the top-level API to more closely match Nokogiri's while maintaining
|
25
|
+
backwards compatibility. The new APIs are
|
26
|
+
* `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
|
27
|
+
* `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
|
28
|
+
* `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
|
29
|
+
* `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
|
30
|
+
* `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
|
31
|
+
In all cases, `html` can be a string or an `IO` object (something that
|
32
|
+
responds to `#read`). The `url` parameter is entirely for error reporting,
|
33
|
+
as in Nokogiri. The `encoding` parameter only signals what encoding `html`
|
34
|
+
should have on input; the output `Document` or `DocumentFragment` will be in
|
35
|
+
UTF-8. Currently, the only options supported is `:max_errors` which controls
|
36
|
+
the maximum number of reported by `#errors`.
|
37
|
+
|
38
|
+
### Deprecated
|
39
|
+
- `:max_parse_errors`; use `:max_errors`
|
40
|
+
|
41
|
+
### Removed
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
- Fixed documents failing to serialize (via `to_html`) if they contain certain
|
45
|
+
`meta` elements that set the `charset`.
|
46
|
+
- Documents are now properly marked as UTF-8 after parsing.
|
47
|
+
- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
|
48
|
+
`<!DOCTYPE html>`.
|
49
|
+
- Fixed crash when input contains U+0000 NULL bytes and error reporting is
|
50
|
+
enabled.
|
51
|
+
|
52
|
+
### Security
|
53
|
+
- The most recent, released version of Gumbo has a [potential security
|
54
|
+
issue](https://github.com/google/gumbo-parser/pull/375) that could result in
|
55
|
+
a cross-site scripting vulnerability. This has been fixed by integrating
|
56
|
+
Gumbo into Nokogumbo.
|
data/README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
|
-
===========
|
1
|
+
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
3
2
|
|
4
3
|
Nokogumbo provides the ability for a Ruby program to invoke the
|
5
4
|
[Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
|
@@ -8,12 +7,11 @@ and to access the result as a
|
|
8
7
|
|
9
8
|
[![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
10
9
|
|
11
|
-
Usage
|
12
|
-
-----
|
10
|
+
## Usage
|
13
11
|
|
14
12
|
```ruby
|
15
13
|
require 'nokogumbo'
|
16
|
-
doc = Nokogiri
|
14
|
+
doc = Nokogiri.HTML5(string)
|
17
15
|
```
|
18
16
|
|
19
17
|
An experimental _fragment_ method is also provided. While not HTML5
|
@@ -32,21 +30,150 @@ require 'nokogumbo'
|
|
32
30
|
doc = Nokogiri::HTML5.get(uri)
|
33
31
|
```
|
34
32
|
|
35
|
-
|
36
|
-
|
33
|
+
## Parsing options
|
34
|
+
The document and fragment parsing methods,
|
35
|
+
- `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
|
36
|
+
- `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
|
37
|
+
- `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
|
38
|
+
- `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
|
39
|
+
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
|
+
support options that are different from Nokogiri's.
|
41
|
+
|
42
|
+
The two currently supported options are `:max_errors` and `:max_tree_depth`,
|
43
|
+
described below.
|
44
|
+
|
45
|
+
### Error reporting
|
46
|
+
Nokogumbo contains an experimental parse error reporting facility. By default,
|
47
|
+
no parse errors are reported but this can be configured by passing the
|
48
|
+
`:max_errors` option to `::parse` or `::fragment`.
|
49
|
+
|
37
50
|
```ruby
|
38
51
|
require 'nokogumbo'
|
39
|
-
|
52
|
+
doc = Nokogiri::HTML5.parse('Hi there!<body>', max_errors: 10)
|
53
|
+
doc.errors.each do |err|
|
54
|
+
puts err
|
55
|
+
end
|
56
|
+
```
|
57
|
+
|
58
|
+
This prints the following.
|
59
|
+
```
|
60
|
+
1:1: ERROR: @1:1: The doctype must be the first token in the document.
|
61
|
+
Hi there!<body>
|
62
|
+
^
|
63
|
+
1:10: ERROR: @1:10: That tag isn't allowed here Currently open tags: html, body..
|
64
|
+
Hi there!<body>
|
65
|
+
^
|
66
|
+
```
|
67
|
+
|
68
|
+
Using `max_errors: -1` results in an unlimited number of errors being
|
69
|
+
returned.
|
70
|
+
|
71
|
+
The errors returned by `#errors` are instances of
|
72
|
+
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
73
|
+
|
74
|
+
### Maximum tree depth
|
75
|
+
The maximum depth of the DOM tree parsed by the various parsing methods is
|
76
|
+
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
77
|
+
exceed this limit, then an
|
78
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
79
|
+
|
80
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
|
81
|
+
be removed by giving the option `max_tree_depth: -1`.
|
82
|
+
|
83
|
+
``` ruby
|
84
|
+
html = '<!DOCTYPE html>' + '<div>' * 1000
|
85
|
+
doc = Nokogiri.HTML5(html)
|
86
|
+
# raises ArgumentError: Document tree depth limit exceeded
|
87
|
+
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
40
88
|
```
|
41
89
|
|
42
|
-
|
90
|
+
## HTML Serialization
|
91
|
+
|
92
|
+
After parsing HTML, it may be serialized using any of the Nokogiri
|
93
|
+
[serialization
|
94
|
+
methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
|
95
|
+
particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
|
96
|
+
and its children. (This is the equivalent of JavaScript's
|
97
|
+
`Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
|
98
|
+
a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
|
99
|
+
|
100
|
+
``` ruby
|
101
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
|
102
|
+
puts doc.serialize
|
103
|
+
# Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
|
104
|
+
```
|
105
|
+
|
106
|
+
Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
|
107
|
+
tree to be serialized and then re-parsed, resulting in a different DOM.
|
108
|
+
Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
|
109
|
+
valid HTML may not survive serialization and re-parsing.
|
110
|
+
|
111
|
+
In particular, a newline at the start of `pre`, `listing`, and `textarea`
|
112
|
+
elements is ignored by the parser.
|
113
|
+
|
114
|
+
``` ruby
|
115
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
116
|
+
<!DOCTYPE html>
|
117
|
+
<pre>
|
118
|
+
Content</pre>
|
119
|
+
EOF
|
120
|
+
puts doc.at('/html/body/pre').serialize
|
121
|
+
# Prints: <pre>Content</pre>
|
43
122
|
```
|
123
|
+
|
124
|
+
In this case, the original HTML is semantically equivalent to the serialized
|
125
|
+
version. If the `pre`, `listing`, or `textarea` content starts with two
|
126
|
+
newlines, the first newline will be stripped on the first parse and the second
|
127
|
+
newline will be stripped on the second, leading to semantically different
|
128
|
+
DOMs. Passing the parameter `preserve_newline: true` will cause two or more
|
129
|
+
newlines to be preserved. (A single leading newline will still be removed.)
|
130
|
+
|
131
|
+
``` ruby
|
132
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
133
|
+
<!DOCTYPE html>
|
134
|
+
<listing>
|
135
|
+
|
136
|
+
Content</listing>
|
137
|
+
EOF
|
138
|
+
puts doc.at('/html/body/listing').serialize(preserve_newline: true)
|
139
|
+
# Prints: <listing>
|
140
|
+
#
|
141
|
+
# Content</listing>
|
142
|
+
```
|
143
|
+
|
144
|
+
## Encodings
|
145
|
+
Nokogumbo always parses HTML using
|
146
|
+
[UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
|
147
|
+
input can be explicitly selected via the optional `encoding` parameter. This
|
148
|
+
is most useful when the input comes not from a string but from an IO object.
|
149
|
+
|
150
|
+
When serializing a document or node, the encoding of the output string can be
|
151
|
+
specified via the `:encoding` options. Characters that cannot be encoded in
|
152
|
+
the selected encoding will be encoded as [HTML numeric
|
153
|
+
entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
|
154
|
+
|
155
|
+
``` ruby
|
156
|
+
frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
|
157
|
+
html = frag.serialize(encoding: 'US-ASCII')
|
158
|
+
puts html
|
159
|
+
# Prints: <span>아는 길도 물어가라</span>
|
160
|
+
frag = Nokogiri::HTML5.fragment(html)
|
161
|
+
puts frag.serialize
|
162
|
+
# Prints: <span>아는 길도 물어가라</span>
|
163
|
+
```
|
164
|
+
|
165
|
+
(There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
|
166
|
+
versions of Ruby that can cause the entity encoding to fail. Of the mandated
|
167
|
+
supported encodings for HTML, the only encoding I'm aware of that has this bug
|
168
|
+
is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
|
169
|
+
|
170
|
+
## Examples
|
171
|
+
```ruby
|
44
172
|
require 'nokogumbo'
|
45
|
-
Nokogiri::HTML5.
|
173
|
+
puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
|
46
174
|
```
|
47
175
|
|
48
|
-
Notes
|
49
|
-
-----
|
176
|
+
## Notes
|
50
177
|
|
51
178
|
* The `Nokogiri::HTML5.fragment` function takes a string and parses it
|
52
179
|
as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
|
@@ -74,20 +201,17 @@ rules defined in the HTML5 specification for doing so.
|
|
74
201
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
75
202
|
original tag name is returned verbatim.
|
76
203
|
|
77
|
-
|
78
|
-
parser will be downloaded and compiled into the Gem itself.
|
79
|
-
|
80
|
-
Installation
|
81
|
-
============
|
204
|
+
# Installation
|
82
205
|
|
83
|
-
git clone
|
206
|
+
git clone https://github.com/rubys/nokogumbo.git
|
84
207
|
cd nokogumbo
|
85
208
|
bundle install
|
86
209
|
rake gem
|
87
210
|
gem install pkg/nokogumbo*.gem
|
88
211
|
|
89
|
-
Related efforts
|
90
|
-
============
|
212
|
+
# Related efforts
|
91
213
|
|
92
|
-
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme)
|
93
|
-
for the Gumbo HTML5 parser.
|
214
|
+
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
|
215
|
+
for the Gumbo HTML5 parser.
|
216
|
+
* [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
|
217
|
+
the Gumbo HTML5 parser.
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'mkmf'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
$CFLAGS += " -std=c99"
|
6
|
+
$LDFLAGS.gsub!('-Wl,--no-undefined', '')
|
7
|
+
$warnflags = CONFIG['warnflags'] = '-Wall'
|
8
|
+
|
9
|
+
NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
|
10
|
+
|
11
|
+
def download_headers
|
12
|
+
begin
|
13
|
+
require 'yaml'
|
14
|
+
|
15
|
+
dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
|
16
|
+
version = dependencies['libxml2']['version']
|
17
|
+
host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
18
|
+
path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
|
19
|
+
return path if File.directory?(path)
|
20
|
+
|
21
|
+
# Make sure we're using the same version Nokogiri uses
|
22
|
+
dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
|
23
|
+
return nil if dep_index.nil?
|
24
|
+
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
gem 'mini_portile2', requirement
|
28
|
+
require 'mini_portile2'
|
29
|
+
p = MiniPortile::new('libxml2', version).tap do |r|
|
30
|
+
r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
31
|
+
r.files = [{
|
32
|
+
url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
|
33
|
+
sha256: dependencies['libxml2']['sha256']
|
34
|
+
}]
|
35
|
+
r.configure_options += [
|
36
|
+
"--without-python",
|
37
|
+
"--without-readline",
|
38
|
+
"--with-c14n",
|
39
|
+
"--with-debug",
|
40
|
+
"--with-threads"
|
41
|
+
]
|
42
|
+
end
|
43
|
+
p.download unless p.downloaded?
|
44
|
+
p.extract
|
45
|
+
p.configure unless p.configured?
|
46
|
+
system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
|
47
|
+
path
|
48
|
+
rescue
|
49
|
+
puts 'failed to download/install headers'
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
required = arg_config('--with-libxml2')
|
55
|
+
prohibited = arg_config('--without-libxml2')
|
56
|
+
if required and prohibited
|
57
|
+
abort "cannot use both --with-libxml2 and --without-libxml2"
|
58
|
+
end
|
59
|
+
|
60
|
+
have_libxml2 = false
|
61
|
+
have_ng = false
|
62
|
+
|
63
|
+
if !prohibited
|
64
|
+
if Nokogiri::VERSION_INFO.include?('libxml') and
|
65
|
+
Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
|
66
|
+
# Nokogiri has libxml2 built in. Find the headers.
|
67
|
+
libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
|
68
|
+
'include/libxml2')
|
69
|
+
if find_header('libxml/tree.h', libxml2_path)
|
70
|
+
have_libxml2 = true
|
71
|
+
else
|
72
|
+
# Unfortunately, some versions of Nokogiri delete these files.
|
73
|
+
# https://github.com/sparklemotion/nokogiri/pull/1788
|
74
|
+
# Try to download them
|
75
|
+
libxml2_path = download_headers
|
76
|
+
unless libxml2_path.nil?
|
77
|
+
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
else
|
81
|
+
# Nokogiri is compiled with system headers.
|
82
|
+
# Hack to work around broken mkmf on macOS
|
83
|
+
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
84
|
+
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
85
|
+
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
86
|
+
end
|
87
|
+
|
88
|
+
pkg_config('libxml-2.0')
|
89
|
+
have_libxml2 = have_library('xml2', 'xmlNewDoc')
|
90
|
+
end
|
91
|
+
if required and !have_libxml2
|
92
|
+
abort "libxml2 required but could not be located"
|
93
|
+
end
|
94
|
+
|
95
|
+
if have_libxml2
|
96
|
+
# Find nokogiri.h
|
97
|
+
have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
if have_libxml2 and have_ng
|
102
|
+
$CFLAGS += " -DNGLIB=1"
|
103
|
+
end
|
104
|
+
|
105
|
+
# Symlink gumbo-parser source files.
|
106
|
+
ext_dir = File.dirname(__FILE__)
|
107
|
+
gumbo_src = File.join(ext_dir, 'gumbo_src')
|
108
|
+
|
109
|
+
Dir.chdir(ext_dir) do
|
110
|
+
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
111
|
+
end
|
112
|
+
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
113
|
+
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
114
|
+
|
115
|
+
create_makefile('nokogumbo/nokogumbo')
|
116
|
+
# vim: set sw=2 sts=2 ts=8 et:
|
@@ -2,7 +2,7 @@
|
|
2
2
|
// nokogumbo.c defines the following:
|
3
3
|
//
|
4
4
|
// class Nokogumbo
|
5
|
-
// def parse(utf8_string) # returns Nokogiri::
|
5
|
+
// def parse(utf8_string) # returns Nokogiri::HTML5::Document
|
6
6
|
// end
|
7
7
|
//
|
8
8
|
// Processing starts by calling gumbo_parse_with_options. The resulting
|
@@ -18,26 +18,29 @@
|
|
18
18
|
// methods are called instead, producing the equivalent functionality.
|
19
19
|
//
|
20
20
|
|
21
|
+
#include <assert.h>
|
21
22
|
#include <ruby.h>
|
22
23
|
#include "gumbo.h"
|
23
24
|
#include "error.h"
|
24
|
-
#include "parser.h"
|
25
25
|
|
26
26
|
// class constants
|
27
27
|
static VALUE Document;
|
28
|
-
static VALUE XMLSyntaxError;
|
29
28
|
|
30
29
|
#ifdef NGLIB
|
31
30
|
#include <nokogiri.h>
|
31
|
+
#include <xml_syntax_error.h>
|
32
32
|
#include <libxml/tree.h>
|
33
|
+
#include <libxml/HTMLtree.h>
|
33
34
|
|
34
35
|
#define NIL NULL
|
35
36
|
#define CONST_CAST (xmlChar const*)
|
36
37
|
#else
|
37
|
-
#define NIL
|
38
|
+
#define NIL Qnil
|
38
39
|
#define CONST_CAST
|
39
40
|
|
40
41
|
// more class constants
|
42
|
+
static VALUE cNokogiriXmlSyntaxError;
|
43
|
+
|
41
44
|
static VALUE Element;
|
42
45
|
static VALUE Text;
|
43
46
|
static VALUE CDATA;
|
@@ -45,11 +48,15 @@ static VALUE Comment;
|
|
45
48
|
|
46
49
|
// interned symbols
|
47
50
|
static VALUE new;
|
51
|
+
static VALUE attribute;
|
48
52
|
static VALUE set_attribute;
|
53
|
+
static VALUE remove_attribute;
|
49
54
|
static VALUE add_child;
|
50
55
|
static VALUE internal_subset;
|
51
56
|
static VALUE remove_;
|
52
57
|
static VALUE create_internal_subset;
|
58
|
+
static VALUE key_;
|
59
|
+
static VALUE node_name_;
|
53
60
|
|
54
61
|
// map libxml2 types to Ruby VALUE
|
55
62
|
#define xmlNodePtr VALUE
|
@@ -58,12 +65,10 @@ static VALUE create_internal_subset;
|
|
58
65
|
// redefine libxml2 API as Ruby function calls
|
59
66
|
#define xmlNewDocNode(doc, ns, name, content) \
|
60
67
|
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
61
|
-
#define xmlNewProp(element, name, value) \
|
62
|
-
rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
|
63
68
|
#define xmlNewDocText(doc, text) \
|
64
69
|
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
65
70
|
#define xmlNewCDataBlock(doc, content, length) \
|
66
|
-
rb_funcall(CDATA, new, 2, rb_str_new(content, length)
|
71
|
+
rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
|
67
72
|
#define xmlNewDocComment(doc, text) \
|
68
73
|
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
69
74
|
#define xmlAddChild(element, node) \
|
@@ -77,11 +82,76 @@ static VALUE create_internal_subset;
|
|
77
82
|
#define Nokogiri_wrap_xml_document(klass, doc) \
|
78
83
|
doc
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
+
static VALUE find_dummy_key(VALUE collection) {
|
86
|
+
VALUE r_dummy = Qnil;
|
87
|
+
char dummy[5] = "a";
|
88
|
+
size_t len = 1;
|
89
|
+
while (len < sizeof dummy) {
|
90
|
+
r_dummy = rb_str_new(dummy, len);
|
91
|
+
if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
|
92
|
+
return r_dummy;
|
93
|
+
for (size_t i = 0; ; ++i) {
|
94
|
+
if (dummy[i] == 0) {
|
95
|
+
dummy[i] = 'a';
|
96
|
+
++len;
|
97
|
+
break;
|
98
|
+
}
|
99
|
+
if (dummy[i] == 'z')
|
100
|
+
dummy[i] = 'a';
|
101
|
+
else {
|
102
|
+
++dummy[i];
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
// This collection has 475254 elements?? Give up.
|
108
|
+
return Qnil;
|
109
|
+
}
|
110
|
+
|
111
|
+
static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) {
|
112
|
+
// Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
|
113
|
+
// which behaves roughly as
|
114
|
+
// if name is a QName prefix:local
|
115
|
+
// if node->doc has a namespace ns corresponding to prefix
|
116
|
+
// return xmlSetNsProp(node, ns, local, value)
|
117
|
+
// return xmlSetNsProp(node, NULL, name, value)
|
118
|
+
//
|
119
|
+
// If the prefix is "xml", then the namespace lookup will create it.
|
120
|
+
//
|
121
|
+
// By contrast, xmlNewProp does not do this parsing and creates an attribute
|
122
|
+
// with the name and value exactly as given. This is the behavior that we
|
123
|
+
// want.
|
124
|
+
//
|
125
|
+
// Thus, for attribute names like "xml:lang", #set_attribute will create an
|
126
|
+
// attribute with namespace "xml" and name "lang". This is incorrect for
|
127
|
+
// html elements (but correct for foreign elements).
|
128
|
+
//
|
129
|
+
// Work around this by inserting a dummy attribute and then changing the
|
130
|
+
// name, if needed.
|
131
|
+
|
132
|
+
// Can't use strchr since it's locale-sensitive.
|
133
|
+
size_t len = strlen(name);
|
134
|
+
VALUE r_name = rb_str_new(name, len);
|
135
|
+
if (memchr(name, ':', len) == NULL) {
|
136
|
+
// No colon.
|
137
|
+
return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
|
138
|
+
}
|
139
|
+
// Find a dummy attribute string that doesn't already exist.
|
140
|
+
VALUE dummy = find_dummy_key(node);
|
141
|
+
if (dummy == Qnil)
|
142
|
+
return Qnil;
|
143
|
+
// Add the dummy attribute.
|
144
|
+
VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value));
|
145
|
+
if (r_value == Qnil)
|
146
|
+
return Qnil;
|
147
|
+
// Remove thet old attribute, if it exists.
|
148
|
+
rb_funcall(node, remove_attribute, 1, r_name);
|
149
|
+
// Rename the dummy
|
150
|
+
VALUE attr = rb_funcall(node, attribute, 1, dummy);
|
151
|
+
if (attr == Qnil)
|
152
|
+
return Qnil;
|
153
|
+
rb_funcall(attr, node_name_, 1, r_name);
|
154
|
+
return attr;
|
85
155
|
}
|
86
156
|
#endif
|
87
157
|
|
@@ -90,30 +160,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
|
90
160
|
|
91
161
|
// Build a xmlNodePtr for a given GumboElement (recursively)
|
92
162
|
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
93
|
-
//
|
94
|
-
xmlNodePtr element;
|
95
|
-
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
96
|
-
element = xmlNewDocNode(document, NIL,
|
97
|
-
CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
|
98
|
-
} else {
|
99
|
-
GumboStringPiece tag = node->original_tag;
|
100
|
-
gumbo_tag_from_original_text(&tag);
|
101
|
-
#ifdef _MSC_VER
|
102
|
-
char* name = alloca(tag.length+1);
|
103
|
-
#else
|
104
|
-
char name[tag.length+1];
|
105
|
-
#endif
|
106
|
-
strncpy(name, tag.data, tag.length);
|
107
|
-
name[tag.length] = '\0';
|
108
|
-
element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
|
109
|
-
}
|
163
|
+
// create the given element
|
164
|
+
xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
|
110
165
|
|
111
166
|
// add in the attributes
|
112
167
|
GumboVector* attrs = &node->attributes;
|
113
168
|
char *name = NULL;
|
114
|
-
|
115
|
-
char *ns;
|
116
|
-
for (
|
169
|
+
size_t namelen = 0;
|
170
|
+
const char *ns;
|
171
|
+
for (size_t i=0; i < attrs->length; i++) {
|
117
172
|
GumboAttribute *attr = attrs->data[i];
|
118
173
|
|
119
174
|
switch (attr->attr_namespace) {
|
@@ -156,7 +211,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
|
156
211
|
|
157
212
|
// add in the children
|
158
213
|
GumboVector* children = &node->children;
|
159
|
-
for (
|
214
|
+
for (size_t i=0; i < children->length; i++) {
|
160
215
|
xmlNodePtr node = walk_tree(document, children->data[i]);
|
161
216
|
if (node) xmlAddChild(element, node);
|
162
217
|
}
|
@@ -176,37 +231,89 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
|
176
231
|
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
177
232
|
case GUMBO_NODE_CDATA:
|
178
233
|
return xmlNewCDataBlock(document,
|
179
|
-
CONST_CAST node->v.text.
|
180
|
-
(int) node->v.text.
|
234
|
+
CONST_CAST node->v.text.text,
|
235
|
+
(int) strlen(node->v.text.text));
|
181
236
|
case GUMBO_NODE_COMMENT:
|
182
237
|
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
183
238
|
}
|
184
239
|
}
|
185
240
|
|
241
|
+
// URI = system id
|
242
|
+
// external id = public id
|
243
|
+
#if NGLIB
|
244
|
+
static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
245
|
+
{
|
246
|
+
// These two libxml2 functions take the public and system ids in
|
247
|
+
// opposite orders.
|
248
|
+
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
|
249
|
+
assert(doc);
|
250
|
+
if (dtd_name)
|
251
|
+
xmlCreateIntSubset(doc, CONST_CAST dtd_name, CONST_CAST public, CONST_CAST system);
|
252
|
+
return doc;
|
253
|
+
}
|
254
|
+
#else
|
255
|
+
// remove internal subset from newly created documents
|
256
|
+
static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
|
257
|
+
VALUE doc;
|
258
|
+
// If system and public are both NULL, Document#new is going to set default
|
259
|
+
// values for them so we're going to have to remove the internal subset
|
260
|
+
// which seems to leak memory in Nokogiri, so leak as little as possible.
|
261
|
+
if (system == NULL && public == NULL) {
|
262
|
+
doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_str_new("", 0));
|
263
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
|
264
|
+
if (dtd_name) {
|
265
|
+
// We need to create an internal subset now.
|
266
|
+
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(dtd_name), Qnil, Qnil);
|
267
|
+
}
|
268
|
+
} else {
|
269
|
+
assert(dtd_name);
|
270
|
+
// Rather than removing and creating the internal subset as we did above,
|
271
|
+
// just create and then rename one.
|
272
|
+
VALUE r_system = system ? rb_str_new2(system) : Qnil;
|
273
|
+
VALUE r_public = public ? rb_str_new2(public) : Qnil;
|
274
|
+
doc = rb_funcall(Document, new, 2, r_system, r_public);
|
275
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_str_new2(dtd_name));
|
276
|
+
}
|
277
|
+
return doc;
|
278
|
+
}
|
279
|
+
#endif
|
280
|
+
|
186
281
|
// Parse a string using gumbo_parse into a Nokogiri document
|
187
|
-
static VALUE parse(VALUE self, VALUE string, VALUE
|
188
|
-
GumboOptions options;
|
189
|
-
|
190
|
-
options.
|
282
|
+
static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE max_depth) {
|
283
|
+
GumboOptions options = kGumboDefaultOptions;
|
284
|
+
options.max_errors = NUM2INT(max_errors);
|
285
|
+
options.max_tree_depth = NUM2INT(max_depth);
|
191
286
|
|
192
287
|
const char *input = RSTRING_PTR(string);
|
193
288
|
size_t input_len = RSTRING_LEN(string);
|
194
289
|
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
290
|
+
|
291
|
+
const char *status_string = gumbo_status_to_string(output->status);
|
292
|
+
switch (output->status) {
|
293
|
+
case GUMBO_STATUS_OK:
|
294
|
+
break;
|
295
|
+
case GUMBO_STATUS_TREE_TOO_DEEP:
|
296
|
+
gumbo_destroy_output(output);
|
297
|
+
rb_raise(rb_eArgError, "%s", status_string);
|
298
|
+
case GUMBO_STATUS_OUT_OF_MEMORY:
|
299
|
+
gumbo_destroy_output(output);
|
300
|
+
rb_raise(rb_eNoMemError, "%s", status_string);
|
301
|
+
}
|
302
|
+
|
303
|
+
xmlDocPtr doc;
|
199
304
|
if (output->document->v.document.has_doctype) {
|
200
305
|
const char *name = output->document->v.document.name;
|
201
306
|
const char *public = output->document->v.document.public_identifier;
|
202
307
|
const char *system = output->document->v.document.system_identifier;
|
203
|
-
|
204
|
-
|
205
|
-
|
308
|
+
public = public[0] ? public : NULL;
|
309
|
+
system = system[0] ? system : NULL;
|
310
|
+
doc = new_html_doc(name, system, public);
|
311
|
+
} else {
|
312
|
+
doc = new_html_doc(NULL, NULL, NULL);
|
206
313
|
}
|
207
314
|
|
208
315
|
GumboVector *children = &output->document->v.document.children;
|
209
|
-
for (
|
316
|
+
for (size_t i=0; i < children->length; i++) {
|
210
317
|
GumboNode *child = children->data[i];
|
211
318
|
xmlNodePtr node = walk_tree(doc, child);
|
212
319
|
if (node) {
|
@@ -222,28 +329,20 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
|
222
329
|
// Add parse errors to rdoc.
|
223
330
|
if (output->errors.length) {
|
224
331
|
GumboVector *errors = &output->errors;
|
225
|
-
GumboParser parser = { ._options = &options };
|
226
332
|
GumboStringBuffer msg;
|
227
333
|
VALUE rerrors = rb_ary_new2(errors->length);
|
228
334
|
|
229
|
-
gumbo_string_buffer_init(&
|
230
|
-
for (
|
335
|
+
gumbo_string_buffer_init(&msg);
|
336
|
+
for (size_t i=0; i < errors->length; i++) {
|
231
337
|
GumboError *err = errors->data[i];
|
232
|
-
gumbo_string_buffer_clear(&
|
233
|
-
|
234
|
-
// See https://github.com/google/gumbo-parser/pull/371
|
235
|
-
// The bug occurs when the error starts with a newline (unless it's the
|
236
|
-
// first character in the input--but that shouldn't cause an error in
|
237
|
-
// the first place.
|
238
|
-
if (*err->original_text == '\n' && err->original_text != input)
|
239
|
-
--err->original_text;
|
240
|
-
gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
|
338
|
+
gumbo_string_buffer_clear(&msg);
|
339
|
+
gumbo_caret_diagnostic_to_string(err, input, input_len, &msg);
|
241
340
|
VALUE err_str = rb_str_new(msg.data, msg.length);
|
242
|
-
VALUE syntax_error = rb_class_new_instance(1, &err_str,
|
341
|
+
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
243
342
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
244
343
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
245
344
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
246
|
-
rb_iv_set(syntax_error, "@file",
|
345
|
+
rb_iv_set(syntax_error, "@file", url);
|
247
346
|
rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
|
248
347
|
rb_iv_set(syntax_error, "@str1", Qnil);
|
249
348
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
@@ -253,28 +352,28 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
|
253
352
|
rb_ary_push(rerrors, syntax_error);
|
254
353
|
}
|
255
354
|
rb_iv_set(rdoc, "@errors", rerrors);
|
256
|
-
gumbo_string_buffer_destroy(&
|
355
|
+
gumbo_string_buffer_destroy(&msg);
|
257
356
|
}
|
258
357
|
|
259
|
-
gumbo_destroy_output(
|
358
|
+
gumbo_destroy_output(output);
|
260
359
|
|
261
360
|
return rdoc;
|
262
361
|
}
|
263
362
|
|
264
363
|
// Initialize the Nokogumbo class and fetch constants we will use later
|
265
|
-
void
|
364
|
+
void Init_nokogumbo() {
|
266
365
|
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
267
366
|
rb_require("nokogiri");
|
268
367
|
|
269
368
|
// class constants
|
270
369
|
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
271
|
-
VALUE
|
272
|
-
Document = rb_const_get(
|
273
|
-
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
274
|
-
XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
370
|
+
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
371
|
+
Document = rb_const_get(HTML5, rb_intern("Document"));
|
275
372
|
|
276
373
|
#ifndef NGLIB
|
277
374
|
// more class constants
|
375
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
376
|
+
cNokogiriXmlSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
278
377
|
Element = rb_const_get(XML, rb_intern("Element"));
|
279
378
|
Text = rb_const_get(XML, rb_intern("Text"));
|
280
379
|
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
@@ -282,14 +381,18 @@ void Init_nokogumboc() {
|
|
282
381
|
|
283
382
|
// interned symbols
|
284
383
|
new = rb_intern("new");
|
384
|
+
attribute = rb_intern("attribute");
|
285
385
|
set_attribute = rb_intern("set_attribute");
|
386
|
+
remove_attribute = rb_intern("remove_attribute");
|
286
387
|
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
287
388
|
internal_subset = rb_intern("internal_subset");
|
288
389
|
remove_ = rb_intern("remove");
|
289
390
|
create_internal_subset = rb_intern("create_internal_subset");
|
391
|
+
key_ = rb_intern("key?");
|
392
|
+
node_name_ = rb_intern("node_name=");
|
290
393
|
#endif
|
291
394
|
|
292
|
-
// define Nokogumbo
|
293
|
-
VALUE Gumbo =
|
294
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
395
|
+
// define Nokogumbo module with a parse method
|
396
|
+
VALUE Gumbo = rb_define_module("Nokogumbo");
|
397
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 4);
|
295
398
|
}
|