nokogumbo 1.5.0 → 2.0.0.pre.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0d434c0749d7922ba8f084c15ed7219ccbf0e07b715368ae846bc38e64aad17
|
4
|
+
data.tar.gz: 2770648e3e9e82d0ffb1877f1c06edc537688cf6a8405bc52dbdf5a6bb69bc1a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6c3de49495bf55ccaa250e2a3275b6796b0f0565da2a930e3333d2a153f2a16312eb77cb28ca3e03c17720127c2ecc27a1f71cfd6acfd15407295c29973e9fb
|
7
|
+
data.tar.gz: e8ce6c80cb2327d2327f03c7e829156c1f0074ba4d6fce2b0d59305b80112b8fd5edc0932fad1fca13cb5f4bb6f2652fe52a2f090110aa76d06e1afbdebc334f
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to Nokogumbo will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
6
|
+
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [Unreleased]
|
9
|
+
### Added
|
10
|
+
- Experimental support for errors (it was supported in 1.5.0 but
|
11
|
+
undocumented).
|
12
|
+
- Added proper HTML5 serialization.
|
13
|
+
- Added option `:max_tree_depth` to control the maximum parse tree depth.
|
14
|
+
|
15
|
+
### Changed
|
16
|
+
- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
|
17
|
+
Nokogumbo. A system version will not be used.
|
18
|
+
- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
|
19
|
+
`:max_parse_errors` is deprecated and will go away
|
20
|
+
- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
|
21
|
+
`Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
|
22
|
+
rather than `Nokogiri::HTML::Document` and
|
23
|
+
`Nokogiri::HTML::DocumentFragment`.
|
24
|
+
- Changed the top-level API to more closely match Nokogiri's while maintaining
|
25
|
+
backwards compatibility. The new APIs are
|
26
|
+
* `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
|
27
|
+
* `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
|
28
|
+
* `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
|
29
|
+
* `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
|
30
|
+
* `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
|
31
|
+
In all cases, `html` can be a string or an `IO` object (something that
|
32
|
+
responds to `#read`). The `url` parameter is entirely for error reporting,
|
33
|
+
as in Nokogiri. The `encoding` parameter only signals what encoding `html`
|
34
|
+
should have on input; the output `Document` or `DocumentFragment` will be in
|
35
|
+
UTF-8. Currently, the only options supported is `:max_errors` which controls
|
36
|
+
the maximum number of reported by `#errors`.
|
37
|
+
|
38
|
+
### Deprecated
|
39
|
+
- `:max_parse_errors`; use `:max_errors`
|
40
|
+
|
41
|
+
### Removed
|
42
|
+
|
43
|
+
### Fixed
|
44
|
+
- Fixed documents failing to serialize (via `to_html`) if they contain certain
|
45
|
+
`meta` elements that set the `charset`.
|
46
|
+
- Documents are now properly marked as UTF-8 after parsing.
|
47
|
+
- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
|
48
|
+
`<!DOCTYPE html>`.
|
49
|
+
- Fixed crash when input contains U+0000 NULL bytes and error reporting is
|
50
|
+
enabled.
|
51
|
+
|
52
|
+
### Security
|
53
|
+
- The most recent, released version of Gumbo has a [potential security
|
54
|
+
issue](https://github.com/google/gumbo-parser/pull/375) that could result in
|
55
|
+
a cross-site scripting vulnerability. This has been fixed by integrating
|
56
|
+
Gumbo into Nokogumbo.
|
data/README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
|
-
===========
|
1
|
+
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
3
2
|
|
4
3
|
Nokogumbo provides the ability for a Ruby program to invoke the
|
5
4
|
[Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
|
@@ -8,12 +7,11 @@ and to access the result as a
|
|
8
7
|
|
9
8
|
[](https://travis-ci.org/rubys/nokogumbo)
|
10
9
|
|
11
|
-
Usage
|
12
|
-
-----
|
10
|
+
## Usage
|
13
11
|
|
14
12
|
```ruby
|
15
13
|
require 'nokogumbo'
|
16
|
-
doc = Nokogiri
|
14
|
+
doc = Nokogiri.HTML5(string)
|
17
15
|
```
|
18
16
|
|
19
17
|
An experimental _fragment_ method is also provided. While not HTML5
|
@@ -32,21 +30,150 @@ require 'nokogumbo'
|
|
32
30
|
doc = Nokogiri::HTML5.get(uri)
|
33
31
|
```
|
34
32
|
|
35
|
-
|
36
|
-
|
33
|
+
## Parsing options
|
34
|
+
The document and fragment parsing methods,
|
35
|
+
- `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
|
36
|
+
- `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
|
37
|
+
- `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
|
38
|
+
- `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
|
39
|
+
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
|
+
support options that are different from Nokogiri's.
|
41
|
+
|
42
|
+
The two currently supported options are `:max_errors` and `:max_tree_depth`,
|
43
|
+
described below.
|
44
|
+
|
45
|
+
### Error reporting
|
46
|
+
Nokogumbo contains an experimental parse error reporting facility. By default,
|
47
|
+
no parse errors are reported but this can be configured by passing the
|
48
|
+
`:max_errors` option to `::parse` or `::fragment`.
|
49
|
+
|
37
50
|
```ruby
|
38
51
|
require 'nokogumbo'
|
39
|
-
|
52
|
+
doc = Nokogiri::HTML5.parse('Hi there!<body>', max_errors: 10)
|
53
|
+
doc.errors.each do |err|
|
54
|
+
puts err
|
55
|
+
end
|
56
|
+
```
|
57
|
+
|
58
|
+
This prints the following.
|
59
|
+
```
|
60
|
+
1:1: ERROR: @1:1: The doctype must be the first token in the document.
|
61
|
+
Hi there!<body>
|
62
|
+
^
|
63
|
+
1:10: ERROR: @1:10: That tag isn't allowed here Currently open tags: html, body..
|
64
|
+
Hi there!<body>
|
65
|
+
^
|
66
|
+
```
|
67
|
+
|
68
|
+
Using `max_errors: -1` results in an unlimited number of errors being
|
69
|
+
returned.
|
70
|
+
|
71
|
+
The errors returned by `#errors` are instances of
|
72
|
+
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
73
|
+
|
74
|
+
### Maximum tree depth
|
75
|
+
The maximum depth of the DOM tree parsed by the various parsing methods is
|
76
|
+
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
77
|
+
exceed this limit, then an
|
78
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
79
|
+
|
80
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
|
81
|
+
be removed by giving the option `max_tree_depth: -1`.
|
82
|
+
|
83
|
+
``` ruby
|
84
|
+
html = '<!DOCTYPE html>' + '<div>' * 1000
|
85
|
+
doc = Nokogiri.HTML5(html)
|
86
|
+
# raises ArgumentError: Document tree depth limit exceeded
|
87
|
+
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
40
88
|
```
|
41
89
|
|
42
|
-
|
90
|
+
## HTML Serialization
|
91
|
+
|
92
|
+
After parsing HTML, it may be serialized using any of the Nokogiri
|
93
|
+
[serialization
|
94
|
+
methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
|
95
|
+
particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
|
96
|
+
and its children. (This is the equivalent of JavaScript's
|
97
|
+
`Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
|
98
|
+
a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
|
99
|
+
|
100
|
+
``` ruby
|
101
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
|
102
|
+
puts doc.serialize
|
103
|
+
# Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
|
104
|
+
```
|
105
|
+
|
106
|
+
Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
|
107
|
+
tree to be serialized and then re-parsed, resulting in a different DOM.
|
108
|
+
Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
|
109
|
+
valid HTML may not survive serialization and re-parsing.
|
110
|
+
|
111
|
+
In particular, a newline at the start of `pre`, `listing`, and `textarea`
|
112
|
+
elements is ignored by the parser.
|
113
|
+
|
114
|
+
``` ruby
|
115
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
116
|
+
<!DOCTYPE html>
|
117
|
+
<pre>
|
118
|
+
Content</pre>
|
119
|
+
EOF
|
120
|
+
puts doc.at('/html/body/pre').serialize
|
121
|
+
# Prints: <pre>Content</pre>
|
43
122
|
```
|
123
|
+
|
124
|
+
In this case, the original HTML is semantically equivalent to the serialized
|
125
|
+
version. If the `pre`, `listing`, or `textarea` content starts with two
|
126
|
+
newlines, the first newline will be stripped on the first parse and the second
|
127
|
+
newline will be stripped on the second, leading to semantically different
|
128
|
+
DOMs. Passing the parameter `preserve_newline: true` will cause two or more
|
129
|
+
newlines to be preserved. (A single leading newline will still be removed.)
|
130
|
+
|
131
|
+
``` ruby
|
132
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
133
|
+
<!DOCTYPE html>
|
134
|
+
<listing>
|
135
|
+
|
136
|
+
Content</listing>
|
137
|
+
EOF
|
138
|
+
puts doc.at('/html/body/listing').serialize(preserve_newline: true)
|
139
|
+
# Prints: <listing>
|
140
|
+
#
|
141
|
+
# Content</listing>
|
142
|
+
```
|
143
|
+
|
144
|
+
## Encodings
|
145
|
+
Nokogumbo always parses HTML using
|
146
|
+
[UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
|
147
|
+
input can be explicitly selected via the optional `encoding` parameter. This
|
148
|
+
is most useful when the input comes not from a string but from an IO object.
|
149
|
+
|
150
|
+
When serializing a document or node, the encoding of the output string can be
|
151
|
+
specified via the `:encoding` options. Characters that cannot be encoded in
|
152
|
+
the selected encoding will be encoded as [HTML numeric
|
153
|
+
entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
|
154
|
+
|
155
|
+
``` ruby
|
156
|
+
frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
|
157
|
+
html = frag.serialize(encoding: 'US-ASCII')
|
158
|
+
puts html
|
159
|
+
# Prints: <span>아는 길도 물어가라</span>
|
160
|
+
frag = Nokogiri::HTML5.fragment(html)
|
161
|
+
puts frag.serialize
|
162
|
+
# Prints: <span>아는 길도 물어가라</span>
|
163
|
+
```
|
164
|
+
|
165
|
+
(There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
|
166
|
+
versions of Ruby that can cause the entity encoding to fail. Of the mandated
|
167
|
+
supported encodings for HTML, the only encoding I'm aware of that has this bug
|
168
|
+
is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
|
169
|
+
|
170
|
+
## Examples
|
171
|
+
```ruby
|
44
172
|
require 'nokogumbo'
|
45
|
-
Nokogiri::HTML5.
|
173
|
+
puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
|
46
174
|
```
|
47
175
|
|
48
|
-
Notes
|
49
|
-
-----
|
176
|
+
## Notes
|
50
177
|
|
51
178
|
* The `Nokogiri::HTML5.fragment` function takes a string and parses it
|
52
179
|
as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
|
@@ -74,20 +201,17 @@ rules defined in the HTML5 specification for doing so.
|
|
74
201
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
75
202
|
original tag name is returned verbatim.
|
76
203
|
|
77
|
-
|
78
|
-
parser will be downloaded and compiled into the Gem itself.
|
79
|
-
|
80
|
-
Installation
|
81
|
-
============
|
204
|
+
# Installation
|
82
205
|
|
83
|
-
git clone
|
206
|
+
git clone https://github.com/rubys/nokogumbo.git
|
84
207
|
cd nokogumbo
|
85
208
|
bundle install
|
86
209
|
rake gem
|
87
210
|
gem install pkg/nokogumbo*.gem
|
88
211
|
|
89
|
-
Related efforts
|
90
|
-
============
|
212
|
+
# Related efforts
|
91
213
|
|
92
|
-
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme)
|
93
|
-
for the Gumbo HTML5 parser.
|
214
|
+
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
|
215
|
+
for the Gumbo HTML5 parser.
|
216
|
+
* [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
|
217
|
+
the Gumbo HTML5 parser.
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'mkmf'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
$CFLAGS += " -std=c99"
|
6
|
+
$LDFLAGS.gsub!('-Wl,--no-undefined', '')
|
7
|
+
$warnflags = CONFIG['warnflags'] = '-Wall'
|
8
|
+
|
9
|
+
NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
|
10
|
+
|
11
|
+
def download_headers
|
12
|
+
begin
|
13
|
+
require 'yaml'
|
14
|
+
|
15
|
+
dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
|
16
|
+
version = dependencies['libxml2']['version']
|
17
|
+
host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
18
|
+
path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
|
19
|
+
return path if File.directory?(path)
|
20
|
+
|
21
|
+
# Make sure we're using the same version Nokogiri uses
|
22
|
+
dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
|
23
|
+
return nil if dep_index.nil?
|
24
|
+
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
gem 'mini_portile2', requirement
|
28
|
+
require 'mini_portile2'
|
29
|
+
p = MiniPortile::new('libxml2', version).tap do |r|
|
30
|
+
r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
31
|
+
r.files = [{
|
32
|
+
url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
|
33
|
+
sha256: dependencies['libxml2']['sha256']
|
34
|
+
}]
|
35
|
+
r.configure_options += [
|
36
|
+
"--without-python",
|
37
|
+
"--without-readline",
|
38
|
+
"--with-c14n",
|
39
|
+
"--with-debug",
|
40
|
+
"--with-threads"
|
41
|
+
]
|
42
|
+
end
|
43
|
+
p.download unless p.downloaded?
|
44
|
+
p.extract
|
45
|
+
p.configure unless p.configured?
|
46
|
+
system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
|
47
|
+
path
|
48
|
+
rescue
|
49
|
+
puts 'failed to download/install headers'
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
required = arg_config('--with-libxml2')
|
55
|
+
prohibited = arg_config('--without-libxml2')
|
56
|
+
if required and prohibited
|
57
|
+
abort "cannot use both --with-libxml2 and --without-libxml2"
|
58
|
+
end
|
59
|
+
|
60
|
+
have_libxml2 = false
|
61
|
+
have_ng = false
|
62
|
+
|
63
|
+
if !prohibited
|
64
|
+
if Nokogiri::VERSION_INFO.include?('libxml') and
|
65
|
+
Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
|
66
|
+
# Nokogiri has libxml2 built in. Find the headers.
|
67
|
+
libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
|
68
|
+
'include/libxml2')
|
69
|
+
if find_header('libxml/tree.h', libxml2_path)
|
70
|
+
have_libxml2 = true
|
71
|
+
else
|
72
|
+
# Unfortunately, some versions of Nokogiri delete these files.
|
73
|
+
# https://github.com/sparklemotion/nokogiri/pull/1788
|
74
|
+
# Try to download them
|
75
|
+
libxml2_path = download_headers
|
76
|
+
unless libxml2_path.nil?
|
77
|
+
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
else
|
81
|
+
# Nokogiri is compiled with system headers.
|
82
|
+
# Hack to work around broken mkmf on macOS
|
83
|
+
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
84
|
+
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
85
|
+
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
86
|
+
end
|
87
|
+
|
88
|
+
pkg_config('libxml-2.0')
|
89
|
+
have_libxml2 = have_library('xml2', 'xmlNewDoc')
|
90
|
+
end
|
91
|
+
if required and !have_libxml2
|
92
|
+
abort "libxml2 required but could not be located"
|
93
|
+
end
|
94
|
+
|
95
|
+
if have_libxml2
|
96
|
+
# Find nokogiri.h
|
97
|
+
have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
if have_libxml2 and have_ng
|
102
|
+
$CFLAGS += " -DNGLIB=1"
|
103
|
+
end
|
104
|
+
|
105
|
+
# Symlink gumbo-parser source files.
|
106
|
+
ext_dir = File.dirname(__FILE__)
|
107
|
+
gumbo_src = File.join(ext_dir, 'gumbo_src')
|
108
|
+
|
109
|
+
Dir.chdir(ext_dir) do
|
110
|
+
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
111
|
+
end
|
112
|
+
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
113
|
+
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
114
|
+
|
115
|
+
create_makefile('nokogumbo/nokogumbo')
|
116
|
+
# vim: set sw=2 sts=2 ts=8 et:
|
@@ -2,7 +2,7 @@
|
|
2
2
|
// nokogumbo.c defines the following:
|
3
3
|
//
|
4
4
|
// class Nokogumbo
|
5
|
-
// def parse(utf8_string) # returns Nokogiri::
|
5
|
+
// def parse(utf8_string) # returns Nokogiri::HTML5::Document
|
6
6
|
// end
|
7
7
|
//
|
8
8
|
// Processing starts by calling gumbo_parse_with_options. The resulting
|
@@ -18,26 +18,29 @@
|
|
18
18
|
// methods are called instead, producing the equivalent functionality.
|
19
19
|
//
|
20
20
|
|
21
|
+
#include <assert.h>
|
21
22
|
#include <ruby.h>
|
22
23
|
#include "gumbo.h"
|
23
24
|
#include "error.h"
|
24
|
-
#include "parser.h"
|
25
25
|
|
26
26
|
// class constants
|
27
27
|
static VALUE Document;
|
28
|
-
static VALUE XMLSyntaxError;
|
29
28
|
|
30
29
|
#ifdef NGLIB
|
31
30
|
#include <nokogiri.h>
|
31
|
+
#include <xml_syntax_error.h>
|
32
32
|
#include <libxml/tree.h>
|
33
|
+
#include <libxml/HTMLtree.h>
|
33
34
|
|
34
35
|
#define NIL NULL
|
35
36
|
#define CONST_CAST (xmlChar const*)
|
36
37
|
#else
|
37
|
-
#define NIL
|
38
|
+
#define NIL Qnil
|
38
39
|
#define CONST_CAST
|
39
40
|
|
40
41
|
// more class constants
|
42
|
+
static VALUE cNokogiriXmlSyntaxError;
|
43
|
+
|
41
44
|
static VALUE Element;
|
42
45
|
static VALUE Text;
|
43
46
|
static VALUE CDATA;
|
@@ -45,11 +48,15 @@ static VALUE Comment;
|
|
45
48
|
|
46
49
|
// interned symbols
|
47
50
|
static VALUE new;
|
51
|
+
static VALUE attribute;
|
48
52
|
static VALUE set_attribute;
|
53
|
+
static VALUE remove_attribute;
|
49
54
|
static VALUE add_child;
|
50
55
|
static VALUE internal_subset;
|
51
56
|
static VALUE remove_;
|
52
57
|
static VALUE create_internal_subset;
|
58
|
+
static VALUE key_;
|
59
|
+
static VALUE node_name_;
|
53
60
|
|
54
61
|
// map libxml2 types to Ruby VALUE
|
55
62
|
#define xmlNodePtr VALUE
|
@@ -58,12 +65,10 @@ static VALUE create_internal_subset;
|
|
58
65
|
// redefine libxml2 API as Ruby function calls
|
59
66
|
#define xmlNewDocNode(doc, ns, name, content) \
|
60
67
|
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
61
|
-
#define xmlNewProp(element, name, value) \
|
62
|
-
rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
|
63
68
|
#define xmlNewDocText(doc, text) \
|
64
69
|
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
65
70
|
#define xmlNewCDataBlock(doc, content, length) \
|
66
|
-
rb_funcall(CDATA, new, 2, rb_str_new(content, length)
|
71
|
+
rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
|
67
72
|
#define xmlNewDocComment(doc, text) \
|
68
73
|
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
69
74
|
#define xmlAddChild(element, node) \
|
@@ -77,11 +82,76 @@ static VALUE create_internal_subset;
|
|
77
82
|
#define Nokogiri_wrap_xml_document(klass, doc) \
|
78
83
|
doc
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
+
static VALUE find_dummy_key(VALUE collection) {
|
86
|
+
VALUE r_dummy = Qnil;
|
87
|
+
char dummy[5] = "a";
|
88
|
+
size_t len = 1;
|
89
|
+
while (len < sizeof dummy) {
|
90
|
+
r_dummy = rb_str_new(dummy, len);
|
91
|
+
if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
|
92
|
+
return r_dummy;
|
93
|
+
for (size_t i = 0; ; ++i) {
|
94
|
+
if (dummy[i] == 0) {
|
95
|
+
dummy[i] = 'a';
|
96
|
+
++len;
|
97
|
+
break;
|
98
|
+
}
|
99
|
+
if (dummy[i] == 'z')
|
100
|
+
dummy[i] = 'a';
|
101
|
+
else {
|
102
|
+
++dummy[i];
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
// This collection has 475254 elements?? Give up.
|
108
|
+
return Qnil;
|
109
|
+
}
|
110
|
+
|
111
|
+
static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) {
|
112
|
+
// Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
|
113
|
+
// which behaves roughly as
|
114
|
+
// if name is a QName prefix:local
|
115
|
+
// if node->doc has a namespace ns corresponding to prefix
|
116
|
+
// return xmlSetNsProp(node, ns, local, value)
|
117
|
+
// return xmlSetNsProp(node, NULL, name, value)
|
118
|
+
//
|
119
|
+
// If the prefix is "xml", then the namespace lookup will create it.
|
120
|
+
//
|
121
|
+
// By contrast, xmlNewProp does not do this parsing and creates an attribute
|
122
|
+
// with the name and value exactly as given. This is the behavior that we
|
123
|
+
// want.
|
124
|
+
//
|
125
|
+
// Thus, for attribute names like "xml:lang", #set_attribute will create an
|
126
|
+
// attribute with namespace "xml" and name "lang". This is incorrect for
|
127
|
+
// html elements (but correct for foreign elements).
|
128
|
+
//
|
129
|
+
// Work around this by inserting a dummy attribute and then changing the
|
130
|
+
// name, if needed.
|
131
|
+
|
132
|
+
// Can't use strchr since it's locale-sensitive.
|
133
|
+
size_t len = strlen(name);
|
134
|
+
VALUE r_name = rb_str_new(name, len);
|
135
|
+
if (memchr(name, ':', len) == NULL) {
|
136
|
+
// No colon.
|
137
|
+
return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
|
138
|
+
}
|
139
|
+
// Find a dummy attribute string that doesn't already exist.
|
140
|
+
VALUE dummy = find_dummy_key(node);
|
141
|
+
if (dummy == Qnil)
|
142
|
+
return Qnil;
|
143
|
+
// Add the dummy attribute.
|
144
|
+
VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value));
|
145
|
+
if (r_value == Qnil)
|
146
|
+
return Qnil;
|
147
|
+
// Remove thet old attribute, if it exists.
|
148
|
+
rb_funcall(node, remove_attribute, 1, r_name);
|
149
|
+
// Rename the dummy
|
150
|
+
VALUE attr = rb_funcall(node, attribute, 1, dummy);
|
151
|
+
if (attr == Qnil)
|
152
|
+
return Qnil;
|
153
|
+
rb_funcall(attr, node_name_, 1, r_name);
|
154
|
+
return attr;
|
85
155
|
}
|
86
156
|
#endif
|
87
157
|
|
@@ -90,30 +160,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
|
90
160
|
|
91
161
|
// Build a xmlNodePtr for a given GumboElement (recursively)
|
92
162
|
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
93
|
-
//
|
94
|
-
xmlNodePtr element;
|
95
|
-
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
96
|
-
element = xmlNewDocNode(document, NIL,
|
97
|
-
CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
|
98
|
-
} else {
|
99
|
-
GumboStringPiece tag = node->original_tag;
|
100
|
-
gumbo_tag_from_original_text(&tag);
|
101
|
-
#ifdef _MSC_VER
|
102
|
-
char* name = alloca(tag.length+1);
|
103
|
-
#else
|
104
|
-
char name[tag.length+1];
|
105
|
-
#endif
|
106
|
-
strncpy(name, tag.data, tag.length);
|
107
|
-
name[tag.length] = '\0';
|
108
|
-
element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
|
109
|
-
}
|
163
|
+
// create the given element
|
164
|
+
xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
|
110
165
|
|
111
166
|
// add in the attributes
|
112
167
|
GumboVector* attrs = &node->attributes;
|
113
168
|
char *name = NULL;
|
114
|
-
|
115
|
-
char *ns;
|
116
|
-
for (
|
169
|
+
size_t namelen = 0;
|
170
|
+
const char *ns;
|
171
|
+
for (size_t i=0; i < attrs->length; i++) {
|
117
172
|
GumboAttribute *attr = attrs->data[i];
|
118
173
|
|
119
174
|
switch (attr->attr_namespace) {
|
@@ -156,7 +211,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
|
156
211
|
|
157
212
|
// add in the children
|
158
213
|
GumboVector* children = &node->children;
|
159
|
-
for (
|
214
|
+
for (size_t i=0; i < children->length; i++) {
|
160
215
|
xmlNodePtr node = walk_tree(document, children->data[i]);
|
161
216
|
if (node) xmlAddChild(element, node);
|
162
217
|
}
|
@@ -176,37 +231,89 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
|
176
231
|
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
177
232
|
case GUMBO_NODE_CDATA:
|
178
233
|
return xmlNewCDataBlock(document,
|
179
|
-
CONST_CAST node->v.text.
|
180
|
-
(int) node->v.text.
|
234
|
+
CONST_CAST node->v.text.text,
|
235
|
+
(int) strlen(node->v.text.text));
|
181
236
|
case GUMBO_NODE_COMMENT:
|
182
237
|
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
183
238
|
}
|
184
239
|
}
|
185
240
|
|
241
|
+
// URI = system id
|
242
|
+
// external id = public id
|
243
|
+
#if NGLIB
|
244
|
+
static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
245
|
+
{
|
246
|
+
// These two libxml2 functions take the public and system ids in
|
247
|
+
// opposite orders.
|
248
|
+
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
|
249
|
+
assert(doc);
|
250
|
+
if (dtd_name)
|
251
|
+
xmlCreateIntSubset(doc, CONST_CAST dtd_name, CONST_CAST public, CONST_CAST system);
|
252
|
+
return doc;
|
253
|
+
}
|
254
|
+
#else
|
255
|
+
// remove internal subset from newly created documents
|
256
|
+
static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
|
257
|
+
VALUE doc;
|
258
|
+
// If system and public are both NULL, Document#new is going to set default
|
259
|
+
// values for them so we're going to have to remove the internal subset
|
260
|
+
// which seems to leak memory in Nokogiri, so leak as little as possible.
|
261
|
+
if (system == NULL && public == NULL) {
|
262
|
+
doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_str_new("", 0));
|
263
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
|
264
|
+
if (dtd_name) {
|
265
|
+
// We need to create an internal subset now.
|
266
|
+
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(dtd_name), Qnil, Qnil);
|
267
|
+
}
|
268
|
+
} else {
|
269
|
+
assert(dtd_name);
|
270
|
+
// Rather than removing and creating the internal subset as we did above,
|
271
|
+
// just create and then rename one.
|
272
|
+
VALUE r_system = system ? rb_str_new2(system) : Qnil;
|
273
|
+
VALUE r_public = public ? rb_str_new2(public) : Qnil;
|
274
|
+
doc = rb_funcall(Document, new, 2, r_system, r_public);
|
275
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_str_new2(dtd_name));
|
276
|
+
}
|
277
|
+
return doc;
|
278
|
+
}
|
279
|
+
#endif
|
280
|
+
|
186
281
|
// Parse a string using gumbo_parse into a Nokogiri document
|
187
|
-
static VALUE parse(VALUE self, VALUE string, VALUE
|
188
|
-
GumboOptions options;
|
189
|
-
|
190
|
-
options.
|
282
|
+
static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE max_depth) {
|
283
|
+
GumboOptions options = kGumboDefaultOptions;
|
284
|
+
options.max_errors = NUM2INT(max_errors);
|
285
|
+
options.max_tree_depth = NUM2INT(max_depth);
|
191
286
|
|
192
287
|
const char *input = RSTRING_PTR(string);
|
193
288
|
size_t input_len = RSTRING_LEN(string);
|
194
289
|
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
290
|
+
|
291
|
+
const char *status_string = gumbo_status_to_string(output->status);
|
292
|
+
switch (output->status) {
|
293
|
+
case GUMBO_STATUS_OK:
|
294
|
+
break;
|
295
|
+
case GUMBO_STATUS_TREE_TOO_DEEP:
|
296
|
+
gumbo_destroy_output(output);
|
297
|
+
rb_raise(rb_eArgError, "%s", status_string);
|
298
|
+
case GUMBO_STATUS_OUT_OF_MEMORY:
|
299
|
+
gumbo_destroy_output(output);
|
300
|
+
rb_raise(rb_eNoMemError, "%s", status_string);
|
301
|
+
}
|
302
|
+
|
303
|
+
xmlDocPtr doc;
|
199
304
|
if (output->document->v.document.has_doctype) {
|
200
305
|
const char *name = output->document->v.document.name;
|
201
306
|
const char *public = output->document->v.document.public_identifier;
|
202
307
|
const char *system = output->document->v.document.system_identifier;
|
203
|
-
|
204
|
-
|
205
|
-
|
308
|
+
public = public[0] ? public : NULL;
|
309
|
+
system = system[0] ? system : NULL;
|
310
|
+
doc = new_html_doc(name, system, public);
|
311
|
+
} else {
|
312
|
+
doc = new_html_doc(NULL, NULL, NULL);
|
206
313
|
}
|
207
314
|
|
208
315
|
GumboVector *children = &output->document->v.document.children;
|
209
|
-
for (
|
316
|
+
for (size_t i=0; i < children->length; i++) {
|
210
317
|
GumboNode *child = children->data[i];
|
211
318
|
xmlNodePtr node = walk_tree(doc, child);
|
212
319
|
if (node) {
|
@@ -222,28 +329,20 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
|
222
329
|
// Add parse errors to rdoc.
|
223
330
|
if (output->errors.length) {
|
224
331
|
GumboVector *errors = &output->errors;
|
225
|
-
GumboParser parser = { ._options = &options };
|
226
332
|
GumboStringBuffer msg;
|
227
333
|
VALUE rerrors = rb_ary_new2(errors->length);
|
228
334
|
|
229
|
-
gumbo_string_buffer_init(&
|
230
|
-
for (
|
335
|
+
gumbo_string_buffer_init(&msg);
|
336
|
+
for (size_t i=0; i < errors->length; i++) {
|
231
337
|
GumboError *err = errors->data[i];
|
232
|
-
gumbo_string_buffer_clear(&
|
233
|
-
|
234
|
-
// See https://github.com/google/gumbo-parser/pull/371
|
235
|
-
// The bug occurs when the error starts with a newline (unless it's the
|
236
|
-
// first character in the input--but that shouldn't cause an error in
|
237
|
-
// the first place.
|
238
|
-
if (*err->original_text == '\n' && err->original_text != input)
|
239
|
-
--err->original_text;
|
240
|
-
gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
|
338
|
+
gumbo_string_buffer_clear(&msg);
|
339
|
+
gumbo_caret_diagnostic_to_string(err, input, input_len, &msg);
|
241
340
|
VALUE err_str = rb_str_new(msg.data, msg.length);
|
242
|
-
VALUE syntax_error = rb_class_new_instance(1, &err_str,
|
341
|
+
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
243
342
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
244
343
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
245
344
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
246
|
-
rb_iv_set(syntax_error, "@file",
|
345
|
+
rb_iv_set(syntax_error, "@file", url);
|
247
346
|
rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
|
248
347
|
rb_iv_set(syntax_error, "@str1", Qnil);
|
249
348
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
@@ -253,28 +352,28 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
|
253
352
|
rb_ary_push(rerrors, syntax_error);
|
254
353
|
}
|
255
354
|
rb_iv_set(rdoc, "@errors", rerrors);
|
256
|
-
gumbo_string_buffer_destroy(&
|
355
|
+
gumbo_string_buffer_destroy(&msg);
|
257
356
|
}
|
258
357
|
|
259
|
-
gumbo_destroy_output(
|
358
|
+
gumbo_destroy_output(output);
|
260
359
|
|
261
360
|
return rdoc;
|
262
361
|
}
|
263
362
|
|
264
363
|
// Initialize the Nokogumbo class and fetch constants we will use later
|
265
|
-
void
|
364
|
+
void Init_nokogumbo() {
|
266
365
|
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
267
366
|
rb_require("nokogiri");
|
268
367
|
|
269
368
|
// class constants
|
270
369
|
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
271
|
-
VALUE
|
272
|
-
Document = rb_const_get(
|
273
|
-
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
274
|
-
XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
370
|
+
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
371
|
+
Document = rb_const_get(HTML5, rb_intern("Document"));
|
275
372
|
|
276
373
|
#ifndef NGLIB
|
277
374
|
// more class constants
|
375
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
376
|
+
cNokogiriXmlSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
278
377
|
Element = rb_const_get(XML, rb_intern("Element"));
|
279
378
|
Text = rb_const_get(XML, rb_intern("Text"));
|
280
379
|
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
@@ -282,14 +381,18 @@ void Init_nokogumboc() {
|
|
282
381
|
|
283
382
|
// interned symbols
|
284
383
|
new = rb_intern("new");
|
384
|
+
attribute = rb_intern("attribute");
|
285
385
|
set_attribute = rb_intern("set_attribute");
|
386
|
+
remove_attribute = rb_intern("remove_attribute");
|
286
387
|
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
287
388
|
internal_subset = rb_intern("internal_subset");
|
288
389
|
remove_ = rb_intern("remove");
|
289
390
|
create_internal_subset = rb_intern("create_internal_subset");
|
391
|
+
key_ = rb_intern("key?");
|
392
|
+
node_name_ = rb_intern("node_name=");
|
290
393
|
#endif
|
291
394
|
|
292
|
-
// define Nokogumbo
|
293
|
-
VALUE Gumbo =
|
294
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
395
|
+
// define Nokogumbo module with a parse method
|
396
|
+
VALUE Gumbo = rb_define_module("Nokogumbo");
|
397
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 4);
|
295
398
|
}
|