nokogumbo 1.5.0 → 2.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +144 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2128 -1562
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +18 -170
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +40 -21
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a84b367d94046358f7844781b0f92cea51a75e052d54e35b53ab03602743f1b8
|
4
|
+
data.tar.gz: 8d96a5adfa701f658f7ba193ee96bb8a7e6901c1ff4d3fb2dad6f3e372ce66d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de2472c6ff89e3f0076a44ac13fa67688e82f909b265a2b70fe45225daf01aaf6059c6ca94f06e10ff94e10ac8a8f42b685e63f494849f04f3af56f337a73382
|
7
|
+
data.tar.gz: 3880defdaa15cb278236cf170d5727d1d73b14698f1ea41e7a7141da7a2fe8c3bafea19367196214c0dc0c1c27854602714d80abd30ecfd6be90f4277f3e33d7
|
data/README.md
CHANGED
@@ -1,23 +1,21 @@
|
|
1
|
-
Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
|
-
===========
|
1
|
+
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
3
2
|
|
4
|
-
Nokogumbo provides the ability for a Ruby program to invoke
|
5
|
-
[Gumbo HTML5 parser](https://github.com/
|
3
|
+
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
+
[our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
|
6
5
|
and to access the result as a
|
7
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
8
7
|
|
9
|
-
[![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
8
|
+
[![Travis-CI Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
9
|
+
[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/rubys/nokogumbo)](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master)
|
10
10
|
|
11
|
-
Usage
|
12
|
-
-----
|
11
|
+
## Usage
|
13
12
|
|
14
13
|
```ruby
|
15
14
|
require 'nokogumbo'
|
16
|
-
doc = Nokogiri
|
15
|
+
doc = Nokogiri.HTML5(string)
|
17
16
|
```
|
18
17
|
|
19
|
-
|
20
|
-
compliant, it may be useful:
|
18
|
+
To parse an HTML fragment, a `fragment` method is provided.
|
21
19
|
|
22
20
|
```ruby
|
23
21
|
require 'nokogumbo'
|
@@ -32,21 +30,207 @@ require 'nokogumbo'
|
|
32
30
|
doc = Nokogiri::HTML5.get(uri)
|
33
31
|
```
|
34
32
|
|
35
|
-
|
36
|
-
|
33
|
+
## Parsing options
|
34
|
+
The document and fragment parsing methods,
|
35
|
+
- `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
|
36
|
+
- `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
|
37
|
+
- `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
|
38
|
+
- `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
|
39
|
+
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
|
+
support options that are different from Nokogiri's.
|
41
|
+
|
42
|
+
The three currently supported options are `:max_errors`, `:max_tree_depth` and
|
43
|
+
`:max_attributes`, described below.
|
44
|
+
|
45
|
+
### Error reporting
|
46
|
+
Nokogumbo contains an experimental parse error reporting facility. By default,
|
47
|
+
no parse errors are reported but this can be configured by passing the
|
48
|
+
`:max_errors` option to `::parse` or `::fragment`.
|
49
|
+
|
37
50
|
```ruby
|
38
51
|
require 'nokogumbo'
|
39
|
-
|
52
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
53
|
+
doc.errors.each do |err|
|
54
|
+
puts(err)
|
55
|
+
end
|
40
56
|
```
|
41
57
|
|
42
|
-
|
58
|
+
This prints the following.
|
43
59
|
```
|
60
|
+
1:1: ERROR: Expected a doctype token
|
61
|
+
<span/>Hi there!</span foo=bar />
|
62
|
+
^
|
63
|
+
1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
|
64
|
+
<span/>Hi there!</span foo=bar />
|
65
|
+
^
|
66
|
+
1:17: ERROR: End tag ends with '/>', use '>'.
|
67
|
+
<span/>Hi there!</span foo=bar />
|
68
|
+
^
|
69
|
+
1:17: ERROR: End tag contains attributes.
|
70
|
+
<span/>Hi there!</span foo=bar />
|
71
|
+
^
|
72
|
+
```
|
73
|
+
|
74
|
+
Using `max_errors: -1` results in an unlimited number of errors being
|
75
|
+
returned.
|
76
|
+
|
77
|
+
The errors returned by `#errors` are instances of
|
78
|
+
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
79
|
+
|
80
|
+
The [HTML
|
81
|
+
standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors)
|
82
|
+
defines a number of standard parse error codes. These error codes only cover
|
83
|
+
the "tokenization" stage of parsing HTML. The parse errors in the
|
84
|
+
"tree construction" stage do not have standardized error codes (yet).
|
85
|
+
|
86
|
+
As a convenience to Nokogumbo users, the defined error codes are available
|
87
|
+
via the
|
88
|
+
[`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method)
|
89
|
+
method.
|
90
|
+
|
91
|
+
```ruby
|
44
92
|
require 'nokogumbo'
|
45
|
-
Nokogiri::HTML5.parse(
|
93
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
94
|
+
doc.errors.each do |err|
|
95
|
+
puts("#{err.line}:#{err.column}: #{err.str1}")
|
96
|
+
end
|
97
|
+
```
|
98
|
+
|
99
|
+
This prints the following.
|
100
|
+
```
|
101
|
+
1:1: generic-parser
|
102
|
+
1:1: non-void-html-element-start-tag-with-trailing-solidus
|
103
|
+
1:17: end-tag-with-trailing-solidus
|
104
|
+
1:17: end-tag-with-attributes
|
105
|
+
```
|
106
|
+
|
107
|
+
Note that the first error is `generic-parser` because it's an error from the
|
108
|
+
tree construction stage and doesn't have a standardized error code.
|
109
|
+
|
110
|
+
For the purposes of semantic versioning, the error messages, error locations,
|
111
|
+
and error codes are not part of Nokogumbo's public API. That is, these are
|
112
|
+
subject to change without Nokogumbo's major version number changing. These may
|
113
|
+
be stabilized in the future.
|
114
|
+
|
115
|
+
### Maximum tree depth
|
116
|
+
The maximum depth of the DOM tree parsed by the various parsing methods is
|
117
|
+
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
118
|
+
exceed this limit, then an
|
119
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
120
|
+
|
121
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
|
122
|
+
be removed by giving the option `max_tree_depth: -1`.
|
123
|
+
|
124
|
+
``` ruby
|
125
|
+
html = '<!DOCTYPE html>' + '<div>' * 1000
|
126
|
+
doc = Nokogiri.HTML5(html)
|
127
|
+
# raises ArgumentError: Document tree depth limit exceeded
|
128
|
+
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
46
129
|
```
|
47
130
|
|
48
|
-
|
49
|
-
|
131
|
+
### Attribute limit per element
|
132
|
+
The maximum number of attributes per DOM element is configurable by the
|
133
|
+
`:max_attributes` option. If a given element would exceed this limit, then an
|
134
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
135
|
+
|
136
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
|
137
|
+
be removed by giving the option `max_attributes: -1`.
|
138
|
+
|
139
|
+
``` ruby
|
140
|
+
html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
|
141
|
+
# "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
|
142
|
+
doc = Nokogiri.HTML5(html)
|
143
|
+
# raises ArgumentError: Attributes per element limit exceeded
|
144
|
+
doc = Nokogiri.HTML5(html, max_attributes: -1)
|
145
|
+
```
|
146
|
+
|
147
|
+
## HTML Serialization
|
148
|
+
|
149
|
+
After parsing HTML, it may be serialized using any of the Nokogiri
|
150
|
+
[serialization
|
151
|
+
methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
|
152
|
+
particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
|
153
|
+
and its children. (This is the equivalent of JavaScript's
|
154
|
+
`Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
|
155
|
+
a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
|
156
|
+
|
157
|
+
``` ruby
|
158
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
|
159
|
+
puts doc.serialize
|
160
|
+
# Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
|
161
|
+
```
|
162
|
+
|
163
|
+
Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
|
164
|
+
tree to be serialized and then re-parsed, resulting in a different DOM.
|
165
|
+
Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
|
166
|
+
valid HTML may not survive serialization and re-parsing.
|
167
|
+
|
168
|
+
In particular, a newline at the start of `pre`, `listing`, and `textarea`
|
169
|
+
elements is ignored by the parser.
|
170
|
+
|
171
|
+
``` ruby
|
172
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
173
|
+
<!DOCTYPE html>
|
174
|
+
<pre>
|
175
|
+
Content</pre>
|
176
|
+
EOF
|
177
|
+
puts doc.at('/html/body/pre').serialize
|
178
|
+
# Prints: <pre>Content</pre>
|
179
|
+
```
|
180
|
+
|
181
|
+
In this case, the original HTML is semantically equivalent to the serialized
|
182
|
+
version. If the `pre`, `listing`, or `textarea` content starts with two
|
183
|
+
newlines, the first newline will be stripped on the first parse and the second
|
184
|
+
newline will be stripped on the second, leading to semantically different
|
185
|
+
DOMs. Passing the parameter `preserve_newline: true` will cause two or more
|
186
|
+
newlines to be preserved. (A single leading newline will still be removed.)
|
187
|
+
|
188
|
+
``` ruby
|
189
|
+
doc = Nokogiri::HTML5(<<-EOF)
|
190
|
+
<!DOCTYPE html>
|
191
|
+
<listing>
|
192
|
+
|
193
|
+
Content</listing>
|
194
|
+
EOF
|
195
|
+
puts doc.at('/html/body/listing').serialize(preserve_newline: true)
|
196
|
+
# Prints: <listing>
|
197
|
+
#
|
198
|
+
# Content</listing>
|
199
|
+
```
|
200
|
+
|
201
|
+
## Encodings
|
202
|
+
Nokogumbo always parses HTML using
|
203
|
+
[UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
|
204
|
+
input can be explicitly selected via the optional `encoding` parameter. This
|
205
|
+
is most useful when the input comes not from a string but from an IO object.
|
206
|
+
|
207
|
+
When serializing a document or node, the encoding of the output string can be
|
208
|
+
specified via the `:encoding` options. Characters that cannot be encoded in
|
209
|
+
the selected encoding will be encoded as [HTML numeric
|
210
|
+
entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
|
211
|
+
|
212
|
+
``` ruby
|
213
|
+
frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
|
214
|
+
html = frag.serialize(encoding: 'US-ASCII')
|
215
|
+
puts html
|
216
|
+
# Prints: <span>아는 길도 물어가라</span>
|
217
|
+
frag = Nokogiri::HTML5.fragment(html)
|
218
|
+
puts frag.serialize
|
219
|
+
# Prints: <span>아는 길도 물어가라</span>
|
220
|
+
```
|
221
|
+
|
222
|
+
(There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
|
223
|
+
versions of Ruby that can cause the entity encoding to fail. Of the mandated
|
224
|
+
supported encodings for HTML, the only encoding I'm aware of that has this bug
|
225
|
+
is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
|
226
|
+
|
227
|
+
## Examples
|
228
|
+
```ruby
|
229
|
+
require 'nokogumbo'
|
230
|
+
puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
|
231
|
+
```
|
232
|
+
|
233
|
+
## Notes
|
50
234
|
|
51
235
|
* The `Nokogiri::HTML5.fragment` function takes a string and parses it
|
52
236
|
as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
|
@@ -74,20 +258,47 @@ rules defined in the HTML5 specification for doing so.
|
|
74
258
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
75
259
|
original tag name is returned verbatim.
|
76
260
|
|
77
|
-
|
78
|
-
|
261
|
+
# Flavors of Nokogumbo
|
262
|
+
Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
|
263
|
+
parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
|
264
|
+
using Nokogiri's Ruby API to construct the DOM tree.
|
265
|
+
|
266
|
+
Nokogiri can be configured to either use the system library version of libxml2
|
267
|
+
or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
|
268
|
+
will use a bundled version.
|
269
|
+
|
270
|
+
To prevent differences between versions of libxml2, Nokogumbo will only use
|
271
|
+
libxml2 if the build process can find the exact same version used by Nokogiri.
|
272
|
+
This leads to three possibilities
|
273
|
+
|
274
|
+
1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
|
275
|
+
(by default) use the same version of libxml2.
|
276
|
+
2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
|
277
|
+
headers are available, then Nokogumbo will (by default) use the system
|
278
|
+
version and headers.
|
279
|
+
3. Nokogiri is compiled with the system libxml2 but its headers aren't
|
280
|
+
available at build time for Nokogumbo. In this case, Nokogumbo will use the
|
281
|
+
slower Ruby API.
|
282
|
+
|
283
|
+
Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
|
284
|
+
rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
|
285
|
+
`-- --without-libxml2`.
|
286
|
+
|
287
|
+
Functionally, the only difference between using libxml2 or not is in the
|
288
|
+
behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
|
289
|
+
return the line number of the corresponding node. Otherwise, it will return 0.
|
79
290
|
|
80
|
-
Installation
|
81
|
-
============
|
291
|
+
# Installation
|
82
292
|
|
83
|
-
git clone
|
293
|
+
git clone https://github.com/rubys/nokogumbo.git
|
84
294
|
cd nokogumbo
|
85
295
|
bundle install
|
86
296
|
rake gem
|
87
297
|
gem install pkg/nokogumbo*.gem
|
88
298
|
|
89
|
-
Related efforts
|
90
|
-
============
|
299
|
+
# Related efforts
|
91
300
|
|
92
|
-
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme)
|
93
|
-
for the Gumbo HTML5 parser.
|
301
|
+
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
|
302
|
+
for the Gumbo HTML5 parser.
|
303
|
+
* [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
|
304
|
+
the Gumbo HTML5 parser.
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'mkmf'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
$CFLAGS += " -std=c99"
|
7
|
+
$LDFLAGS.gsub!('-Wl,--no-undefined', '')
|
8
|
+
$DLDFLAGS.gsub!('-Wl,--no-undefined', '')
|
9
|
+
$warnflags = CONFIG['warnflags'] = '-Wall'
|
10
|
+
|
11
|
+
NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
|
12
|
+
|
13
|
+
def download_headers
|
14
|
+
begin
|
15
|
+
require 'yaml'
|
16
|
+
|
17
|
+
dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
|
18
|
+
version = dependencies['libxml2']['version']
|
19
|
+
host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
20
|
+
path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
|
21
|
+
return path if File.directory?(path)
|
22
|
+
|
23
|
+
# Make sure we're using the same version Nokogiri uses
|
24
|
+
dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
|
25
|
+
return nil if dep_index.nil?
|
26
|
+
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
27
|
+
|
28
|
+
gem 'mini_portile2', requirement
|
29
|
+
require 'mini_portile2'
|
30
|
+
p = MiniPortile::new('libxml2', version).tap do |r|
|
31
|
+
r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
32
|
+
r.files = [{
|
33
|
+
url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
|
34
|
+
sha256: dependencies['libxml2']['sha256']
|
35
|
+
}]
|
36
|
+
r.configure_options += [
|
37
|
+
"--without-python",
|
38
|
+
"--without-readline",
|
39
|
+
"--with-c14n",
|
40
|
+
"--with-debug",
|
41
|
+
"--with-threads"
|
42
|
+
]
|
43
|
+
end
|
44
|
+
p.download unless p.downloaded?
|
45
|
+
p.extract
|
46
|
+
p.configure unless p.configured?
|
47
|
+
system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
|
48
|
+
path
|
49
|
+
rescue
|
50
|
+
puts 'failed to download/install headers'
|
51
|
+
nil
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
required = arg_config('--with-libxml2')
|
56
|
+
prohibited = arg_config('--without-libxml2')
|
57
|
+
if required and prohibited
|
58
|
+
abort "cannot use both --with-libxml2 and --without-libxml2"
|
59
|
+
end
|
60
|
+
|
61
|
+
have_libxml2 = false
|
62
|
+
have_ng = false
|
63
|
+
|
64
|
+
def windows?
|
65
|
+
::RUBY_PLATFORM =~ /mingw|mswin/
|
66
|
+
end
|
67
|
+
|
68
|
+
def modern_nokogiri?
|
69
|
+
nokogiri_version = Gem::Version.new(Nokogiri::VERSION)
|
70
|
+
requirement = windows? ? ">= 1.11.2" : ">= 1.11.0.rc4"
|
71
|
+
Gem::Requirement.new(requirement).satisfied_by?(nokogiri_version)
|
72
|
+
end
|
73
|
+
|
74
|
+
if !prohibited
|
75
|
+
if modern_nokogiri?
|
76
|
+
append_cflags(Nokogiri::VERSION_INFO["nokogiri"]["cppflags"])
|
77
|
+
append_ldflags(Nokogiri::VERSION_INFO["nokogiri"]["ldflags"]) # may be nil for nokogiri pre-1.11.2
|
78
|
+
have_libxml2 = if Nokogiri::VERSION_INFO["nokogiri"]["ldflags"].empty?
|
79
|
+
have_header('libxml/tree.h')
|
80
|
+
else
|
81
|
+
have_func("xmlNewDoc", "libxml/tree.h")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
if !have_libxml2
|
86
|
+
if Nokogiri::VERSION_INFO.include?('libxml') and
|
87
|
+
Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
|
88
|
+
# Nokogiri has libxml2 built in. Find the headers.
|
89
|
+
libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
|
90
|
+
'include/libxml2')
|
91
|
+
if find_header('libxml/tree.h', libxml2_path)
|
92
|
+
have_libxml2 = true
|
93
|
+
else
|
94
|
+
# Unfortunately, some versions of Nokogiri delete these files.
|
95
|
+
# https://github.com/sparklemotion/nokogiri/pull/1788
|
96
|
+
# Try to download them
|
97
|
+
libxml2_path = download_headers
|
98
|
+
unless libxml2_path.nil?
|
99
|
+
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
else
|
103
|
+
# Nokogiri is compiled with system headers.
|
104
|
+
# Hack to work around broken mkmf on macOS
|
105
|
+
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
106
|
+
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
107
|
+
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
108
|
+
end
|
109
|
+
|
110
|
+
pkg_config('libxml-2.0')
|
111
|
+
have_libxml2 = have_library('xml2', 'xmlNewDoc')
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
if required and !have_libxml2
|
116
|
+
abort "libxml2 required but could not be located"
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
if have_libxml2
|
121
|
+
have_ng = have_header('nokogiri.h') || find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if have_libxml2 and have_ng
|
126
|
+
$CFLAGS += " -DNGLIB=1"
|
127
|
+
end
|
128
|
+
|
129
|
+
# Symlink gumbo-parser source files.
|
130
|
+
ext_dir = File.dirname(__FILE__)
|
131
|
+
|
132
|
+
Dir.chdir(ext_dir) do
|
133
|
+
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
134
|
+
$hdrs = Dir['*.h', '../../gumbo-parser/src/*.h']
|
135
|
+
end
|
136
|
+
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
137
|
+
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
138
|
+
|
139
|
+
create_makefile('nokogumbo/nokogumbo') do |conf|
|
140
|
+
conf.map! do |chunk|
|
141
|
+
chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}")
|
142
|
+
end
|
143
|
+
end
|
144
|
+
# vim: set sw=2 sts=2 ts=8 et:
|