nokogumbo 2.0.0.pre.alpha → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +101 -14
- data/ext/nokogumbo/extconf.rb +7 -2
- data/ext/nokogumbo/nokogumbo.c +630 -235
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +391 -126
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +74 -4
- data/gumbo-parser/src/parser.c +1161 -1025
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1440 -1278
- data/gumbo-parser/src/tokenizer.h +7 -18
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +17 -59
- data/gumbo-parser/src/utf8.h +52 -16
- data/lib/nokogumbo.rb +3 -1
- data/lib/nokogumbo/html5.rb +17 -15
- data/lib/nokogumbo/html5/document.rb +19 -3
- data/lib/nokogumbo/html5/document_fragment.rb +36 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +20 -14
- data/CHANGELOG.md +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d5936b6ffffb20ec22609b2be5d2493919c81f7c8279a7c21fd8e95e457d0c5
|
4
|
+
data.tar.gz: 5259acf1b328d1097edcbb162433d72e69b084b66ed8bae28703bca53bd88b76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6fdc59096ec69684932424b70bed03f05f6ea5324e9d3f70eee8481891a15ae9f3cf063fff7106cc38d2873d072d21169b58ed6f5b381b687aa5b1680eee35e6
|
7
|
+
data.tar.gz: 4d1d88817b4242b060e5a9ac74e36e9f23d38b153547b64d51a59d86daff44853915ba8bd07473902fe4d936cd53851d2594c3ac09541159863117eedf8339ab
|
data/README.md
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
2
|
|
3
|
-
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
-
[Gumbo HTML5 parser](https://github.com/
|
3
|
+
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
+
[our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
|
5
5
|
and to access the result as a
|
6
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
7
7
|
|
8
|
-
[![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
8
|
+
[![Travis-CI Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
9
|
+
[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/rubys/nokogumbo)](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master)
|
9
10
|
|
10
11
|
## Usage
|
11
12
|
|
@@ -14,8 +15,7 @@ require 'nokogumbo'
|
|
14
15
|
doc = Nokogiri.HTML5(string)
|
15
16
|
```
|
16
17
|
|
17
|
-
|
18
|
-
compliant, it may be useful:
|
18
|
+
To parse an HTML fragment, a `fragment` method is provided.
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
require 'nokogumbo'
|
@@ -39,8 +39,8 @@ The document and fragment parsing methods,
|
|
39
39
|
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
40
|
support options that are different from Nokogiri's.
|
41
41
|
|
42
|
-
The
|
43
|
-
described below.
|
42
|
+
The three currently supported options are `:max_errors`, `:max_tree_depth` and
|
43
|
+
`:max_attributes`, described below.
|
44
44
|
|
45
45
|
### Error reporting
|
46
46
|
Nokogumbo contains an experimental parse error reporting facility. By default,
|
@@ -49,20 +49,26 @@ no parse errors are reported but this can be configured by passing the
|
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
require 'nokogumbo'
|
52
|
-
doc = Nokogiri::HTML5.parse('Hi there
|
52
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
53
53
|
doc.errors.each do |err|
|
54
|
-
puts
|
54
|
+
puts(err)
|
55
55
|
end
|
56
56
|
```
|
57
57
|
|
58
58
|
This prints the following.
|
59
59
|
```
|
60
|
-
1:1: ERROR:
|
61
|
-
Hi there
|
60
|
+
1:1: ERROR: Expected a doctype token
|
61
|
+
<span/>Hi there!</span foo=bar />
|
62
62
|
^
|
63
|
-
1:
|
64
|
-
Hi there
|
65
|
-
|
63
|
+
1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
|
64
|
+
<span/>Hi there!</span foo=bar />
|
65
|
+
^
|
66
|
+
1:17: ERROR: End tag ends with '/>', use '>'.
|
67
|
+
<span/>Hi there!</span foo=bar />
|
68
|
+
^
|
69
|
+
1:17: ERROR: End tag contains attributes.
|
70
|
+
<span/>Hi there!</span foo=bar />
|
71
|
+
^
|
66
72
|
```
|
67
73
|
|
68
74
|
Using `max_errors: -1` results in an unlimited number of errors being
|
@@ -71,6 +77,41 @@ returned.
|
|
71
77
|
The errors returned by `#errors` are instances of
|
72
78
|
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
73
79
|
|
80
|
+
The [HTML
|
81
|
+
standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors)
|
82
|
+
defines a number of standard parse error codes. These error codes only cover
|
83
|
+
the "tokenization" stage of parsing HTML. The parse errors in the
|
84
|
+
"tree construction" stage do not have standardized error codes (yet).
|
85
|
+
|
86
|
+
As a convenience to Nokogumbo users, the defined error codes are available
|
87
|
+
via the
|
88
|
+
[`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method)
|
89
|
+
method.
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
require 'nokogumbo'
|
93
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
94
|
+
doc.errors.each do |err|
|
95
|
+
puts("#{err.line}:#{err.column}: #{err.str1}")
|
96
|
+
end
|
97
|
+
```
|
98
|
+
|
99
|
+
This prints the following.
|
100
|
+
```
|
101
|
+
1:1: generic-parser
|
102
|
+
1:1: non-void-html-element-start-tag-with-trailing-solidus
|
103
|
+
1:17: end-tag-with-trailing-solidus
|
104
|
+
1:17: end-tag-with-attributes
|
105
|
+
```
|
106
|
+
|
107
|
+
Note that the first error is `generic-parser` because it's an error from the
|
108
|
+
tree construction stage and doesn't have a standardized error code.
|
109
|
+
|
110
|
+
For the purposes of semantic versioning, the error messages, error locations,
|
111
|
+
and error codes are not part of Nokogumbo's public API. That is, these are
|
112
|
+
subject to change without Nokogumbo's major version number changing. These may
|
113
|
+
be stabilized in the future.
|
114
|
+
|
74
115
|
### Maximum tree depth
|
75
116
|
The maximum depth of the DOM tree parsed by the various parsing methods is
|
76
117
|
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
@@ -87,6 +128,22 @@ doc = Nokogiri.HTML5(html)
|
|
87
128
|
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
88
129
|
```
|
89
130
|
|
131
|
+
### Attribute limit per element
|
132
|
+
The maximum number of attributes per DOM element is configurable by the
|
133
|
+
`:max_attributes` option. If a given element would exceed this limit, then an
|
134
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
135
|
+
|
136
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
|
137
|
+
be removed by giving the option `max_attributes: -1`.
|
138
|
+
|
139
|
+
``` ruby
|
140
|
+
html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
|
141
|
+
# "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
|
142
|
+
doc = Nokogiri.HTML5(html)
|
143
|
+
# raises ArgumentError: Attributes per element limit exceeded
|
144
|
+
doc = Nokogiri.HTML5(html, max_attributes: -1)
|
145
|
+
```
|
146
|
+
|
90
147
|
## HTML Serialization
|
91
148
|
|
92
149
|
After parsing HTML, it may be serialized using any of the Nokogiri
|
@@ -201,6 +258,36 @@ rules defined in the HTML5 specification for doing so.
|
|
201
258
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
202
259
|
original tag name is returned verbatim.
|
203
260
|
|
261
|
+
# Flavors of Nokogumbo
|
262
|
+
Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
|
263
|
+
parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
|
264
|
+
using Nokogiri's Ruby API to construct the DOM tree.
|
265
|
+
|
266
|
+
Nokogiri can be configured to either use the system library version of libxml2
|
267
|
+
or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
|
268
|
+
will use a bundled version.
|
269
|
+
|
270
|
+
To prevent differences between versions of libxml2, Nokogumbo will only use
|
271
|
+
libxml2 if the build process can find the exact same version used by Nokogiri.
|
272
|
+
This leads to three possibilities
|
273
|
+
|
274
|
+
1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
|
275
|
+
(by default) use the same version of libxml2.
|
276
|
+
2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
|
277
|
+
headers are available, then Nokogumbo will (by default) use the system
|
278
|
+
version and headers.
|
279
|
+
3. Nokogiri is compiled with the system libxml2 but its headers aren't
|
280
|
+
available at build time for Nokogumbo. In this case, Nokogumbo will use the
|
281
|
+
slower Ruby API.
|
282
|
+
|
283
|
+
Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
|
284
|
+
rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
|
285
|
+
`-- --without-libxml2`.
|
286
|
+
|
287
|
+
Functionally, the only difference between using libxml2 or not is in the
|
288
|
+
behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
|
289
|
+
return the line number of the corresponding node. Otherwise, it will return 0.
|
290
|
+
|
204
291
|
# Installation
|
205
292
|
|
206
293
|
git clone https://github.com/rubys/nokogumbo.git
|
data/ext/nokogumbo/extconf.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'fileutils'
|
2
3
|
require 'mkmf'
|
3
4
|
require 'nokogiri'
|
@@ -23,7 +24,6 @@ def download_headers
|
|
23
24
|
return nil if dep_index.nil?
|
24
25
|
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
25
26
|
|
26
|
-
require 'rubygems'
|
27
27
|
gem 'mini_portile2', requirement
|
28
28
|
require 'mini_portile2'
|
29
29
|
p = MiniPortile::new('libxml2', version).tap do |r|
|
@@ -108,9 +108,14 @@ gumbo_src = File.join(ext_dir, 'gumbo_src')
|
|
108
108
|
|
109
109
|
Dir.chdir(ext_dir) do
|
110
110
|
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
111
|
+
$hdrs = Dir['*.h', '../../gumbo-parser/src/*.h']
|
111
112
|
end
|
112
113
|
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
113
114
|
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
114
115
|
|
115
|
-
create_makefile('nokogumbo/nokogumbo')
|
116
|
+
create_makefile('nokogumbo/nokogumbo') do |conf|
|
117
|
+
conf.map! do |chunk|
|
118
|
+
chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}")
|
119
|
+
end
|
120
|
+
end
|
116
121
|
# vim: set sw=2 sts=2 ts=8 et:
|
data/ext/nokogumbo/nokogumbo.c
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
// document tree is then walked:
|
10
10
|
//
|
11
11
|
// * if Nokogiri and libxml2 headers are available at compile time,
|
12
|
-
// (
|
12
|
+
// (if NGLIB) then a parallel libxml2 tree is constructed, and the
|
13
13
|
// final document is then wrapped using Nokogiri_wrap_xml_document.
|
14
14
|
// This approach reduces memory and CPU requirements as Ruby objects
|
15
15
|
// are only built when necessary.
|
@@ -20,74 +20,111 @@
|
|
20
20
|
|
21
21
|
#include <assert.h>
|
22
22
|
#include <ruby.h>
|
23
|
+
#include <ruby/version.h>
|
24
|
+
|
23
25
|
#include "gumbo.h"
|
24
|
-
#include "error.h"
|
25
26
|
|
26
27
|
// class constants
|
27
28
|
static VALUE Document;
|
28
29
|
|
29
|
-
|
30
|
+
// Interned symbols
|
31
|
+
static ID internal_subset;
|
32
|
+
static ID parent;
|
33
|
+
|
34
|
+
/* Backwards compatibility to Ruby 2.1.0 */
|
35
|
+
#if RUBY_API_VERSION_CODE < 20200
|
36
|
+
#define ONIG_ESCAPE_UCHAR_COLLISION 1
|
37
|
+
#include <ruby/encoding.h>
|
38
|
+
|
39
|
+
static VALUE rb_utf8_str_new(const char *str, long length) {
|
40
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
41
|
+
}
|
42
|
+
|
43
|
+
static VALUE rb_utf8_str_new_cstr(const char *str) {
|
44
|
+
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
|
45
|
+
}
|
46
|
+
|
47
|
+
static VALUE rb_utf8_str_new_static(const char *str, long length) {
|
48
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
49
|
+
}
|
50
|
+
#endif
|
51
|
+
|
52
|
+
#if NGLIB
|
30
53
|
#include <nokogiri.h>
|
31
|
-
#include <xml_syntax_error.h>
|
32
54
|
#include <libxml/tree.h>
|
33
55
|
#include <libxml/HTMLtree.h>
|
34
56
|
|
35
57
|
#define NIL NULL
|
36
|
-
#define CONST_CAST (xmlChar const*)
|
37
58
|
#else
|
38
59
|
#define NIL Qnil
|
39
|
-
#define CONST_CAST
|
40
60
|
|
41
|
-
//
|
61
|
+
// These are defined by nokogiri.h
|
42
62
|
static VALUE cNokogiriXmlSyntaxError;
|
63
|
+
static VALUE cNokogiriXmlElement;
|
64
|
+
static VALUE cNokogiriXmlText;
|
65
|
+
static VALUE cNokogiriXmlCData;
|
66
|
+
static VALUE cNokogiriXmlComment;
|
67
|
+
|
68
|
+
// Interned symbols.
|
69
|
+
static ID new;
|
70
|
+
static ID node_name_;
|
71
|
+
|
72
|
+
// Map libxml2 types to Ruby VALUE.
|
73
|
+
typedef VALUE xmlNodePtr;
|
74
|
+
typedef VALUE xmlDocPtr;
|
75
|
+
typedef VALUE xmlNsPtr;
|
76
|
+
typedef VALUE xmlDtdPtr;
|
77
|
+
typedef char xmlChar;
|
78
|
+
#define BAD_CAST
|
79
|
+
|
80
|
+
// Redefine libxml2 API as Ruby function calls.
|
81
|
+
static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) {
|
82
|
+
assert(ns == NIL && content == NULL);
|
83
|
+
return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc);
|
84
|
+
}
|
85
|
+
|
86
|
+
static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) {
|
87
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
88
|
+
return rb_funcall(cNokogiriXmlText, new, 2, str, doc);
|
89
|
+
}
|
90
|
+
|
91
|
+
static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) {
|
92
|
+
VALUE str = rb_utf8_str_new(content, len);
|
93
|
+
// CDATA.new takes arguments in the opposite order from Text.new.
|
94
|
+
return rb_funcall(cNokogiriXmlCData, new, 2, doc, str);
|
95
|
+
}
|
96
|
+
|
97
|
+
static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) {
|
98
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
99
|
+
return rb_funcall(cNokogiriXmlComment, new, 2, doc, str);
|
100
|
+
}
|
101
|
+
|
102
|
+
static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) {
|
103
|
+
ID add_child;
|
104
|
+
CONST_ID(add_child, "add_child");
|
105
|
+
return rb_funcall(parent, add_child, 1, cur);
|
106
|
+
}
|
107
|
+
|
108
|
+
static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) {
|
109
|
+
ID namespace_;
|
110
|
+
CONST_ID(namespace_, "namespace=");
|
111
|
+
rb_funcall(node, namespace_, 1, ns);
|
112
|
+
}
|
113
|
+
|
114
|
+
static void xmlFreeDoc(xmlDocPtr doc) { }
|
43
115
|
|
44
|
-
static VALUE
|
45
|
-
|
46
|
-
|
47
|
-
static VALUE Comment;
|
48
|
-
|
49
|
-
// interned symbols
|
50
|
-
static VALUE new;
|
51
|
-
static VALUE attribute;
|
52
|
-
static VALUE set_attribute;
|
53
|
-
static VALUE remove_attribute;
|
54
|
-
static VALUE add_child;
|
55
|
-
static VALUE internal_subset;
|
56
|
-
static VALUE remove_;
|
57
|
-
static VALUE create_internal_subset;
|
58
|
-
static VALUE key_;
|
59
|
-
static VALUE node_name_;
|
60
|
-
|
61
|
-
// map libxml2 types to Ruby VALUE
|
62
|
-
#define xmlNodePtr VALUE
|
63
|
-
#define xmlDocPtr VALUE
|
64
|
-
|
65
|
-
// redefine libxml2 API as Ruby function calls
|
66
|
-
#define xmlNewDocNode(doc, ns, name, content) \
|
67
|
-
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
68
|
-
#define xmlNewDocText(doc, text) \
|
69
|
-
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
70
|
-
#define xmlNewCDataBlock(doc, content, length) \
|
71
|
-
rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
|
72
|
-
#define xmlNewDocComment(doc, text) \
|
73
|
-
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
74
|
-
#define xmlAddChild(element, node) \
|
75
|
-
rb_funcall(element, add_child, 1, node)
|
76
|
-
#define xmlDocSetRootElement(doc, root) \
|
77
|
-
rb_funcall(doc, add_child, 1, root)
|
78
|
-
#define xmlCreateIntSubset(doc, name, external, system) \
|
79
|
-
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
|
80
|
-
(external ? rb_str_new2(external) : Qnil), \
|
81
|
-
(system ? rb_str_new2(system) : Qnil));
|
82
|
-
#define Nokogiri_wrap_xml_document(klass, doc) \
|
83
|
-
doc
|
116
|
+
static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) {
|
117
|
+
return doc;
|
118
|
+
}
|
84
119
|
|
85
120
|
static VALUE find_dummy_key(VALUE collection) {
|
86
121
|
VALUE r_dummy = Qnil;
|
87
122
|
char dummy[5] = "a";
|
88
123
|
size_t len = 1;
|
124
|
+
ID key_;
|
125
|
+
CONST_ID(key_, "key?");
|
89
126
|
while (len < sizeof dummy) {
|
90
|
-
r_dummy =
|
127
|
+
r_dummy = rb_utf8_str_new(dummy, len);
|
91
128
|
if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
|
92
129
|
return r_dummy;
|
93
130
|
for (size_t i = 0; ; ++i) {
|
@@ -105,10 +142,42 @@ static VALUE find_dummy_key(VALUE collection) {
|
|
105
142
|
}
|
106
143
|
}
|
107
144
|
// This collection has 475254 elements?? Give up.
|
108
|
-
|
145
|
+
rb_raise(rb_eArgError, "Failed to find a dummy key.");
|
109
146
|
}
|
110
147
|
|
111
|
-
|
148
|
+
// This should return an xmlAttrPtr, but we don't need it and it's easier to
|
149
|
+
// not get the result.
|
150
|
+
static void xmlNewNsProp (
|
151
|
+
xmlNodePtr node,
|
152
|
+
xmlNsPtr ns,
|
153
|
+
const xmlChar *name,
|
154
|
+
const xmlChar *value
|
155
|
+
) {
|
156
|
+
ID set_attribute;
|
157
|
+
CONST_ID(set_attribute, "set_attribute");
|
158
|
+
|
159
|
+
VALUE rvalue = rb_utf8_str_new_cstr(value);
|
160
|
+
|
161
|
+
if (RTEST(ns)) {
|
162
|
+
// This is an easy case, we have a namespace so it's enough to do
|
163
|
+
// node["#{ns.prefix}:#{name}"] = value
|
164
|
+
ID prefix;
|
165
|
+
CONST_ID(prefix, "prefix");
|
166
|
+
VALUE ns_prefix = rb_funcall(ns, prefix, 0);
|
167
|
+
VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name);
|
168
|
+
rb_funcall(node, set_attribute, 2, qname, rvalue);
|
169
|
+
return;
|
170
|
+
}
|
171
|
+
|
172
|
+
size_t len = strlen(name);
|
173
|
+
VALUE rname = rb_utf8_str_new(name, len);
|
174
|
+
if (memchr(name, ':', len) == NULL) {
|
175
|
+
// This is the easiest case. There's no colon so we can do
|
176
|
+
// node[name] = value.
|
177
|
+
rb_funcall(node, set_attribute, 2, rname, rvalue);
|
178
|
+
return;
|
179
|
+
}
|
180
|
+
|
112
181
|
// Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
|
113
182
|
// which behaves roughly as
|
114
183
|
// if name is a QName prefix:local
|
@@ -118,7 +187,7 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
118
187
|
//
|
119
188
|
// If the prefix is "xml", then the namespace lookup will create it.
|
120
189
|
//
|
121
|
-
// By contrast,
|
190
|
+
// By contrast, xmlNewNsProp does not do this parsing and creates an attribute
|
122
191
|
// with the name and value exactly as given. This is the behavior that we
|
123
192
|
// want.
|
124
193
|
//
|
@@ -129,169 +198,90 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
129
198
|
// Work around this by inserting a dummy attribute and then changing the
|
130
199
|
// name, if needed.
|
131
200
|
|
132
|
-
// Can't use strchr since it's locale-sensitive.
|
133
|
-
size_t len = strlen(name);
|
134
|
-
VALUE r_name = rb_str_new(name, len);
|
135
|
-
if (memchr(name, ':', len) == NULL) {
|
136
|
-
// No colon.
|
137
|
-
return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
|
138
|
-
}
|
139
201
|
// Find a dummy attribute string that doesn't already exist.
|
140
202
|
VALUE dummy = find_dummy_key(node);
|
141
|
-
if (dummy == Qnil)
|
142
|
-
return Qnil;
|
143
203
|
// Add the dummy attribute.
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
204
|
+
rb_funcall(node, set_attribute, 2, dummy, rvalue);
|
205
|
+
|
206
|
+
// Remove the old attribute, if it exists.
|
207
|
+
ID remove_attribute;
|
208
|
+
CONST_ID(remove_attribute, "remove_attribute");
|
209
|
+
rb_funcall(node, remove_attribute, 1, rname);
|
210
|
+
|
149
211
|
// Rename the dummy
|
212
|
+
ID attribute;
|
213
|
+
CONST_ID(attribute, "attribute");
|
150
214
|
VALUE attr = rb_funcall(node, attribute, 1, dummy);
|
151
|
-
|
152
|
-
return Qnil;
|
153
|
-
rb_funcall(attr, node_name_, 1, r_name);
|
154
|
-
return attr;
|
215
|
+
rb_funcall(attr, node_name_, 1, rname);
|
155
216
|
}
|
156
217
|
#endif
|
157
218
|
|
158
|
-
// Build a xmlNodePtr for a given GumboNode (recursively)
|
159
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
160
|
-
|
161
|
-
// Build a xmlNodePtr for a given GumboElement (recursively)
|
162
|
-
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
163
|
-
// create the given element
|
164
|
-
xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
|
165
|
-
|
166
|
-
// add in the attributes
|
167
|
-
GumboVector* attrs = &node->attributes;
|
168
|
-
char *name = NULL;
|
169
|
-
size_t namelen = 0;
|
170
|
-
const char *ns;
|
171
|
-
for (size_t i=0; i < attrs->length; i++) {
|
172
|
-
GumboAttribute *attr = attrs->data[i];
|
173
|
-
|
174
|
-
switch (attr->attr_namespace) {
|
175
|
-
case GUMBO_ATTR_NAMESPACE_XLINK:
|
176
|
-
ns = "xlink:";
|
177
|
-
break;
|
178
|
-
|
179
|
-
case GUMBO_ATTR_NAMESPACE_XML:
|
180
|
-
ns = "xml:";
|
181
|
-
break;
|
182
|
-
|
183
|
-
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
184
|
-
ns = "xmlns:";
|
185
|
-
if (!strcmp(attr->name, "xmlns")) ns = NULL;
|
186
|
-
break;
|
187
|
-
|
188
|
-
default:
|
189
|
-
ns = NULL;
|
190
|
-
}
|
191
|
-
|
192
|
-
if (ns) {
|
193
|
-
if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
|
194
|
-
free(name);
|
195
|
-
name = NULL;
|
196
|
-
}
|
197
|
-
|
198
|
-
if (!name) {
|
199
|
-
namelen = strlen(ns) + strlen(attr->name) + 1;
|
200
|
-
name = malloc(namelen);
|
201
|
-
}
|
202
|
-
|
203
|
-
strcpy(name, ns);
|
204
|
-
strcat(name, attr->name);
|
205
|
-
xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
|
206
|
-
} else {
|
207
|
-
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
208
|
-
}
|
209
|
-
}
|
210
|
-
if (name) free(name);
|
211
|
-
|
212
|
-
// add in the children
|
213
|
-
GumboVector* children = &node->children;
|
214
|
-
for (size_t i=0; i < children->length; i++) {
|
215
|
-
xmlNodePtr node = walk_tree(document, children->data[i]);
|
216
|
-
if (node) xmlAddChild(element, node);
|
217
|
-
}
|
218
|
-
|
219
|
-
return element;
|
220
|
-
}
|
221
|
-
|
222
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
223
|
-
switch (node->type) {
|
224
|
-
case GUMBO_NODE_DOCUMENT:
|
225
|
-
return NIL;
|
226
|
-
case GUMBO_NODE_ELEMENT:
|
227
|
-
case GUMBO_NODE_TEMPLATE:
|
228
|
-
return walk_element(document, &node->v.element);
|
229
|
-
case GUMBO_NODE_TEXT:
|
230
|
-
case GUMBO_NODE_WHITESPACE:
|
231
|
-
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
232
|
-
case GUMBO_NODE_CDATA:
|
233
|
-
return xmlNewCDataBlock(document,
|
234
|
-
CONST_CAST node->v.text.text,
|
235
|
-
(int) strlen(node->v.text.text));
|
236
|
-
case GUMBO_NODE_COMMENT:
|
237
|
-
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
238
|
-
}
|
239
|
-
}
|
240
|
-
|
241
219
|
// URI = system id
|
242
220
|
// external id = public id
|
243
|
-
|
244
|
-
static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
221
|
+
static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
245
222
|
{
|
223
|
+
#if NGLIB
|
246
224
|
// These two libxml2 functions take the public and system ids in
|
247
225
|
// opposite orders.
|
248
226
|
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
|
249
227
|
assert(doc);
|
250
228
|
if (dtd_name)
|
251
|
-
xmlCreateIntSubset(doc,
|
229
|
+
xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
|
252
230
|
return doc;
|
253
|
-
}
|
254
231
|
#else
|
255
|
-
// remove internal subset from newly created documents
|
256
|
-
static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
|
232
|
+
// remove internal subset from newly created documents
|
257
233
|
VALUE doc;
|
258
234
|
// If system and public are both NULL, Document#new is going to set default
|
259
235
|
// values for them so we're going to have to remove the internal subset
|
260
236
|
// which seems to leak memory in Nokogiri, so leak as little as possible.
|
261
237
|
if (system == NULL && public == NULL) {
|
262
|
-
|
263
|
-
|
238
|
+
ID remove;
|
239
|
+
CONST_ID(remove, "remove");
|
240
|
+
doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0));
|
241
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0);
|
264
242
|
if (dtd_name) {
|
265
243
|
// We need to create an internal subset now.
|
266
|
-
|
244
|
+
ID create_internal_subset;
|
245
|
+
CONST_ID(create_internal_subset, "create_internal_subset");
|
246
|
+
rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil);
|
267
247
|
}
|
268
248
|
} else {
|
269
249
|
assert(dtd_name);
|
270
250
|
// Rather than removing and creating the internal subset as we did above,
|
271
251
|
// just create and then rename one.
|
272
|
-
VALUE r_system = system ?
|
273
|
-
VALUE r_public = public ?
|
252
|
+
VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil;
|
253
|
+
VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil;
|
274
254
|
doc = rb_funcall(Document, new, 2, r_system, r_public);
|
275
|
-
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1,
|
255
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name));
|
276
256
|
}
|
277
257
|
return doc;
|
278
|
-
}
|
279
258
|
#endif
|
259
|
+
}
|
280
260
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
261
|
+
static xmlNodePtr get_parent(xmlNodePtr node) {
|
262
|
+
#if NGLIB
|
263
|
+
return node->parent;
|
264
|
+
#else
|
265
|
+
if (!rb_respond_to(node, parent))
|
266
|
+
return Qnil;
|
267
|
+
return rb_funcall(node, parent, 0);
|
268
|
+
#endif
|
269
|
+
}
|
286
270
|
|
287
|
-
|
288
|
-
|
289
|
-
|
271
|
+
static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
|
272
|
+
assert(RTEST(input));
|
273
|
+
Check_Type(input, T_STRING);
|
274
|
+
GumboOutput *output = gumbo_parse_with_options (
|
275
|
+
options,
|
276
|
+
RSTRING_PTR(input),
|
277
|
+
RSTRING_LEN(input)
|
278
|
+
);
|
290
279
|
|
291
280
|
const char *status_string = gumbo_status_to_string(output->status);
|
292
281
|
switch (output->status) {
|
293
282
|
case GUMBO_STATUS_OK:
|
294
283
|
break;
|
284
|
+
case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
|
295
285
|
case GUMBO_STATUS_TREE_TOO_DEEP:
|
296
286
|
gumbo_destroy_output(output);
|
297
287
|
rb_raise(rb_eArgError, "%s", status_string);
|
@@ -299,100 +289,505 @@ static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE
|
|
299
289
|
gumbo_destroy_output(output);
|
300
290
|
rb_raise(rb_eNoMemError, "%s", status_string);
|
301
291
|
}
|
292
|
+
return output;
|
293
|
+
}
|
302
294
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
295
|
+
static xmlNsPtr lookup_or_add_ns (
|
296
|
+
xmlDocPtr doc,
|
297
|
+
xmlNodePtr root,
|
298
|
+
const char *href,
|
299
|
+
const char *prefix
|
300
|
+
) {
|
301
|
+
#if NGLIB
|
302
|
+
xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
|
303
|
+
if (ns)
|
304
|
+
return ns;
|
305
|
+
return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
|
306
|
+
#else
|
307
|
+
ID add_namespace_definition;
|
308
|
+
CONST_ID(add_namespace_definition, "add_namespace_definition");
|
309
|
+
VALUE rprefix = rb_utf8_str_new_cstr(prefix);
|
310
|
+
VALUE rhref = rb_utf8_str_new_cstr(href);
|
311
|
+
return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref);
|
312
|
+
#endif
|
313
|
+
}
|
314
|
+
|
315
|
+
static void set_line(xmlNodePtr node, size_t line) {
|
316
|
+
#if NGLIB
|
317
|
+
// libxml2 uses 65535 to mean look elsewhere for the line number on some
|
318
|
+
// nodes.
|
319
|
+
if (line < 65535)
|
320
|
+
node->line = (unsigned short)line;
|
321
|
+
#else
|
322
|
+
// XXX: If Nokogiri gets a `#line=` method, we'll use that.
|
323
|
+
#endif
|
324
|
+
}
|
314
325
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
326
|
+
// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
|
327
|
+
// at gumbo_node.
|
328
|
+
static void build_tree (
|
329
|
+
xmlDocPtr doc,
|
330
|
+
xmlNodePtr xml_output_node,
|
331
|
+
const GumboNode *gumbo_node
|
332
|
+
) {
|
333
|
+
xmlNodePtr xml_root = NIL;
|
334
|
+
xmlNodePtr xml_node = xml_output_node;
|
335
|
+
size_t child_index = 0;
|
336
|
+
|
337
|
+
while (true) {
|
338
|
+
assert(gumbo_node != NULL);
|
339
|
+
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT?
|
340
|
+
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
|
341
|
+
if (child_index >= children->length) {
|
342
|
+
// Move up the tree and to the next child.
|
343
|
+
if (xml_node == xml_output_node) {
|
344
|
+
// We've built as much of the tree as we can.
|
345
|
+
return;
|
346
|
+
}
|
347
|
+
child_index = gumbo_node->index_within_parent + 1;
|
348
|
+
gumbo_node = gumbo_node->parent;
|
349
|
+
xml_node = get_parent(xml_node);
|
350
|
+
// Children of fragments don't share the same root, so reset it and
|
351
|
+
// it'll be set below. In the non-fragment case, this will only happen
|
352
|
+
// after the html element has been finished at which point there are no
|
353
|
+
// further elements.
|
354
|
+
if (xml_node == xml_output_node)
|
355
|
+
xml_root = NIL;
|
356
|
+
continue;
|
357
|
+
}
|
358
|
+
const GumboNode *gumbo_child = children->data[child_index++];
|
359
|
+
xmlNodePtr xml_child;
|
360
|
+
|
361
|
+
switch (gumbo_child->type) {
|
362
|
+
case GUMBO_NODE_DOCUMENT:
|
363
|
+
abort(); // Bug in Gumbo.
|
364
|
+
|
365
|
+
case GUMBO_NODE_TEXT:
|
366
|
+
case GUMBO_NODE_WHITESPACE:
|
367
|
+
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
|
368
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
369
|
+
xmlAddChild(xml_node, xml_child);
|
370
|
+
break;
|
371
|
+
|
372
|
+
case GUMBO_NODE_CDATA:
|
373
|
+
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
|
374
|
+
(int) strlen(gumbo_child->v.text.text));
|
375
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
376
|
+
xmlAddChild(xml_node, xml_child);
|
377
|
+
break;
|
378
|
+
|
379
|
+
case GUMBO_NODE_COMMENT:
|
380
|
+
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
|
381
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
382
|
+
xmlAddChild(xml_node, xml_child);
|
383
|
+
break;
|
384
|
+
|
385
|
+
case GUMBO_NODE_TEMPLATE:
|
386
|
+
// XXX: Should create a template element and a new DocumentFragment
|
387
|
+
case GUMBO_NODE_ELEMENT:
|
388
|
+
{
|
389
|
+
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
|
390
|
+
set_line(xml_child, gumbo_child->v.element.start_pos.line);
|
391
|
+
if (xml_root == NIL)
|
392
|
+
xml_root = xml_child;
|
393
|
+
xmlNsPtr ns = NIL;
|
394
|
+
switch (gumbo_child->v.element.tag_namespace) {
|
395
|
+
case GUMBO_NAMESPACE_HTML:
|
396
|
+
break;
|
397
|
+
case GUMBO_NAMESPACE_SVG:
|
398
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
|
399
|
+
break;
|
400
|
+
case GUMBO_NAMESPACE_MATHML:
|
401
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
|
402
|
+
break;
|
403
|
+
}
|
404
|
+
if (ns != NIL)
|
405
|
+
xmlSetNs(xml_child, ns);
|
406
|
+
xmlAddChild(xml_node, xml_child);
|
407
|
+
|
408
|
+
// Add the attributes.
|
409
|
+
const GumboVector* attrs = &gumbo_child->v.element.attributes;
|
410
|
+
for (size_t i=0; i < attrs->length; i++) {
|
411
|
+
const GumboAttribute *attr = attrs->data[i];
|
412
|
+
|
413
|
+
switch (attr->attr_namespace) {
|
414
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
415
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
|
416
|
+
break;
|
417
|
+
|
418
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
419
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
|
420
|
+
break;
|
421
|
+
|
422
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
423
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
|
424
|
+
break;
|
425
|
+
|
426
|
+
default:
|
427
|
+
ns = NIL;
|
428
|
+
}
|
429
|
+
xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
|
430
|
+
}
|
431
|
+
|
432
|
+
// Add children for this element.
|
433
|
+
child_index = 0;
|
434
|
+
gumbo_node = gumbo_child;
|
435
|
+
xml_node = xml_child;
|
436
|
+
}
|
324
437
|
}
|
325
438
|
}
|
439
|
+
}
|
326
440
|
|
327
|
-
|
441
|
+
static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) {
|
442
|
+
const char *input_str = RSTRING_PTR(input);
|
443
|
+
size_t input_len = RSTRING_LEN(input);
|
328
444
|
|
329
445
|
// Add parse errors to rdoc.
|
330
446
|
if (output->errors.length) {
|
331
|
-
GumboVector *errors = &output->errors;
|
332
|
-
GumboStringBuffer msg;
|
447
|
+
const GumboVector *errors = &output->errors;
|
333
448
|
VALUE rerrors = rb_ary_new2(errors->length);
|
334
449
|
|
335
|
-
gumbo_string_buffer_init(&msg);
|
336
450
|
for (size_t i=0; i < errors->length; i++) {
|
337
451
|
GumboError *err = errors->data[i];
|
338
|
-
|
339
|
-
|
340
|
-
|
452
|
+
GumboSourcePosition position = gumbo_error_position(err);
|
453
|
+
char *msg;
|
454
|
+
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
|
455
|
+
VALUE err_str = rb_utf8_str_new(msg, size);
|
456
|
+
free(msg);
|
341
457
|
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
458
|
+
const char *error_code = gumbo_error_code(err);
|
459
|
+
VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
|
342
460
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
343
461
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
344
462
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
345
463
|
rb_iv_set(syntax_error, "@file", url);
|
346
|
-
rb_iv_set(syntax_error, "@line", INT2NUM(
|
347
|
-
rb_iv_set(syntax_error, "@str1",
|
464
|
+
rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
|
465
|
+
rb_iv_set(syntax_error, "@str1", str1);
|
348
466
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
349
467
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
350
|
-
rb_iv_set(syntax_error, "@int1", INT2NUM(
|
351
|
-
rb_iv_set(syntax_error, "@column", INT2NUM(
|
468
|
+
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
469
|
+
rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
|
352
470
|
rb_ary_push(rerrors, syntax_error);
|
353
471
|
}
|
354
472
|
rb_iv_set(rdoc, "@errors", rerrors);
|
355
|
-
gumbo_string_buffer_destroy(&msg);
|
356
473
|
}
|
474
|
+
}
|
357
475
|
|
358
|
-
|
476
|
+
typedef struct {
|
477
|
+
GumboOutput *output;
|
478
|
+
VALUE input;
|
479
|
+
VALUE url_or_frag;
|
480
|
+
xmlDocPtr doc;
|
481
|
+
} ParseArgs;
|
482
|
+
|
483
|
+
static void parse_args_mark(void *parse_args) {
|
484
|
+
ParseArgs *args = parse_args;
|
485
|
+
rb_gc_mark_maybe(args->input);
|
486
|
+
rb_gc_mark_maybe(args->url_or_frag);
|
487
|
+
}
|
359
488
|
|
489
|
+
// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
|
490
|
+
// wrapper.
|
491
|
+
static VALUE wrap_parse_args(ParseArgs *args) {
|
492
|
+
return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
|
493
|
+
}
|
494
|
+
|
495
|
+
// Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
|
496
|
+
static ParseArgs *unwrap_parse_args(VALUE obj) {
|
497
|
+
ParseArgs *args;
|
498
|
+
Data_Get_Struct(obj, ParseArgs, args);
|
499
|
+
return args;
|
500
|
+
}
|
501
|
+
|
502
|
+
static VALUE parse_cleanup(VALUE parse_args) {
|
503
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
504
|
+
gumbo_destroy_output(args->output);
|
505
|
+
// Make sure garbage collection doesn't mark the objects as being live based
|
506
|
+
// on references from the ParseArgs. This may be unnecessary.
|
507
|
+
args->input = Qnil;
|
508
|
+
args->url_or_frag = Qnil;
|
509
|
+
if (args->doc != NIL)
|
510
|
+
xmlFreeDoc(args->doc);
|
511
|
+
return Qnil;
|
512
|
+
}
|
513
|
+
|
514
|
+
static VALUE parse_continue(VALUE parse_args);
|
515
|
+
|
516
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
517
|
+
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
|
518
|
+
GumboOptions options = kGumboDefaultOptions;
|
519
|
+
options.max_attributes = NUM2INT(max_attributes);
|
520
|
+
options.max_errors = NUM2INT(max_errors);
|
521
|
+
options.max_tree_depth = NUM2INT(max_depth);
|
522
|
+
|
523
|
+
GumboOutput *output = perform_parse(&options, input);
|
524
|
+
ParseArgs args = {
|
525
|
+
.output = output,
|
526
|
+
.input = input,
|
527
|
+
.url_or_frag = url,
|
528
|
+
.doc = NIL,
|
529
|
+
};
|
530
|
+
VALUE parse_args = wrap_parse_args(&args);
|
531
|
+
|
532
|
+
return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
|
533
|
+
}
|
534
|
+
|
535
|
+
static VALUE parse_continue(VALUE parse_args) {
|
536
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
537
|
+
GumboOutput *output = args->output;
|
538
|
+
xmlDocPtr doc;
|
539
|
+
if (output->document->v.document.has_doctype) {
|
540
|
+
const char *name = output->document->v.document.name;
|
541
|
+
const char *public = output->document->v.document.public_identifier;
|
542
|
+
const char *system = output->document->v.document.system_identifier;
|
543
|
+
public = public[0] ? public : NULL;
|
544
|
+
system = system[0] ? system : NULL;
|
545
|
+
doc = new_html_doc(name, system, public);
|
546
|
+
} else {
|
547
|
+
doc = new_html_doc(NULL, NULL, NULL);
|
548
|
+
}
|
549
|
+
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
550
|
+
build_tree(doc, (xmlNodePtr)doc, output->document);
|
551
|
+
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
|
552
|
+
args->doc = NIL; // The Ruby runtime now owns doc so don't delete it.
|
553
|
+
add_errors(output, rdoc, args->input, args->url_or_frag);
|
360
554
|
return rdoc;
|
361
555
|
}
|
362
556
|
|
363
|
-
|
557
|
+
static int lookup_namespace(VALUE node, bool require_known_ns) {
|
558
|
+
ID namespace, href;
|
559
|
+
CONST_ID(namespace, "namespace");
|
560
|
+
CONST_ID(href, "href");
|
561
|
+
VALUE ns = rb_funcall(node, namespace, 0);
|
562
|
+
|
563
|
+
if (NIL_P(ns))
|
564
|
+
return GUMBO_NAMESPACE_HTML;
|
565
|
+
ns = rb_funcall(ns, href, 0);
|
566
|
+
assert(RTEST(ns));
|
567
|
+
Check_Type(ns, T_STRING);
|
568
|
+
|
569
|
+
const char *href_ptr = RSTRING_PTR(ns);
|
570
|
+
size_t href_len = RSTRING_LEN(ns);
|
571
|
+
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
|
572
|
+
if (NAMESPACE_P("http://www.w3.org/1999/xhtml"))
|
573
|
+
return GUMBO_NAMESPACE_HTML;
|
574
|
+
if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML"))
|
575
|
+
return GUMBO_NAMESPACE_MATHML;
|
576
|
+
if (NAMESPACE_P("http://www.w3.org/2000/svg"))
|
577
|
+
return GUMBO_NAMESPACE_SVG;
|
578
|
+
#undef NAMESPACE_P
|
579
|
+
if (require_known_ns)
|
580
|
+
rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
|
581
|
+
return -1;
|
582
|
+
}
|
583
|
+
|
584
|
+
static xmlNodePtr extract_xml_node(VALUE node) {
|
585
|
+
#if NGLIB
|
586
|
+
xmlNodePtr xml_node;
|
587
|
+
Data_Get_Struct(node, xmlNode, xml_node);
|
588
|
+
return xml_node;
|
589
|
+
#else
|
590
|
+
return node;
|
591
|
+
#endif
|
592
|
+
}
|
593
|
+
|
594
|
+
static VALUE fragment_continue(VALUE parse_args);
|
595
|
+
|
596
|
+
static VALUE fragment (
|
597
|
+
VALUE self,
|
598
|
+
VALUE doc_fragment,
|
599
|
+
VALUE tags,
|
600
|
+
VALUE ctx,
|
601
|
+
VALUE max_attributes,
|
602
|
+
VALUE max_errors,
|
603
|
+
VALUE max_depth
|
604
|
+
) {
|
605
|
+
ID name = rb_intern_const("name");
|
606
|
+
const char *ctx_tag;
|
607
|
+
GumboNamespaceEnum ctx_ns;
|
608
|
+
GumboQuirksModeEnum quirks_mode;
|
609
|
+
bool form = false;
|
610
|
+
const char *encoding = NULL;
|
611
|
+
|
612
|
+
if (NIL_P(ctx)) {
|
613
|
+
ctx_tag = "body";
|
614
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
615
|
+
} else if (TYPE(ctx) == T_STRING) {
|
616
|
+
ctx_tag = StringValueCStr(ctx);
|
617
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
618
|
+
size_t len = RSTRING_LEN(ctx);
|
619
|
+
const char *colon = memchr(ctx_tag, ':', len);
|
620
|
+
if (colon) {
|
621
|
+
switch (colon - ctx_tag) {
|
622
|
+
case 3:
|
623
|
+
if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
|
624
|
+
goto error;
|
625
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
626
|
+
break;
|
627
|
+
case 4:
|
628
|
+
if (st_strncasecmp(ctx_tag, "html", 4) == 0)
|
629
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
630
|
+
else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
|
631
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
632
|
+
else
|
633
|
+
goto error;
|
634
|
+
break;
|
635
|
+
default:
|
636
|
+
error:
|
637
|
+
rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
|
638
|
+
}
|
639
|
+
ctx_tag = colon+1;
|
640
|
+
} else {
|
641
|
+
// For convenience, put 'svg' and 'math' in their namespaces.
|
642
|
+
if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
|
643
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
644
|
+
else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
|
645
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
646
|
+
}
|
647
|
+
|
648
|
+
// Check if it's a form.
|
649
|
+
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
|
650
|
+
} else {
|
651
|
+
ID element_ = rb_intern_const("element?");
|
652
|
+
|
653
|
+
// Context fragment name.
|
654
|
+
VALUE tag_name = rb_funcall(ctx, name, 0);
|
655
|
+
assert(RTEST(tag_name));
|
656
|
+
Check_Type(tag_name, T_STRING);
|
657
|
+
ctx_tag = StringValueCStr(tag_name);
|
658
|
+
|
659
|
+
// Context fragment namespace.
|
660
|
+
ctx_ns = lookup_namespace(ctx, true);
|
661
|
+
|
662
|
+
// Check for a form ancestor, including self.
|
663
|
+
for (VALUE node = ctx;
|
664
|
+
!NIL_P(node);
|
665
|
+
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
|
666
|
+
if (!RTEST(rb_funcall(node, element_, 0)))
|
667
|
+
continue;
|
668
|
+
VALUE element_name = rb_funcall(node, name, 0);
|
669
|
+
if (RSTRING_LEN(element_name) == 4
|
670
|
+
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
|
671
|
+
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
|
672
|
+
form = true;
|
673
|
+
break;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
// Encoding.
|
678
|
+
if (RSTRING_LEN(tag_name) == 14
|
679
|
+
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
680
|
+
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
681
|
+
rb_utf8_str_new_static("encoding", 8));
|
682
|
+
if (RTEST(enc)) {
|
683
|
+
Check_Type(enc, T_STRING);
|
684
|
+
encoding = StringValueCStr(enc);
|
685
|
+
}
|
686
|
+
}
|
687
|
+
}
|
688
|
+
|
689
|
+
// Quirks mode.
|
690
|
+
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
691
|
+
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
692
|
+
if (NIL_P(dtd)) {
|
693
|
+
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
694
|
+
} else {
|
695
|
+
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
696
|
+
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
697
|
+
VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
|
698
|
+
quirks_mode = gumbo_compute_quirks_mode (
|
699
|
+
NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
|
700
|
+
NIL_P(pubid)? NULL:StringValueCStr(pubid),
|
701
|
+
NIL_P(sysid)? NULL:StringValueCStr(sysid)
|
702
|
+
);
|
703
|
+
}
|
704
|
+
|
705
|
+
// Perform a fragment parse.
|
706
|
+
int depth = NUM2INT(max_depth);
|
707
|
+
GumboOptions options = kGumboDefaultOptions;
|
708
|
+
options.max_attributes = NUM2INT(max_attributes);
|
709
|
+
options.max_errors = NUM2INT(max_errors);
|
710
|
+
// Add one to account for the HTML element.
|
711
|
+
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
712
|
+
options.fragment_context = ctx_tag;
|
713
|
+
options.fragment_namespace = ctx_ns;
|
714
|
+
options.fragment_encoding = encoding;
|
715
|
+
options.quirks_mode = quirks_mode;
|
716
|
+
options.fragment_context_has_form_ancestor = form;
|
717
|
+
|
718
|
+
GumboOutput *output = perform_parse(&options, tags);
|
719
|
+
ParseArgs args = {
|
720
|
+
.output = output,
|
721
|
+
.input = tags,
|
722
|
+
.url_or_frag = doc_fragment,
|
723
|
+
.doc = (xmlDocPtr)extract_xml_node(doc),
|
724
|
+
};
|
725
|
+
VALUE parse_args = wrap_parse_args(&args);
|
726
|
+
rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
|
727
|
+
return Qnil;
|
728
|
+
}
|
729
|
+
|
730
|
+
static VALUE fragment_continue(VALUE parse_args) {
|
731
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
732
|
+
GumboOutput *output = args->output;
|
733
|
+
VALUE doc_fragment = args->url_or_frag;
|
734
|
+
xmlDocPtr xml_doc = args->doc;
|
735
|
+
|
736
|
+
args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it.
|
737
|
+
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
738
|
+
build_tree(xml_doc, xml_frag, output->root);
|
739
|
+
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
740
|
+
return Qnil;
|
741
|
+
}
|
742
|
+
|
743
|
+
// Initialize the Nokogumbo class and fetch constants we will use later.
|
364
744
|
void Init_nokogumbo() {
|
365
|
-
rb_funcall(rb_mKernel,
|
745
|
+
rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
|
366
746
|
rb_require("nokogiri");
|
367
747
|
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
key_ = rb_intern("key?");
|
392
|
-
node_name_ = rb_intern("node_name=");
|
748
|
+
VALUE line_supported = Qtrue;
|
749
|
+
|
750
|
+
#if !NGLIB
|
751
|
+
// Class constants.
|
752
|
+
VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
|
753
|
+
VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
|
754
|
+
cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
|
755
|
+
rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
|
756
|
+
cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
|
757
|
+
rb_gc_register_mark_object(cNokogiriXmlElement);
|
758
|
+
cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
|
759
|
+
rb_gc_register_mark_object(cNokogiriXmlText);
|
760
|
+
cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
|
761
|
+
rb_gc_register_mark_object(cNokogiriXmlCData);
|
762
|
+
cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
|
763
|
+
rb_gc_register_mark_object(cNokogiriXmlComment);
|
764
|
+
|
765
|
+
// Interned symbols.
|
766
|
+
new = rb_intern_const("new");
|
767
|
+
node_name_ = rb_intern_const("node_name=");
|
768
|
+
|
769
|
+
// #line is not supported (returns 0)
|
770
|
+
line_supported = Qfalse;
|
393
771
|
#endif
|
394
772
|
|
395
|
-
//
|
773
|
+
// Class constants.
|
774
|
+
VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
|
775
|
+
Document = rb_const_get(HTML5, rb_intern_const("Document"));
|
776
|
+
rb_gc_register_mark_object(Document);
|
777
|
+
|
778
|
+
// Interned symbols.
|
779
|
+
internal_subset = rb_intern_const("internal_subset");
|
780
|
+
parent = rb_intern_const("parent");
|
781
|
+
|
782
|
+
// Define Nokogumbo module with parse and fragment methods.
|
396
783
|
VALUE Gumbo = rb_define_module("Nokogumbo");
|
397
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
784
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 5);
|
785
|
+
rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
|
786
|
+
|
787
|
+
// Add private constant for testing.
|
788
|
+
rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
|
789
|
+
rb_funcall(Gumbo, rb_intern_const("private_constant"), 1,
|
790
|
+
rb_utf8_str_new_cstr("LINE_SUPPORTED"));
|
398
791
|
}
|
792
|
+
|
793
|
+
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|