nokogumbo 2.0.0.pre.alpha → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 97aae1603382eb4357f4126f4c36f86841b930a04b46e0eb6a1c02c26c9e77a4
|
4
|
+
data.tar.gz: 614f1dc01be03d4ccb48b43d512faf80f556e8ddde4690fa34fbb9e420c36cb9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aabd005ec985f1a94b0b82195ce8547fabc3d3f97e672b9b03c174ddd131b534b6c2648803ccd0d1fdfc9224bab132cdbd2757271b5646fc3873649d1f509e26
|
7
|
+
data.tar.gz: 4e793d5436de772587f2abcdb54c1060a3ddbde3dfcad05539e823dbde3c67f0792d1589289323fe76d5baebe265a526aba71d5adcbeb8d2ca72559d789e7b14
|
data/README.md
CHANGED
@@ -5,7 +5,8 @@ Nokogumbo provides the ability for a Ruby program to invoke the
|
|
5
5
|
and to access the result as a
|
6
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
7
7
|
|
8
|
-
[![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
8
|
+
[![Travis-CI Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
|
9
|
+
[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/rubys/nokogumbo)](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master)
|
9
10
|
|
10
11
|
## Usage
|
11
12
|
|
@@ -14,8 +15,7 @@ require 'nokogumbo'
|
|
14
15
|
doc = Nokogiri.HTML5(string)
|
15
16
|
```
|
16
17
|
|
17
|
-
|
18
|
-
compliant, it may be useful:
|
18
|
+
To parse an HTML fragment, a `fragment` method is provided.
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
require 'nokogumbo'
|
@@ -49,20 +49,26 @@ no parse errors are reported but this can be configured by passing the
|
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
require 'nokogumbo'
|
52
|
-
doc = Nokogiri::HTML5.parse('Hi there
|
52
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
53
53
|
doc.errors.each do |err|
|
54
|
-
puts
|
54
|
+
puts(err)
|
55
55
|
end
|
56
56
|
```
|
57
57
|
|
58
58
|
This prints the following.
|
59
59
|
```
|
60
|
-
1:1: ERROR:
|
61
|
-
Hi there
|
60
|
+
1:1: ERROR: Expected a doctype token
|
61
|
+
<span/>Hi there!</span foo=bar />
|
62
62
|
^
|
63
|
-
1:
|
64
|
-
Hi there
|
65
|
-
|
63
|
+
1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
|
64
|
+
<span/>Hi there!</span foo=bar />
|
65
|
+
^
|
66
|
+
1:17: ERROR: End tag ends with '/>', use '>'.
|
67
|
+
<span/>Hi there!</span foo=bar />
|
68
|
+
^
|
69
|
+
1:17: ERROR: End tag contains attributes.
|
70
|
+
<span/>Hi there!</span foo=bar />
|
71
|
+
^
|
66
72
|
```
|
67
73
|
|
68
74
|
Using `max_errors: -1` results in an unlimited number of errors being
|
@@ -71,6 +77,41 @@ returned.
|
|
71
77
|
The errors returned by `#errors` are instances of
|
72
78
|
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
73
79
|
|
80
|
+
The [HTML
|
81
|
+
standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors)
|
82
|
+
defines a number of standard parse error codes. These error codes only cover
|
83
|
+
the "tokenization" stage of parsing HTML. The parse errors in the
|
84
|
+
"tree construction" stage do not have standardized error codes (yet).
|
85
|
+
|
86
|
+
As a convenience to Nokogumbo users, the defined error codes are available
|
87
|
+
via the
|
88
|
+
[`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method)
|
89
|
+
method.
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
require 'nokogumbo'
|
93
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
94
|
+
doc.errors.each do |err|
|
95
|
+
puts("#{err.line}:#{err.column}: #{err.str1}")
|
96
|
+
end
|
97
|
+
```
|
98
|
+
|
99
|
+
This prints the following.
|
100
|
+
```
|
101
|
+
1:1: generic-parser
|
102
|
+
1:1: non-void-html-element-start-tag-with-trailing-solidus
|
103
|
+
1:17: end-tag-with-trailing-solidus
|
104
|
+
1:17: end-tag-with-attributes
|
105
|
+
```
|
106
|
+
|
107
|
+
Note that the first error is `generic-parser` because it's an error from the
|
108
|
+
tree construction stage and doesn't have a standardized error code.
|
109
|
+
|
110
|
+
For the purposes of semantic versioning, the error messages, error locations,
|
111
|
+
and error codes are not part of Nokogumbo's public API. That is, these are
|
112
|
+
subject to change without Nokogumbo's major version number changing. These may
|
113
|
+
be stabilized in the future.
|
114
|
+
|
74
115
|
### Maximum tree depth
|
75
116
|
The maximum depth of the DOM tree parsed by the various parsing methods is
|
76
117
|
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
@@ -201,6 +242,36 @@ rules defined in the HTML5 specification for doing so.
|
|
201
242
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
202
243
|
original tag name is returned verbatim.
|
203
244
|
|
245
|
+
# Flavors of Nokogumbo
|
246
|
+
Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
|
247
|
+
parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
|
248
|
+
using Nokogiri's Ruby API to construct the DOM tree.
|
249
|
+
|
250
|
+
Nokogiri can be configured to either use the system library version of libxml2
|
251
|
+
or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
|
252
|
+
will use a bundled version.
|
253
|
+
|
254
|
+
To prevent differences between versions of libxml2, Nokogumbo will only use
|
255
|
+
libxml2 if the build process can find the exact same version used by Nokogiri.
|
256
|
+
This leads to three possibilities
|
257
|
+
|
258
|
+
1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
|
259
|
+
(by default) use the same version of libxml2.
|
260
|
+
2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
|
261
|
+
headers are available, then Nokogumbo will (by default) use the system
|
262
|
+
version and headers.
|
263
|
+
3. Nokogiri is compiled with the system libxml2 but its headers aren't
|
264
|
+
available at build time for Nokogumbo. In this case, Nokogumbo will use the
|
265
|
+
slower Ruby API.
|
266
|
+
|
267
|
+
Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
|
268
|
+
rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
|
269
|
+
`-- --without-libxml2`.
|
270
|
+
|
271
|
+
Functionally, the only difference between using libxml2 or not is in the
|
272
|
+
behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
|
273
|
+
return the line number of the corresponding node. Otherwise, it will return 0.
|
274
|
+
|
204
275
|
# Installation
|
205
276
|
|
206
277
|
git clone https://github.com/rubys/nokogumbo.git
|
data/ext/nokogumbo/extconf.rb
CHANGED
@@ -108,9 +108,14 @@ gumbo_src = File.join(ext_dir, 'gumbo_src')
|
|
108
108
|
|
109
109
|
Dir.chdir(ext_dir) do
|
110
110
|
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
111
|
+
$hdrs = Dir['*.h', '../../gumbo-parser/src/*.h']
|
111
112
|
end
|
112
113
|
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
113
114
|
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
114
115
|
|
115
|
-
create_makefile('nokogumbo/nokogumbo')
|
116
|
+
create_makefile('nokogumbo/nokogumbo') do |conf|
|
117
|
+
conf.map! do |chunk|
|
118
|
+
chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}")
|
119
|
+
end
|
120
|
+
end
|
116
121
|
# vim: set sw=2 sts=2 ts=8 et:
|
data/ext/nokogumbo/nokogumbo.c
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
// document tree is then walked:
|
10
10
|
//
|
11
11
|
// * if Nokogiri and libxml2 headers are available at compile time,
|
12
|
-
// (
|
12
|
+
// (if NGLIB) then a parallel libxml2 tree is constructed, and the
|
13
13
|
// final document is then wrapped using Nokogiri_wrap_xml_document.
|
14
14
|
// This approach reduces memory and CPU requirements as Ruby objects
|
15
15
|
// are only built when necessary.
|
@@ -20,74 +20,110 @@
|
|
20
20
|
|
21
21
|
#include <assert.h>
|
22
22
|
#include <ruby.h>
|
23
|
+
#include <ruby/version.h>
|
24
|
+
|
23
25
|
#include "gumbo.h"
|
24
|
-
#include "error.h"
|
25
26
|
|
26
27
|
// class constants
|
27
28
|
static VALUE Document;
|
28
29
|
|
29
|
-
|
30
|
+
// Interned symbols
|
31
|
+
static ID internal_subset;
|
32
|
+
static ID parent;
|
33
|
+
|
34
|
+
/* Backwards compatibility to Ruby 2.1.0 */
|
35
|
+
#if RUBY_API_VERSION_CODE < 20200
|
36
|
+
#include <ruby/encoding.h>
|
37
|
+
|
38
|
+
static VALUE rb_utf8_str_new(const char *str, long length) {
|
39
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
40
|
+
}
|
41
|
+
|
42
|
+
static VALUE rb_utf8_str_new_cstr(const char *str) {
|
43
|
+
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
|
44
|
+
}
|
45
|
+
|
46
|
+
static VALUE rb_utf8_str_new_static(const char *str, long length) {
|
47
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
48
|
+
}
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#if NGLIB
|
30
52
|
#include <nokogiri.h>
|
31
|
-
#include <xml_syntax_error.h>
|
32
53
|
#include <libxml/tree.h>
|
33
54
|
#include <libxml/HTMLtree.h>
|
34
55
|
|
35
56
|
#define NIL NULL
|
36
|
-
#define CONST_CAST (xmlChar const*)
|
37
57
|
#else
|
38
58
|
#define NIL Qnil
|
39
|
-
#define CONST_CAST
|
40
59
|
|
41
|
-
//
|
60
|
+
// These are defined by nokogiri.h
|
42
61
|
static VALUE cNokogiriXmlSyntaxError;
|
62
|
+
static VALUE cNokogiriXmlElement;
|
63
|
+
static VALUE cNokogiriXmlText;
|
64
|
+
static VALUE cNokogiriXmlCData;
|
65
|
+
static VALUE cNokogiriXmlComment;
|
66
|
+
|
67
|
+
// Interned symbols.
|
68
|
+
static ID new;
|
69
|
+
static ID node_name_;
|
70
|
+
|
71
|
+
// Map libxml2 types to Ruby VALUE.
|
72
|
+
typedef VALUE xmlNodePtr;
|
73
|
+
typedef VALUE xmlDocPtr;
|
74
|
+
typedef VALUE xmlNsPtr;
|
75
|
+
typedef VALUE xmlDtdPtr;
|
76
|
+
typedef char xmlChar;
|
77
|
+
#define BAD_CAST
|
78
|
+
|
79
|
+
// Redefine libxml2 API as Ruby function calls.
|
80
|
+
static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) {
|
81
|
+
assert(ns == NIL && content == NULL);
|
82
|
+
return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc);
|
83
|
+
}
|
84
|
+
|
85
|
+
static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) {
|
86
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
87
|
+
return rb_funcall(cNokogiriXmlText, new, 2, str, doc);
|
88
|
+
}
|
89
|
+
|
90
|
+
static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) {
|
91
|
+
VALUE str = rb_utf8_str_new(content, len);
|
92
|
+
// CDATA.new takes arguments in the opposite order from Text.new.
|
93
|
+
return rb_funcall(cNokogiriXmlCData, new, 2, doc, str);
|
94
|
+
}
|
95
|
+
|
96
|
+
static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) {
|
97
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
98
|
+
return rb_funcall(cNokogiriXmlComment, new, 2, doc, str);
|
99
|
+
}
|
100
|
+
|
101
|
+
static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) {
|
102
|
+
ID add_child;
|
103
|
+
CONST_ID(add_child, "add_child");
|
104
|
+
return rb_funcall(parent, add_child, 1, cur);
|
105
|
+
}
|
106
|
+
|
107
|
+
static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) {
|
108
|
+
ID namespace_;
|
109
|
+
CONST_ID(namespace_, "namespace=");
|
110
|
+
rb_funcall(node, namespace_, 1, ns);
|
111
|
+
}
|
43
112
|
|
44
|
-
static
|
45
|
-
|
46
|
-
static VALUE
|
47
|
-
|
48
|
-
|
49
|
-
// interned symbols
|
50
|
-
static VALUE new;
|
51
|
-
static VALUE attribute;
|
52
|
-
static VALUE set_attribute;
|
53
|
-
static VALUE remove_attribute;
|
54
|
-
static VALUE add_child;
|
55
|
-
static VALUE internal_subset;
|
56
|
-
static VALUE remove_;
|
57
|
-
static VALUE create_internal_subset;
|
58
|
-
static VALUE key_;
|
59
|
-
static VALUE node_name_;
|
60
|
-
|
61
|
-
// map libxml2 types to Ruby VALUE
|
62
|
-
#define xmlNodePtr VALUE
|
63
|
-
#define xmlDocPtr VALUE
|
64
|
-
|
65
|
-
// redefine libxml2 API as Ruby function calls
|
66
|
-
#define xmlNewDocNode(doc, ns, name, content) \
|
67
|
-
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
68
|
-
#define xmlNewDocText(doc, text) \
|
69
|
-
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
70
|
-
#define xmlNewCDataBlock(doc, content, length) \
|
71
|
-
rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
|
72
|
-
#define xmlNewDocComment(doc, text) \
|
73
|
-
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
74
|
-
#define xmlAddChild(element, node) \
|
75
|
-
rb_funcall(element, add_child, 1, node)
|
76
|
-
#define xmlDocSetRootElement(doc, root) \
|
77
|
-
rb_funcall(doc, add_child, 1, root)
|
78
|
-
#define xmlCreateIntSubset(doc, name, external, system) \
|
79
|
-
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
|
80
|
-
(external ? rb_str_new2(external) : Qnil), \
|
81
|
-
(system ? rb_str_new2(system) : Qnil));
|
82
|
-
#define Nokogiri_wrap_xml_document(klass, doc) \
|
83
|
-
doc
|
113
|
+
static void xmlFreeDoc(xmlDocPtr doc) { }
|
114
|
+
|
115
|
+
static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) {
|
116
|
+
return doc;
|
117
|
+
}
|
84
118
|
|
85
119
|
static VALUE find_dummy_key(VALUE collection) {
|
86
120
|
VALUE r_dummy = Qnil;
|
87
121
|
char dummy[5] = "a";
|
88
122
|
size_t len = 1;
|
123
|
+
ID key_;
|
124
|
+
CONST_ID(key_, "key?");
|
89
125
|
while (len < sizeof dummy) {
|
90
|
-
r_dummy =
|
126
|
+
r_dummy = rb_utf8_str_new(dummy, len);
|
91
127
|
if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
|
92
128
|
return r_dummy;
|
93
129
|
for (size_t i = 0; ; ++i) {
|
@@ -105,10 +141,42 @@ static VALUE find_dummy_key(VALUE collection) {
|
|
105
141
|
}
|
106
142
|
}
|
107
143
|
// This collection has 475254 elements?? Give up.
|
108
|
-
|
144
|
+
rb_raise(rb_eArgError, "Failed to find a dummy key.");
|
109
145
|
}
|
110
146
|
|
111
|
-
|
147
|
+
// This should return an xmlAttrPtr, but we don't need it and it's easier to
|
148
|
+
// not get the result.
|
149
|
+
static void xmlNewNsProp (
|
150
|
+
xmlNodePtr node,
|
151
|
+
xmlNsPtr ns,
|
152
|
+
const xmlChar *name,
|
153
|
+
const xmlChar *value
|
154
|
+
) {
|
155
|
+
ID set_attribute;
|
156
|
+
CONST_ID(set_attribute, "set_attribute");
|
157
|
+
|
158
|
+
VALUE rvalue = rb_utf8_str_new_cstr(value);
|
159
|
+
|
160
|
+
if (RTEST(ns)) {
|
161
|
+
// This is an easy case, we have a namespace so it's enough to do
|
162
|
+
// node["#{ns.prefix}:#{name}"] = value
|
163
|
+
ID prefix;
|
164
|
+
CONST_ID(prefix, "prefix");
|
165
|
+
VALUE ns_prefix = rb_funcall(ns, prefix, 0);
|
166
|
+
VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name);
|
167
|
+
rb_funcall(node, set_attribute, 2, qname, rvalue);
|
168
|
+
return;
|
169
|
+
}
|
170
|
+
|
171
|
+
size_t len = strlen(name);
|
172
|
+
VALUE rname = rb_utf8_str_new(name, len);
|
173
|
+
if (memchr(name, ':', len) == NULL) {
|
174
|
+
// This is the easiest case. There's no colon so we can do
|
175
|
+
// node[name] = value.
|
176
|
+
rb_funcall(node, set_attribute, 2, rname, rvalue);
|
177
|
+
return;
|
178
|
+
}
|
179
|
+
|
112
180
|
// Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
|
113
181
|
// which behaves roughly as
|
114
182
|
// if name is a QName prefix:local
|
@@ -118,7 +186,7 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
118
186
|
//
|
119
187
|
// If the prefix is "xml", then the namespace lookup will create it.
|
120
188
|
//
|
121
|
-
// By contrast,
|
189
|
+
// By contrast, xmlNewNsProp does not do this parsing and creates an attribute
|
122
190
|
// with the name and value exactly as given. This is the behavior that we
|
123
191
|
// want.
|
124
192
|
//
|
@@ -129,164 +197,84 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
129
197
|
// Work around this by inserting a dummy attribute and then changing the
|
130
198
|
// name, if needed.
|
131
199
|
|
132
|
-
// Can't use strchr since it's locale-sensitive.
|
133
|
-
size_t len = strlen(name);
|
134
|
-
VALUE r_name = rb_str_new(name, len);
|
135
|
-
if (memchr(name, ':', len) == NULL) {
|
136
|
-
// No colon.
|
137
|
-
return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
|
138
|
-
}
|
139
200
|
// Find a dummy attribute string that doesn't already exist.
|
140
201
|
VALUE dummy = find_dummy_key(node);
|
141
|
-
if (dummy == Qnil)
|
142
|
-
return Qnil;
|
143
202
|
// Add the dummy attribute.
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
203
|
+
rb_funcall(node, set_attribute, 2, dummy, rvalue);
|
204
|
+
|
205
|
+
// Remove the old attribute, if it exists.
|
206
|
+
ID remove_attribute;
|
207
|
+
CONST_ID(remove_attribute, "remove_attribute");
|
208
|
+
rb_funcall(node, remove_attribute, 1, rname);
|
209
|
+
|
149
210
|
// Rename the dummy
|
211
|
+
ID attribute;
|
212
|
+
CONST_ID(attribute, "attribute");
|
150
213
|
VALUE attr = rb_funcall(node, attribute, 1, dummy);
|
151
|
-
|
152
|
-
return Qnil;
|
153
|
-
rb_funcall(attr, node_name_, 1, r_name);
|
154
|
-
return attr;
|
214
|
+
rb_funcall(attr, node_name_, 1, rname);
|
155
215
|
}
|
156
216
|
#endif
|
157
217
|
|
158
|
-
// Build a xmlNodePtr for a given GumboNode (recursively)
|
159
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
160
|
-
|
161
|
-
// Build a xmlNodePtr for a given GumboElement (recursively)
|
162
|
-
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
163
|
-
// create the given element
|
164
|
-
xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
|
165
|
-
|
166
|
-
// add in the attributes
|
167
|
-
GumboVector* attrs = &node->attributes;
|
168
|
-
char *name = NULL;
|
169
|
-
size_t namelen = 0;
|
170
|
-
const char *ns;
|
171
|
-
for (size_t i=0; i < attrs->length; i++) {
|
172
|
-
GumboAttribute *attr = attrs->data[i];
|
173
|
-
|
174
|
-
switch (attr->attr_namespace) {
|
175
|
-
case GUMBO_ATTR_NAMESPACE_XLINK:
|
176
|
-
ns = "xlink:";
|
177
|
-
break;
|
178
|
-
|
179
|
-
case GUMBO_ATTR_NAMESPACE_XML:
|
180
|
-
ns = "xml:";
|
181
|
-
break;
|
182
|
-
|
183
|
-
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
184
|
-
ns = "xmlns:";
|
185
|
-
if (!strcmp(attr->name, "xmlns")) ns = NULL;
|
186
|
-
break;
|
187
|
-
|
188
|
-
default:
|
189
|
-
ns = NULL;
|
190
|
-
}
|
191
|
-
|
192
|
-
if (ns) {
|
193
|
-
if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
|
194
|
-
free(name);
|
195
|
-
name = NULL;
|
196
|
-
}
|
197
|
-
|
198
|
-
if (!name) {
|
199
|
-
namelen = strlen(ns) + strlen(attr->name) + 1;
|
200
|
-
name = malloc(namelen);
|
201
|
-
}
|
202
|
-
|
203
|
-
strcpy(name, ns);
|
204
|
-
strcat(name, attr->name);
|
205
|
-
xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
|
206
|
-
} else {
|
207
|
-
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
208
|
-
}
|
209
|
-
}
|
210
|
-
if (name) free(name);
|
211
|
-
|
212
|
-
// add in the children
|
213
|
-
GumboVector* children = &node->children;
|
214
|
-
for (size_t i=0; i < children->length; i++) {
|
215
|
-
xmlNodePtr node = walk_tree(document, children->data[i]);
|
216
|
-
if (node) xmlAddChild(element, node);
|
217
|
-
}
|
218
|
-
|
219
|
-
return element;
|
220
|
-
}
|
221
|
-
|
222
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
223
|
-
switch (node->type) {
|
224
|
-
case GUMBO_NODE_DOCUMENT:
|
225
|
-
return NIL;
|
226
|
-
case GUMBO_NODE_ELEMENT:
|
227
|
-
case GUMBO_NODE_TEMPLATE:
|
228
|
-
return walk_element(document, &node->v.element);
|
229
|
-
case GUMBO_NODE_TEXT:
|
230
|
-
case GUMBO_NODE_WHITESPACE:
|
231
|
-
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
232
|
-
case GUMBO_NODE_CDATA:
|
233
|
-
return xmlNewCDataBlock(document,
|
234
|
-
CONST_CAST node->v.text.text,
|
235
|
-
(int) strlen(node->v.text.text));
|
236
|
-
case GUMBO_NODE_COMMENT:
|
237
|
-
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
238
|
-
}
|
239
|
-
}
|
240
|
-
|
241
218
|
// URI = system id
|
242
219
|
// external id = public id
|
243
|
-
|
244
|
-
static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
220
|
+
static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
245
221
|
{
|
222
|
+
#if NGLIB
|
246
223
|
// These two libxml2 functions take the public and system ids in
|
247
224
|
// opposite orders.
|
248
225
|
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
|
249
226
|
assert(doc);
|
250
227
|
if (dtd_name)
|
251
|
-
xmlCreateIntSubset(doc,
|
228
|
+
xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
|
252
229
|
return doc;
|
253
|
-
}
|
254
230
|
#else
|
255
|
-
// remove internal subset from newly created documents
|
256
|
-
static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
|
231
|
+
// remove internal subset from newly created documents
|
257
232
|
VALUE doc;
|
258
233
|
// If system and public are both NULL, Document#new is going to set default
|
259
234
|
// values for them so we're going to have to remove the internal subset
|
260
235
|
// which seems to leak memory in Nokogiri, so leak as little as possible.
|
261
236
|
if (system == NULL && public == NULL) {
|
262
|
-
|
263
|
-
|
237
|
+
ID remove;
|
238
|
+
CONST_ID(remove, "remove");
|
239
|
+
doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0));
|
240
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0);
|
264
241
|
if (dtd_name) {
|
265
242
|
// We need to create an internal subset now.
|
266
|
-
|
243
|
+
ID create_internal_subset;
|
244
|
+
CONST_ID(create_internal_subset, "create_internal_subset");
|
245
|
+
rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil);
|
267
246
|
}
|
268
247
|
} else {
|
269
248
|
assert(dtd_name);
|
270
249
|
// Rather than removing and creating the internal subset as we did above,
|
271
250
|
// just create and then rename one.
|
272
|
-
VALUE r_system = system ?
|
273
|
-
VALUE r_public = public ?
|
251
|
+
VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil;
|
252
|
+
VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil;
|
274
253
|
doc = rb_funcall(Document, new, 2, r_system, r_public);
|
275
|
-
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1,
|
254
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name));
|
276
255
|
}
|
277
256
|
return doc;
|
278
|
-
}
|
279
257
|
#endif
|
258
|
+
}
|
280
259
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
260
|
+
static xmlNodePtr get_parent(xmlNodePtr node) {
|
261
|
+
#if NGLIB
|
262
|
+
return node->parent;
|
263
|
+
#else
|
264
|
+
if (!rb_respond_to(node, parent))
|
265
|
+
return Qnil;
|
266
|
+
return rb_funcall(node, parent, 0);
|
267
|
+
#endif
|
268
|
+
}
|
286
269
|
|
287
|
-
|
288
|
-
|
289
|
-
|
270
|
+
static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
|
271
|
+
assert(RTEST(input));
|
272
|
+
Check_Type(input, T_STRING);
|
273
|
+
GumboOutput *output = gumbo_parse_with_options (
|
274
|
+
options,
|
275
|
+
RSTRING_PTR(input),
|
276
|
+
RSTRING_LEN(input)
|
277
|
+
);
|
290
278
|
|
291
279
|
const char *status_string = gumbo_status_to_string(output->status);
|
292
280
|
switch (output->status) {
|
@@ -299,100 +287,458 @@ static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE
|
|
299
287
|
gumbo_destroy_output(output);
|
300
288
|
rb_raise(rb_eNoMemError, "%s", status_string);
|
301
289
|
}
|
290
|
+
return output;
|
291
|
+
}
|
302
292
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
293
|
+
static xmlNsPtr lookup_or_add_ns (
|
294
|
+
xmlDocPtr doc,
|
295
|
+
xmlNodePtr root,
|
296
|
+
const char *href,
|
297
|
+
const char *prefix
|
298
|
+
) {
|
299
|
+
#if NGLIB
|
300
|
+
xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
|
301
|
+
if (ns)
|
302
|
+
return ns;
|
303
|
+
return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
|
304
|
+
#else
|
305
|
+
ID add_namespace_definition;
|
306
|
+
CONST_ID(add_namespace_definition, "add_namespace_definition");
|
307
|
+
VALUE rprefix = rb_utf8_str_new_cstr(prefix);
|
308
|
+
VALUE rhref = rb_utf8_str_new_cstr(href);
|
309
|
+
return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref);
|
310
|
+
#endif
|
311
|
+
}
|
312
|
+
|
313
|
+
static void set_line(xmlNodePtr node, size_t line) {
|
314
|
+
#if NGLIB
|
315
|
+
// libxml2 uses 65535 to mean look elsewhere for the line number on some
|
316
|
+
// nodes.
|
317
|
+
if (line < 65535)
|
318
|
+
node->line = (unsigned short)line;
|
319
|
+
#else
|
320
|
+
// XXX: If Nokogiri gets a `#line=` method, we'll use that.
|
321
|
+
#endif
|
322
|
+
}
|
314
323
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
+
// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
|
325
|
+
// at gumbo_node.
|
326
|
+
static void build_tree (
|
327
|
+
xmlDocPtr doc,
|
328
|
+
xmlNodePtr xml_output_node,
|
329
|
+
const GumboNode *gumbo_node
|
330
|
+
) {
|
331
|
+
xmlNodePtr xml_root = NIL;
|
332
|
+
xmlNodePtr xml_node = xml_output_node;
|
333
|
+
size_t child_index = 0;
|
334
|
+
|
335
|
+
while (true) {
|
336
|
+
assert(gumbo_node != NULL);
|
337
|
+
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT?
|
338
|
+
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
|
339
|
+
if (child_index >= children->length) {
|
340
|
+
// Move up the tree and to the next child.
|
341
|
+
if (xml_node == xml_output_node) {
|
342
|
+
// We've built as much of the tree as we can.
|
343
|
+
return;
|
344
|
+
}
|
345
|
+
child_index = gumbo_node->index_within_parent + 1;
|
346
|
+
gumbo_node = gumbo_node->parent;
|
347
|
+
xml_node = get_parent(xml_node);
|
348
|
+
// Children of fragments don't share the same root, so reset it and
|
349
|
+
// it'll be set below. In the non-fragment case, this will only happen
|
350
|
+
// after the html element has been finished at which point there are no
|
351
|
+
// further elements.
|
352
|
+
if (xml_node == xml_output_node)
|
353
|
+
xml_root = NIL;
|
354
|
+
continue;
|
355
|
+
}
|
356
|
+
const GumboNode *gumbo_child = children->data[child_index++];
|
357
|
+
xmlNodePtr xml_child;
|
358
|
+
|
359
|
+
switch (gumbo_child->type) {
|
360
|
+
case GUMBO_NODE_DOCUMENT:
|
361
|
+
abort(); // Bug in Gumbo.
|
362
|
+
|
363
|
+
case GUMBO_NODE_TEXT:
|
364
|
+
case GUMBO_NODE_WHITESPACE:
|
365
|
+
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
|
366
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
367
|
+
xmlAddChild(xml_node, xml_child);
|
368
|
+
break;
|
369
|
+
|
370
|
+
case GUMBO_NODE_CDATA:
|
371
|
+
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
|
372
|
+
(int) strlen(gumbo_child->v.text.text));
|
373
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
374
|
+
xmlAddChild(xml_node, xml_child);
|
375
|
+
break;
|
376
|
+
|
377
|
+
case GUMBO_NODE_COMMENT:
|
378
|
+
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
|
379
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
380
|
+
xmlAddChild(xml_node, xml_child);
|
381
|
+
break;
|
382
|
+
|
383
|
+
case GUMBO_NODE_TEMPLATE:
|
384
|
+
// XXX: Should create a template element and a new DocumentFragment
|
385
|
+
case GUMBO_NODE_ELEMENT:
|
386
|
+
{
|
387
|
+
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
|
388
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
389
|
+
if (xml_root == NIL)
|
390
|
+
xml_root = xml_child;
|
391
|
+
xmlNsPtr ns = NIL;
|
392
|
+
switch (gumbo_child->v.element.tag_namespace) {
|
393
|
+
case GUMBO_NAMESPACE_HTML:
|
394
|
+
break;
|
395
|
+
case GUMBO_NAMESPACE_SVG:
|
396
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
|
397
|
+
break;
|
398
|
+
case GUMBO_NAMESPACE_MATHML:
|
399
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
|
400
|
+
break;
|
401
|
+
}
|
402
|
+
if (ns != NIL)
|
403
|
+
xmlSetNs(xml_child, ns);
|
404
|
+
xmlAddChild(xml_node, xml_child);
|
405
|
+
|
406
|
+
// Add the attributes.
|
407
|
+
const GumboVector* attrs = &gumbo_child->v.element.attributes;
|
408
|
+
for (size_t i=0; i < attrs->length; i++) {
|
409
|
+
const GumboAttribute *attr = attrs->data[i];
|
410
|
+
|
411
|
+
switch (attr->attr_namespace) {
|
412
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
413
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
|
414
|
+
break;
|
415
|
+
|
416
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
417
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
|
418
|
+
break;
|
419
|
+
|
420
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
421
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
|
422
|
+
break;
|
423
|
+
|
424
|
+
default:
|
425
|
+
ns = NIL;
|
426
|
+
}
|
427
|
+
xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
|
428
|
+
}
|
429
|
+
|
430
|
+
// Add children for this element.
|
431
|
+
child_index = 0;
|
432
|
+
gumbo_node = gumbo_child;
|
433
|
+
xml_node = xml_child;
|
434
|
+
}
|
324
435
|
}
|
325
436
|
}
|
437
|
+
}
|
326
438
|
|
327
|
-
|
439
|
+
static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) {
|
440
|
+
const char *input_str = RSTRING_PTR(input);
|
441
|
+
size_t input_len = RSTRING_LEN(input);
|
328
442
|
|
329
443
|
// Add parse errors to rdoc.
|
330
444
|
if (output->errors.length) {
|
331
|
-
GumboVector *errors = &output->errors;
|
332
|
-
GumboStringBuffer msg;
|
445
|
+
const GumboVector *errors = &output->errors;
|
333
446
|
VALUE rerrors = rb_ary_new2(errors->length);
|
334
447
|
|
335
|
-
gumbo_string_buffer_init(&msg);
|
336
448
|
for (size_t i=0; i < errors->length; i++) {
|
337
449
|
GumboError *err = errors->data[i];
|
338
|
-
|
339
|
-
|
340
|
-
|
450
|
+
GumboSourcePosition position = gumbo_error_position(err);
|
451
|
+
char *msg;
|
452
|
+
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
|
453
|
+
VALUE err_str = rb_utf8_str_new(msg, size);
|
454
|
+
free(msg);
|
341
455
|
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
456
|
+
const char *error_code = gumbo_error_code(err);
|
457
|
+
VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
|
342
458
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
343
459
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
344
460
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
345
461
|
rb_iv_set(syntax_error, "@file", url);
|
346
|
-
rb_iv_set(syntax_error, "@line", INT2NUM(
|
347
|
-
rb_iv_set(syntax_error, "@str1",
|
462
|
+
rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
|
463
|
+
rb_iv_set(syntax_error, "@str1", str1);
|
348
464
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
349
465
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
350
|
-
rb_iv_set(syntax_error, "@int1", INT2NUM(
|
351
|
-
rb_iv_set(syntax_error, "@column", INT2NUM(
|
466
|
+
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
467
|
+
rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
|
352
468
|
rb_ary_push(rerrors, syntax_error);
|
353
469
|
}
|
354
470
|
rb_iv_set(rdoc, "@errors", rerrors);
|
355
|
-
gumbo_string_buffer_destroy(&msg);
|
356
471
|
}
|
472
|
+
}
|
473
|
+
|
474
|
+
typedef struct {
|
475
|
+
GumboOutput *output;
|
476
|
+
VALUE input;
|
477
|
+
VALUE url_or_frag;
|
478
|
+
xmlDocPtr doc;
|
479
|
+
} ParseArgs;
|
480
|
+
|
481
|
+
static VALUE parse_cleanup(ParseArgs *args) {
|
482
|
+
gumbo_destroy_output(args->output);
|
483
|
+
if (args->doc != NIL)
|
484
|
+
xmlFreeDoc(args->doc);
|
485
|
+
return Qnil;
|
486
|
+
}
|
487
|
+
|
488
|
+
|
489
|
+
static VALUE parse_continue(ParseArgs *args);
|
490
|
+
|
491
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
492
|
+
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
|
493
|
+
GumboOptions options = kGumboDefaultOptions;
|
494
|
+
options.max_errors = NUM2INT(max_errors);
|
495
|
+
options.max_tree_depth = NUM2INT(max_depth);
|
357
496
|
|
358
|
-
|
497
|
+
GumboOutput *output = perform_parse(&options, input);
|
498
|
+
ParseArgs args = {
|
499
|
+
.output = output,
|
500
|
+
.input = input,
|
501
|
+
.url_or_frag = url,
|
502
|
+
.doc = NIL,
|
503
|
+
};
|
504
|
+
return rb_ensure(parse_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
|
505
|
+
}
|
359
506
|
|
507
|
+
static VALUE parse_continue(ParseArgs *args) {
|
508
|
+
GumboOutput *output = args->output;
|
509
|
+
xmlDocPtr doc;
|
510
|
+
if (output->document->v.document.has_doctype) {
|
511
|
+
const char *name = output->document->v.document.name;
|
512
|
+
const char *public = output->document->v.document.public_identifier;
|
513
|
+
const char *system = output->document->v.document.system_identifier;
|
514
|
+
public = public[0] ? public : NULL;
|
515
|
+
system = system[0] ? system : NULL;
|
516
|
+
doc = new_html_doc(name, system, public);
|
517
|
+
} else {
|
518
|
+
doc = new_html_doc(NULL, NULL, NULL);
|
519
|
+
}
|
520
|
+
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
521
|
+
build_tree(doc, (xmlNodePtr)doc, output->document);
|
522
|
+
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
|
523
|
+
args->doc = NIL; // The Ruby runtime now owns doc so don't delete it.
|
524
|
+
add_errors(output, rdoc, args->input, args->url_or_frag);
|
360
525
|
return rdoc;
|
361
526
|
}
|
362
527
|
|
363
|
-
|
528
|
+
static int lookup_namespace(VALUE node, bool require_known_ns) {
|
529
|
+
ID namespace, href;
|
530
|
+
CONST_ID(namespace, "namespace");
|
531
|
+
CONST_ID(href, "href");
|
532
|
+
VALUE ns = rb_funcall(node, namespace, 0);
|
533
|
+
|
534
|
+
if (NIL_P(ns))
|
535
|
+
return GUMBO_NAMESPACE_HTML;
|
536
|
+
ns = rb_funcall(ns, href, 0);
|
537
|
+
assert(RTEST(ns));
|
538
|
+
Check_Type(ns, T_STRING);
|
539
|
+
|
540
|
+
const char *href_ptr = RSTRING_PTR(ns);
|
541
|
+
size_t href_len = RSTRING_LEN(ns);
|
542
|
+
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
|
543
|
+
if (NAMESPACE_P("http://www.w3.org/1999/xhtml"))
|
544
|
+
return GUMBO_NAMESPACE_HTML;
|
545
|
+
if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML"))
|
546
|
+
return GUMBO_NAMESPACE_MATHML;
|
547
|
+
if (NAMESPACE_P("http://www.w3.org/2000/svg"))
|
548
|
+
return GUMBO_NAMESPACE_SVG;
|
549
|
+
#undef NAMESPACE_P
|
550
|
+
if (require_known_ns)
|
551
|
+
rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
|
552
|
+
return -1;
|
553
|
+
}
|
554
|
+
|
555
|
+
static xmlNodePtr extract_xml_node(VALUE node) {
|
556
|
+
#if NGLIB
|
557
|
+
xmlNodePtr xml_node;
|
558
|
+
Data_Get_Struct(node, xmlNode, xml_node);
|
559
|
+
return xml_node;
|
560
|
+
#else
|
561
|
+
return node;
|
562
|
+
#endif
|
563
|
+
}
|
564
|
+
|
565
|
+
static VALUE fragment_continue(ParseArgs *args);
|
566
|
+
|
567
|
+
static VALUE fragment (
|
568
|
+
VALUE self,
|
569
|
+
VALUE doc_fragment,
|
570
|
+
VALUE tags,
|
571
|
+
VALUE ctx,
|
572
|
+
VALUE max_errors,
|
573
|
+
VALUE max_depth
|
574
|
+
) {
|
575
|
+
ID name = rb_intern_const("name");
|
576
|
+
const char *ctx_tag;
|
577
|
+
GumboNamespaceEnum ctx_ns;
|
578
|
+
GumboQuirksModeEnum quirks_mode;
|
579
|
+
bool form = false;
|
580
|
+
const char *encoding = NULL;
|
581
|
+
|
582
|
+
if (NIL_P(ctx)) {
|
583
|
+
ctx_tag = "body";
|
584
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
585
|
+
} else if (TYPE(ctx) == T_STRING) {
|
586
|
+
ctx_tag = StringValueCStr(ctx);
|
587
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
588
|
+
size_t len = RSTRING_LEN(ctx);
|
589
|
+
const char *colon = memchr(ctx_tag, ':', len);
|
590
|
+
if (colon) {
|
591
|
+
switch (colon - ctx_tag) {
|
592
|
+
case 3:
|
593
|
+
if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
|
594
|
+
goto error;
|
595
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
596
|
+
break;
|
597
|
+
case 4:
|
598
|
+
if (st_strncasecmp(ctx_tag, "html", 4) == 0)
|
599
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
600
|
+
else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
|
601
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
602
|
+
else
|
603
|
+
goto error;
|
604
|
+
break;
|
605
|
+
default:
|
606
|
+
error:
|
607
|
+
rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
|
608
|
+
}
|
609
|
+
ctx_tag = colon+1;
|
610
|
+
} else {
|
611
|
+
// For convenience, put 'svg' and 'math' in their namespaces.
|
612
|
+
if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
|
613
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
614
|
+
else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
|
615
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
616
|
+
}
|
617
|
+
|
618
|
+
// Check if it's a form.
|
619
|
+
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
|
620
|
+
} else {
|
621
|
+
ID element_ = rb_intern_const("element?");
|
622
|
+
|
623
|
+
// Context fragment name.
|
624
|
+
VALUE tag_name = rb_funcall(ctx, name, 0);
|
625
|
+
assert(RTEST(tag_name));
|
626
|
+
Check_Type(tag_name, T_STRING);
|
627
|
+
ctx_tag = StringValueCStr(tag_name);
|
628
|
+
|
629
|
+
// Context fragment namespace.
|
630
|
+
ctx_ns = lookup_namespace(ctx, true);
|
631
|
+
|
632
|
+
// Check for a form ancestor, including self.
|
633
|
+
for (VALUE node = ctx;
|
634
|
+
!NIL_P(node);
|
635
|
+
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
|
636
|
+
if (!RTEST(rb_funcall(node, element_, 0)))
|
637
|
+
continue;
|
638
|
+
VALUE element_name = rb_funcall(node, name, 0);
|
639
|
+
if (RSTRING_LEN(element_name) == 4
|
640
|
+
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
|
641
|
+
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
|
642
|
+
form = true;
|
643
|
+
break;
|
644
|
+
}
|
645
|
+
}
|
646
|
+
|
647
|
+
// Encoding.
|
648
|
+
if (RSTRING_LEN(tag_name) == 14
|
649
|
+
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
650
|
+
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
651
|
+
rb_utf8_str_new_static("encoding", 8));
|
652
|
+
if (RTEST(enc)) {
|
653
|
+
Check_Type(enc, T_STRING);
|
654
|
+
encoding = StringValueCStr(enc);
|
655
|
+
}
|
656
|
+
}
|
657
|
+
}
|
658
|
+
|
659
|
+
// Quirks mode.
|
660
|
+
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
661
|
+
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
662
|
+
if (NIL_P(dtd)) {
|
663
|
+
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
664
|
+
} else {
|
665
|
+
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
666
|
+
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
667
|
+
VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
|
668
|
+
quirks_mode = gumbo_compute_quirks_mode (
|
669
|
+
NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
|
670
|
+
NIL_P(pubid)? NULL:StringValueCStr(pubid),
|
671
|
+
NIL_P(sysid)? NULL:StringValueCStr(sysid)
|
672
|
+
);
|
673
|
+
}
|
674
|
+
|
675
|
+
// Perform a fragment parse.
|
676
|
+
int depth = NUM2INT(max_depth);
|
677
|
+
GumboOptions options = kGumboDefaultOptions;
|
678
|
+
options.max_errors = NUM2INT(max_errors);
|
679
|
+
// Add one to account for the HTML element.
|
680
|
+
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
681
|
+
options.fragment_context = ctx_tag;
|
682
|
+
options.fragment_namespace = ctx_ns;
|
683
|
+
options.fragment_encoding = encoding;
|
684
|
+
options.quirks_mode = quirks_mode;
|
685
|
+
options.fragment_context_has_form_ancestor = form;
|
686
|
+
|
687
|
+
GumboOutput *output = perform_parse(&options, tags);
|
688
|
+
ParseArgs args = {
|
689
|
+
.output = output,
|
690
|
+
.input = tags,
|
691
|
+
.url_or_frag = doc_fragment,
|
692
|
+
.doc = (xmlDocPtr)extract_xml_node(doc),
|
693
|
+
};
|
694
|
+
rb_ensure(fragment_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
|
695
|
+
return Qnil;
|
696
|
+
}
|
697
|
+
|
698
|
+
static VALUE fragment_continue(ParseArgs *args) {
|
699
|
+
GumboOutput *output = args->output;
|
700
|
+
VALUE doc_fragment = args->url_or_frag;
|
701
|
+
xmlDocPtr xml_doc = args->doc;
|
702
|
+
|
703
|
+
args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it.
|
704
|
+
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
705
|
+
build_tree(xml_doc, xml_frag, output->root);
|
706
|
+
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
707
|
+
return Qnil;
|
708
|
+
}
|
709
|
+
|
710
|
+
// Initialize the Nokogumbo class and fetch constants we will use later.
|
364
711
|
void Init_nokogumbo() {
|
365
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1,
|
712
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
|
366
713
|
rb_require("nokogiri");
|
367
714
|
|
368
|
-
// class constants
|
369
|
-
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
370
|
-
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
371
|
-
Document = rb_const_get(HTML5, rb_intern("Document"));
|
372
|
-
|
373
715
|
#ifndef NGLIB
|
374
|
-
//
|
375
|
-
VALUE
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
remove_attribute = rb_intern("remove_attribute");
|
387
|
-
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
388
|
-
internal_subset = rb_intern("internal_subset");
|
389
|
-
remove_ = rb_intern("remove");
|
390
|
-
create_internal_subset = rb_intern("create_internal_subset");
|
391
|
-
key_ = rb_intern("key?");
|
392
|
-
node_name_ = rb_intern("node_name=");
|
716
|
+
// Class constants.
|
717
|
+
VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
|
718
|
+
VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
|
719
|
+
cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
|
720
|
+
cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
|
721
|
+
cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
|
722
|
+
cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
|
723
|
+
cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
|
724
|
+
|
725
|
+
// Interned symbols.
|
726
|
+
new = rb_intern_const("new");
|
727
|
+
node_name_ = rb_intern_const("node_name=");
|
393
728
|
#endif
|
394
729
|
|
395
|
-
//
|
730
|
+
// Class constants.
|
731
|
+
VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
|
732
|
+
Document = rb_const_get(HTML5, rb_intern_const("Document"));
|
733
|
+
|
734
|
+
// Interned symbols.
|
735
|
+
internal_subset = rb_intern_const("internal_subset");
|
736
|
+
parent = rb_intern_const("parent");
|
737
|
+
|
738
|
+
// Define Nokogumbo module with parse and fragment methods.
|
396
739
|
VALUE Gumbo = rb_define_module("Nokogumbo");
|
397
740
|
rb_define_singleton_method(Gumbo, "parse", parse, 4);
|
741
|
+
rb_define_singleton_method(Gumbo, "fragment", fragment, 5);
|
398
742
|
}
|
743
|
+
|
744
|
+
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|