nokogumbo 2.0.0.pre.alpha → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 97aae1603382eb4357f4126f4c36f86841b930a04b46e0eb6a1c02c26c9e77a4
|
4
|
+
data.tar.gz: 614f1dc01be03d4ccb48b43d512faf80f556e8ddde4690fa34fbb9e420c36cb9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aabd005ec985f1a94b0b82195ce8547fabc3d3f97e672b9b03c174ddd131b534b6c2648803ccd0d1fdfc9224bab132cdbd2757271b5646fc3873649d1f509e26
|
7
|
+
data.tar.gz: 4e793d5436de772587f2abcdb54c1060a3ddbde3dfcad05539e823dbde3c67f0792d1589289323fe76d5baebe265a526aba71d5adcbeb8d2ca72559d789e7b14
|
data/README.md
CHANGED
@@ -5,7 +5,8 @@ Nokogumbo provides the ability for a Ruby program to invoke the
|
|
5
5
|
and to access the result as a
|
6
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
7
7
|
|
8
|
-
[](https://travis-ci.org/rubys/nokogumbo)
|
8
|
+
[](https://travis-ci.org/rubys/nokogumbo)
|
9
|
+
[](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master)
|
9
10
|
|
10
11
|
## Usage
|
11
12
|
|
@@ -14,8 +15,7 @@ require 'nokogumbo'
|
|
14
15
|
doc = Nokogiri.HTML5(string)
|
15
16
|
```
|
16
17
|
|
17
|
-
|
18
|
-
compliant, it may be useful:
|
18
|
+
To parse an HTML fragment, a `fragment` method is provided.
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
require 'nokogumbo'
|
@@ -49,20 +49,26 @@ no parse errors are reported but this can be configured by passing the
|
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
require 'nokogumbo'
|
52
|
-
doc = Nokogiri::HTML5.parse('Hi there
|
52
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
53
53
|
doc.errors.each do |err|
|
54
|
-
puts
|
54
|
+
puts(err)
|
55
55
|
end
|
56
56
|
```
|
57
57
|
|
58
58
|
This prints the following.
|
59
59
|
```
|
60
|
-
1:1: ERROR:
|
61
|
-
Hi there
|
60
|
+
1:1: ERROR: Expected a doctype token
|
61
|
+
<span/>Hi there!</span foo=bar />
|
62
62
|
^
|
63
|
-
1:
|
64
|
-
Hi there
|
65
|
-
|
63
|
+
1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
|
64
|
+
<span/>Hi there!</span foo=bar />
|
65
|
+
^
|
66
|
+
1:17: ERROR: End tag ends with '/>', use '>'.
|
67
|
+
<span/>Hi there!</span foo=bar />
|
68
|
+
^
|
69
|
+
1:17: ERROR: End tag contains attributes.
|
70
|
+
<span/>Hi there!</span foo=bar />
|
71
|
+
^
|
66
72
|
```
|
67
73
|
|
68
74
|
Using `max_errors: -1` results in an unlimited number of errors being
|
@@ -71,6 +77,41 @@ returned.
|
|
71
77
|
The errors returned by `#errors` are instances of
|
72
78
|
[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
|
73
79
|
|
80
|
+
The [HTML
|
81
|
+
standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors)
|
82
|
+
defines a number of standard parse error codes. These error codes only cover
|
83
|
+
the "tokenization" stage of parsing HTML. The parse errors in the
|
84
|
+
"tree construction" stage do not have standardized error codes (yet).
|
85
|
+
|
86
|
+
As a convenience to Nokogumbo users, the defined error codes are available
|
87
|
+
via the
|
88
|
+
[`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method)
|
89
|
+
method.
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
require 'nokogumbo'
|
93
|
+
doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
|
94
|
+
doc.errors.each do |err|
|
95
|
+
puts("#{err.line}:#{err.column}: #{err.str1}")
|
96
|
+
end
|
97
|
+
```
|
98
|
+
|
99
|
+
This prints the following.
|
100
|
+
```
|
101
|
+
1:1: generic-parser
|
102
|
+
1:1: non-void-html-element-start-tag-with-trailing-solidus
|
103
|
+
1:17: end-tag-with-trailing-solidus
|
104
|
+
1:17: end-tag-with-attributes
|
105
|
+
```
|
106
|
+
|
107
|
+
Note that the first error is `generic-parser` because it's an error from the
|
108
|
+
tree construction stage and doesn't have a standardized error code.
|
109
|
+
|
110
|
+
For the purposes of semantic versioning, the error messages, error locations,
|
111
|
+
and error codes are not part of Nokogumbo's public API. That is, these are
|
112
|
+
subject to change without Nokogumbo's major version number changing. These may
|
113
|
+
be stabilized in the future.
|
114
|
+
|
74
115
|
### Maximum tree depth
|
75
116
|
The maximum depth of the DOM tree parsed by the various parsing methods is
|
76
117
|
configurable by the `:max_tree_depth` option. If the depth of the tree would
|
@@ -201,6 +242,36 @@ rules defined in the HTML5 specification for doing so.
|
|
201
242
|
* Instead of returning `unknown` as the element name for unknown tags, the
|
202
243
|
original tag name is returned verbatim.
|
203
244
|
|
245
|
+
# Flavors of Nokogumbo
|
246
|
+
Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
|
247
|
+
parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
|
248
|
+
using Nokogiri's Ruby API to construct the DOM tree.
|
249
|
+
|
250
|
+
Nokogiri can be configured to either use the system library version of libxml2
|
251
|
+
or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
|
252
|
+
will use a bundled version.
|
253
|
+
|
254
|
+
To prevent differences between versions of libxml2, Nokogumbo will only use
|
255
|
+
libxml2 if the build process can find the exact same version used by Nokogiri.
|
256
|
+
This leads to three possibilities
|
257
|
+
|
258
|
+
1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
|
259
|
+
(by default) use the same version of libxml2.
|
260
|
+
2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
|
261
|
+
headers are available, then Nokogumbo will (by default) use the system
|
262
|
+
version and headers.
|
263
|
+
3. Nokogiri is compiled with the system libxml2 but its headers aren't
|
264
|
+
available at build time for Nokogumbo. In this case, Nokogumbo will use the
|
265
|
+
slower Ruby API.
|
266
|
+
|
267
|
+
Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
|
268
|
+
rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
|
269
|
+
`-- --without-libxml2`.
|
270
|
+
|
271
|
+
Functionally, the only difference between using libxml2 or not is in the
|
272
|
+
behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
|
273
|
+
return the line number of the corresponding node. Otherwise, it will return 0.
|
274
|
+
|
204
275
|
# Installation
|
205
276
|
|
206
277
|
git clone https://github.com/rubys/nokogumbo.git
|
data/ext/nokogumbo/extconf.rb
CHANGED
@@ -108,9 +108,14 @@ gumbo_src = File.join(ext_dir, 'gumbo_src')
|
|
108
108
|
|
109
109
|
Dir.chdir(ext_dir) do
|
110
110
|
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
111
|
+
$hdrs = Dir['*.h', '../../gumbo-parser/src/*.h']
|
111
112
|
end
|
112
113
|
$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
|
113
114
|
$VPATH << '$(srcdir)/../../gumbo-parser/src'
|
114
115
|
|
115
|
-
create_makefile('nokogumbo/nokogumbo')
|
116
|
+
create_makefile('nokogumbo/nokogumbo') do |conf|
|
117
|
+
conf.map! do |chunk|
|
118
|
+
chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}")
|
119
|
+
end
|
120
|
+
end
|
116
121
|
# vim: set sw=2 sts=2 ts=8 et:
|
data/ext/nokogumbo/nokogumbo.c
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
// document tree is then walked:
|
10
10
|
//
|
11
11
|
// * if Nokogiri and libxml2 headers are available at compile time,
|
12
|
-
// (
|
12
|
+
// (if NGLIB) then a parallel libxml2 tree is constructed, and the
|
13
13
|
// final document is then wrapped using Nokogiri_wrap_xml_document.
|
14
14
|
// This approach reduces memory and CPU requirements as Ruby objects
|
15
15
|
// are only built when necessary.
|
@@ -20,74 +20,110 @@
|
|
20
20
|
|
21
21
|
#include <assert.h>
|
22
22
|
#include <ruby.h>
|
23
|
+
#include <ruby/version.h>
|
24
|
+
|
23
25
|
#include "gumbo.h"
|
24
|
-
#include "error.h"
|
25
26
|
|
26
27
|
// class constants
|
27
28
|
static VALUE Document;
|
28
29
|
|
29
|
-
|
30
|
+
// Interned symbols
|
31
|
+
static ID internal_subset;
|
32
|
+
static ID parent;
|
33
|
+
|
34
|
+
/* Backwards compatibility to Ruby 2.1.0 */
|
35
|
+
#if RUBY_API_VERSION_CODE < 20200
|
36
|
+
#include <ruby/encoding.h>
|
37
|
+
|
38
|
+
static VALUE rb_utf8_str_new(const char *str, long length) {
|
39
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
40
|
+
}
|
41
|
+
|
42
|
+
static VALUE rb_utf8_str_new_cstr(const char *str) {
|
43
|
+
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
|
44
|
+
}
|
45
|
+
|
46
|
+
static VALUE rb_utf8_str_new_static(const char *str, long length) {
|
47
|
+
return rb_enc_str_new(str, length, rb_utf8_encoding());
|
48
|
+
}
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#if NGLIB
|
30
52
|
#include <nokogiri.h>
|
31
|
-
#include <xml_syntax_error.h>
|
32
53
|
#include <libxml/tree.h>
|
33
54
|
#include <libxml/HTMLtree.h>
|
34
55
|
|
35
56
|
#define NIL NULL
|
36
|
-
#define CONST_CAST (xmlChar const*)
|
37
57
|
#else
|
38
58
|
#define NIL Qnil
|
39
|
-
#define CONST_CAST
|
40
59
|
|
41
|
-
//
|
60
|
+
// These are defined by nokogiri.h
|
42
61
|
static VALUE cNokogiriXmlSyntaxError;
|
62
|
+
static VALUE cNokogiriXmlElement;
|
63
|
+
static VALUE cNokogiriXmlText;
|
64
|
+
static VALUE cNokogiriXmlCData;
|
65
|
+
static VALUE cNokogiriXmlComment;
|
66
|
+
|
67
|
+
// Interned symbols.
|
68
|
+
static ID new;
|
69
|
+
static ID node_name_;
|
70
|
+
|
71
|
+
// Map libxml2 types to Ruby VALUE.
|
72
|
+
typedef VALUE xmlNodePtr;
|
73
|
+
typedef VALUE xmlDocPtr;
|
74
|
+
typedef VALUE xmlNsPtr;
|
75
|
+
typedef VALUE xmlDtdPtr;
|
76
|
+
typedef char xmlChar;
|
77
|
+
#define BAD_CAST
|
78
|
+
|
79
|
+
// Redefine libxml2 API as Ruby function calls.
|
80
|
+
static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) {
|
81
|
+
assert(ns == NIL && content == NULL);
|
82
|
+
return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc);
|
83
|
+
}
|
84
|
+
|
85
|
+
static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) {
|
86
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
87
|
+
return rb_funcall(cNokogiriXmlText, new, 2, str, doc);
|
88
|
+
}
|
89
|
+
|
90
|
+
static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) {
|
91
|
+
VALUE str = rb_utf8_str_new(content, len);
|
92
|
+
// CDATA.new takes arguments in the opposite order from Text.new.
|
93
|
+
return rb_funcall(cNokogiriXmlCData, new, 2, doc, str);
|
94
|
+
}
|
95
|
+
|
96
|
+
static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) {
|
97
|
+
VALUE str = rb_utf8_str_new_cstr(content);
|
98
|
+
return rb_funcall(cNokogiriXmlComment, new, 2, doc, str);
|
99
|
+
}
|
100
|
+
|
101
|
+
static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) {
|
102
|
+
ID add_child;
|
103
|
+
CONST_ID(add_child, "add_child");
|
104
|
+
return rb_funcall(parent, add_child, 1, cur);
|
105
|
+
}
|
106
|
+
|
107
|
+
static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) {
|
108
|
+
ID namespace_;
|
109
|
+
CONST_ID(namespace_, "namespace=");
|
110
|
+
rb_funcall(node, namespace_, 1, ns);
|
111
|
+
}
|
43
112
|
|
44
|
-
static
|
45
|
-
|
46
|
-
static VALUE
|
47
|
-
|
48
|
-
|
49
|
-
// interned symbols
|
50
|
-
static VALUE new;
|
51
|
-
static VALUE attribute;
|
52
|
-
static VALUE set_attribute;
|
53
|
-
static VALUE remove_attribute;
|
54
|
-
static VALUE add_child;
|
55
|
-
static VALUE internal_subset;
|
56
|
-
static VALUE remove_;
|
57
|
-
static VALUE create_internal_subset;
|
58
|
-
static VALUE key_;
|
59
|
-
static VALUE node_name_;
|
60
|
-
|
61
|
-
// map libxml2 types to Ruby VALUE
|
62
|
-
#define xmlNodePtr VALUE
|
63
|
-
#define xmlDocPtr VALUE
|
64
|
-
|
65
|
-
// redefine libxml2 API as Ruby function calls
|
66
|
-
#define xmlNewDocNode(doc, ns, name, content) \
|
67
|
-
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
68
|
-
#define xmlNewDocText(doc, text) \
|
69
|
-
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
70
|
-
#define xmlNewCDataBlock(doc, content, length) \
|
71
|
-
rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
|
72
|
-
#define xmlNewDocComment(doc, text) \
|
73
|
-
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
74
|
-
#define xmlAddChild(element, node) \
|
75
|
-
rb_funcall(element, add_child, 1, node)
|
76
|
-
#define xmlDocSetRootElement(doc, root) \
|
77
|
-
rb_funcall(doc, add_child, 1, root)
|
78
|
-
#define xmlCreateIntSubset(doc, name, external, system) \
|
79
|
-
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
|
80
|
-
(external ? rb_str_new2(external) : Qnil), \
|
81
|
-
(system ? rb_str_new2(system) : Qnil));
|
82
|
-
#define Nokogiri_wrap_xml_document(klass, doc) \
|
83
|
-
doc
|
113
|
+
static void xmlFreeDoc(xmlDocPtr doc) { }
|
114
|
+
|
115
|
+
static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) {
|
116
|
+
return doc;
|
117
|
+
}
|
84
118
|
|
85
119
|
static VALUE find_dummy_key(VALUE collection) {
|
86
120
|
VALUE r_dummy = Qnil;
|
87
121
|
char dummy[5] = "a";
|
88
122
|
size_t len = 1;
|
123
|
+
ID key_;
|
124
|
+
CONST_ID(key_, "key?");
|
89
125
|
while (len < sizeof dummy) {
|
90
|
-
r_dummy =
|
126
|
+
r_dummy = rb_utf8_str_new(dummy, len);
|
91
127
|
if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
|
92
128
|
return r_dummy;
|
93
129
|
for (size_t i = 0; ; ++i) {
|
@@ -105,10 +141,42 @@ static VALUE find_dummy_key(VALUE collection) {
|
|
105
141
|
}
|
106
142
|
}
|
107
143
|
// This collection has 475254 elements?? Give up.
|
108
|
-
|
144
|
+
rb_raise(rb_eArgError, "Failed to find a dummy key.");
|
109
145
|
}
|
110
146
|
|
111
|
-
|
147
|
+
// This should return an xmlAttrPtr, but we don't need it and it's easier to
|
148
|
+
// not get the result.
|
149
|
+
static void xmlNewNsProp (
|
150
|
+
xmlNodePtr node,
|
151
|
+
xmlNsPtr ns,
|
152
|
+
const xmlChar *name,
|
153
|
+
const xmlChar *value
|
154
|
+
) {
|
155
|
+
ID set_attribute;
|
156
|
+
CONST_ID(set_attribute, "set_attribute");
|
157
|
+
|
158
|
+
VALUE rvalue = rb_utf8_str_new_cstr(value);
|
159
|
+
|
160
|
+
if (RTEST(ns)) {
|
161
|
+
// This is an easy case, we have a namespace so it's enough to do
|
162
|
+
// node["#{ns.prefix}:#{name}"] = value
|
163
|
+
ID prefix;
|
164
|
+
CONST_ID(prefix, "prefix");
|
165
|
+
VALUE ns_prefix = rb_funcall(ns, prefix, 0);
|
166
|
+
VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name);
|
167
|
+
rb_funcall(node, set_attribute, 2, qname, rvalue);
|
168
|
+
return;
|
169
|
+
}
|
170
|
+
|
171
|
+
size_t len = strlen(name);
|
172
|
+
VALUE rname = rb_utf8_str_new(name, len);
|
173
|
+
if (memchr(name, ':', len) == NULL) {
|
174
|
+
// This is the easiest case. There's no colon so we can do
|
175
|
+
// node[name] = value.
|
176
|
+
rb_funcall(node, set_attribute, 2, rname, rvalue);
|
177
|
+
return;
|
178
|
+
}
|
179
|
+
|
112
180
|
// Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
|
113
181
|
// which behaves roughly as
|
114
182
|
// if name is a QName prefix:local
|
@@ -118,7 +186,7 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
118
186
|
//
|
119
187
|
// If the prefix is "xml", then the namespace lookup will create it.
|
120
188
|
//
|
121
|
-
// By contrast,
|
189
|
+
// By contrast, xmlNewNsProp does not do this parsing and creates an attribute
|
122
190
|
// with the name and value exactly as given. This is the behavior that we
|
123
191
|
// want.
|
124
192
|
//
|
@@ -129,164 +197,84 @@ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *valu
|
|
129
197
|
// Work around this by inserting a dummy attribute and then changing the
|
130
198
|
// name, if needed.
|
131
199
|
|
132
|
-
// Can't use strchr since it's locale-sensitive.
|
133
|
-
size_t len = strlen(name);
|
134
|
-
VALUE r_name = rb_str_new(name, len);
|
135
|
-
if (memchr(name, ':', len) == NULL) {
|
136
|
-
// No colon.
|
137
|
-
return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
|
138
|
-
}
|
139
200
|
// Find a dummy attribute string that doesn't already exist.
|
140
201
|
VALUE dummy = find_dummy_key(node);
|
141
|
-
if (dummy == Qnil)
|
142
|
-
return Qnil;
|
143
202
|
// Add the dummy attribute.
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
203
|
+
rb_funcall(node, set_attribute, 2, dummy, rvalue);
|
204
|
+
|
205
|
+
// Remove the old attribute, if it exists.
|
206
|
+
ID remove_attribute;
|
207
|
+
CONST_ID(remove_attribute, "remove_attribute");
|
208
|
+
rb_funcall(node, remove_attribute, 1, rname);
|
209
|
+
|
149
210
|
// Rename the dummy
|
211
|
+
ID attribute;
|
212
|
+
CONST_ID(attribute, "attribute");
|
150
213
|
VALUE attr = rb_funcall(node, attribute, 1, dummy);
|
151
|
-
|
152
|
-
return Qnil;
|
153
|
-
rb_funcall(attr, node_name_, 1, r_name);
|
154
|
-
return attr;
|
214
|
+
rb_funcall(attr, node_name_, 1, rname);
|
155
215
|
}
|
156
216
|
#endif
|
157
217
|
|
158
|
-
// Build a xmlNodePtr for a given GumboNode (recursively)
|
159
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
160
|
-
|
161
|
-
// Build a xmlNodePtr for a given GumboElement (recursively)
|
162
|
-
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
163
|
-
// create the given element
|
164
|
-
xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
|
165
|
-
|
166
|
-
// add in the attributes
|
167
|
-
GumboVector* attrs = &node->attributes;
|
168
|
-
char *name = NULL;
|
169
|
-
size_t namelen = 0;
|
170
|
-
const char *ns;
|
171
|
-
for (size_t i=0; i < attrs->length; i++) {
|
172
|
-
GumboAttribute *attr = attrs->data[i];
|
173
|
-
|
174
|
-
switch (attr->attr_namespace) {
|
175
|
-
case GUMBO_ATTR_NAMESPACE_XLINK:
|
176
|
-
ns = "xlink:";
|
177
|
-
break;
|
178
|
-
|
179
|
-
case GUMBO_ATTR_NAMESPACE_XML:
|
180
|
-
ns = "xml:";
|
181
|
-
break;
|
182
|
-
|
183
|
-
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
184
|
-
ns = "xmlns:";
|
185
|
-
if (!strcmp(attr->name, "xmlns")) ns = NULL;
|
186
|
-
break;
|
187
|
-
|
188
|
-
default:
|
189
|
-
ns = NULL;
|
190
|
-
}
|
191
|
-
|
192
|
-
if (ns) {
|
193
|
-
if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
|
194
|
-
free(name);
|
195
|
-
name = NULL;
|
196
|
-
}
|
197
|
-
|
198
|
-
if (!name) {
|
199
|
-
namelen = strlen(ns) + strlen(attr->name) + 1;
|
200
|
-
name = malloc(namelen);
|
201
|
-
}
|
202
|
-
|
203
|
-
strcpy(name, ns);
|
204
|
-
strcat(name, attr->name);
|
205
|
-
xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
|
206
|
-
} else {
|
207
|
-
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
208
|
-
}
|
209
|
-
}
|
210
|
-
if (name) free(name);
|
211
|
-
|
212
|
-
// add in the children
|
213
|
-
GumboVector* children = &node->children;
|
214
|
-
for (size_t i=0; i < children->length; i++) {
|
215
|
-
xmlNodePtr node = walk_tree(document, children->data[i]);
|
216
|
-
if (node) xmlAddChild(element, node);
|
217
|
-
}
|
218
|
-
|
219
|
-
return element;
|
220
|
-
}
|
221
|
-
|
222
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
223
|
-
switch (node->type) {
|
224
|
-
case GUMBO_NODE_DOCUMENT:
|
225
|
-
return NIL;
|
226
|
-
case GUMBO_NODE_ELEMENT:
|
227
|
-
case GUMBO_NODE_TEMPLATE:
|
228
|
-
return walk_element(document, &node->v.element);
|
229
|
-
case GUMBO_NODE_TEXT:
|
230
|
-
case GUMBO_NODE_WHITESPACE:
|
231
|
-
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
232
|
-
case GUMBO_NODE_CDATA:
|
233
|
-
return xmlNewCDataBlock(document,
|
234
|
-
CONST_CAST node->v.text.text,
|
235
|
-
(int) strlen(node->v.text.text));
|
236
|
-
case GUMBO_NODE_COMMENT:
|
237
|
-
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
238
|
-
}
|
239
|
-
}
|
240
|
-
|
241
218
|
// URI = system id
|
242
219
|
// external id = public id
|
243
|
-
|
244
|
-
static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
220
|
+
static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
|
245
221
|
{
|
222
|
+
#if NGLIB
|
246
223
|
// These two libxml2 functions take the public and system ids in
|
247
224
|
// opposite orders.
|
248
225
|
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
|
249
226
|
assert(doc);
|
250
227
|
if (dtd_name)
|
251
|
-
xmlCreateIntSubset(doc,
|
228
|
+
xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
|
252
229
|
return doc;
|
253
|
-
}
|
254
230
|
#else
|
255
|
-
// remove internal subset from newly created documents
|
256
|
-
static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
|
231
|
+
// remove internal subset from newly created documents
|
257
232
|
VALUE doc;
|
258
233
|
// If system and public are both NULL, Document#new is going to set default
|
259
234
|
// values for them so we're going to have to remove the internal subset
|
260
235
|
// which seems to leak memory in Nokogiri, so leak as little as possible.
|
261
236
|
if (system == NULL && public == NULL) {
|
262
|
-
|
263
|
-
|
237
|
+
ID remove;
|
238
|
+
CONST_ID(remove, "remove");
|
239
|
+
doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0));
|
240
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0);
|
264
241
|
if (dtd_name) {
|
265
242
|
// We need to create an internal subset now.
|
266
|
-
|
243
|
+
ID create_internal_subset;
|
244
|
+
CONST_ID(create_internal_subset, "create_internal_subset");
|
245
|
+
rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil);
|
267
246
|
}
|
268
247
|
} else {
|
269
248
|
assert(dtd_name);
|
270
249
|
// Rather than removing and creating the internal subset as we did above,
|
271
250
|
// just create and then rename one.
|
272
|
-
VALUE r_system = system ?
|
273
|
-
VALUE r_public = public ?
|
251
|
+
VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil;
|
252
|
+
VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil;
|
274
253
|
doc = rb_funcall(Document, new, 2, r_system, r_public);
|
275
|
-
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1,
|
254
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name));
|
276
255
|
}
|
277
256
|
return doc;
|
278
|
-
}
|
279
257
|
#endif
|
258
|
+
}
|
280
259
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
260
|
+
static xmlNodePtr get_parent(xmlNodePtr node) {
|
261
|
+
#if NGLIB
|
262
|
+
return node->parent;
|
263
|
+
#else
|
264
|
+
if (!rb_respond_to(node, parent))
|
265
|
+
return Qnil;
|
266
|
+
return rb_funcall(node, parent, 0);
|
267
|
+
#endif
|
268
|
+
}
|
286
269
|
|
287
|
-
|
288
|
-
|
289
|
-
|
270
|
+
static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
|
271
|
+
assert(RTEST(input));
|
272
|
+
Check_Type(input, T_STRING);
|
273
|
+
GumboOutput *output = gumbo_parse_with_options (
|
274
|
+
options,
|
275
|
+
RSTRING_PTR(input),
|
276
|
+
RSTRING_LEN(input)
|
277
|
+
);
|
290
278
|
|
291
279
|
const char *status_string = gumbo_status_to_string(output->status);
|
292
280
|
switch (output->status) {
|
@@ -299,100 +287,458 @@ static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE
|
|
299
287
|
gumbo_destroy_output(output);
|
300
288
|
rb_raise(rb_eNoMemError, "%s", status_string);
|
301
289
|
}
|
290
|
+
return output;
|
291
|
+
}
|
302
292
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
293
|
+
static xmlNsPtr lookup_or_add_ns (
|
294
|
+
xmlDocPtr doc,
|
295
|
+
xmlNodePtr root,
|
296
|
+
const char *href,
|
297
|
+
const char *prefix
|
298
|
+
) {
|
299
|
+
#if NGLIB
|
300
|
+
xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
|
301
|
+
if (ns)
|
302
|
+
return ns;
|
303
|
+
return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
|
304
|
+
#else
|
305
|
+
ID add_namespace_definition;
|
306
|
+
CONST_ID(add_namespace_definition, "add_namespace_definition");
|
307
|
+
VALUE rprefix = rb_utf8_str_new_cstr(prefix);
|
308
|
+
VALUE rhref = rb_utf8_str_new_cstr(href);
|
309
|
+
return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref);
|
310
|
+
#endif
|
311
|
+
}
|
312
|
+
|
313
|
+
static void set_line(xmlNodePtr node, size_t line) {
|
314
|
+
#if NGLIB
|
315
|
+
// libxml2 uses 65535 to mean look elsewhere for the line number on some
|
316
|
+
// nodes.
|
317
|
+
if (line < 65535)
|
318
|
+
node->line = (unsigned short)line;
|
319
|
+
#else
|
320
|
+
// XXX: If Nokogiri gets a `#line=` method, we'll use that.
|
321
|
+
#endif
|
322
|
+
}
|
314
323
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
+
// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
|
325
|
+
// at gumbo_node.
|
326
|
+
static void build_tree (
|
327
|
+
xmlDocPtr doc,
|
328
|
+
xmlNodePtr xml_output_node,
|
329
|
+
const GumboNode *gumbo_node
|
330
|
+
) {
|
331
|
+
xmlNodePtr xml_root = NIL;
|
332
|
+
xmlNodePtr xml_node = xml_output_node;
|
333
|
+
size_t child_index = 0;
|
334
|
+
|
335
|
+
while (true) {
|
336
|
+
assert(gumbo_node != NULL);
|
337
|
+
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT?
|
338
|
+
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
|
339
|
+
if (child_index >= children->length) {
|
340
|
+
// Move up the tree and to the next child.
|
341
|
+
if (xml_node == xml_output_node) {
|
342
|
+
// We've built as much of the tree as we can.
|
343
|
+
return;
|
344
|
+
}
|
345
|
+
child_index = gumbo_node->index_within_parent + 1;
|
346
|
+
gumbo_node = gumbo_node->parent;
|
347
|
+
xml_node = get_parent(xml_node);
|
348
|
+
// Children of fragments don't share the same root, so reset it and
|
349
|
+
// it'll be set below. In the non-fragment case, this will only happen
|
350
|
+
// after the html element has been finished at which point there are no
|
351
|
+
// further elements.
|
352
|
+
if (xml_node == xml_output_node)
|
353
|
+
xml_root = NIL;
|
354
|
+
continue;
|
355
|
+
}
|
356
|
+
const GumboNode *gumbo_child = children->data[child_index++];
|
357
|
+
xmlNodePtr xml_child;
|
358
|
+
|
359
|
+
switch (gumbo_child->type) {
|
360
|
+
case GUMBO_NODE_DOCUMENT:
|
361
|
+
abort(); // Bug in Gumbo.
|
362
|
+
|
363
|
+
case GUMBO_NODE_TEXT:
|
364
|
+
case GUMBO_NODE_WHITESPACE:
|
365
|
+
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
|
366
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
367
|
+
xmlAddChild(xml_node, xml_child);
|
368
|
+
break;
|
369
|
+
|
370
|
+
case GUMBO_NODE_CDATA:
|
371
|
+
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
|
372
|
+
(int) strlen(gumbo_child->v.text.text));
|
373
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
374
|
+
xmlAddChild(xml_node, xml_child);
|
375
|
+
break;
|
376
|
+
|
377
|
+
case GUMBO_NODE_COMMENT:
|
378
|
+
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
|
379
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
380
|
+
xmlAddChild(xml_node, xml_child);
|
381
|
+
break;
|
382
|
+
|
383
|
+
case GUMBO_NODE_TEMPLATE:
|
384
|
+
// XXX: Should create a template element and a new DocumentFragment
|
385
|
+
case GUMBO_NODE_ELEMENT:
|
386
|
+
{
|
387
|
+
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
|
388
|
+
set_line(xml_child, gumbo_child->v.text.start_pos.line);
|
389
|
+
if (xml_root == NIL)
|
390
|
+
xml_root = xml_child;
|
391
|
+
xmlNsPtr ns = NIL;
|
392
|
+
switch (gumbo_child->v.element.tag_namespace) {
|
393
|
+
case GUMBO_NAMESPACE_HTML:
|
394
|
+
break;
|
395
|
+
case GUMBO_NAMESPACE_SVG:
|
396
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
|
397
|
+
break;
|
398
|
+
case GUMBO_NAMESPACE_MATHML:
|
399
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
|
400
|
+
break;
|
401
|
+
}
|
402
|
+
if (ns != NIL)
|
403
|
+
xmlSetNs(xml_child, ns);
|
404
|
+
xmlAddChild(xml_node, xml_child);
|
405
|
+
|
406
|
+
// Add the attributes.
|
407
|
+
const GumboVector* attrs = &gumbo_child->v.element.attributes;
|
408
|
+
for (size_t i=0; i < attrs->length; i++) {
|
409
|
+
const GumboAttribute *attr = attrs->data[i];
|
410
|
+
|
411
|
+
switch (attr->attr_namespace) {
|
412
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
413
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
|
414
|
+
break;
|
415
|
+
|
416
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
417
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
|
418
|
+
break;
|
419
|
+
|
420
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
421
|
+
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
|
422
|
+
break;
|
423
|
+
|
424
|
+
default:
|
425
|
+
ns = NIL;
|
426
|
+
}
|
427
|
+
xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
|
428
|
+
}
|
429
|
+
|
430
|
+
// Add children for this element.
|
431
|
+
child_index = 0;
|
432
|
+
gumbo_node = gumbo_child;
|
433
|
+
xml_node = xml_child;
|
434
|
+
}
|
324
435
|
}
|
325
436
|
}
|
437
|
+
}
|
326
438
|
|
327
|
-
|
439
|
+
static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) {
|
440
|
+
const char *input_str = RSTRING_PTR(input);
|
441
|
+
size_t input_len = RSTRING_LEN(input);
|
328
442
|
|
329
443
|
// Add parse errors to rdoc.
|
330
444
|
if (output->errors.length) {
|
331
|
-
GumboVector *errors = &output->errors;
|
332
|
-
GumboStringBuffer msg;
|
445
|
+
const GumboVector *errors = &output->errors;
|
333
446
|
VALUE rerrors = rb_ary_new2(errors->length);
|
334
447
|
|
335
|
-
gumbo_string_buffer_init(&msg);
|
336
448
|
for (size_t i=0; i < errors->length; i++) {
|
337
449
|
GumboError *err = errors->data[i];
|
338
|
-
|
339
|
-
|
340
|
-
|
450
|
+
GumboSourcePosition position = gumbo_error_position(err);
|
451
|
+
char *msg;
|
452
|
+
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
|
453
|
+
VALUE err_str = rb_utf8_str_new(msg, size);
|
454
|
+
free(msg);
|
341
455
|
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
|
456
|
+
const char *error_code = gumbo_error_code(err);
|
457
|
+
VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
|
342
458
|
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
343
459
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
344
460
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
345
461
|
rb_iv_set(syntax_error, "@file", url);
|
346
|
-
rb_iv_set(syntax_error, "@line", INT2NUM(
|
347
|
-
rb_iv_set(syntax_error, "@str1",
|
462
|
+
rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
|
463
|
+
rb_iv_set(syntax_error, "@str1", str1);
|
348
464
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
349
465
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
350
|
-
rb_iv_set(syntax_error, "@int1", INT2NUM(
|
351
|
-
rb_iv_set(syntax_error, "@column", INT2NUM(
|
466
|
+
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
467
|
+
rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
|
352
468
|
rb_ary_push(rerrors, syntax_error);
|
353
469
|
}
|
354
470
|
rb_iv_set(rdoc, "@errors", rerrors);
|
355
|
-
gumbo_string_buffer_destroy(&msg);
|
356
471
|
}
|
472
|
+
}
|
473
|
+
|
474
|
+
typedef struct {
|
475
|
+
GumboOutput *output;
|
476
|
+
VALUE input;
|
477
|
+
VALUE url_or_frag;
|
478
|
+
xmlDocPtr doc;
|
479
|
+
} ParseArgs;
|
480
|
+
|
481
|
+
static VALUE parse_cleanup(ParseArgs *args) {
|
482
|
+
gumbo_destroy_output(args->output);
|
483
|
+
if (args->doc != NIL)
|
484
|
+
xmlFreeDoc(args->doc);
|
485
|
+
return Qnil;
|
486
|
+
}
|
487
|
+
|
488
|
+
|
489
|
+
static VALUE parse_continue(ParseArgs *args);
|
490
|
+
|
491
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
492
|
+
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
|
493
|
+
GumboOptions options = kGumboDefaultOptions;
|
494
|
+
options.max_errors = NUM2INT(max_errors);
|
495
|
+
options.max_tree_depth = NUM2INT(max_depth);
|
357
496
|
|
358
|
-
|
497
|
+
GumboOutput *output = perform_parse(&options, input);
|
498
|
+
ParseArgs args = {
|
499
|
+
.output = output,
|
500
|
+
.input = input,
|
501
|
+
.url_or_frag = url,
|
502
|
+
.doc = NIL,
|
503
|
+
};
|
504
|
+
return rb_ensure(parse_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
|
505
|
+
}
|
359
506
|
|
507
|
+
static VALUE parse_continue(ParseArgs *args) {
|
508
|
+
GumboOutput *output = args->output;
|
509
|
+
xmlDocPtr doc;
|
510
|
+
if (output->document->v.document.has_doctype) {
|
511
|
+
const char *name = output->document->v.document.name;
|
512
|
+
const char *public = output->document->v.document.public_identifier;
|
513
|
+
const char *system = output->document->v.document.system_identifier;
|
514
|
+
public = public[0] ? public : NULL;
|
515
|
+
system = system[0] ? system : NULL;
|
516
|
+
doc = new_html_doc(name, system, public);
|
517
|
+
} else {
|
518
|
+
doc = new_html_doc(NULL, NULL, NULL);
|
519
|
+
}
|
520
|
+
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
521
|
+
build_tree(doc, (xmlNodePtr)doc, output->document);
|
522
|
+
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
|
523
|
+
args->doc = NIL; // The Ruby runtime now owns doc so don't delete it.
|
524
|
+
add_errors(output, rdoc, args->input, args->url_or_frag);
|
360
525
|
return rdoc;
|
361
526
|
}
|
362
527
|
|
363
|
-
|
528
|
+
static int lookup_namespace(VALUE node, bool require_known_ns) {
|
529
|
+
ID namespace, href;
|
530
|
+
CONST_ID(namespace, "namespace");
|
531
|
+
CONST_ID(href, "href");
|
532
|
+
VALUE ns = rb_funcall(node, namespace, 0);
|
533
|
+
|
534
|
+
if (NIL_P(ns))
|
535
|
+
return GUMBO_NAMESPACE_HTML;
|
536
|
+
ns = rb_funcall(ns, href, 0);
|
537
|
+
assert(RTEST(ns));
|
538
|
+
Check_Type(ns, T_STRING);
|
539
|
+
|
540
|
+
const char *href_ptr = RSTRING_PTR(ns);
|
541
|
+
size_t href_len = RSTRING_LEN(ns);
|
542
|
+
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
|
543
|
+
if (NAMESPACE_P("http://www.w3.org/1999/xhtml"))
|
544
|
+
return GUMBO_NAMESPACE_HTML;
|
545
|
+
if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML"))
|
546
|
+
return GUMBO_NAMESPACE_MATHML;
|
547
|
+
if (NAMESPACE_P("http://www.w3.org/2000/svg"))
|
548
|
+
return GUMBO_NAMESPACE_SVG;
|
549
|
+
#undef NAMESPACE_P
|
550
|
+
if (require_known_ns)
|
551
|
+
rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
|
552
|
+
return -1;
|
553
|
+
}
|
554
|
+
|
555
|
+
static xmlNodePtr extract_xml_node(VALUE node) {
|
556
|
+
#if NGLIB
|
557
|
+
xmlNodePtr xml_node;
|
558
|
+
Data_Get_Struct(node, xmlNode, xml_node);
|
559
|
+
return xml_node;
|
560
|
+
#else
|
561
|
+
return node;
|
562
|
+
#endif
|
563
|
+
}
|
564
|
+
|
565
|
+
static VALUE fragment_continue(ParseArgs *args);
|
566
|
+
|
567
|
+
static VALUE fragment (
|
568
|
+
VALUE self,
|
569
|
+
VALUE doc_fragment,
|
570
|
+
VALUE tags,
|
571
|
+
VALUE ctx,
|
572
|
+
VALUE max_errors,
|
573
|
+
VALUE max_depth
|
574
|
+
) {
|
575
|
+
ID name = rb_intern_const("name");
|
576
|
+
const char *ctx_tag;
|
577
|
+
GumboNamespaceEnum ctx_ns;
|
578
|
+
GumboQuirksModeEnum quirks_mode;
|
579
|
+
bool form = false;
|
580
|
+
const char *encoding = NULL;
|
581
|
+
|
582
|
+
if (NIL_P(ctx)) {
|
583
|
+
ctx_tag = "body";
|
584
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
585
|
+
} else if (TYPE(ctx) == T_STRING) {
|
586
|
+
ctx_tag = StringValueCStr(ctx);
|
587
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
588
|
+
size_t len = RSTRING_LEN(ctx);
|
589
|
+
const char *colon = memchr(ctx_tag, ':', len);
|
590
|
+
if (colon) {
|
591
|
+
switch (colon - ctx_tag) {
|
592
|
+
case 3:
|
593
|
+
if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
|
594
|
+
goto error;
|
595
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
596
|
+
break;
|
597
|
+
case 4:
|
598
|
+
if (st_strncasecmp(ctx_tag, "html", 4) == 0)
|
599
|
+
ctx_ns = GUMBO_NAMESPACE_HTML;
|
600
|
+
else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
|
601
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
602
|
+
else
|
603
|
+
goto error;
|
604
|
+
break;
|
605
|
+
default:
|
606
|
+
error:
|
607
|
+
rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
|
608
|
+
}
|
609
|
+
ctx_tag = colon+1;
|
610
|
+
} else {
|
611
|
+
// For convenience, put 'svg' and 'math' in their namespaces.
|
612
|
+
if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
|
613
|
+
ctx_ns = GUMBO_NAMESPACE_SVG;
|
614
|
+
else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
|
615
|
+
ctx_ns = GUMBO_NAMESPACE_MATHML;
|
616
|
+
}
|
617
|
+
|
618
|
+
// Check if it's a form.
|
619
|
+
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
|
620
|
+
} else {
|
621
|
+
ID element_ = rb_intern_const("element?");
|
622
|
+
|
623
|
+
// Context fragment name.
|
624
|
+
VALUE tag_name = rb_funcall(ctx, name, 0);
|
625
|
+
assert(RTEST(tag_name));
|
626
|
+
Check_Type(tag_name, T_STRING);
|
627
|
+
ctx_tag = StringValueCStr(tag_name);
|
628
|
+
|
629
|
+
// Context fragment namespace.
|
630
|
+
ctx_ns = lookup_namespace(ctx, true);
|
631
|
+
|
632
|
+
// Check for a form ancestor, including self.
|
633
|
+
for (VALUE node = ctx;
|
634
|
+
!NIL_P(node);
|
635
|
+
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
|
636
|
+
if (!RTEST(rb_funcall(node, element_, 0)))
|
637
|
+
continue;
|
638
|
+
VALUE element_name = rb_funcall(node, name, 0);
|
639
|
+
if (RSTRING_LEN(element_name) == 4
|
640
|
+
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
|
641
|
+
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
|
642
|
+
form = true;
|
643
|
+
break;
|
644
|
+
}
|
645
|
+
}
|
646
|
+
|
647
|
+
// Encoding.
|
648
|
+
if (RSTRING_LEN(tag_name) == 14
|
649
|
+
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
650
|
+
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
651
|
+
rb_utf8_str_new_static("encoding", 8));
|
652
|
+
if (RTEST(enc)) {
|
653
|
+
Check_Type(enc, T_STRING);
|
654
|
+
encoding = StringValueCStr(enc);
|
655
|
+
}
|
656
|
+
}
|
657
|
+
}
|
658
|
+
|
659
|
+
// Quirks mode.
|
660
|
+
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
661
|
+
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
662
|
+
if (NIL_P(dtd)) {
|
663
|
+
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
664
|
+
} else {
|
665
|
+
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
666
|
+
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
667
|
+
VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
|
668
|
+
quirks_mode = gumbo_compute_quirks_mode (
|
669
|
+
NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
|
670
|
+
NIL_P(pubid)? NULL:StringValueCStr(pubid),
|
671
|
+
NIL_P(sysid)? NULL:StringValueCStr(sysid)
|
672
|
+
);
|
673
|
+
}
|
674
|
+
|
675
|
+
// Perform a fragment parse.
|
676
|
+
int depth = NUM2INT(max_depth);
|
677
|
+
GumboOptions options = kGumboDefaultOptions;
|
678
|
+
options.max_errors = NUM2INT(max_errors);
|
679
|
+
// Add one to account for the HTML element.
|
680
|
+
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
681
|
+
options.fragment_context = ctx_tag;
|
682
|
+
options.fragment_namespace = ctx_ns;
|
683
|
+
options.fragment_encoding = encoding;
|
684
|
+
options.quirks_mode = quirks_mode;
|
685
|
+
options.fragment_context_has_form_ancestor = form;
|
686
|
+
|
687
|
+
GumboOutput *output = perform_parse(&options, tags);
|
688
|
+
ParseArgs args = {
|
689
|
+
.output = output,
|
690
|
+
.input = tags,
|
691
|
+
.url_or_frag = doc_fragment,
|
692
|
+
.doc = (xmlDocPtr)extract_xml_node(doc),
|
693
|
+
};
|
694
|
+
rb_ensure(fragment_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
|
695
|
+
return Qnil;
|
696
|
+
}
|
697
|
+
|
698
|
+
static VALUE fragment_continue(ParseArgs *args) {
|
699
|
+
GumboOutput *output = args->output;
|
700
|
+
VALUE doc_fragment = args->url_or_frag;
|
701
|
+
xmlDocPtr xml_doc = args->doc;
|
702
|
+
|
703
|
+
args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it.
|
704
|
+
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
705
|
+
build_tree(xml_doc, xml_frag, output->root);
|
706
|
+
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
707
|
+
return Qnil;
|
708
|
+
}
|
709
|
+
|
710
|
+
// Initialize the Nokogumbo class and fetch constants we will use later.
|
364
711
|
void Init_nokogumbo() {
|
365
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1,
|
712
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
|
366
713
|
rb_require("nokogiri");
|
367
714
|
|
368
|
-
// class constants
|
369
|
-
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
370
|
-
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
371
|
-
Document = rb_const_get(HTML5, rb_intern("Document"));
|
372
|
-
|
373
715
|
#ifndef NGLIB
|
374
|
-
//
|
375
|
-
VALUE
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
remove_attribute = rb_intern("remove_attribute");
|
387
|
-
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
388
|
-
internal_subset = rb_intern("internal_subset");
|
389
|
-
remove_ = rb_intern("remove");
|
390
|
-
create_internal_subset = rb_intern("create_internal_subset");
|
391
|
-
key_ = rb_intern("key?");
|
392
|
-
node_name_ = rb_intern("node_name=");
|
716
|
+
// Class constants.
|
717
|
+
VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
|
718
|
+
VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
|
719
|
+
cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
|
720
|
+
cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
|
721
|
+
cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
|
722
|
+
cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
|
723
|
+
cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
|
724
|
+
|
725
|
+
// Interned symbols.
|
726
|
+
new = rb_intern_const("new");
|
727
|
+
node_name_ = rb_intern_const("node_name=");
|
393
728
|
#endif
|
394
729
|
|
395
|
-
//
|
730
|
+
// Class constants.
|
731
|
+
VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
|
732
|
+
Document = rb_const_get(HTML5, rb_intern_const("Document"));
|
733
|
+
|
734
|
+
// Interned symbols.
|
735
|
+
internal_subset = rb_intern_const("internal_subset");
|
736
|
+
parent = rb_intern_const("parent");
|
737
|
+
|
738
|
+
// Define Nokogumbo module with parse and fragment methods.
|
396
739
|
VALUE Gumbo = rb_define_module("Nokogumbo");
|
397
740
|
rb_define_singleton_method(Gumbo, "parse", parse, 4);
|
741
|
+
rb_define_singleton_method(Gumbo, "fragment", fragment, 5);
|
398
742
|
}
|
743
|
+
|
744
|
+
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|