nokogumbo 1.4.13 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +1 -1
- data/ext/nokogumboc/nokogumbo.c +9 -6
- data/lib/nokogumbo.rb +7 -7
- data/test-nokogumbo.rb +30 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
|
4
|
+
data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
|
7
|
+
data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
|
data/README.md
CHANGED
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -184,11 +184,14 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
|
184
184
|
}
|
185
185
|
|
186
186
|
// Parse a string using gumbo_parse into a Nokogiri document
|
187
|
-
static VALUE parse(VALUE self, VALUE string) {
|
188
|
-
|
187
|
+
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
188
|
+
GumboOptions options;
|
189
|
+
memcpy(&options, &kGumboDefaultOptions, sizeof options);
|
190
|
+
options.max_errors = NUM2INT(max_parse_errors);
|
191
|
+
|
189
192
|
const char *input = RSTRING_PTR(string);
|
190
193
|
size_t input_len = RSTRING_LEN(string);
|
191
|
-
GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
|
194
|
+
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
192
195
|
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
193
196
|
#ifdef NGLIB
|
194
197
|
doc->type = XML_HTML_DOCUMENT_NODE;
|
@@ -219,7 +222,7 @@ static VALUE parse(VALUE self, VALUE string) {
|
|
219
222
|
// Add parse errors to rdoc.
|
220
223
|
if (output->errors.length) {
|
221
224
|
GumboVector *errors = &output->errors;
|
222
|
-
GumboParser parser = { ._options = options };
|
225
|
+
GumboParser parser = { ._options = &options };
|
223
226
|
GumboStringBuffer msg;
|
224
227
|
VALUE rerrors = rb_ary_new2(errors->length);
|
225
228
|
|
@@ -253,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string) {
|
|
253
256
|
gumbo_string_buffer_destroy(&parser, &msg);
|
254
257
|
}
|
255
258
|
|
256
|
-
gumbo_destroy_output(options, output);
|
259
|
+
gumbo_destroy_output(&options, output);
|
257
260
|
|
258
261
|
return rdoc;
|
259
262
|
}
|
@@ -288,5 +291,5 @@ void Init_nokogumboc() {
|
|
288
291
|
|
289
292
|
// define Nokogumbo class with a singleton parse method
|
290
293
|
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
291
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
294
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 2);
|
292
295
|
}
|
data/lib/nokogumbo.rb
CHANGED
@@ -4,14 +4,14 @@ require 'nokogumboc'
|
|
4
4
|
module Nokogiri
|
5
5
|
# Parse an HTML document. +string+ contains the document. +string+
|
6
6
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
7
|
-
def self.HTML5(
|
8
|
-
Nokogiri::HTML5.parse(
|
7
|
+
def self.HTML5(*args)
|
8
|
+
Nokogiri::HTML5.parse(*args)
|
9
9
|
end
|
10
10
|
|
11
11
|
module HTML5
|
12
12
|
# Parse an HTML document. +string+ contains the document. +string+
|
13
13
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
14
|
-
def self.parse(string)
|
14
|
+
def self.parse(string, options={})
|
15
15
|
if string.respond_to? :read
|
16
16
|
string = string.read
|
17
17
|
end
|
@@ -21,7 +21,7 @@ module Nokogiri
|
|
21
21
|
string = reencode(string)
|
22
22
|
end
|
23
23
|
|
24
|
-
Nokogumbo.parse(string.to_s)
|
24
|
+
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
|
25
25
|
end
|
26
26
|
|
27
27
|
# Fetch and parse a HTML document from the web, following redirects,
|
@@ -67,7 +67,7 @@ module Nokogiri
|
|
67
67
|
|
68
68
|
case response
|
69
69
|
when Net::HTTPSuccess
|
70
|
-
doc = parse(reencode(response.body, response['content-type']))
|
70
|
+
doc = parse(reencode(response.body, response['content-type']), options)
|
71
71
|
doc.instance_variable_set('@response', response)
|
72
72
|
doc.class.send(:attr_reader, :response)
|
73
73
|
doc
|
@@ -83,8 +83,8 @@ module Nokogiri
|
|
83
83
|
# while fragment is on the Gumbo TODO list, simulate it by doing
|
84
84
|
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
85
85
|
# tags, and collecting up the children of each.
|
86
|
-
def self.fragment(
|
87
|
-
doc = parse(
|
86
|
+
def self.fragment(*args)
|
87
|
+
doc = parse(*args)
|
88
88
|
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
89
89
|
|
90
90
|
if doc.children.length != 1 or doc.children.first.name != 'html'
|
data/test-nokogumbo.rb
CHANGED
@@ -78,7 +78,7 @@ class TestNokogumbo < Minitest::Test
|
|
78
78
|
end
|
79
79
|
|
80
80
|
def test_html5_doctype
|
81
|
-
doc =
|
81
|
+
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
|
82
82
|
assert_match /<!DOCTYPE html>/, doc.to_html
|
83
83
|
end
|
84
84
|
|
@@ -126,17 +126,43 @@ class TestNokogumbo < Minitest::Test
|
|
126
126
|
end
|
127
127
|
|
128
128
|
def test_parse_errors
|
129
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>")
|
129
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
|
130
130
|
assert_equal doc.errors.length, 2
|
131
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>")
|
131
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
|
132
132
|
assert_empty doc.errors
|
133
133
|
end
|
134
134
|
|
135
|
+
def test_max_parse_errors
|
136
|
+
# This document contains 2 parse errors, but we force limit to 1.
|
137
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
|
138
|
+
assert_equal 1, doc.errors.length
|
139
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
|
140
|
+
assert_empty doc.errors
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_default_max_parse_errors
|
144
|
+
# This document contains 200 parse errors, but default limit is 0.
|
145
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
|
146
|
+
assert_equal 0, doc.errors.length
|
147
|
+
end
|
148
|
+
|
135
149
|
def test_parse_fragment_errors
|
136
|
-
doc = Nokogiri::HTML5.fragment("<\r\n")
|
150
|
+
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
|
137
151
|
refute_empty doc.errors
|
138
152
|
end
|
139
153
|
|
154
|
+
def test_fragment_max_parse_errors
|
155
|
+
# This fragment contains 3 parse errors, but we force limit to 1.
|
156
|
+
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
|
157
|
+
assert_equal 1, doc.errors.length
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_fragment_default_max_parse_errors
|
161
|
+
# This fragment contains 201 parse errors, but default limit is 0.
|
162
|
+
doc = Nokogiri::HTML5.fragment("</p>" * 200)
|
163
|
+
assert_equal 0, doc.errors.length
|
164
|
+
end
|
165
|
+
|
140
166
|
private
|
141
167
|
|
142
168
|
def buffer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
90
|
version: '0'
|
91
91
|
requirements: []
|
92
92
|
rubyforge_project:
|
93
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.7.4
|
94
94
|
signing_key:
|
95
95
|
specification_version: 4
|
96
96
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|