nokogumbo 1.4.13 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +1 -1
- data/ext/nokogumboc/nokogumbo.c +9 -6
- data/lib/nokogumbo.rb +7 -7
- data/test-nokogumbo.rb +30 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
|
4
|
+
data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
|
7
|
+
data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
|
data/README.md
CHANGED
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -184,11 +184,14 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
|
184
184
|
}
|
185
185
|
|
186
186
|
// Parse a string using gumbo_parse into a Nokogiri document
|
187
|
-
static VALUE parse(VALUE self, VALUE string) {
|
188
|
-
|
187
|
+
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
188
|
+
GumboOptions options;
|
189
|
+
memcpy(&options, &kGumboDefaultOptions, sizeof options);
|
190
|
+
options.max_errors = NUM2INT(max_parse_errors);
|
191
|
+
|
189
192
|
const char *input = RSTRING_PTR(string);
|
190
193
|
size_t input_len = RSTRING_LEN(string);
|
191
|
-
GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
|
194
|
+
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
192
195
|
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
193
196
|
#ifdef NGLIB
|
194
197
|
doc->type = XML_HTML_DOCUMENT_NODE;
|
@@ -219,7 +222,7 @@ static VALUE parse(VALUE self, VALUE string) {
|
|
219
222
|
// Add parse errors to rdoc.
|
220
223
|
if (output->errors.length) {
|
221
224
|
GumboVector *errors = &output->errors;
|
222
|
-
GumboParser parser = { ._options = options };
|
225
|
+
GumboParser parser = { ._options = &options };
|
223
226
|
GumboStringBuffer msg;
|
224
227
|
VALUE rerrors = rb_ary_new2(errors->length);
|
225
228
|
|
@@ -253,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string) {
|
|
253
256
|
gumbo_string_buffer_destroy(&parser, &msg);
|
254
257
|
}
|
255
258
|
|
256
|
-
gumbo_destroy_output(options, output);
|
259
|
+
gumbo_destroy_output(&options, output);
|
257
260
|
|
258
261
|
return rdoc;
|
259
262
|
}
|
@@ -288,5 +291,5 @@ void Init_nokogumboc() {
|
|
288
291
|
|
289
292
|
// define Nokogumbo class with a singleton parse method
|
290
293
|
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
291
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
294
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 2);
|
292
295
|
}
|
data/lib/nokogumbo.rb
CHANGED
@@ -4,14 +4,14 @@ require 'nokogumboc'
|
|
4
4
|
module Nokogiri
|
5
5
|
# Parse an HTML document. +string+ contains the document. +string+
|
6
6
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
7
|
-
def self.HTML5(
|
8
|
-
Nokogiri::HTML5.parse(
|
7
|
+
def self.HTML5(*args)
|
8
|
+
Nokogiri::HTML5.parse(*args)
|
9
9
|
end
|
10
10
|
|
11
11
|
module HTML5
|
12
12
|
# Parse an HTML document. +string+ contains the document. +string+
|
13
13
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
14
|
-
def self.parse(string)
|
14
|
+
def self.parse(string, options={})
|
15
15
|
if string.respond_to? :read
|
16
16
|
string = string.read
|
17
17
|
end
|
@@ -21,7 +21,7 @@ module Nokogiri
|
|
21
21
|
string = reencode(string)
|
22
22
|
end
|
23
23
|
|
24
|
-
Nokogumbo.parse(string.to_s)
|
24
|
+
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
|
25
25
|
end
|
26
26
|
|
27
27
|
# Fetch and parse a HTML document from the web, following redirects,
|
@@ -67,7 +67,7 @@ module Nokogiri
|
|
67
67
|
|
68
68
|
case response
|
69
69
|
when Net::HTTPSuccess
|
70
|
-
doc = parse(reencode(response.body, response['content-type']))
|
70
|
+
doc = parse(reencode(response.body, response['content-type']), options)
|
71
71
|
doc.instance_variable_set('@response', response)
|
72
72
|
doc.class.send(:attr_reader, :response)
|
73
73
|
doc
|
@@ -83,8 +83,8 @@ module Nokogiri
|
|
83
83
|
# while fragment is on the Gumbo TODO list, simulate it by doing
|
84
84
|
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
85
85
|
# tags, and collecting up the children of each.
|
86
|
-
def self.fragment(
|
87
|
-
doc = parse(
|
86
|
+
def self.fragment(*args)
|
87
|
+
doc = parse(*args)
|
88
88
|
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
89
89
|
|
90
90
|
if doc.children.length != 1 or doc.children.first.name != 'html'
|
data/test-nokogumbo.rb
CHANGED
@@ -78,7 +78,7 @@ class TestNokogumbo < Minitest::Test
|
|
78
78
|
end
|
79
79
|
|
80
80
|
def test_html5_doctype
|
81
|
-
doc =
|
81
|
+
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
|
82
82
|
assert_match /<!DOCTYPE html>/, doc.to_html
|
83
83
|
end
|
84
84
|
|
@@ -126,17 +126,43 @@ class TestNokogumbo < Minitest::Test
|
|
126
126
|
end
|
127
127
|
|
128
128
|
def test_parse_errors
|
129
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>")
|
129
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
|
130
130
|
assert_equal doc.errors.length, 2
|
131
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>")
|
131
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
|
132
132
|
assert_empty doc.errors
|
133
133
|
end
|
134
134
|
|
135
|
+
def test_max_parse_errors
|
136
|
+
# This document contains 2 parse errors, but we force limit to 1.
|
137
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
|
138
|
+
assert_equal 1, doc.errors.length
|
139
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
|
140
|
+
assert_empty doc.errors
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_default_max_parse_errors
|
144
|
+
# This document contains 200 parse errors, but default limit is 0.
|
145
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
|
146
|
+
assert_equal 0, doc.errors.length
|
147
|
+
end
|
148
|
+
|
135
149
|
def test_parse_fragment_errors
|
136
|
-
doc = Nokogiri::HTML5.fragment("<\r\n")
|
150
|
+
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
|
137
151
|
refute_empty doc.errors
|
138
152
|
end
|
139
153
|
|
154
|
+
def test_fragment_max_parse_errors
|
155
|
+
# This fragment contains 3 parse errors, but we force limit to 1.
|
156
|
+
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
|
157
|
+
assert_equal 1, doc.errors.length
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_fragment_default_max_parse_errors
|
161
|
+
# This fragment contains 201 parse errors, but default limit is 0.
|
162
|
+
doc = Nokogiri::HTML5.fragment("</p>" * 200)
|
163
|
+
assert_equal 0, doc.errors.length
|
164
|
+
end
|
165
|
+
|
140
166
|
private
|
141
167
|
|
142
168
|
def buffer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
90
|
version: '0'
|
91
91
|
requirements: []
|
92
92
|
rubyforge_project:
|
93
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.7.4
|
94
94
|
signing_key:
|
95
95
|
specification_version: 4
|
96
96
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|