nokogiri 1.11.0.rc3 → 1.11.0.rc4
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/README.md +1 -1
- data/ext/nokogiri/depend +476 -357
- data/ext/nokogiri/extconf.rb +441 -321
- data/ext/nokogiri/html_document.c +79 -78
- data/ext/nokogiri/html_sax_parser_context.c +2 -2
- data/ext/nokogiri/nokogiri.c +34 -46
- data/ext/nokogiri/nokogiri.h +22 -26
- data/ext/nokogiri/xml_document.c +2 -2
- data/ext/nokogiri/xml_node.c +1 -1
- data/ext/nokogiri/xml_node_set.c +1 -1
- data/ext/nokogiri/xml_relax_ng.c +29 -11
- data/ext/nokogiri/xml_sax_parser.c +2 -7
- data/ext/nokogiri/xml_sax_parser_context.c +2 -2
- data/ext/nokogiri/xml_schema.c +55 -13
- data/ext/nokogiri/xml_xpath_context.c +80 -4
- data/ext/nokogiri/xslt_stylesheet.c +1 -4
- data/lib/nokogiri.rb +1 -1
- data/lib/nokogiri/css/parser.rb +3 -3
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +70 -42
- data/lib/nokogiri/html/document.rb +12 -26
- data/lib/nokogiri/version.rb +2 -149
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +182 -0
- data/lib/nokogiri/xml/document.rb +17 -7
- data/lib/nokogiri/xml/document_fragment.rb +4 -6
- data/lib/nokogiri/xml/node.rb +50 -27
- data/lib/nokogiri/xml/parse_options.rb +6 -0
- data/lib/nokogiri/xml/relax_ng.rb +6 -2
- data/lib/nokogiri/xml/schema.rb +12 -4
- data/lib/nokogiri/xml/searchable.rb +3 -1
- data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +73 -0
- data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +103 -0
- data/patches/libxml2/0008-use-glibc-strlen.patch +53 -0
- metadata +34 -22
@@ -210,8 +210,10 @@ module Nokogiri
|
|
210
210
|
end
|
211
211
|
|
212
212
|
def xpath_query_from_css_rule(rule, ns)
|
213
|
+
visitor = Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new
|
213
214
|
self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context|
|
214
|
-
CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns
|
215
|
+
CSS.xpath_for(rule.to_s, {:prefix => implied_xpath_context, :ns => ns,
|
216
|
+
:visitor => visitor})
|
215
217
|
end.join(" | ")
|
216
218
|
end
|
217
219
|
|
@@ -0,0 +1,73 @@
|
|
1
|
+
From 4f51a6d2b1755ce5b36c524c215aad70d864ac1d Mon Sep 17 00:00:00 2001
|
2
|
+
From: Mike Dalessio <mike.dalessio@gmail.com>
|
3
|
+
Date: Mon, 3 Aug 2020 17:36:05 -0400
|
4
|
+
Subject: [PATCH 1/2] htmlParseComment: treat `--!>` as if it closed the
|
5
|
+
comment
|
6
|
+
|
7
|
+
See guidance provided on incorrectly-closed comments here:
|
8
|
+
|
9
|
+
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
|
10
|
+
---
|
11
|
+
HTMLparser.c | 28 ++++++++++++++++++++--------
|
12
|
+
1 file changed, 20 insertions(+), 8 deletions(-)
|
13
|
+
|
14
|
+
diff --git a/HTMLparser.c b/HTMLparser.c
|
15
|
+
index 7b6d689..4d43479 100644
|
16
|
+
--- a/HTMLparser.c
|
17
|
+
+++ b/HTMLparser.c
|
18
|
+
@@ -3300,6 +3300,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
19
|
+
int q, ql;
|
20
|
+
int r, rl;
|
21
|
+
int cur, l;
|
22
|
+
+ int next, nl;
|
23
|
+
xmlParserInputState state;
|
24
|
+
|
25
|
+
/*
|
26
|
+
@@ -3332,6 +3333,21 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
27
|
+
while (IS_CHAR(cur) &&
|
28
|
+
((cur != '>') ||
|
29
|
+
(r != '-') || (q != '-'))) {
|
30
|
+
+ NEXTL(l);
|
31
|
+
+ next = CUR_CHAR(nl);
|
32
|
+
+ if (next == 0) {
|
33
|
+
+ SHRINK;
|
34
|
+
+ GROW;
|
35
|
+
+ next = CUR_CHAR(nl);
|
36
|
+
+ }
|
37
|
+
+
|
38
|
+
+ if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
|
39
|
+
+ htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
|
40
|
+
+ "Comment incorrectly closed by '--!>'", NULL, NULL);
|
41
|
+
+ cur = '>';
|
42
|
+
+ break;
|
43
|
+
+ }
|
44
|
+
+
|
45
|
+
if (len + 5 >= size) {
|
46
|
+
xmlChar *tmp;
|
47
|
+
|
48
|
+
@@ -3345,18 +3361,14 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
49
|
+
}
|
50
|
+
buf = tmp;
|
51
|
+
}
|
52
|
+
- COPY_BUF(ql,buf,len,q);
|
53
|
+
+ COPY_BUF(ql,buf,len,q);
|
54
|
+
+
|
55
|
+
q = r;
|
56
|
+
ql = rl;
|
57
|
+
r = cur;
|
58
|
+
rl = l;
|
59
|
+
- NEXTL(l);
|
60
|
+
- cur = CUR_CHAR(l);
|
61
|
+
- if (cur == 0) {
|
62
|
+
- SHRINK;
|
63
|
+
- GROW;
|
64
|
+
- cur = CUR_CHAR(l);
|
65
|
+
- }
|
66
|
+
+ cur = next;
|
67
|
+
+ l = nl;
|
68
|
+
}
|
69
|
+
buf[len] = 0;
|
70
|
+
if (IS_CHAR(cur)) {
|
71
|
+
--
|
72
|
+
2.25.1
|
73
|
+
|
@@ -0,0 +1,103 @@
|
|
1
|
+
From b20d746fa7cbb74716171bc49d836af99927e41e Mon Sep 17 00:00:00 2001
|
2
|
+
From: Mike Dalessio <mike.dalessio@gmail.com>
|
3
|
+
Date: Sun, 11 Oct 2020 14:15:37 -0400
|
4
|
+
Subject: [PATCH 2/2] use new htmlParseLookupCommentEnd to find comment ends
|
5
|
+
|
6
|
+
Note that the caret in error messages generated during comment parsing
|
7
|
+
may have moved by one byte.
|
8
|
+
|
9
|
+
See guidance provided on incorrectly-closed comments here:
|
10
|
+
|
11
|
+
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
|
12
|
+
---
|
13
|
+
HTMLparser.c | 46 +++++++++++++++++++++++++++++++++++++---------
|
14
|
+
1 file changed, 37 insertions(+), 9 deletions(-)
|
15
|
+
|
16
|
+
diff --git a/HTMLparser.c b/HTMLparser.c
|
17
|
+
index 4d43479..000dc3d 100644
|
18
|
+
--- a/HTMLparser.c
|
19
|
+
+++ b/HTMLparser.c
|
20
|
+
@@ -5331,6 +5331,39 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
21
|
+
return (-1);
|
22
|
+
}
|
23
|
+
|
24
|
+
+/**
|
25
|
+
+ * htmlParseLookupCommentEnd:
|
26
|
+
+ * @ctxt: an HTML parser context
|
27
|
+
+ *
|
28
|
+
+ * Try to find a comment end tag in the input stream
|
29
|
+
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
|
30
|
+
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
|
31
|
+
+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
|
32
|
+
+ * to avoid rescanning sequences of bytes, it DOES change the state of the
|
33
|
+
+ * parser, do not use liberally.
|
34
|
+
+ * This wraps to htmlParseLookupSequence()
|
35
|
+
+ *
|
36
|
+
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
|
37
|
+
+ */
|
38
|
+
+static int
|
39
|
+
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
|
40
|
+
+{
|
41
|
+
+ int mark = 0;
|
42
|
+
+ int cur = CUR_PTR - BASE_PTR;
|
43
|
+
+
|
44
|
+
+ while (mark >= 0) {
|
45
|
+
+ mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 1, 1);
|
46
|
+
+ if ((mark < 0) ||
|
47
|
+
+ (NXT(mark+2) == '>') ||
|
48
|
+
+ ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
|
49
|
+
+ return mark;
|
50
|
+
+ }
|
51
|
+
+ ctxt->checkIndex = cur + mark + 1;
|
52
|
+
+ }
|
53
|
+
+ return mark;
|
54
|
+
+}
|
55
|
+
+
|
56
|
+
+
|
57
|
+
/**
|
58
|
+
* htmlParseTryOrFinish:
|
59
|
+
* @ctxt: an HTML parser context
|
60
|
+
@@ -5507,8 +5540,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
61
|
+
cur = in->cur[0];
|
62
|
+
if ((cur == '<') && (next == '!') &&
|
63
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
64
|
+
- if ((!terminate) &&
|
65
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
66
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
67
|
+
goto done;
|
68
|
+
#ifdef DEBUG_PUSH
|
69
|
+
xmlGenericError(xmlGenericErrorContext,
|
70
|
+
@@ -5567,8 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
71
|
+
next = in->cur[1];
|
72
|
+
if ((cur == '<') && (next == '!') &&
|
73
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
74
|
+
- if ((!terminate) &&
|
75
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
76
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
77
|
+
goto done;
|
78
|
+
#ifdef DEBUG_PUSH
|
79
|
+
xmlGenericError(xmlGenericErrorContext,
|
80
|
+
@@ -5614,8 +5645,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
81
|
+
next = in->cur[1];
|
82
|
+
if ((cur == '<') && (next == '!') &&
|
83
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
84
|
+
- if ((!terminate) &&
|
85
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
86
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
87
|
+
goto done;
|
88
|
+
#ifdef DEBUG_PUSH
|
89
|
+
xmlGenericError(xmlGenericErrorContext,
|
90
|
+
@@ -5871,9 +5901,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
91
|
+
htmlParseDocTypeDecl(ctxt);
|
92
|
+
} else if ((cur == '<') && (next == '!') &&
|
93
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
94
|
+
- if ((!terminate) &&
|
95
|
+
- (htmlParseLookupSequence(
|
96
|
+
- ctxt, '-', '-', '>', 1, 1) < 0))
|
97
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
98
|
+
goto done;
|
99
|
+
#ifdef DEBUG_PUSH
|
100
|
+
xmlGenericError(xmlGenericErrorContext,
|
101
|
+
--
|
102
|
+
2.25.1
|
103
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Ilya Zub <ilya@serpapi.com>
|
3
|
+
Date: Wed, 23 Dec 2020 12:45:29 +0200
|
4
|
+
Subject: Use glibc strlen to speed up xmlStrlen
|
5
|
+
MIME-Version: 1.0
|
6
|
+
Content-Type: text/plain; charset=UTF-8
|
7
|
+
Content-Transfer-Encoding: 8bit
|
8
|
+
|
9
|
+
xmlStrlen (entire HTML file): 926171.936981 μs
|
10
|
+
glibc_xmlStrlen (entire HTML file): 36905.903992 μs
|
11
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
|
12
|
+
|
13
|
+
xmlStrlen (average string): 57479.204010 μs
|
14
|
+
glibc_xmlStrlen (average string): 5802.069000 μs
|
15
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
|
16
|
+
|
17
|
+
xmlStrlen (bigger string): 388056.315979 μs
|
18
|
+
glibc_xmlStrlen (bigger string): 12797.856995 μs
|
19
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
|
20
|
+
|
21
|
+
xmlStrlen (smallest string): 15870.046021 μs
|
22
|
+
glibc_xmlStrlen (smallest string): 6282.208984 μs
|
23
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
|
24
|
+
|
25
|
+
See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
|
26
|
+
---
|
27
|
+
xmlstring.c | 9 ++-------
|
28
|
+
1 file changed, 2 insertions(+), 7 deletions(-)
|
29
|
+
|
30
|
+
diff --git a/xmlstring.c b/xmlstring.c
|
31
|
+
index e8a1e45d..df247dff 100644
|
32
|
+
--- a/xmlstring.c
|
33
|
+
+++ b/xmlstring.c
|
34
|
+
@@ -423,14 +423,9 @@ xmlStrsub(const xmlChar *str, int start, int len) {
|
35
|
+
|
36
|
+
int
|
37
|
+
xmlStrlen(const xmlChar *str) {
|
38
|
+
- int len = 0;
|
39
|
+
-
|
40
|
+
if (str == NULL) return(0);
|
41
|
+
- while (*str != 0) { /* non input consuming */
|
42
|
+
- str++;
|
43
|
+
- len++;
|
44
|
+
- }
|
45
|
+
- return(len);
|
46
|
+
+
|
47
|
+
+ return strlen((const char*)str);
|
48
|
+
}
|
49
|
+
|
50
|
+
/**
|
51
|
+
--
|
52
|
+
2.29.2
|
53
|
+
|
metadata
CHANGED
@@ -1,21 +1,35 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogiri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.11.0.
|
4
|
+
version: 1.11.0.rc4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Aaron Patterson
|
8
7
|
- Mike Dalessio
|
8
|
+
- Aaron Patterson
|
9
|
+
- John Shahid
|
9
10
|
- Yoko Harada
|
10
|
-
- Tim Elliott
|
11
11
|
- Akinori MUSHA
|
12
|
-
- John Shahid
|
13
12
|
- Lars Kanis
|
13
|
+
- Tim Elliott
|
14
14
|
autorequire:
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
|
-
date: 2020-
|
17
|
+
date: 2020-12-29 00:00:00.000000000 Z
|
18
18
|
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: racc
|
21
|
+
requirement: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '1.4'
|
26
|
+
type: :runtime
|
27
|
+
prerelease: false
|
28
|
+
version_requirements: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - "~>"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '1.4'
|
19
33
|
- !ruby/object:Gem::Dependency
|
20
34
|
name: mini_portile2
|
21
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -36,14 +50,14 @@ dependencies:
|
|
36
50
|
requirements:
|
37
51
|
- - "~>"
|
38
52
|
- !ruby/object:Gem::Version
|
39
|
-
version: '0.
|
53
|
+
version: '0.41'
|
40
54
|
type: :development
|
41
55
|
prerelease: false
|
42
56
|
version_requirements: !ruby/object:Gem::Requirement
|
43
57
|
requirements:
|
44
58
|
- - "~>"
|
45
59
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
60
|
+
version: '0.41'
|
47
61
|
- !ruby/object:Gem::Dependency
|
48
62
|
name: hoe
|
49
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -149,19 +163,19 @@ dependencies:
|
|
149
163
|
- !ruby/object:Gem::Version
|
150
164
|
version: '5.8'
|
151
165
|
- !ruby/object:Gem::Dependency
|
152
|
-
name:
|
166
|
+
name: minitest-reporters
|
153
167
|
requirement: !ruby/object:Gem::Requirement
|
154
168
|
requirements:
|
155
169
|
- - "~>"
|
156
170
|
- !ruby/object:Gem::Version
|
157
|
-
version: 1.4
|
171
|
+
version: '1.4'
|
158
172
|
type: :development
|
159
173
|
prerelease: false
|
160
174
|
version_requirements: !ruby/object:Gem::Requirement
|
161
175
|
requirements:
|
162
176
|
- - "~>"
|
163
177
|
- !ruby/object:Gem::Version
|
164
|
-
version: 1.4
|
178
|
+
version: '1.4'
|
165
179
|
- !ruby/object:Gem::Dependency
|
166
180
|
name: rake
|
167
181
|
requirement: !ruby/object:Gem::Requirement
|
@@ -196,14 +210,14 @@ dependencies:
|
|
196
210
|
requirements:
|
197
211
|
- - "~>"
|
198
212
|
- !ruby/object:Gem::Version
|
199
|
-
version: '1.
|
213
|
+
version: '1.1'
|
200
214
|
type: :development
|
201
215
|
prerelease: false
|
202
216
|
version_requirements: !ruby/object:Gem::Requirement
|
203
217
|
requirements:
|
204
218
|
- - "~>"
|
205
219
|
- !ruby/object:Gem::Version
|
206
|
-
version: '1.
|
220
|
+
version: '1.1'
|
207
221
|
- !ruby/object:Gem::Dependency
|
208
222
|
name: rexical
|
209
223
|
requirement: !ruby/object:Gem::Requirement
|
@@ -270,14 +284,7 @@ description: |-
|
|
270
284
|
Nokogiri (鋸) is an HTML, XML, SAX, and Reader parser. Among
|
271
285
|
Nokogiri's many features is the ability to search documents via XPath
|
272
286
|
or CSS3 selectors.
|
273
|
-
email:
|
274
|
-
- aaronp@rubyforge.org
|
275
|
-
- mike.dalessio@gmail.com
|
276
|
-
- yokolet@gmail.com
|
277
|
-
- tle@holymonkey.com
|
278
|
-
- knu@idaemons.org
|
279
|
-
- jvshahid@gmail.com
|
280
|
-
- lars@greiz-reinsdorf.de
|
287
|
+
email: nokogiri-talk@googlegroups.com
|
281
288
|
executables:
|
282
289
|
- nokogiri
|
283
290
|
extensions:
|
@@ -420,6 +427,8 @@ files:
|
|
420
427
|
- lib/nokogiri/jruby/dependencies.rb
|
421
428
|
- lib/nokogiri/syntax_error.rb
|
422
429
|
- lib/nokogiri/version.rb
|
430
|
+
- lib/nokogiri/version/constant.rb
|
431
|
+
- lib/nokogiri/version/info.rb
|
423
432
|
- lib/nokogiri/xml.rb
|
424
433
|
- lib/nokogiri/xml/attr.rb
|
425
434
|
- lib/nokogiri/xml/attribute_decl.rb
|
@@ -465,6 +474,9 @@ files:
|
|
465
474
|
- patches/libxml2/0003-Update-entities-to-remove-handling-of-ssi.patch
|
466
475
|
- patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch
|
467
476
|
- patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch
|
477
|
+
- patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch
|
478
|
+
- patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch
|
479
|
+
- patches/libxml2/0008-use-glibc-strlen.patch
|
468
480
|
- ports/archives/libxml2-2.9.10.tar.gz
|
469
481
|
- ports/archives/libxslt-1.1.34.tar.gz
|
470
482
|
homepage: https://nokogiri.org
|
@@ -486,14 +498,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
486
498
|
requirements:
|
487
499
|
- - ">="
|
488
500
|
- !ruby/object:Gem::Version
|
489
|
-
version: 2.
|
501
|
+
version: 2.5.0
|
490
502
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
491
503
|
requirements:
|
492
504
|
- - ">"
|
493
505
|
- !ruby/object:Gem::Version
|
494
506
|
version: 1.3.1
|
495
507
|
requirements: []
|
496
|
-
rubygems_version: 3.1.
|
508
|
+
rubygems_version: 3.1.4
|
497
509
|
signing_key:
|
498
510
|
specification_version: 4
|
499
511
|
summary: Nokogiri (鋸) is an HTML, XML, SAX, and Reader parser
|