nokogiri 1.11.0.rc1 → 1.11.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/README.md +164 -92
- data/ext/nokogiri/depend +476 -357
- data/ext/nokogiri/extconf.rb +467 -326
- data/ext/nokogiri/html_document.c +79 -78
- data/ext/nokogiri/html_sax_parser_context.c +4 -2
- data/ext/nokogiri/html_sax_push_parser.c +14 -8
- data/ext/nokogiri/nokogiri.c +37 -46
- data/ext/nokogiri/nokogiri.h +25 -17
- data/ext/nokogiri/test_global_handlers.c +41 -0
- data/ext/nokogiri/xml_document.c +8 -3
- data/ext/nokogiri/xml_io.c +8 -6
- data/ext/nokogiri/xml_node.c +1 -1
- data/ext/nokogiri/xml_node_set.c +1 -1
- data/ext/nokogiri/xml_reader.c +6 -17
- data/ext/nokogiri/xml_relax_ng.c +29 -11
- data/ext/nokogiri/xml_sax_parser.c +2 -7
- data/ext/nokogiri/xml_sax_parser_context.c +4 -2
- data/ext/nokogiri/xml_sax_push_parser.c +2 -0
- data/ext/nokogiri/xml_schema.c +84 -13
- data/ext/nokogiri/xml_syntax_error.c +23 -0
- data/ext/nokogiri/xml_syntax_error.h +15 -3
- data/ext/nokogiri/xml_xpath_context.c +80 -4
- data/ext/nokogiri/xslt_stylesheet.c +1 -4
- data/lib/nokogiri.rb +20 -3
- data/lib/nokogiri/css/parser.rb +62 -62
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +38 -36
- data/lib/nokogiri/css/xpath_visitor.rb +70 -42
- data/lib/nokogiri/html/document.rb +12 -26
- data/lib/nokogiri/version.rb +2 -148
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +182 -0
- data/lib/nokogiri/xml/builder.rb +2 -2
- data/lib/nokogiri/xml/document.rb +17 -7
- data/lib/nokogiri/xml/document_fragment.rb +4 -6
- data/lib/nokogiri/xml/node.rb +562 -238
- data/lib/nokogiri/xml/parse_options.rb +6 -0
- data/lib/nokogiri/xml/relax_ng.rb +6 -2
- data/lib/nokogiri/xml/schema.rb +12 -4
- data/lib/nokogiri/xml/searchable.rb +24 -16
- data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +32 -0
- data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +73 -0
- data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +103 -0
- data/patches/libxml2/0008-use-glibc-strlen.patch +53 -0
- data/patches/libxml2/0009-avoid-isnan-isinf.patch +81 -0
- metadata +84 -114
@@ -73,6 +73,8 @@ module Nokogiri
|
|
73
73
|
DEFAULT_XML = RECOVER | NONET
|
74
74
|
# the default options used for parsing HTML documents
|
75
75
|
DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET
|
76
|
+
# the default options used for parsing XML schemas
|
77
|
+
DEFAULT_SCHEMA = NONET
|
76
78
|
|
77
79
|
attr_accessor :options
|
78
80
|
def initialize options = STRICT
|
@@ -107,6 +109,10 @@ module Nokogiri
|
|
107
109
|
@options & RECOVER == STRICT
|
108
110
|
end
|
109
111
|
|
112
|
+
def ==(other)
|
113
|
+
other.to_i == to_i
|
114
|
+
end
|
115
|
+
|
110
116
|
alias :to_i :options
|
111
117
|
|
112
118
|
def inspect
|
@@ -5,8 +5,8 @@ module Nokogiri
|
|
5
5
|
###
|
6
6
|
# Create a new Nokogiri::XML::RelaxNG document from +string_or_io+.
|
7
7
|
# See Nokogiri::XML::RelaxNG for an example.
|
8
|
-
def RelaxNG
|
9
|
-
RelaxNG.new(string_or_io)
|
8
|
+
def RelaxNG(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
|
9
|
+
RelaxNG.new(string_or_io, options)
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
@@ -27,6 +27,10 @@ module Nokogiri
|
|
27
27
|
# end
|
28
28
|
#
|
29
29
|
# The list of errors are Nokogiri::XML::SyntaxError objects.
|
30
|
+
#
|
31
|
+
# NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
|
32
|
+
# underlying parsing libraries to access network resources. This is counter to Nokogiri's
|
33
|
+
# "untrusted by default" security policy, but is a limitation of the underlying libraries.
|
30
34
|
class RelaxNG < Nokogiri::XML::Schema
|
31
35
|
end
|
32
36
|
end
|
data/lib/nokogiri/xml/schema.rb
CHANGED
@@ -5,8 +5,8 @@ module Nokogiri
|
|
5
5
|
###
|
6
6
|
# Create a new Nokogiri::XML::Schema object using a +string_or_io+
|
7
7
|
# object.
|
8
|
-
def Schema
|
9
|
-
Schema.new(string_or_io)
|
8
|
+
def Schema(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
|
9
|
+
Schema.new(string_or_io, options)
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
@@ -27,15 +27,23 @@ module Nokogiri
|
|
27
27
|
# end
|
28
28
|
#
|
29
29
|
# The list of errors are Nokogiri::XML::SyntaxError objects.
|
30
|
+
#
|
31
|
+
# NOTE: As of v1.11.0, Schema treats inputs as UNTRUSTED by default, and so external entities
|
32
|
+
# are not resolved from the network (`http://` or `ftp://`). Previously, parsing treated
|
33
|
+
# documents as "trusted" by default which was counter to Nokogiri's "untrusted by default"
|
34
|
+
# security policy. If a document is trusted, then the caller may turn off the NONET option via
|
35
|
+
# the ParseOptions to re-enable external entity resolution over a network connection.
|
30
36
|
class Schema
|
31
37
|
# Errors while parsing the schema file
|
32
38
|
attr_accessor :errors
|
39
|
+
# The Nokogiri::XML::ParseOptions used to parse the schema
|
40
|
+
attr_accessor :parse_options
|
33
41
|
|
34
42
|
###
|
35
43
|
# Create a new Nokogiri::XML::Schema object using a +string_or_io+
|
36
44
|
# object.
|
37
|
-
def self.new string_or_io
|
38
|
-
from_document
|
45
|
+
def self.new string_or_io, options = ParseOptions::DEFAULT_SCHEMA
|
46
|
+
from_document(Nokogiri::XML(string_or_io), options)
|
39
47
|
end
|
40
48
|
|
41
49
|
###
|
@@ -12,7 +12,9 @@ module Nokogiri
|
|
12
12
|
# Regular expression used by Searchable#search to determine if a query
|
13
13
|
# string is CSS or XPath
|
14
14
|
LOOKS_LIKE_XPATH = /^(\.\/|\/|\.\.|\.$)/
|
15
|
-
|
15
|
+
|
16
|
+
# @!group Searching via XPath or CSS Queries
|
17
|
+
|
16
18
|
###
|
17
19
|
# call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
|
18
20
|
#
|
@@ -46,7 +48,7 @@ module Nokogiri
|
|
46
48
|
# )
|
47
49
|
#
|
48
50
|
# See Searchable#xpath and Searchable#css for further usage help.
|
49
|
-
def search
|
51
|
+
def search(*args)
|
50
52
|
paths, handler, ns, binds = extract_params(args)
|
51
53
|
|
52
54
|
xpaths = paths.map(&:to_s).map do |path|
|
@@ -55,6 +57,7 @@ module Nokogiri
|
|
55
57
|
|
56
58
|
xpath(*(xpaths + [ns, handler, binds].compact))
|
57
59
|
end
|
60
|
+
|
58
61
|
alias :/ :search
|
59
62
|
|
60
63
|
###
|
@@ -64,9 +67,10 @@ module Nokogiri
|
|
64
67
|
# result. +paths+ must be one or more XPath or CSS queries.
|
65
68
|
#
|
66
69
|
# See Searchable#search for more information.
|
67
|
-
def at
|
70
|
+
def at(*args)
|
68
71
|
search(*args).first
|
69
72
|
end
|
73
|
+
|
70
74
|
alias :% :at
|
71
75
|
|
72
76
|
###
|
@@ -102,7 +106,7 @@ module Nokogiri
|
|
102
106
|
# found in an XML document, where tags names are case-sensitive
|
103
107
|
# (e.g., "H1" is distinct from "h1").
|
104
108
|
#
|
105
|
-
def css
|
109
|
+
def css(*args)
|
106
110
|
rules, handler, ns, _ = extract_params(args)
|
107
111
|
|
108
112
|
css_internal self, rules, handler, ns
|
@@ -115,7 +119,7 @@ module Nokogiri
|
|
115
119
|
# match. +rules+ must be one or more CSS selectors.
|
116
120
|
#
|
117
121
|
# See Searchable#css for more information.
|
118
|
-
def at_css
|
122
|
+
def at_css(*args)
|
119
123
|
css(*args).first
|
120
124
|
end
|
121
125
|
|
@@ -149,7 +153,7 @@ module Nokogiri
|
|
149
153
|
# end
|
150
154
|
# }.new)
|
151
155
|
#
|
152
|
-
def xpath
|
156
|
+
def xpath(*args)
|
153
157
|
paths, handler, ns, binds = extract_params(args)
|
154
158
|
|
155
159
|
xpath_internal self, paths, handler, ns, binds
|
@@ -162,17 +166,19 @@ module Nokogiri
|
|
162
166
|
# match. +paths+ must be one or more XPath queries.
|
163
167
|
#
|
164
168
|
# See Searchable#xpath for more information.
|
165
|
-
def at_xpath
|
169
|
+
def at_xpath(*args)
|
166
170
|
xpath(*args).first
|
167
171
|
end
|
168
172
|
|
173
|
+
# @!endgroup
|
174
|
+
|
169
175
|
private
|
170
176
|
|
171
|
-
def css_internal
|
177
|
+
def css_internal(node, rules, handler, ns)
|
172
178
|
xpath_internal node, css_rules_to_xpath(rules, ns), handler, ns, nil
|
173
179
|
end
|
174
180
|
|
175
|
-
def xpath_internal
|
181
|
+
def xpath_internal(node, paths, handler, ns, binds)
|
176
182
|
document = node.document
|
177
183
|
return NodeSet.new(document) unless document
|
178
184
|
|
@@ -187,12 +193,12 @@ module Nokogiri
|
|
187
193
|
end
|
188
194
|
end
|
189
195
|
|
190
|
-
def xpath_impl
|
196
|
+
def xpath_impl(node, path, handler, ns, binds)
|
191
197
|
ctx = XPathContext.new(node)
|
192
198
|
ctx.register_namespaces(ns)
|
193
|
-
path = path.gsub(/xmlns:/,
|
199
|
+
path = path.gsub(/xmlns:/, " :") unless Nokogiri.uses_libxml?
|
194
200
|
|
195
|
-
binds.each do |key,value|
|
201
|
+
binds.each do |key, value|
|
196
202
|
ctx.register_variable key.to_s, value
|
197
203
|
end if binds
|
198
204
|
|
@@ -203,13 +209,15 @@ module Nokogiri
|
|
203
209
|
rules.map { |rule| xpath_query_from_css_rule(rule, ns) }
|
204
210
|
end
|
205
211
|
|
206
|
-
def xpath_query_from_css_rule
|
212
|
+
def xpath_query_from_css_rule(rule, ns)
|
213
|
+
visitor = Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new
|
207
214
|
self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context|
|
208
|
-
CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns
|
209
|
-
|
215
|
+
CSS.xpath_for(rule.to_s, {:prefix => implied_xpath_context, :ns => ns,
|
216
|
+
:visitor => visitor})
|
217
|
+
end.join(" | ")
|
210
218
|
end
|
211
219
|
|
212
|
-
def extract_params
|
220
|
+
def extract_params(params) # :nodoc:
|
213
221
|
handler = params.find do |param|
|
214
222
|
![Hash, String, Symbol].include?(param.class)
|
215
223
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
From 0e1a49c8907645d2e155f0d89d4d9895ac5112b5 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Zhipeng Xie <xiezhipeng1@huawei.com>
|
3
|
+
Date: Thu, 12 Dec 2019 17:30:55 +0800
|
4
|
+
Subject: [PATCH] Fix infinite loop in xmlStringLenDecodeEntities
|
5
|
+
|
6
|
+
When ctxt->instate == XML_PARSER_EOF,xmlParseStringEntityRef
|
7
|
+
return NULL which cause a infinite loop in xmlStringLenDecodeEntities
|
8
|
+
|
9
|
+
Found with libFuzzer.
|
10
|
+
|
11
|
+
Signed-off-by: Zhipeng Xie <xiezhipeng1@huawei.com>
|
12
|
+
---
|
13
|
+
parser.c | 3 ++-
|
14
|
+
1 file changed, 2 insertions(+), 1 deletion(-)
|
15
|
+
|
16
|
+
diff --git a/parser.c b/parser.c
|
17
|
+
index d1c3196..a34bb6c 100644
|
18
|
+
--- a/parser.c
|
19
|
+
+++ b/parser.c
|
20
|
+
@@ -2646,7 +2646,8 @@ xmlStringLenDecodeEntities(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
|
21
|
+
else
|
22
|
+
c = 0;
|
23
|
+
while ((c != 0) && (c != end) && /* non input consuming loop */
|
24
|
+
- (c != end2) && (c != end3)) {
|
25
|
+
+ (c != end2) && (c != end3) &&
|
26
|
+
+ (ctxt->instate != XML_PARSER_EOF)) {
|
27
|
+
|
28
|
+
if (c == 0) break;
|
29
|
+
if ((c == '&') && (str[1] == '#')) {
|
30
|
+
--
|
31
|
+
2.17.1
|
32
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
From 4f51a6d2b1755ce5b36c524c215aad70d864ac1d Mon Sep 17 00:00:00 2001
|
2
|
+
From: Mike Dalessio <mike.dalessio@gmail.com>
|
3
|
+
Date: Mon, 3 Aug 2020 17:36:05 -0400
|
4
|
+
Subject: [PATCH 1/2] htmlParseComment: treat `--!>` as if it closed the
|
5
|
+
comment
|
6
|
+
|
7
|
+
See guidance provided on incorrectly-closed comments here:
|
8
|
+
|
9
|
+
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
|
10
|
+
---
|
11
|
+
HTMLparser.c | 28 ++++++++++++++++++++--------
|
12
|
+
1 file changed, 20 insertions(+), 8 deletions(-)
|
13
|
+
|
14
|
+
diff --git a/HTMLparser.c b/HTMLparser.c
|
15
|
+
index 7b6d689..4d43479 100644
|
16
|
+
--- a/HTMLparser.c
|
17
|
+
+++ b/HTMLparser.c
|
18
|
+
@@ -3300,6 +3300,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
19
|
+
int q, ql;
|
20
|
+
int r, rl;
|
21
|
+
int cur, l;
|
22
|
+
+ int next, nl;
|
23
|
+
xmlParserInputState state;
|
24
|
+
|
25
|
+
/*
|
26
|
+
@@ -3332,6 +3333,21 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
27
|
+
while (IS_CHAR(cur) &&
|
28
|
+
((cur != '>') ||
|
29
|
+
(r != '-') || (q != '-'))) {
|
30
|
+
+ NEXTL(l);
|
31
|
+
+ next = CUR_CHAR(nl);
|
32
|
+
+ if (next == 0) {
|
33
|
+
+ SHRINK;
|
34
|
+
+ GROW;
|
35
|
+
+ next = CUR_CHAR(nl);
|
36
|
+
+ }
|
37
|
+
+
|
38
|
+
+ if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
|
39
|
+
+ htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
|
40
|
+
+ "Comment incorrectly closed by '--!>'", NULL, NULL);
|
41
|
+
+ cur = '>';
|
42
|
+
+ break;
|
43
|
+
+ }
|
44
|
+
+
|
45
|
+
if (len + 5 >= size) {
|
46
|
+
xmlChar *tmp;
|
47
|
+
|
48
|
+
@@ -3345,18 +3361,14 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
|
49
|
+
}
|
50
|
+
buf = tmp;
|
51
|
+
}
|
52
|
+
- COPY_BUF(ql,buf,len,q);
|
53
|
+
+ COPY_BUF(ql,buf,len,q);
|
54
|
+
+
|
55
|
+
q = r;
|
56
|
+
ql = rl;
|
57
|
+
r = cur;
|
58
|
+
rl = l;
|
59
|
+
- NEXTL(l);
|
60
|
+
- cur = CUR_CHAR(l);
|
61
|
+
- if (cur == 0) {
|
62
|
+
- SHRINK;
|
63
|
+
- GROW;
|
64
|
+
- cur = CUR_CHAR(l);
|
65
|
+
- }
|
66
|
+
+ cur = next;
|
67
|
+
+ l = nl;
|
68
|
+
}
|
69
|
+
buf[len] = 0;
|
70
|
+
if (IS_CHAR(cur)) {
|
71
|
+
--
|
72
|
+
2.25.1
|
73
|
+
|
@@ -0,0 +1,103 @@
|
|
1
|
+
From b20d746fa7cbb74716171bc49d836af99927e41e Mon Sep 17 00:00:00 2001
|
2
|
+
From: Mike Dalessio <mike.dalessio@gmail.com>
|
3
|
+
Date: Sun, 11 Oct 2020 14:15:37 -0400
|
4
|
+
Subject: [PATCH 2/2] use new htmlParseLookupCommentEnd to find comment ends
|
5
|
+
|
6
|
+
Note that the caret in error messages generated during comment parsing
|
7
|
+
may have moved by one byte.
|
8
|
+
|
9
|
+
See guidance provided on incorrectly-closed comments here:
|
10
|
+
|
11
|
+
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
|
12
|
+
---
|
13
|
+
HTMLparser.c | 46 +++++++++++++++++++++++++++++++++++++---------
|
14
|
+
1 file changed, 37 insertions(+), 9 deletions(-)
|
15
|
+
|
16
|
+
diff --git a/HTMLparser.c b/HTMLparser.c
|
17
|
+
index 4d43479..000dc3d 100644
|
18
|
+
--- a/HTMLparser.c
|
19
|
+
+++ b/HTMLparser.c
|
20
|
+
@@ -5331,6 +5331,39 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
21
|
+
return (-1);
|
22
|
+
}
|
23
|
+
|
24
|
+
+/**
|
25
|
+
+ * htmlParseLookupCommentEnd:
|
26
|
+
+ * @ctxt: an HTML parser context
|
27
|
+
+ *
|
28
|
+
+ * Try to find a comment end tag in the input stream
|
29
|
+
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
|
30
|
+
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
|
31
|
+
+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
|
32
|
+
+ * to avoid rescanning sequences of bytes, it DOES change the state of the
|
33
|
+
+ * parser, do not use liberally.
|
34
|
+
+ * This wraps to htmlParseLookupSequence()
|
35
|
+
+ *
|
36
|
+
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
|
37
|
+
+ */
|
38
|
+
+static int
|
39
|
+
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
|
40
|
+
+{
|
41
|
+
+ int mark = 0;
|
42
|
+
+ int cur = CUR_PTR - BASE_PTR;
|
43
|
+
+
|
44
|
+
+ while (mark >= 0) {
|
45
|
+
+ mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 1, 1);
|
46
|
+
+ if ((mark < 0) ||
|
47
|
+
+ (NXT(mark+2) == '>') ||
|
48
|
+
+ ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
|
49
|
+
+ return mark;
|
50
|
+
+ }
|
51
|
+
+ ctxt->checkIndex = cur + mark + 1;
|
52
|
+
+ }
|
53
|
+
+ return mark;
|
54
|
+
+}
|
55
|
+
+
|
56
|
+
+
|
57
|
+
/**
|
58
|
+
* htmlParseTryOrFinish:
|
59
|
+
* @ctxt: an HTML parser context
|
60
|
+
@@ -5507,8 +5540,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
61
|
+
cur = in->cur[0];
|
62
|
+
if ((cur == '<') && (next == '!') &&
|
63
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
64
|
+
- if ((!terminate) &&
|
65
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
66
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
67
|
+
goto done;
|
68
|
+
#ifdef DEBUG_PUSH
|
69
|
+
xmlGenericError(xmlGenericErrorContext,
|
70
|
+
@@ -5567,8 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
71
|
+
next = in->cur[1];
|
72
|
+
if ((cur == '<') && (next == '!') &&
|
73
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
74
|
+
- if ((!terminate) &&
|
75
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
76
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
77
|
+
goto done;
|
78
|
+
#ifdef DEBUG_PUSH
|
79
|
+
xmlGenericError(xmlGenericErrorContext,
|
80
|
+
@@ -5614,8 +5645,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
81
|
+
next = in->cur[1];
|
82
|
+
if ((cur == '<') && (next == '!') &&
|
83
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
84
|
+
- if ((!terminate) &&
|
85
|
+
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
86
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
87
|
+
goto done;
|
88
|
+
#ifdef DEBUG_PUSH
|
89
|
+
xmlGenericError(xmlGenericErrorContext,
|
90
|
+
@@ -5871,9 +5901,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
91
|
+
htmlParseDocTypeDecl(ctxt);
|
92
|
+
} else if ((cur == '<') && (next == '!') &&
|
93
|
+
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
94
|
+
- if ((!terminate) &&
|
95
|
+
- (htmlParseLookupSequence(
|
96
|
+
- ctxt, '-', '-', '>', 1, 1) < 0))
|
97
|
+
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
|
98
|
+
goto done;
|
99
|
+
#ifdef DEBUG_PUSH
|
100
|
+
xmlGenericError(xmlGenericErrorContext,
|
101
|
+
--
|
102
|
+
2.25.1
|
103
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Ilya Zub <ilya@serpapi.com>
|
3
|
+
Date: Wed, 23 Dec 2020 12:45:29 +0200
|
4
|
+
Subject: Use glibc strlen to speed up xmlStrlen
|
5
|
+
MIME-Version: 1.0
|
6
|
+
Content-Type: text/plain; charset=UTF-8
|
7
|
+
Content-Transfer-Encoding: 8bit
|
8
|
+
|
9
|
+
xmlStrlen (entire HTML file): 926171.936981 μs
|
10
|
+
glibc_xmlStrlen (entire HTML file): 36905.903992 μs
|
11
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
|
12
|
+
|
13
|
+
xmlStrlen (average string): 57479.204010 μs
|
14
|
+
glibc_xmlStrlen (average string): 5802.069000 μs
|
15
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
|
16
|
+
|
17
|
+
xmlStrlen (bigger string): 388056.315979 μs
|
18
|
+
glibc_xmlStrlen (bigger string): 12797.856995 μs
|
19
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
|
20
|
+
|
21
|
+
xmlStrlen (smallest string): 15870.046021 μs
|
22
|
+
glibc_xmlStrlen (smallest string): 6282.208984 μs
|
23
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
|
24
|
+
|
25
|
+
See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
|
26
|
+
---
|
27
|
+
xmlstring.c | 9 ++-------
|
28
|
+
1 file changed, 2 insertions(+), 7 deletions(-)
|
29
|
+
|
30
|
+
diff --git a/xmlstring.c b/xmlstring.c
|
31
|
+
index e8a1e45d..df247dff 100644
|
32
|
+
--- a/xmlstring.c
|
33
|
+
+++ b/xmlstring.c
|
34
|
+
@@ -423,14 +423,9 @@ xmlStrsub(const xmlChar *str, int start, int len) {
|
35
|
+
|
36
|
+
int
|
37
|
+
xmlStrlen(const xmlChar *str) {
|
38
|
+
- int len = 0;
|
39
|
+
-
|
40
|
+
if (str == NULL) return(0);
|
41
|
+
- while (*str != 0) { /* non input consuming */
|
42
|
+
- str++;
|
43
|
+
- len++;
|
44
|
+
- }
|
45
|
+
- return(len);
|
46
|
+
+
|
47
|
+
+ return strlen((const char*)str);
|
48
|
+
}
|
49
|
+
|
50
|
+
/**
|
51
|
+
--
|
52
|
+
2.29.2
|
53
|
+
|