RubyGems - nokogiri - Versions diffs - 1.11.0.rc1 → 1.11.1 - Mend

nokogiri 1.11.0.rc1 → 1.11.1

Potentially problematic release.

This version of nokogiri might be problematic. Click here for more details.

Files changed (49) hide show

checksums.yaml +4 -4
data/Gemfile +3 -0
data/LICENSE-DEPENDENCIES.md +1015 -947
data/README.md +164 -92
data/ext/nokogiri/depend +476 -357
data/ext/nokogiri/extconf.rb +467 -326
data/ext/nokogiri/html_document.c +79 -78
data/ext/nokogiri/html_sax_parser_context.c +4 -2
data/ext/nokogiri/html_sax_push_parser.c +14 -8
data/ext/nokogiri/nokogiri.c +37 -46
data/ext/nokogiri/nokogiri.h +25 -17
data/ext/nokogiri/test_global_handlers.c +41 -0
data/ext/nokogiri/xml_document.c +8 -3
data/ext/nokogiri/xml_io.c +8 -6
data/ext/nokogiri/xml_node.c +1 -1
data/ext/nokogiri/xml_node_set.c +1 -1
data/ext/nokogiri/xml_reader.c +6 -17
data/ext/nokogiri/xml_relax_ng.c +29 -11
data/ext/nokogiri/xml_sax_parser.c +2 -7
data/ext/nokogiri/xml_sax_parser_context.c +4 -2
data/ext/nokogiri/xml_sax_push_parser.c +2 -0
data/ext/nokogiri/xml_schema.c +84 -13
data/ext/nokogiri/xml_syntax_error.c +23 -0
data/ext/nokogiri/xml_syntax_error.h +15 -3
data/ext/nokogiri/xml_xpath_context.c +80 -4
data/ext/nokogiri/xslt_stylesheet.c +1 -4
data/lib/nokogiri.rb +20 -3
data/lib/nokogiri/css/parser.rb +62 -62
data/lib/nokogiri/css/parser.y +2 -2
data/lib/nokogiri/css/parser_extras.rb +38 -36
data/lib/nokogiri/css/xpath_visitor.rb +70 -42
data/lib/nokogiri/html/document.rb +12 -26
data/lib/nokogiri/version.rb +2 -148
data/lib/nokogiri/version/constant.rb +5 -0
data/lib/nokogiri/version/info.rb +182 -0
data/lib/nokogiri/xml/builder.rb +2 -2
data/lib/nokogiri/xml/document.rb +17 -7
data/lib/nokogiri/xml/document_fragment.rb +4 -6
data/lib/nokogiri/xml/node.rb +562 -238
data/lib/nokogiri/xml/parse_options.rb +6 -0
data/lib/nokogiri/xml/relax_ng.rb +6 -2
data/lib/nokogiri/xml/schema.rb +12 -4
data/lib/nokogiri/xml/searchable.rb +24 -16
data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +32 -0
data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +73 -0
data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +103 -0
data/patches/libxml2/0008-use-glibc-strlen.patch +53 -0
data/patches/libxml2/0009-avoid-isnan-isinf.patch +81 -0
metadata +84 -114

data/lib/nokogiri/xml/parse_options.rb CHANGED

@@ -73,6 +73,8 @@ module Nokogiri
       DEFAULT_XML  = RECOVER | NONET
       # the default options used for parsing HTML documents
       DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET
+      # the default options used for parsing XML schemas
+      DEFAULT_SCHEMA = NONET
       attr_accessor :options
       def initialize options = STRICT
@@ -107,6 +109,10 @@ module Nokogiri
         @options & RECOVER == STRICT
       end
+      def ==(other)
+        other.to_i == to_i
+      end
       alias :to_i :options
       def inspect

data/lib/nokogiri/xml/relax_ng.rb CHANGED

@@ -5,8 +5,8 @@ module Nokogiri
       ###
       # Create a new Nokogiri::XML::RelaxNG document from +string_or_io+.
       # See Nokogiri::XML::RelaxNG for an example.
-      def RelaxNG string_or_io
-        RelaxNG.new(string_or_io)
+      def RelaxNG(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
+        RelaxNG.new(string_or_io, options)
       end
     end
@@ -27,6 +27,10 @@ module Nokogiri
     #   end
     #
     # The list of errors are Nokogiri::XML::SyntaxError objects.
+    #
+    # NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
+    # underlying parsing libraries to access network resources. This is counter to Nokogiri's
+    # "untrusted by default" security policy, but is a limitation of the underlying libraries.
     class RelaxNG < Nokogiri::XML::Schema
     end
   end

data/lib/nokogiri/xml/schema.rb CHANGED

@@ -5,8 +5,8 @@ module Nokogiri
       ###
       # Create a new Nokogiri::XML::Schema object using a +string_or_io+
       # object.
-      def Schema string_or_io
-        Schema.new(string_or_io)
+      def Schema(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
+        Schema.new(string_or_io, options)
       end
     end
@@ -27,15 +27,23 @@ module Nokogiri
     #   end
     #
     # The list of errors are Nokogiri::XML::SyntaxError objects.
+    #
+    # NOTE: As of v1.11.0, Schema treats inputs as UNTRUSTED by default, and so external entities
+    # are not resolved from the network (`http://` or `ftp://`). Previously, parsing treated
+    # documents as "trusted" by default which was counter to Nokogiri's "untrusted by default"
+    # security policy. If a document is trusted, then the caller may turn off the NONET option via
+    # the ParseOptions to re-enable external entity resolution over a network connection.
     class Schema
       # Errors while parsing the schema file
       attr_accessor :errors
+      # The Nokogiri::XML::ParseOptions used to parse the schema
+      attr_accessor :parse_options
       ###
       # Create a new Nokogiri::XML::Schema object using a +string_or_io+
       # object.
-      def self.new string_or_io
-        from_document Nokogiri::XML(string_or_io)
+      def self.new string_or_io, options = ParseOptions::DEFAULT_SCHEMA
+        from_document(Nokogiri::XML(string_or_io), options)
       end
       ###

data/lib/nokogiri/xml/searchable.rb CHANGED

@@ -12,7 +12,9 @@ module Nokogiri
       # Regular expression used by Searchable#search to determine if a query
       # string is CSS or XPath
       LOOKS_LIKE_XPATH = /^(\.\/|\/|\.\.|\.$)/
+      # @!group Searching via XPath or CSS Queries
       ###
       # call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
       #
@@ -46,7 +48,7 @@ module Nokogiri
       #   )
       #
       # See Searchable#xpath and Searchable#css for further usage help.
-      def search *args
+      def search(*args)
         paths, handler, ns, binds = extract_params(args)
         xpaths = paths.map(&:to_s).map do |path|
@@ -55,6 +57,7 @@ module Nokogiri
         xpath(*(xpaths + [ns, handler, binds].compact))
       end
       alias :/ :search
       ###
@@ -64,9 +67,10 @@ module Nokogiri
       # result. +paths+ must be one or more XPath or CSS queries.
       #
       # See Searchable#search for more information.
-      def at *args
+      def at(*args)
         search(*args).first
       end
       alias :% :at
       ###
@@ -102,7 +106,7 @@ module Nokogiri
       # found in an XML document, where tags names are case-sensitive
       # (e.g., "H1" is distinct from "h1").
       #
-      def css *args
+      def css(*args)
         rules, handler, ns, _ = extract_params(args)
         css_internal self, rules, handler, ns
@@ -115,7 +119,7 @@ module Nokogiri
       # match. +rules+ must be one or more CSS selectors.
       #
       # See Searchable#css for more information.
-      def at_css *args
+      def at_css(*args)
         css(*args).first
       end
@@ -149,7 +153,7 @@ module Nokogiri
       #     end
       #   }.new)
       #
-      def xpath *args
+      def xpath(*args)
         paths, handler, ns, binds = extract_params(args)
         xpath_internal self, paths, handler, ns, binds
@@ -162,17 +166,19 @@ module Nokogiri
       # match. +paths+ must be one or more XPath queries.
       #
       # See Searchable#xpath for more information.
-      def at_xpath *args
+      def at_xpath(*args)
         xpath(*args).first
       end
+      # @!endgroup
       private
-      def css_internal node, rules, handler, ns
+      def css_internal(node, rules, handler, ns)
         xpath_internal node, css_rules_to_xpath(rules, ns), handler, ns, nil
       end
-      def xpath_internal node, paths, handler, ns, binds
+      def xpath_internal(node, paths, handler, ns, binds)
         document = node.document
         return NodeSet.new(document) unless document
@@ -187,12 +193,12 @@ module Nokogiri
         end
       end
-      def xpath_impl node, path, handler, ns, binds
+      def xpath_impl(node, path, handler, ns, binds)
         ctx = XPathContext.new(node)
         ctx.register_namespaces(ns)
-        path = path.gsub(/xmlns:/, ' :') unless Nokogiri.uses_libxml?
+        path = path.gsub(/xmlns:/, " :") unless Nokogiri.uses_libxml?
-        binds.each do |key,value|
+        binds.each do |key, value|
           ctx.register_variable key.to_s, value
         end if binds
@@ -203,13 +209,15 @@ module Nokogiri
         rules.map { |rule| xpath_query_from_css_rule(rule, ns) }
       end
-      def xpath_query_from_css_rule rule, ns
+      def xpath_query_from_css_rule(rule, ns)
+        visitor = Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new
         self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context|
-          CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns)
-        end.join(' | ')
+          CSS.xpath_for(rule.to_s, {:prefix => implied_xpath_context, :ns => ns,
+                                    :visitor => visitor})
+        end.join(" | ")
       end
-      def extract_params params # :nodoc:
+      def extract_params(params) # :nodoc:
         handler = params.find do |param|
           ![Hash, String, Symbol].include?(param.class)
         end

data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch ADDED

@@ -0,0 +1,32 @@
+From 0e1a49c8907645d2e155f0d89d4d9895ac5112b5 Mon Sep 17 00:00:00 2001
+From: Zhipeng Xie <xiezhipeng1@huawei.com>
+Date: Thu, 12 Dec 2019 17:30:55 +0800
+Subject: [PATCH] Fix infinite loop in xmlStringLenDecodeEntities
+When ctxt->instate == XML_PARSER_EOF,xmlParseStringEntityRef
+return NULL which cause a infinite loop in xmlStringLenDecodeEntities
+Found with libFuzzer.
+Signed-off-by: Zhipeng Xie <xiezhipeng1@huawei.com>
+---
+ parser.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/parser.c b/parser.c
+index d1c3196..a34bb6c 100644
+--- a/parser.c
++++ b/parser.c
+@@ -2646,7 +2646,8 @@ xmlStringLenDecodeEntities(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
+     else
+         c = 0;
+     while ((c != 0) && (c != end) && /* non input consuming loop */
+-	   (c != end2) && (c != end3)) {
++           (c != end2) && (c != end3) &&
++           (ctxt->instate != XML_PARSER_EOF)) {
+ 	if (c == 0) break;
+         if ((c == '&') && (str[1] == '#')) {
+--
+2.17.1

data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch ADDED

@@ -0,0 +1,73 @@
+From 4f51a6d2b1755ce5b36c524c215aad70d864ac1d Mon Sep 17 00:00:00 2001
+From: Mike Dalessio <mike.dalessio@gmail.com>
+Date: Mon, 3 Aug 2020 17:36:05 -0400
+Subject: [PATCH 1/2] htmlParseComment: treat `--!>` as if it closed the
+ comment
+See guidance provided on incorrectly-closed comments here:
+https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
+---
+ HTMLparser.c | 28 ++++++++++++++++++++--------
+ 1 file changed, 20 insertions(+), 8 deletions(-)
+diff --git a/HTMLparser.c b/HTMLparser.c
+index 7b6d689..4d43479 100644
+--- a/HTMLparser.c
++++ b/HTMLparser.c
+@@ -3300,6 +3300,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
+     int q, ql;
+     int r, rl;
+     int cur, l;
++    int next, nl;
+     xmlParserInputState state;
+     /*
+@@ -3332,6 +3333,21 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
+     while (IS_CHAR(cur) &&
+            ((cur != '>') ||
+ 	    (r != '-') || (q != '-'))) {
++	NEXTL(l);
++	next = CUR_CHAR(nl);
++	if (next == 0) {
++	    SHRINK;
++	    GROW;
++	    next = CUR_CHAR(nl);
++	}
++
++	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
++	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
++		       "Comment incorrectly closed by '--!>'", NULL, NULL);
++	  cur = '>';
++	  break;
++	}
++
+ 	if (len + 5 >= size) {
+ 	    xmlChar *tmp;
+@@ -3345,18 +3361,14 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
+ 	    }
+ 	    buf = tmp;
+ 	}
+-	COPY_BUF(ql,buf,len,q);
++        COPY_BUF(ql,buf,len,q);
++
+ 	q = r;
+ 	ql = rl;
+ 	r = cur;
+ 	rl = l;
+-	NEXTL(l);
+-	cur = CUR_CHAR(l);
+-	if (cur == 0) {
+-	    SHRINK;
+-	    GROW;
+-	    cur = CUR_CHAR(l);
+-	}
++	cur = next;
++	l = nl;
+     }
+     buf[len] = 0;
+     if (IS_CHAR(cur)) {
+--
+2.25.1

data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch ADDED

@@ -0,0 +1,103 @@
+From b20d746fa7cbb74716171bc49d836af99927e41e Mon Sep 17 00:00:00 2001
+From: Mike Dalessio <mike.dalessio@gmail.com>
+Date: Sun, 11 Oct 2020 14:15:37 -0400
+Subject: [PATCH 2/2] use new htmlParseLookupCommentEnd to find comment ends
+Note that the caret in error messages generated during comment parsing
+may have moved by one byte.
+See guidance provided on incorrectly-closed comments here:
+https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
+---
+ HTMLparser.c | 46 +++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 37 insertions(+), 9 deletions(-)
+diff --git a/HTMLparser.c b/HTMLparser.c
+index 4d43479..000dc3d 100644
+--- a/HTMLparser.c
++++ b/HTMLparser.c
+@@ -5331,6 +5331,39 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
+     return (-1);
+ }
++/**
++ * htmlParseLookupCommentEnd:
++ * @ctxt: an HTML parser context
++ *
++ * Try to find a comment end tag in the input stream
++ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
++ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
++ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
++ * to avoid rescanning sequences of bytes, it DOES change the state of the
++ * parser, do not use liberally.
++ * This wraps to htmlParseLookupSequence()
++ *
++ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
++ */
++static int
++htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
++{
++    int mark = 0;
++    int cur = CUR_PTR - BASE_PTR;
++
++    while (mark >= 0) {
++	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 1, 1);
++	if ((mark < 0) ||
++	    (NXT(mark+2) == '>') ||
++	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
++	    return mark;
++	}
++	ctxt->checkIndex = cur + mark + 1;
++    }
++    return mark;
++}
++
++
+ /**
+  * htmlParseTryOrFinish:
+  * @ctxt:  an HTML parser context
+@@ -5507,8 +5540,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
+ 		cur = in->cur[0];
+ 	        if ((cur == '<') && (next == '!') &&
+ 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
+-		    if ((!terminate) &&
+-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
++		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
+ 			goto done;
+ #ifdef DEBUG_PUSH
+ 		    xmlGenericError(xmlGenericErrorContext,
+@@ -5567,8 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
+ 		next = in->cur[1];
+ 		if ((cur == '<') && (next == '!') &&
+ 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
+-		    if ((!terminate) &&
+-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
++		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
+ 			goto done;
+ #ifdef DEBUG_PUSH
+ 		    xmlGenericError(xmlGenericErrorContext,
+@@ -5614,8 +5645,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
+ 		next = in->cur[1];
+ 	        if ((cur == '<') && (next == '!') &&
+ 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
+-		    if ((!terminate) &&
+-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
++		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
+ 			goto done;
+ #ifdef DEBUG_PUSH
+ 		    xmlGenericError(xmlGenericErrorContext,
+@@ -5871,9 +5901,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
+ 			htmlParseDocTypeDecl(ctxt);
+ 		    } else if ((cur == '<') && (next == '!') &&
+ 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
+-			if ((!terminate) &&
+-			    (htmlParseLookupSequence(
+-				ctxt, '-', '-', '>', 1, 1) < 0))
++			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
+ 			    goto done;
+ #ifdef DEBUG_PUSH
+ 			xmlGenericError(xmlGenericErrorContext,
+--
+2.25.1

data/patches/libxml2/0008-use-glibc-strlen.patch ADDED

@@ -0,0 +1,53 @@
+From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
+From: Ilya Zub <ilya@serpapi.com>
+Date: Wed, 23 Dec 2020 12:45:29 +0200
+Subject: Use glibc strlen to speed up xmlStrlen
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+xmlStrlen (entire HTML file): 926171.936981 μs
+glibc_xmlStrlen (entire HTML file): 36905.903992 μs
+delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
+xmlStrlen (average string): 57479.204010 μs
+glibc_xmlStrlen (average string): 5802.069000 μs
+delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
+xmlStrlen (bigger string): 388056.315979 μs
+glibc_xmlStrlen (bigger string): 12797.856995 μs
+delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
+xmlStrlen (smallest string): 15870.046021 μs
+glibc_xmlStrlen (smallest string): 6282.208984 μs
+delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
+See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
+---
+ xmlstring.c | 9 ++-------
+ 1 file changed, 2 insertions(+), 7 deletions(-)
+diff --git a/xmlstring.c b/xmlstring.c
+index e8a1e45d..df247dff 100644
+--- a/xmlstring.c
++++ b/xmlstring.c
+@@ -423,14 +423,9 @@ xmlStrsub(const xmlChar *str, int start, int len) {
+ int
+ xmlStrlen(const xmlChar *str) {
+-    int len = 0;
+-
+     if (str == NULL) return(0);
+-    while (*str != 0) { /* non input consuming */
+-        str++;
+-        len++;
+-    }
+-    return(len);
++
++    return strlen((const char*)str);
+ }
+ /**
+--
+2.29.2