RubyGems - nokogumbo - Versions diffs - 2.0.0 → 2.0.5 - Mend

nokogumbo 2.0.0 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/README.md +20 -4
data/ext/nokogumbo/extconf.rb +50 -27
data/ext/nokogumbo/nokogumbo.c +63 -14
data/gumbo-parser/src/error.c +17 -8
data/gumbo-parser/src/gumbo.h +27 -0
data/gumbo-parser/src/parser.c +476 -480
data/gumbo-parser/src/tokenizer.c +24 -27
data/gumbo-parser/src/tokenizer.h +2 -13
data/gumbo-parser/src/utf8.c +5 -0
data/gumbo-parser/src/utf8.h +1 -0
data/lib/nokogumbo.rb +22 -9
data/lib/nokogumbo/html5.rb +15 -14
data/lib/nokogumbo/html5/document.rb +7 -2
data/lib/nokogumbo/html5/document_fragment.rb +2 -1
data/lib/nokogumbo/version.rb +1 -1
metadata +3 -4

data/gumbo-parser/src/tokenizer.c CHANGED Viewed

@@ -20,10 +20,7 @@
  Coding conventions specific to this file:
  1. Functions that fill in a token should be named emit_*, and should be
-    followed immediately by a return from the tokenizer (true if no error
-    occurred, false if an error occurred). Sometimes the emit functions
-    themselves return a boolean so that they can be combined with the return
-    statement; in this case, they should match this convention.
+    followed immediately by a return from the tokenizer.
  2. Functions that shuffle data from temporaries to final API structures
     should be named finish_*, and be called just before the tokenizer exits the
     state that accumulates the temporary.
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
   // text tokens emitted will be GUMBO_TOKEN_CDATA.
   bool _is_in_cdata;
-  // A flag indicating whether the tokenizer has seen a parse error since the
-  // last token was emitted.
-  bool _parse_error;
   // Certain states (notably character references) may emit two character tokens
   // at once, but the contract for lex() fills in only one token at a time. The
   // extra character is buffered here, and then this is checked on entry to
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
   GumboErrorType type
 ) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
-  tokenizer->_parse_error = true;
   GumboError* error = gumbo_add_error(parser);
   if (!error) {
     return;
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
   int codepoint
 ) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
-  tokenizer->_parse_error = true;
   GumboError* error = gumbo_add_error(parser);
   if (!error)
     return;
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
   GumboErrorType type
 ) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
-  tokenizer->_parse_error = true;
   GumboError* error = gumbo_add_error(parser);
   if (!error)
     return;
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
   original_text->data = tag_state->_original_text;
   original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
                           tag_state->_original_text;
-  if (original_text->data[original_text->length - 1] == '\r') {
+  if (
+    original_text->length
+    && original_text->data[original_text->length - 1] == '\r'
+  ) {
     // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
     // appended to the end of original text even when it's really the first part
     // of the next character. If we detect this situation, shrink the length of
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
 static void add_duplicate_attr_error(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
-  tokenizer->_parse_error = true;
   GumboError* error = gumbo_add_error(parser);
   if (!error) {
     return;
@@ -788,17 +780,26 @@ static void add_duplicate_attr_error(GumboParser* parser) {
 // the attribute's name. The attribute's value starts out as the empty string
 // (following the "Boolean attributes" section of the spec) and is only
 // overwritten on finish_attribute_value(). If the attribute has already been
-// specified, the new attribute is dropped, a parse error is added, and the
-// function returns false. Otherwise, this returns true.
-static bool finish_attribute_name(GumboParser* parser) {
+// specified, the new attribute is dropped and a parse error is added
+static void finish_attribute_name(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
   GumboTagState* tag_state = &tokenizer->_tag_state;
+  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
+  int max_attributes = parser->_options->max_attributes;
+  if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
+    parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
+    gumbo_debug("Attributes limit exceeded.\n");
+    reinitialize_tag_buffer(parser);
+    tag_state->_drop_next_attr_value = true;
+    return;
+  }
   // May've been set by a previous attribute without a value; reset it here.
   tag_state->_drop_next_attr_value = false;
   assert(tag_state->_attributes.data);
   assert(tag_state->_attributes.capacity);
-  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
   for (unsigned int i = 0; i < attributes->length; ++i) {
     GumboAttribute* attr = attributes->data[i];
     if (
@@ -813,7 +814,7 @@ static bool finish_attribute_name(GumboParser* parser) {
       add_duplicate_attr_error(parser);
       reinitialize_tag_buffer(parser);
       tag_state->_drop_next_attr_value = true;
-      return false;
+      return;
     }
   }
@@ -835,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
   );
   gumbo_vector_add(attr, attributes);
   reinitialize_tag_buffer(parser);
-  return true;
 }
 // Finishes an attribute value. This sets the value of the most recently added
@@ -881,7 +881,6 @@ void gumbo_tokenizer_state_init (
   tokenizer->_reconsume_current_input = false;
   tokenizer->_is_adjusted_current_node_foreign = false;
   tokenizer->_is_in_cdata = false;
-  tokenizer->_parse_error = false;
   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
   tokenizer->_tag_state._name = NULL;
@@ -891,9 +890,9 @@ void gumbo_tokenizer_state_init (
   mark_tag_state_as_empty(&tokenizer->_tag_state);
-  tokenizer->_token_start = text;
   utf8iterator_init(parser, text, text_length, &tokenizer->_input);
   utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
+  tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
   doc_type_state_init(parser);
 }
@@ -3373,7 +3372,7 @@ static GumboLexerStateFunction dispatch_table[] = {
   [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
 };
-bool gumbo_lex(GumboParser* parser, GumboToken* output) {
+void gumbo_lex(GumboParser* parser, GumboToken* output) {
   // Because of the spec requirements that...
   //
   // 1. Tokens be handled immediately by the parser upon emission.
@@ -3398,15 +3397,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
     // isn't consumed twice.
     tokenizer->_reconsume_current_input = false;
     tokenizer->_buffered_emit_char = kGumboNoChar;
-    return true;
+    return;
   }
   if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
-    // Return no error.
-    return true;
+    return;
   }
-  tokenizer->_parse_error = false;
   while (1) {
     assert(!tokenizer->_resume_pos);
     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
@@ -3420,7 +3417,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
     tokenizer->_reconsume_current_input = false;
     if (result == EMIT_TOKEN)
-      return !tokenizer->_parse_error;
+      return;
     if (should_advance) {
       utf8iterator_next(&tokenizer->_input);

data/gumbo-parser/src/tokenizer.h CHANGED Viewed

@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
 );
 // Lexes a single token from the specified buffer, filling the output with the
-// parsed GumboToken data structure. Returns true for a successful
-// tokenization, false if a parse error occurs.
-//
-// Example:
-//   struct GumboInternalParser parser;
-//   GumboToken output;
-//   gumbo_tokenizer_state_init(&parser, text, strlen(text));
-//   while (gumbo_lex(&parser, &output)) {
-//     ...do stuff with output.
-//     gumbo_token_destroy(&token);
-//   }
-//   gumbo_tokenizer_state_destroy(&parser);
-bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
+// parsed GumboToken data structure.
+void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
 // Frees the internally-allocated pointers within a GumboToken. Note that this
 // doesn't free the token itself, since oftentimes it will be allocated on the

data/gumbo-parser/src/utf8.c CHANGED Viewed

@@ -193,6 +193,11 @@ void utf8iterator_init (
   iter->_pos.offset = 0;
   iter->_parser = parser;
   read_char(iter);
+  if (iter->_current == kUtf8BomChar) {
+    iter->_start += iter->_width;
+    iter->_pos.offset += iter->_width;
+    read_char(iter);
+  }
 }
 void utf8iterator_next(Utf8Iterator* iter) {

data/gumbo-parser/src/utf8.h CHANGED Viewed

@@ -31,6 +31,7 @@ struct GumboInternalParser;
 // Unicode replacement char.
 #define kUtf8ReplacementChar 0xFFFD
+#define kUtf8BomChar 0xFEFF
 #define kUtf8MaxChar 0x10FFFF
 typedef struct GumboInternalUtf8Iterator {

data/lib/nokogumbo.rb CHANGED Viewed

@@ -1,14 +1,27 @@
 require 'nokogiri'
-require 'nokogumbo/version'
-require 'nokogumbo/html5'
-require 'nokogumbo/nokogumbo'
+if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) &&
+    (defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) &&
+    !(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false"))
+  warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information."
+  ::Nokogumbo = ::Nokogiri::Gumbo
+else
+  require 'nokogumbo/html5'
+  require 'nokogumbo/nokogumbo'
-module Nokogumbo
-  # The default maximum number of errors for parsing a document or a fragment.
-  DEFAULT_MAX_ERRORS = 0
+  module Nokogumbo
+    # The default maximum number of attributes per element.
+    DEFAULT_MAX_ATTRIBUTES = 400
-  # The default maximum depth of the DOM tree produced by parsing a document
-  # or fragment.
-  DEFAULT_MAX_TREE_DEPTH = 400
+    # The default maximum number of errors for parsing a document or a fragment.
+    DEFAULT_MAX_ERRORS = 0
+    # The default maximum depth of the DOM tree produced by parsing a document
+    # or fragment.
+    DEFAULT_MAX_TREE_DEPTH = 400
+  end
 end
+require 'nokogumbo/version'

data/lib/nokogumbo/html5.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module Nokogiri
     # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
     def self.parse(string, url = nil, encoding = nil, **options, &block)
-      Document.parse(string, url, encoding, options, &block)
+      Document.parse(string, url, encoding, **options, &block)
     end
     # Parse a fragment from +string+. Convenience method for
@@ -92,19 +92,20 @@ module Nokogiri
         if encoding.nil?
           string = string.read
         else
-        string = string.read(encoding: encoding)
+          string = string.read(encoding: encoding)
         end
       else
         # Otherwise the string has the given encoding.
-        if encoding && string.respond_to?(:force_encoding)
+        string = string.to_s
+        if encoding
           string = string.dup
           string.force_encoding(encoding)
         end
       end
-      # convert to UTF-8 (Ruby 1.9+)
-      if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
-        string = reencode(string.dup)
+      # convert to UTF-8
+      if string.encoding != Encoding::UTF_8
+        string = reencode(string)
       end
       string
     end
@@ -123,18 +124,17 @@ module Nokogiri
     # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
     #
     def self.reencode(body, content_type=nil)
-      return body unless body.respond_to? :encoding
       if body.encoding == Encoding::ASCII_8BIT
         encoding = nil
         # look for a Byte Order Mark (BOM)
-        if body[0..1] == "\xFE\xFF"
-          encoding = 'utf-16be'
-        elsif body[0..1] == "\xFF\xFE"
-          encoding = 'utf-16le'
-        elsif body[0..2] == "\xEF\xBB\xBF"
-          encoding = 'utf-8'
+        initial_bytes = body[0..2].bytes
+        if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
+          encoding = Encoding::UTF_8
+        elsif initial_bytes[0..1] == [0xFE, 0xFF]
+          encoding = Encoding::UTF_16BE
+        elsif initial_bytes[0..1] == [0xFF, 0xFE]
+          encoding = Encoding::UTF_16LE
         end
         # look for a charset in a content-encoding header
@@ -154,6 +154,7 @@ module Nokogiri
         encoding ||= Encoding::ISO_8859_1
         # change the encoding to match the detected or inferred encoding
+        body = body.dup
         begin
           body.force_encoding(encoding)
         rescue ArgumentError

data/lib/nokogumbo/html5/document.rb CHANGED Viewed

@@ -12,6 +12,9 @@ module Nokogiri
         if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
           url ||= string_or_io.path
         end
+        unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
+          raise ArgumentError.new("not a string or IO object")
+        end
         do_parse(string_or_io, url, encoding, options)
       end
@@ -21,7 +24,8 @@ module Nokogiri
       end
       def self.read_memory(string, url = nil, encoding = nil, **options)
-        do_parse(string.to_s, url, encoding, options)
+        raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
+        do_parse(string, url, encoding, options)
       end
       def fragment(tags = nil)
@@ -37,9 +41,10 @@ module Nokogiri
       private
       def self.do_parse(string_or_io, url, encoding, options)
         string = HTML5.read_and_encode(string_or_io, encoding)
+        max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
         max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
         max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
-        doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
+        doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
         doc.encoding = 'UTF-8'
         doc
       end

data/lib/nokogumbo/html5/document_fragment.rb CHANGED Viewed

@@ -12,10 +12,11 @@ module Nokogiri
         self.errors = []
         return self unless tags
+        max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
         max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
         max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
         tags = Nokogiri::HTML5.read_and_encode(tags, nil)
-        Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
+        Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
       end
       def serialize(options = {}, &block)

data/lib/nokogumbo/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Nokogumbo
-  VERSION = "2.0.0"
+  VERSION = "2.0.5"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: nokogumbo
 version: !ruby/object:Gem::Version
-  version: 2.0.0
+  version: 2.0.5
 platform: ruby
 authors:
 - Sam Ruby
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-10-04 00:00:00.000000000 Z
+date: 2021-03-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.1.4
 signing_key:
 specification_version: 4
 summary: Nokogiri interface to the Gumbo HTML5 parser